diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 1 | ||||
-rw-r--r-- | block/Kconfig.iosched | 6 | ||||
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/bfq-cgroup.c | 480 | ||||
-rw-r--r-- | block/bfq-iosched.c | 2580 | ||||
-rw-r--r-- | block/bfq-sched.c | 439 | ||||
-rw-r--r-- | block/bfq.h | 708 | ||||
-rw-r--r-- | block/bio.c | 11 | ||||
-rw-r--r-- | block/blk-core.c | 31 | ||||
-rw-r--r-- | block/blk-flush.c | 11 | ||||
-rw-r--r-- | block/blk-lib.c | 184 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 47 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 17 | ||||
-rw-r--r-- | block/blk-mq.c | 70 | ||||
-rw-r--r-- | block/blk-mq.h | 3 | ||||
-rw-r--r-- | block/blk-settings.c | 59 | ||||
-rw-r--r-- | block/blk-stat.c | 185 | ||||
-rw-r--r-- | block/blk-stat.h | 17 | ||||
-rw-r--r-- | block/blk-sysfs.c | 184 | ||||
-rw-r--r-- | block/blk-throttle.c | 5 | ||||
-rw-r--r-- | block/cfq-iosched.c | 12 | ||||
-rw-r--r-- | block/genhd.c | 1 | ||||
-rw-r--r-- | block/ioctl.c | 33 | ||||
-rw-r--r-- | block/ioprio.c | 2 | ||||
-rw-r--r-- | block/partitions/efi.c | 4 | ||||
-rw-r--r-- | block/partitions/ldm.c | 60 |
26 files changed, 3147 insertions, 2005 deletions
diff --git a/block/Kconfig b/block/Kconfig index 0363cd731..d4c2ff4b9 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -4,6 +4,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select WBT help Provide block layer support for the kernel. diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 1fc1a4dc5..6d9257924 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -41,7 +41,7 @@ config CFQ_GROUP_IOSCHED config IOSCHED_BFQ tristate "BFQ I/O scheduler" - default y + default n ---help--- The BFQ I/O scheduler tries to distribute bandwidth among all processes according to their weights. @@ -53,14 +53,14 @@ config IOSCHED_BFQ config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" - depends on CGROUPS && IOSCHED_BFQ=y + depends on IOSCHED_BFQ && BLK_CGROUP default n ---help--- Enable hierarchical scheduling in BFQ, using the blkio controller. choice prompt "Default I/O scheduler" - default DEFAULT_BFQ + default DEFAULT_CFQ help Select the I/O scheduler which will be used by default for all block devices. diff --git a/block/Makefile b/block/Makefile index 37afa486e..3f2bee907 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ uuid.o genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 5ee99ecbd..c83d90c99 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -162,7 +162,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) { struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); - BUG_ON(!pd); return pd_to_bfqg(pd); } @@ -224,14 +223,6 @@ static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) blkg_rwstat_add(&bfqg->stats.merged, rw, 1); } -static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); -} - static void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, uint64_t io_start_time, int rw) { @@ -248,17 +239,11 @@ static void bfqg_stats_update_completion(struct bfq_group *bfqg, /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { - if (!stats) - return; - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); blkg_stat_reset(&stats->time); - blkg_stat_reset(&stats->unaccounted_time); blkg_stat_reset(&stats->avg_queue_size_sum); blkg_stat_reset(&stats->avg_queue_size_samples); blkg_stat_reset(&stats->dequeue); @@ -268,21 +253,19 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) } /* @to += @from */ -static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) +static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) { if (!to || !from) return; /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); - blkg_rwstat_add_aux(&to->serviced, &from->serviced); blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); blkg_stat_add_aux(&from->time, &from->time); - blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->avg_queue_size_samples, + &from->avg_queue_size_samples); blkg_stat_add_aux(&to->dequeue, &from->dequeue); blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); blkg_stat_add_aux(&to->idle_time, &from->idle_time); @@ -308,10 +291,8 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) if (unlikely(!parent)) return; - bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); - bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); + bfqg_stats_add_aux(&parent->stats, &bfqg->stats); bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); } static void bfq_init_entity(struct bfq_entity *entity, @@ -332,15 +313,11 @@ static void bfq_init_entity(struct bfq_entity *entity, static void bfqg_stats_exit(struct bfqg_stats *stats) { - blkg_rwstat_exit(&stats->service_bytes); - blkg_rwstat_exit(&stats->serviced); blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); - blkg_stat_exit(&stats->sectors); blkg_stat_exit(&stats->time); - blkg_stat_exit(&stats->unaccounted_time); blkg_stat_exit(&stats->avg_queue_size_sum); blkg_stat_exit(&stats->avg_queue_size_samples); blkg_stat_exit(&stats->dequeue); @@ -351,15 +328,11 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { - if (blkg_rwstat_init(&stats->service_bytes, gfp) || - blkg_rwstat_init(&stats->serviced, gfp) || - blkg_rwstat_init(&stats->merged, gfp) || + if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || - blkg_stat_init(&stats->sectors, gfp) || blkg_stat_init(&stats->time, gfp) || - blkg_stat_init(&stats->unaccounted_time, gfp) || blkg_stat_init(&stats->avg_queue_size_sum, gfp) || blkg_stat_init(&stats->avg_queue_size_samples, gfp) || blkg_stat_init(&stats->dequeue, gfp) || @@ -374,20 +347,36 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) - { +{ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; - } +} static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) { return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); } +static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) +{ + struct bfq_group_data *bgd; + + bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); + if (!bgd) + return NULL; + return &bgd->pd; +} + static void bfq_cpd_init(struct blkcg_policy_data *cpd) { struct bfq_group_data *d = cpd_to_bfqgd(cpd); - d->weight = BFQ_DEFAULT_GRP_WEIGHT; + d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? + CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; +} + +static void bfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_bfqgd(cpd)); } static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) @@ -398,8 +387,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) if (!bfqg) return NULL; - if (bfqg_stats_init(&bfqg->stats, gfp) || - bfqg_stats_init(&bfqg->dead_stats, gfp)) { + if (bfqg_stats_init(&bfqg->stats, gfp)) { kfree(bfqg); return NULL; } @@ -407,27 +395,20 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) return &bfqg->pd; } -static void bfq_group_set_parent(struct bfq_group *bfqg, - struct bfq_group *parent) +static void bfq_pd_init(struct blkg_policy_data *pd) { + struct blkcg_gq *blkg; + struct bfq_group *bfqg; + struct bfq_data *bfqd; struct bfq_entity *entity; + struct bfq_group_data *d; - BUG_ON(!parent); - BUG_ON(!bfqg); - BUG_ON(bfqg == parent); - + blkg = pd_to_blkg(pd); + BUG_ON(!blkg); + bfqg = blkg_to_bfqg(blkg); + bfqd = blkg->q->elevator->elevator_data; entity = &bfqg->entity; - entity->parent = parent->my_entity; - entity->sched_data = &parent->sched_data; -} - -static void bfq_pd_init(struct blkg_policy_data *pd) -{ - struct blkcg_gq *blkg = pd_to_blkg(pd); - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - struct bfq_data *bfqd = blkg->q->elevator->elevator_data; - struct bfq_entity *entity = &bfqg->entity; - struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); + d = blkcg_to_bfqgd(blkg->blkcg); entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; @@ -445,70 +426,53 @@ static void bfq_pd_free(struct blkg_policy_data *pd) struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); - bfqg_stats_exit(&bfqg->dead_stats); - return kfree(bfqg); } -/* offset delta from bfqg->stats to bfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - - offsetof(struct bfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +static void bfq_pd_reset_stats(struct blkg_policy_data *pd) { - u64 sum = 0; + struct bfq_group *bfqg = pd_to_bfqg(pd); - sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); - sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, - off + dead_stats_off_delta); - return sum; + bfqg_stats_reset(&bfqg->stats); } -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, - int off) +static void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) { - struct blkg_rwstat a, b; + struct bfq_entity *entity; + + BUG_ON(!parent); + BUG_ON(!bfqg); + BUG_ON(bfqg == parent); - a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); - b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, - off + dead_stats_off_delta); - blkg_rwstat_add_aux(&a, &b); - return a; + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; } -static void bfq_pd_reset_stats(struct blkg_policy_data *pd) +static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, + struct blkcg *blkcg) { - struct bfq_group *bfqg = pd_to_bfqg(pd); + struct blkcg_gq *blkg; - bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); + blkg = blkg_lookup(blkcg, bfqd->queue); + if (likely(blkg)) + return blkg_to_bfqg(blkg); + return NULL; } -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, + struct blkcg *blkcg) { - struct request_queue *q = bfqd->queue; - struct bfq_group *bfqg = NULL, *parent; - struct bfq_entity *entity = NULL; + struct bfq_group *bfqg, *parent; + struct bfq_entity *entity; assert_spin_locked(bfqd->queue->queue_lock); - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - bfqg = bfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - bfqg = blkg_to_bfqg(blkg); - else /* fallback to root_group */ - bfqg = bfqd->root_group; - } + bfqg = bfq_lookup_bfqg(bfqd, blkcg); - BUG_ON(!bfqg); + if (unlikely(!bfqg)) + return NULL; /* * Update chain of bfq_groups as we might be handling a leaf group @@ -531,13 +495,18 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, return bfqg; } -static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); +static void bfq_pos_tree_add_move(struct bfq_data *bfqd, + struct bfq_queue *bfqq); + +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason); /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. * @bfqq: the queue to move. - * @entity: @bfqq's entity. * @bfqg: the group to move to. * * Move @bfqq to @bfqg, deactivating it from its old group and reactivating @@ -548,26 +517,40 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) * rcu_read_lock()). */ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_entity *entity, struct bfq_group *bfqg) + struct bfq_group *bfqg) { - int busy, resume; - - busy = bfq_bfqq_busy(bfqq); - resume = !RB_EMPTY_ROOT(&bfqq->sort_list); + struct bfq_entity *entity = &bfqq->entity; - BUG_ON(resume && !entity->on_st); - BUG_ON(busy && !resume && entity->on_st && + BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); + BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) + && entity->on_st && bfqq != bfqd->in_service_queue); + BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); + + /* If bfqq is empty, then bfq_bfqq_expire also invokes + * bfq_del_bfqq_busy, thereby removing bfqq and its entity + * from data structures related to current group. Otherwise we + * need to remove bfqq explicitly with bfq_deactivate_bfqq, as + * we do below. + */ + if (bfqq == bfqd->in_service_queue) + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQ_BFQQ_PREEMPTED); + + BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) + && &bfq_entity_service_tree(entity)->idle != + entity->tree); - if (busy) { - BUG_ON(atomic_read(&bfqq->ref) < 2); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - if (!resume) - bfq_del_bfqq_busy(bfqd, bfqq, 0); - else - bfq_deactivate_bfqq(bfqd, bfqq, 0); - } else if (entity->on_st) + if (bfq_bfqq_busy(bfqq)) + bfq_deactivate_bfqq(bfqd, bfqq, 0); + else if (entity->on_st) { + BUG_ON(&bfq_entity_service_tree(entity)->idle != + entity->tree); bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + } bfqg_put(bfqq_group(bfqq)); /* @@ -579,14 +562,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, entity->sched_data = &bfqg->sched_data; bfqg_get(bfqg); - if (busy) { + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); + if (bfq_bfqq_busy(bfqq)) { bfq_pos_tree_add_move(bfqd, bfqq); - if (resume) - bfq_activate_bfqq(bfqd, bfqq); + bfq_activate_bfqq(bfqd, bfqq); } if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); + BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) + && &bfq_entity_service_tree(entity)->idle != + entity->tree); } /** @@ -613,7 +599,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, lockdep_assert_held(bfqd->queue->queue_lock); - bfqg = bfq_find_alloc_group(bfqd, blkcg); + bfqg = bfq_find_set_group(bfqd, blkcg); if (async_bfqq) { entity = &async_bfqq->entity; @@ -621,7 +607,8 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, bic_set_bfqq(bic, NULL, 0); bfq_log_bfqq(bfqd, async_bfqq, "bic_change_group: %p %d", - async_bfqq, atomic_read(&async_bfqq->ref)); + async_bfqq, + async_bfqq->ref); bfq_put_queue(async_bfqq); } } @@ -629,7 +616,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, if (sync_bfqq) { entity = &sync_bfqq->entity; if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); } return bfqg; @@ -638,25 +625,23 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct blkcg *blkcg; struct bfq_group *bfqg = NULL; - uint64_t id; + uint64_t serial_nr; rcu_read_lock(); - blkcg = bio_blkcg(bio); - id = blkcg->css.serial_nr; - rcu_read_unlock(); + serial_nr = bio_blkcg(bio)->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ - if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) - return; + if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) + goto out; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); - BUG_ON(!bfqg); - bic->blkcg_id = id; + bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); + bic->blkcg_serial_nr = serial_nr; +out: + rcu_read_unlock(); } /** @@ -682,8 +667,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); BUG_ON(!bfqq); - bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); - return; + bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** @@ -711,16 +695,15 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, if (bfqg->sched_data.in_service_entity) bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.in_service_entity); - - return; } /** - * bfq_destroy_group - destroy @bfqg. - * @bfqg: the group being destroyed. + * bfq_pd_offline - deactivate the entity associated with @pd, + * and reparent its children entities. + * @pd: descriptor of the policy going offline. * - * Destroy @bfqg, making sure that it is not referenced from its parent. - * blkio already grabs the queue_lock for us, so no need to use RCU-based magic + * blkio already grabs the queue_lock for us, so no need to use + * RCU-based magic */ static void bfq_pd_offline(struct blkg_policy_data *pd) { @@ -779,6 +762,12 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) bfq_put_async_queues(bfqd, bfqg); BUG_ON(entity->tree); + /* + * @blkg is going offline and will be ignored by + * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so + * that they don't get lost. If IOs complete after this point, the + * stats for them will be lost. Oh well... + */ bfqg_stats_xfer_dead(bfqg); } @@ -788,46 +777,35 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); + BUG_ON(!bfqg); bfq_end_wr_async_queues(bfqd, bfqg); } bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, - struct cftype *cftype) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - int ret = -EINVAL; - - spin_lock_irq(&blkcg->lock); - ret = bfqgd->weight; - spin_unlock_irq(&blkcg->lock); - - return ret; -} - -static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) +static int bfq_io_show_weight(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + unsigned int val = 0; - spin_lock_irq(&blkcg->lock); - seq_printf(sf, "%u\n", bfqgd->weight); - spin_unlock_irq(&blkcg->lock); + if (bfqgd) + val = bfqgd->weight; + + seq_printf(sf, "%u\n", val); return 0; } -static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, - struct cftype *cftype, - u64 val) +static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); struct blkcg_gq *blkg; - int ret = -EINVAL; + int ret = -ERANGE; if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) return ret; @@ -837,6 +815,7 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, bfqgd->weight = (unsigned short)val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); + if (!bfqg) continue; /* @@ -871,13 +850,18 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, return ret; } -static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) +static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { + u64 weight; /* First unsigned long found in the file is used */ - return bfqio_cgroup_weight_write(of_css(of), NULL, - simple_strtoull(strim(buf), NULL, 0)); + int ret = kstrtoull(strim(buf), 0, &weight); + + if (ret) + return ret; + + return bfq_io_set_weight_legacy(of_css(of), NULL, weight); } static int bfqg_print_stat(struct seq_file *sf, void *v) @@ -897,16 +881,17 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = bfqg_stat_pd_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_bfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_bfq, + off); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -926,6 +911,41 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); + return 0; +} + +static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, + false); + return 0; +} + + static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -950,7 +970,8 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) return 0; } -static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +static struct bfq_group * +bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { int ret; @@ -958,41 +979,18 @@ static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int n if (ret) return NULL; - return blkg_to_bfqg(bfqd->queue->root_blkg); + return blkg_to_bfqg(bfqd->queue->root_blkg); } -static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -{ - struct bfq_group_data *bgd; - - bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); - if (!bgd) - return NULL; - return &bgd->pd; -} - -static void bfq_cpd_free(struct blkcg_policy_data *cpd) -{ - kfree(cpd_to_bfqgd(cpd)); -} - -static struct cftype bfqio_files_dfl[] = { +static struct cftype bfq_blkcg_legacy_files[] = { { - .name = "weight", + .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfqio_cgroup_weight_read_dfl, - .write = bfqio_cgroup_weight_write_dfl, + .seq_show = bfq_io_show_weight, + .write_u64 = bfq_io_set_weight_legacy, }, - {} /* terminate */ -}; -static struct cftype bfqio_files[] = { - { - .name = "bfq.weight", - .read_u64 = bfqio_cgroup_weight_read, - .write_u64 = bfqio_cgroup_weight_write, - }, - /* statistics, cover only the tasks in the bfqg */ + /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), @@ -1000,18 +998,17 @@ static struct cftype bfqio_files[] = { }, { .name = "bfq.sectors", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat, + .seq_show = bfqg_print_stat_sectors, }, { .name = "bfq.io_service_bytes", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "bfq.io_serviced", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios, }, { .name = "bfq.io_service_time", @@ -1042,18 +1039,17 @@ static struct cftype bfqio_files[] = { }, { .name = "bfq.sectors_recursive", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat_recursive, + .seq_show = bfqg_print_stat_sectors_recursive, }, { .name = "bfq.io_service_bytes_recursive", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "bfq.io_serviced_recursive", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "bfq.io_service_time_recursive", @@ -1099,32 +1095,35 @@ static struct cftype bfqio_files[] = { .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, - { - .name = "bfq.unaccounted_time", - .private = offsetof(struct bfq_group, stats.unaccounted_time), - .seq_show = bfqg_print_stat, - }, { } /* terminate */ }; -static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfqio_files_dfl, - .legacy_cftypes = bfqio_files, - - .pd_alloc_fn = bfq_pd_alloc, - .pd_init_fn = bfq_pd_init, - .pd_offline_fn = bfq_pd_offline, - .pd_free_fn = bfq_pd_free, - .pd_reset_stats_fn = bfq_pd_reset_stats, - - .cpd_alloc_fn = bfq_cpd_alloc, - .cpd_init_fn = bfq_cpd_init, - .cpd_bind_fn = bfq_cpd_init, - .cpd_free_fn = bfq_cpd_free, - +static struct cftype bfq_blkg_files[] = { + { + .name = "bfq.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write = bfq_io_set_weight, + }, + {} /* terminate */ }; -#else +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, int rw) { } +static inline void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) { } +static inline void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) { } +static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, + uint64_t start_time, uint64_t io_start_time, int rw) { } +static inline void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, +struct bfq_group *curr_bfqg) { } +static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } +static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } static void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) @@ -1146,27 +1145,20 @@ bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) return bfqd->root_group; } -static void bfq_bfqq_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_entity *entity, - struct bfq_group *bfqg) -{ -} - static void bfq_end_wr_async(struct bfq_data *bfqd) { bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static void bfq_disconnect_groups(struct bfq_data *bfqd) +static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, + struct blkcg *blkcg) { - bfq_put_async_queues(bfqd, bfqd->root_group); + return bfqd->root_group; } -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { - return bfqd->root_group; + return bfqq->bfqd->root_group; } static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d1f648d05..c2cb29873 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7,25 +7,26 @@ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * - * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> + * Copyright (C) 2016 Paolo Valente <paolo.valente@unimore.it> * * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ * file. * - * BFQ is a proportional-share storage-I/O scheduling algorithm based on - * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, - * measured in number of sectors, to processes instead of time slices. The - * device is not granted to the in-service process for a given time slice, - * but until it has exhausted its assigned budget. This change from the time - * to the service domain allows BFQ to distribute the device throughput - * among processes as desired, without any distortion due to ZBR, workload - * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, - * called B-WF2Q+, to schedule processes according to their budgets. More - * precisely, BFQ schedules queues associated to processes. Thanks to the - * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to - * I/O-bound processes issuing sequential requests (to boost the - * throughput), and yet guarantee a low latency to interactive and soft - * real-time applications. + * BFQ is a proportional-share storage-I/O scheduling algorithm based + * on the slice-by-slice service scheme of CFQ. But BFQ assigns + * budgets, measured in number of sectors, to processes instead of + * time slices. The device is not granted to the in-service process + * for a given time slice, but until it has exhausted its assigned + * budget. This change from the time to the service domain enables BFQ + * to distribute the device throughput among processes as desired, + * without any distortion due to throughput fluctuations, or to device + * internal queueing. BFQ uses an ad hoc internal scheduler, called + * B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated with processes. Thanks to + * the accurate policy of B-WF2Q+, BFQ can afford to assign high + * budgets to I/O-bound processes issuing sequential requests (to + * boost the throughput), and yet guarantee a low latency to + * interactive and soft real-time applications. * * BFQ is described in [1], where also a reference to the initial, more * theoretical paper on BFQ can be found. The interested reader can find @@ -87,7 +88,6 @@ static const int bfq_stats_min_budgets = 194; /* Default maximum budget values, in sectors and number of requests. */ static const int bfq_default_max_budget = 16 * 1024; -static const int bfq_max_budget_async_rq = 4; /* * Async to sync throughput distribution is controlled as follows: @@ -97,8 +97,7 @@ static const int bfq_max_budget_async_rq = 4; static const int bfq_async_charge_factor = 10; /* Default timeout values, in jiffies, approximating CFQ defaults. */ -static const int bfq_timeout_sync = HZ / 8; -static int bfq_timeout_async = HZ / 25; +static const int bfq_timeout = HZ / 8; struct kmem_cache *bfq_pool; @@ -109,8 +108,9 @@ struct kmem_cache *bfq_pool; #define BFQ_HW_QUEUE_THRESHOLD 4 #define BFQ_HW_QUEUE_SAMPLES 32 -#define BFQQ_SEEK_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) +#define BFQQ_SEEK_THR (sector_t)(8 * 100) +#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) /* Min samples used for peak rate estimation (for autotuning). */ #define BFQ_PEAK_RATE_SAMPLES 32 @@ -141,16 +141,24 @@ struct kmem_cache *bfq_pool; * The device's speed class is dynamically (re)detected in * bfq_update_peak_rate() every time the estimated peak rate is updated. * - * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] - * are the reference values for a slow/fast rotational device, whereas - * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for - * a slow/fast non-rotational device. Finally, device_speed_thresh are the - * thresholds used to switch between speed classes. + * In the following definitions, R_slow[0]/R_fast[0] and + * T_slow[0]/T_fast[0] are the reference values for a slow/fast + * rotational device, whereas R_slow[1]/R_fast[1] and + * T_slow[1]/T_fast[1] are the reference values for a slow/fast + * non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. The reference + * rates are not the actual peak rates of the devices used as a + * reference, but slightly lower values. The reason for using these + * slightly lower values is that the peak-rate estimator tends to + * yield slightly lower values than the actual peak rate (it can yield + * the actual peak rate only if there is only one process doing I/O, + * and the process does sequential I/O). + * * Both the reference peak rates and the thresholds are measured in * sectors/usec, left-shifted by BFQ_RATE_SHIFT. */ -static int R_slow[2] = {1536, 10752}; -static int R_fast[2] = {17415, 34791}; +static int R_slow[2] = {1000, 10700}; +static int R_fast[2] = {14000, 33000}; /* * To improve readability, a conversion function is used to initialize the * following arrays, which entails that they can be initialized only in a @@ -410,11 +418,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) */ static bool bfq_symmetric_scenario(struct bfq_data *bfqd) { - return -#ifdef CONFIG_BFQ_GROUP_IOSCHED - !bfqd->active_numerous_groups && -#endif - !bfq_differentiated_weights(bfqd); + return !bfq_differentiated_weights(bfqd); } /* @@ -534,9 +538,19 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, static unsigned long bfq_serv_to_charge(struct request *rq, struct bfq_queue *bfqq) { - return blk_rq_sectors(rq) * - (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * - bfq_async_charge_factor)); + if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) + return blk_rq_sectors(rq); + + /* + * If there are no weight-raised queues, then amplify service + * by just the async charge factor; otherwise amplify service + * by twice the async charge factor, to further reduce latency + * for weight-raised queues. + */ + if (bfqq->bfqd->wr_busy_queues == 0) + return blk_rq_sectors(rq) * bfq_async_charge_factor; + + return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; } /** @@ -591,12 +605,23 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) dur = bfqd->RT_prod; do_div(dur, bfqd->peak_rate); - return dur; -} + /* + * Limit duration between 3 and 13 seconds. Tests show that + * higher values than 13 seconds often yield the opposite of + * the desired result, i.e., worsen responsiveness by letting + * non-interactive and non-soft-real-time applications + * preserve weight raising for a too long time interval. + * + * On the other end, lower values than 3 seconds make it + * difficult for most interactive tasks to complete their jobs + * before weight-raising finishes. + */ + if (dur > msecs_to_jiffies(13000)) + dur = msecs_to_jiffies(13000); + else if (dur < msecs_to_jiffies(3000)) + dur = msecs_to_jiffies(3000); -static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq) -{ - return bfqq->bic ? bfqq->bic->cooperations : 0; + return dur; } static void @@ -606,31 +631,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfq_mark_bfqq_idle_window(bfqq); else bfq_clear_bfqq_idle_window(bfqq); + if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); - /* Assuming that the flag in_large_burst is already correctly set */ - if (bic->wr_time_left && bfqq->bfqd->low_latency && - !bfq_bfqq_in_large_burst(bfqq) && - bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { - /* - * Start a weight raising period with the duration given by - * the raising_time_left snapshot. - */ - if (bfq_bfqq_busy(bfqq)) - bfqq->bfqd->wr_busy_queues++; - bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bic->wr_time_left; - bfqq->last_wr_start_finish = jiffies; - bfqq->entity.prio_changed = 1; - } - /* - * Clear wr_time_left to prevent bfq_bfqq_save_state() from - * getting confused about the queue's need of a weight-raising - * period. - */ - bic->wr_time_left = 0; } static int bfqq_process_refs(struct bfq_queue *bfqq) @@ -640,7 +645,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) lockdep_assert_held(bfqq->bfqd->queue->queue_lock); io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; - process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; + process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; BUG_ON(process_refs < 0); return process_refs; } @@ -655,6 +660,7 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) hlist_del_init(&item->burst_list_node); hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); bfqd->burst_size = 1; + bfqd->burst_parent_entity = bfqq->entity.parent; } /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ @@ -663,6 +669,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* Increment burst size to take into account also bfqq */ bfqd->burst_size++; + bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); + + BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { struct bfq_queue *pos, *bfqq_item; struct hlist_node *n; @@ -672,15 +682,19 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * other to consider this burst as large. */ bfqd->large_burst = true; + bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); /* * We can now mark all queues in the burst list as * belonging to a large burst. */ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, - burst_list_node) + burst_list_node) { bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); + } bfq_mark_bfqq_in_large_burst(bfqq); + bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); /* * From now on, and until the current burst finishes, any @@ -692,67 +706,79 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, burst_list_node) hlist_del_init(&pos->burst_list_node); - } else /* burst not yet large: add bfqq to the burst list */ + } else /* + * Burst not yet large: add bfqq to the burst list. Do + * not increment the ref counter for bfqq, because bfqq + * is removed from the burst list before freeing bfqq + * in put_queue. + */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } /* - * If many queues happen to become active shortly after each other, then, - * to help the processes associated to these queues get their job done as - * soon as possible, it is usually better to not grant either weight-raising - * or device idling to these queues. In this comment we describe, firstly, - * the reasons why this fact holds, and, secondly, the next function, which - * implements the main steps needed to properly mark these queues so that - * they can then be treated in a different way. + * If many queues belonging to the same group happen to be created + * shortly after each other, then the processes associated with these + * queues have typically a common goal. In particular, bursts of queue + * creations are usually caused by services or applications that spawn + * many parallel threads/processes. Examples are systemd during boot, + * or git grep. To help these processes get their job done as soon as + * possible, it is usually better to not grant either weight-raising + * or device idling to their queues. * - * As for the terminology, we say that a queue becomes active, i.e., - * switches from idle to backlogged, either when it is created (as a - * consequence of the arrival of an I/O request), or, if already existing, - * when a new request for the queue arrives while the queue is idle. - * Bursts of activations, i.e., activations of different queues occurring - * shortly after each other, are typically caused by services or applications - * that spawn or reactivate many parallel threads/processes. Examples are - * systemd during boot or git grep. + * In this comment we describe, firstly, the reasons why this fact + * holds, and, secondly, the next function, which implements the main + * steps needed to properly mark these queues so that they can then be + * treated in a different way. * - * These services or applications benefit mostly from a high throughput: - * the quicker the requests of the activated queues are cumulatively served, - * the sooner the target job of these queues gets completed. As a consequence, - * weight-raising any of these queues, which also implies idling the device - * for it, is almost always counterproductive: in most cases it just lowers - * throughput. + * The above services or applications benefit mostly from a high + * throughput: the quicker the requests of the activated queues are + * cumulatively served, the sooner the target job of these queues gets + * completed. As a consequence, weight-raising any of these queues, + * which also implies idling the device for it, is almost always + * counterproductive. In most cases it just lowers throughput. * - * On the other hand, a burst of activations may be also caused by the start - * of an application that does not consist in a lot of parallel I/O-bound - * threads. In fact, with a complex application, the burst may be just a - * consequence of the fact that several processes need to be executed to - * start-up the application. To start an application as quickly as possible, - * the best thing to do is to privilege the I/O related to the application - * with respect to all other I/O. Therefore, the best strategy to start as - * quickly as possible an application that causes a burst of activations is - * to weight-raise all the queues activated during the burst. This is the + * On the other hand, a burst of queue creations may be caused also by + * the start of an application that does not consist of a lot of + * parallel I/O-bound threads. In fact, with a complex application, + * several short processes may need to be executed to start-up the + * application. In this respect, to start an application as quickly as + * possible, the best thing to do is in any case to privilege the I/O + * related to the application with respect to all other + * I/O. Therefore, the best strategy to start as quickly as possible + * an application that causes a burst of queue creations is to + * weight-raise all the queues created during the burst. This is the * exact opposite of the best strategy for the other type of bursts. * - * In the end, to take the best action for each of the two cases, the two - * types of bursts need to be distinguished. Fortunately, this seems - * relatively easy to do, by looking at the sizes of the bursts. In - * particular, we found a threshold such that bursts with a larger size - * than that threshold are apparently caused only by services or commands - * such as systemd or git grep. For brevity, hereafter we call just 'large' - * these bursts. BFQ *does not* weight-raise queues whose activations occur - * in a large burst. In addition, for each of these queues BFQ performs or - * does not perform idling depending on which choice boosts the throughput - * most. The exact choice depends on the device and request pattern at + * In the end, to take the best action for each of the two cases, the + * two types of bursts need to be distinguished. Fortunately, this + * seems relatively easy, by looking at the sizes of the bursts. In + * particular, we found a threshold such that only bursts with a + * larger size than that threshold are apparently caused by + * services or commands such as systemd or git grep. For brevity, + * hereafter we call just 'large' these bursts. BFQ *does not* + * weight-raise queues whose creation occurs in a large burst. In + * addition, for each of these queues BFQ performs or does not perform + * idling depending on which choice boosts the throughput more. The + * exact choice depends on the device and request pattern at * hand. * - * Turning back to the next function, it implements all the steps needed - * to detect the occurrence of a large burst and to properly mark all the - * queues belonging to it (so that they can then be treated in a different - * way). This goal is achieved by maintaining a special "burst list" that - * holds, temporarily, the queues that belong to the burst in progress. The - * list is then used to mark these queues as belonging to a large burst if - * the burst does become large. The main steps are the following. + * Unfortunately, false positives may occur while an interactive task + * is starting (e.g., an application is being started). The + * consequence is that the queues associated with the task do not + * enjoy weight raising as expected. Fortunately these false positives + * are very rare. They typically occur if some service happens to + * start doing I/O exactly when the interactive task starts. * - * . when the very first queue is activated, the queue is inserted into the + * Turning back to the next function, it implements all the steps + * needed to detect the occurrence of a large burst and to properly + * mark all the queues belonging to it (so that they can then be + * treated in a different way). This goal is achieved by maintaining a + * "burst list" that holds, temporarily, the queues that belong to the + * burst in progress. The list is then used to mark these queues as + * belonging to a large burst if the burst does become large. The main + * steps are the following. + * + * . when the very first queue is created, the queue is inserted into the * list (as it could be the first queue in a possible burst) * * . if the current burst has not yet become large, and a queue Q that does @@ -773,13 +799,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * * . the device enters a large-burst mode * - * . if a queue Q that does not belong to the burst is activated while + * . if a queue Q that does not belong to the burst is created while * the device is in large-burst mode and shortly after the last time * at which a queue either entered the burst list or was marked as * belonging to the current large burst, then Q is immediately marked * as belonging to a large burst. * - * . if a queue Q that does not belong to the burst is activated a while + * . if a queue Q that does not belong to the burst is created a while * later, i.e., not shortly after, than the last time at which a queue * either entered the burst list or was marked as belonging to the * current large burst, then the current burst is deemed as finished and: @@ -792,52 +818,44 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * in a possible new burst (then the burst list contains just Q * after this step). */ -static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool idle_for_long_time) +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* - * If bfqq happened to be activated in a burst, but has been idle - * for at least as long as an interactive queue, then we assume - * that, in the overall I/O initiated in the burst, the I/O - * associated to bfqq is finished. So bfqq does not need to be - * treated as a queue belonging to a burst anymore. Accordingly, - * we reset bfqq's in_large_burst flag if set, and remove bfqq - * from the burst list if it's there. We do not decrement instead - * burst_size, because the fact that bfqq does not need to belong - * to the burst list any more does not invalidate the fact that - * bfqq may have been activated during the current burst. - */ - if (idle_for_long_time) { - hlist_del_init(&bfqq->burst_list_node); - bfq_clear_bfqq_in_large_burst(bfqq); - } - - /* * If bfqq is already in the burst list or is part of a large - * burst, then there is nothing else to do. + * burst, or finally has just been split, then there is + * nothing else to do. */ if (!hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq)) + bfq_bfqq_in_large_burst(bfqq) || + time_is_after_eq_jiffies(bfqq->split_time + + msecs_to_jiffies(10))) return; /* - * If bfqq's activation happens late enough, then the current - * burst is finished, and related data structures must be reset. + * If bfqq's creation happens late enough, or bfqq belongs to + * a different group than the burst group, then the current + * burst is finished, and related data structures must be + * reset. * - * In this respect, consider the special case where bfqq is the very - * first queue being activated. In this case, last_ins_in_burst is - * not yet significant when we get here. But it is easy to verify - * that, whether or not the following condition is true, bfqq will - * end up being inserted into the burst list. In particular the - * list will happen to contain only bfqq. And this is exactly what - * has to happen, as bfqq may be the first queue in a possible + * In this respect, consider the special case where bfqq is + * the very first queue created after BFQ is selected for this + * device. In this case, last_ins_in_burst and + * burst_parent_entity are not yet significant when we get + * here. But it is easy to verify that, whether or not the + * following condition is true, bfqq will end up being + * inserted into the burst list. In particular the list will + * happen to contain only bfqq. And this is exactly what has + * to happen, as bfqq may be the first queue of the first * burst. */ if (time_is_before_jiffies(bfqd->last_ins_in_burst + - bfqd->bfq_burst_interval)) { + bfqd->bfq_burst_interval) || + bfqq->entity.parent != bfqd->burst_parent_entity) { bfqd->large_burst = false; bfq_reset_burst_list(bfqd, bfqq); - return; + bfq_log_bfqq(bfqd, bfqq, + "handle_burst: late activation or different group"); + goto end; } /* @@ -846,8 +864,9 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, * bfqq as belonging to this large burst immediately. */ if (bfqd->large_burst) { + bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); bfq_mark_bfqq_in_large_burst(bfqq); - return; + goto end; } /* @@ -856,25 +875,498 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, * queue. Then we add bfqq to the burst. */ bfq_add_to_burst(bfqd, bfqq); +end: + /* + * At this point, bfqq either has been added to the current + * burst or has caused the current burst to terminate and a + * possible new burst to start. In particular, in the second + * case, bfqq has become the first queue in the possible new + * burst. In both cases last_ins_in_burst needs to be moved + * forward. + */ + bfqd->last_ins_in_burst = jiffies; + +} + +static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + return entity->budget - entity->service; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static int bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static int bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason); + +/* + * The next function, invoked after the input queue bfqq switches from + * idle to busy, updates the budget of bfqq. The function also tells + * whether the in-service queue should be expired, by returning + * true. The purpose of expiring the in-service queue is to give bfqq + * the chance to possibly preempt the in-service queue, and the reason + * for preempting the in-service queue is to achieve one of the two + * goals below. + * + * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has + * expired because it has remained idle. In particular, bfqq may have + * expired for one of the following two reasons: + * + * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and + * did not make it to issue a new request before its last request + * was served; + * + * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue + * a new request before the expiration of the idling-time. + * + * Even if bfqq has expired for one of the above reasons, the process + * associated with the queue may be however issuing requests greedily, + * and thus be sensitive to the bandwidth it receives (bfqq may have + * remained idle for other reasons: CPU high load, bfqq not enjoying + * idling, I/O throttling somewhere in the path from the process to + * the I/O scheduler, ...). But if, after every expiration for one of + * the above two reasons, bfqq has to wait for the service of at least + * one full budget of another queue before being served again, then + * bfqq is likely to get a much lower bandwidth or resource time than + * its reserved ones. To address this issue, two countermeasures need + * to be taken. + * + * First, the budget and the timestamps of bfqq need to be updated in + * a special way on bfqq reactivation: they need to be updated as if + * bfqq did not remain idle and did not expire. In fact, if they are + * computed as if bfqq expired and remained idle until reactivation, + * then the process associated with bfqq is treated as if, instead of + * being greedy, it stopped issuing requests when bfqq remained idle, + * and restarts issuing requests only on this reactivation. In other + * words, the scheduler does not help the process recover the "service + * hole" between bfqq expiration and reactivation. As a consequence, + * the process receives a lower bandwidth than its reserved one. In + * contrast, to recover this hole, the budget must be updated as if + * bfqq was not expired at all before this reactivation, i.e., it must + * be set to the value of the remaining budget when bfqq was + * expired. Along the same line, timestamps need to be assigned the + * value they had the last time bfqq was selected for service, i.e., + * before last expiration. Thus timestamps need to be back-shifted + * with respect to their normal computation (see [1] for more details + * on this tricky aspect). + * + * Secondly, to allow the process to recover the hole, the in-service + * queue must be expired too, to give bfqq the chance to preempt it + * immediately. In fact, if bfqq has to wait for a full budget of the + * in-service queue to be completed, then it may become impossible to + * let the process recover the hole, even if the back-shifted + * timestamps of bfqq are lower than those of the in-service queue. If + * this happens for most or all of the holes, then the process may not + * receive its reserved bandwidth. In this respect, it is worth noting + * that, being the service of outstanding requests unpreemptible, a + * little fraction of the holes may however be unrecoverable, thereby + * causing a little loss of bandwidth. + * + * The last important point is detecting whether bfqq does need this + * bandwidth recovery. In this respect, the next function deems the + * process associated with bfqq greedy, and thus allows it to recover + * the hole, if: 1) the process is waiting for the arrival of a new + * request (which implies that bfqq expired for one of the above two + * reasons), and 2) such a request has arrived soon. The first + * condition is controlled through the flag non_blocking_wait_rq, + * while the second through the flag arrived_in_time. If both + * conditions hold, then the function computes the budget in the + * above-described special way, and signals that the in-service queue + * should be expired. Timestamp back-shifting is done later in + * __bfq_activate_entity. + * + * 2. Reduce latency. Even if timestamps are not backshifted to let + * the process associated with bfqq recover a service hole, bfqq may + * however happen to have, after being (re)activated, a lower finish + * timestamp than the in-service queue. That is, the next budget of + * bfqq may have to be completed before the one of the in-service + * queue. If this is the case, then preempting the in-service queue + * allows this goal to be achieved, apart from the unpreemptible, + * outstanding requests mentioned above. + * + * Unfortunately, regardless of which of the above two goals one wants + * to achieve, service trees need first to be updated to know whether + * the in-service queue must be preempted. To have service trees + * correctly updated, the in-service queue must be expired and + * rescheduled, and bfqq must be scheduled too. This is one of the + * most costly operations (in future versions, the scheduling + * mechanism may be re-designed in such a way to make it possible to + * know whether preemption is needed without needing to update service + * trees). In addition, queue preemptions almost always cause random + * I/O, and thus loss of throughput. Because of these facts, the next + * function adopts the following simple scheme to avoid both costly + * operations and too frequent preemptions: it requests the expiration + * of the in-service queue (unconditionally) only for queues that need + * to recover a hole, or that either are weight-raised or deserve to + * be weight-raised. + */ +static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool arrived_in_time, + bool wr_or_deserves_wr) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { + /* + * We do not clear the flag non_blocking_wait_rq here, as + * the latter is used in bfq_activate_bfqq to signal + * that timestamps need to be back-shifted (and is + * cleared right after). + */ + + /* + * In next assignment we rely on that either + * entity->service or entity->budget are not updated + * on expiration if bfqq is empty (see + * __bfq_bfqq_recalc_budget). Thus both quantities + * remain unchanged after such an expiration, and the + * following statement therefore assigns to + * entity->budget the remaining budget on such an + * expiration. For clarity, entity->service is not + * updated on expiration in any case, and, in normal + * operation, is reset only when bfqq is selected for + * service (see bfq_get_next_queue). + */ + entity->budget = min_t(unsigned long, + bfq_bfqq_budget_left(bfqq), + bfqq->max_budget); + + BUG_ON(entity->budget < 0); + return true; + } + + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(bfqq->next_rq,bfqq)); + BUG_ON(entity->budget < 0); + + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); + return wr_or_deserves_wr; +} + +static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + unsigned int old_wr_coeff, + bool wr_or_deserves_wr, + bool interactive, + bool in_burst, + bool soft_rt) +{ + if (old_wr_coeff == 1 && wr_or_deserves_wr) { + /* start a weight-raising period */ + if (interactive) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + /* + * If needed, further reduce budget to make sure it is + * close to bfqq's backlog, so as to reduce the + * scheduling-error component due to a too large + * budget. Do not care about throughput consequences, + * but only about latency. Finally, do not assign a + * too small budget either, to avoid increasing + * latency by causing too frequent expirations. + */ + bfqq->entity.budget = min_t(unsigned long, + bfqq->entity.budget, + 2 * bfq_min_budget(bfqd)); + + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) { /* update wr coeff and duration */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else if (in_burst) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (time_before( + bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time, + jiffies + + bfqd->bfq_wr_rt_max_time) && + soft_rt) { + /* + * The remaining weight-raising time is lower + * than bfqd->bfq_wr_rt_max_time, which means + * that the application is enjoying weight + * raising either because deemed soft-rt in + * the near past, or because deemed interactive + * a long ago. + * In both cases, resetting now the current + * remaining weight-raising time for the + * application to the weight-raising duration + * for soft rt applications would not cause any + * latency increase for the application (as the + * new duration would be higher than the + * remaining time). + * + * In addition, the application is now meeting + * the requirements for being deemed soft rt. + * In the end we can correctly and safely + * (re)charge the weight-raising duration for + * the application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfq_log_bfqq(bfqd, bfqq, + "switching to soft_rt wr, or " + " just moving forward duration"); + } + } +} + +static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return bfqq->dispatched == 0 && + time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); +} + +static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int old_wr_coeff, + struct request *rq, + bool *interactive) +{ + bool soft_rt, in_burst, wr_or_deserves_wr, + bfqq_wants_to_preempt, + idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), + /* + * See the comments on + * bfq_bfqq_update_budg_for_activation for + * details on the usage of the next variable. + */ + arrived_in_time = time_is_after_jiffies( + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle * 3); + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request non-busy: " + "jiffies %lu, in_time %d, idle_long %d busyw %d " + "wr_coeff %u", + jiffies, arrived_in_time, + idle_for_long_time, + bfq_bfqq_non_blocking_wait_rq(bfqq), + old_wr_coeff); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + BUG_ON(bfqq == bfqd->in_service_queue); + bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, + rq->cmd_flags); + + /* + * bfqq deserves to be weight-raised if: + * - it is sync, + * - it does not belong to a large burst, + * - it has been idle for enough time or is soft real-time, + * - is linked to a bfq_io_cq (it is not shared in any sense) + */ + in_burst = bfq_bfqq_in_large_burst(bfqq); + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + *interactive = + !in_burst && + idle_for_long_time; + wr_or_deserves_wr = bfqd->low_latency && + (bfqq->wr_coeff > 1 || + (bfq_bfqq_sync(bfqq) && + bfqq->bic && (*interactive || soft_rt))); + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request: " + "in_burst %d, " + "soft_rt %d (next %lu), inter %d, bic %p", + bfq_bfqq_in_large_burst(bfqq), soft_rt, + bfqq->soft_rt_next_start, + *interactive, + bfqq->bic); + + /* + * Using the last flag, update budget and check whether bfqq + * may want to preempt the in-service queue. + */ + bfqq_wants_to_preempt = + bfq_bfqq_update_budg_for_activation(bfqd, bfqq, + arrived_in_time, + wr_or_deserves_wr); + + /* + * If bfqq happened to be activated in a burst, but has been + * idle for much more than an interactive queue, then we + * assume that, in the overall I/O initiated in the burst, the + * I/O associated with bfqq is finished. So bfqq does not need + * to be treated as a queue belonging to a burst + * anymore. Accordingly, we reset bfqq's in_large_burst flag + * if set, and remove bfqq from the burst list if it's + * there. We do not decrement burst_size, because the fact + * that bfqq does not need to belong to the burst list any + * more does not invalidate the fact that bfqq was created in + * a burst. + */ + if (likely(!bfq_bfqq_just_created(bfqq)) && + idle_for_long_time && + time_is_before_jiffies( + bfqq->budget_timeout + + msecs_to_jiffies(10000))) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + bfq_clear_bfqq_just_created(bfqq); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (arrived_in_time) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + bfq_log_bfqq(bfqd, bfqq, "requests in time %d", + bfqq->requests_within_timer); + } + + if (bfqd->low_latency) { + if (unlikely(time_is_after_jiffies(bfqq->split_time))) + /* wraparound */ + bfqq->split_time = + jiffies - bfqd->bfq_wr_min_idle_time - 1; + + if (time_is_before_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) { + bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, + old_wr_coeff, + wr_or_deserves_wr, + *interactive, + in_burst, + soft_rt); + + if (old_wr_coeff != bfqq->wr_coeff) + bfqq->entity.prio_changed = 1; + } + } + + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + + bfq_add_bfqq_busy(bfqd, bfqq); + + /* + * Expire in-service queue only if preemption may be needed + * for guarantees. In this respect, the function + * next_queue_may_preempt just checks a simple, necessary + * condition, and not a sufficient condition based on + * timestamps. In fact, for the latter condition to be + * evaluated, timestamps would need first to be updated, and + * this operation is quite costly (see the comments on the + * function bfq_bfqq_update_budg_for_activation). + */ + if (bfqd->in_service_queue && bfqq_wants_to_preempt && + bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && + next_queue_may_preempt(bfqd)) { + struct bfq_queue *in_serv = + bfqd->in_service_queue; + BUG_ON(in_serv == bfqq); + + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQ_BFQQ_PREEMPTED); + BUG_ON(in_serv->entity.budget < 0); + } } static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_entity *entity = &bfqq->entity; struct bfq_data *bfqd = bfqq->bfqd; struct request *next_rq, *prev; - unsigned long old_wr_coeff = bfqq->wr_coeff; + unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); + bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", + blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); + + if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + bfqq->queued[rq_is_sync(rq)]++; bfqd->queued++; elv_rb_add(&bfqq->sort_list, rq); /* - * Check if this request is a better next-serve candidate. + * Check if this request is a better next-to-serve candidate. */ prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); @@ -887,160 +1379,10 @@ static void bfq_add_request(struct request *rq) if (prev != bfqq->next_rq) bfq_pos_tree_add_move(bfqd, bfqq); - if (!bfq_bfqq_busy(bfqq)) { - bool soft_rt, coop_or_in_burst, - idle_for_long_time = time_is_before_jiffies( - bfqq->budget_timeout + - bfqd->bfq_wr_min_idle_time); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, - rq->cmd_flags); -#endif - if (bfq_bfqq_sync(bfqq)) { - bool already_in_burst = - !hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq); - bfq_handle_burst(bfqd, bfqq, idle_for_long_time); - /* - * If bfqq was not already in the current burst, - * then, at this point, bfqq either has been - * added to the current burst or has caused the - * current burst to terminate. In particular, in - * the second case, bfqq has become the first - * queue in a possible new burst. - * In both cases last_ins_in_burst needs to be - * moved forward. - */ - if (!already_in_burst) - bfqd->last_ins_in_burst = jiffies; - } - - coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || - bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; - soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && - !coop_or_in_burst && - time_is_before_jiffies(bfqq->soft_rt_next_start); - interactive = !coop_or_in_burst && idle_for_long_time; - entity->budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (time_before(jiffies, - RQ_BIC(rq)->ttime.last_end_request + - bfqd->bfq_slice_idle)) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - } - - if (!bfqd->low_latency) - goto add_bfqq_busy; - - if (bfq_bfqq_just_split(bfqq)) - goto set_prio_changed; - - /* - * If the queue: - * - is not being boosted, - * - has been idle for enough time, - * - is not a sync queue or is linked to a bfq_io_cq (it is - * shared "for its nature" or it is not shared and its - * requests have not been redirected to a shared queue) - * start a weight-raising period. - */ - if (old_wr_coeff == 1 && (interactive || soft_rt) && - (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - bfq_log_bfqq(bfqd, bfqq, - "wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } else if (old_wr_coeff > 1) { - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else if (coop_or_in_burst || - (bfqq->wr_cur_max_time == - bfqd->bfq_wr_rt_max_time && - !soft_rt)) { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq-> - wr_cur_max_time)); - } else if (time_before( - bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time, - jiffies + - bfqd->bfq_wr_rt_max_time) && - soft_rt) { - /* - * - * The remaining weight-raising time is lower - * than bfqd->bfq_wr_rt_max_time, which means - * that the application is enjoying weight - * raising either because deemed soft-rt in - * the near past, or because deemed interactive - * a long ago. - * In both cases, resetting now the current - * remaining weight-raising time for the - * application to the weight-raising duration - * for soft rt applications would not cause any - * latency increase for the application (as the - * new duration would be higher than the - * remaining time). - * - * In addition, the application is now meeting - * the requirements for being deemed soft rt. - * In the end we can correctly and safely - * (re)charge the weight-raising duration for - * the application with the weight-raising - * duration for soft rt applications. - * - * In particular, doing this recharge now, i.e., - * before the weight-raising period for the - * application finishes, reduces the probability - * of the following negative scenario: - * 1) the weight of a soft rt application is - * raised at startup (as for any newly - * created application), - * 2) since the application is not interactive, - * at a certain time weight-raising is - * stopped for the application, - * 3) at that time the application happens to - * still have pending requests, and hence - * is destined to not have a chance to be - * deemed soft rt before these requests are - * completed (see the comments to the - * function bfq_bfqq_softrt_next_start() - * for details on soft rt detection), - * 4) these pending requests experience a high - * latency because the application is not - * weight-raised while they are pending. - */ - bfqq->last_wr_start_finish = jiffies; - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - } - } -set_prio_changed: - if (old_wr_coeff != bfqq->wr_coeff) - entity->prio_changed = 1; -add_bfqq_busy: - bfqq->last_idle_bklogged = jiffies; - bfqq->service_from_backlogged = 0; - bfq_clear_bfqq_softrt_update(bfqq); - bfq_add_bfqq_busy(bfqd, bfqq); - } else { + if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ + bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, + rq, &interactive); + else { if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && time_is_before_jiffies( bfqq->last_wr_start_finish + @@ -1049,16 +1391,43 @@ add_bfqq_busy: bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); bfqd->wr_busy_queues++; - entity->prio_changed = 1; + bfqq->entity.prio_changed = 1; bfq_log_bfqq(bfqd, bfqq, - "non-idle wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); + "non-idle wrais starting, " + "wr_max_time %u wr_busy %d", + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqd->wr_busy_queues); } if (prev != bfqq->next_rq) bfq_updated_next_req(bfqd, bfqq); } + /* + * Assign jiffies to last_wr_start_finish in the following + * cases: + * + * . if bfqq is not going to be weight-raised, because, for + * non weight-raised queues, last_wr_start_finish stores the + * arrival time of the last request; as of now, this piece + * of information is used only for deciding whether to + * weight-raise async queues + * + * . if bfqq is not weight-raised, because, if bfqq is now + * switching to weight-raised, then last_wr_start_finish + * stores the time when weight-raising starts + * + * . if bfqq is interactive, because, regardless of whether + * bfqq is currently weight-raised, the weight-raising + * period must start or restart (this case is considered + * separately because it is not detected by the above + * conditions, if bfqq is already weight-raised) + * + * last_wr_start_finish has to be updated also if bfqq is soft + * real-time, because the weight-raising period is constantly + * restarted on idle-to-busy transitions for these queues, but + * this is already done in bfq_bfqq_handle_idle_busy_switch if + * needed. + */ if (bfqd->low_latency && (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) bfqq->last_wr_start_finish = jiffies; @@ -1106,6 +1475,9 @@ static void bfq_remove_request(struct request *rq) struct bfq_data *bfqd = bfqq->bfqd; const int sync = rq_is_sync(rq); + BUG_ON(bfqq->entity.service > bfqq->entity.budget && + bfqq == bfqd->in_service_queue); + if (bfqq->next_rq == rq) { bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); bfq_updated_next_req(bfqd, bfqq); @@ -1119,8 +1491,25 @@ static void bfq_remove_request(struct request *rq) elv_rb_del(&bfqq->sort_list, rq); if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) + BUG_ON(bfqq->entity.budget < 0); + + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { bfq_del_bfqq_busy(bfqd, bfqq, 1); + + /* bfqq emptied. In normal operation, when + * bfqq is empty, bfqq->entity.service and + * bfqq->entity.budget must contain, + * respectively, the service received and the + * budget used last time bfqq emptied. These + * facts do not hold in this case, as at least + * this last removal occurred while bfqq is + * not in service. To avoid inconsistencies, + * reset both bfqq->entity.service and + * bfqq->entity.budget. + */ + bfqq->entity.budget = bfqq->entity.service = 0; + } + /* * Remove queue from request-position tree as it is empty. */ @@ -1134,9 +1523,7 @@ static void bfq_remove_request(struct request *rq) BUG_ON(bfqq->meta_pending == 0); bfqq->meta_pending--; } -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -#endif } static int bfq_merge(struct request_queue *q, struct request **req, @@ -1221,21 +1608,25 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfq_remove_request(next); -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -#endif } /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { BUG_ON(!bfqq); + if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; - /* Trigger a weight change on the next activation of the queue */ + /* + * Trigger a weight change on the next invocation of + * __bfq_entity_update_weight_prio. + */ bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", + bfqq->bfqd->wr_busy_queues); } static void bfq_end_wr_async_queues(struct bfq_data *bfqd, @@ -1278,7 +1669,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request, sector_t sector) { return abs(bfq_io_struct_pos(io_struct, request) - sector) <= - BFQQ_SEEK_THR; + BFQQ_CLOSE_THR; } static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, @@ -1400,7 +1791,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * throughput. */ bfqq->new_bfqq = new_bfqq; - atomic_add(process_refs, &new_bfqq->ref); + new_bfqq->ref += process_refs; return new_bfqq; } @@ -1431,9 +1822,23 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, } /* - * Attempt to schedule a merge of bfqq with the currently in-service queue - * or with a close queue among the scheduled queues. - * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue + * If this function returns true, then bfqq cannot be merged. The idea + * is that true cooperation happens very early after processes start + * to do I/O. Usually, late cooperations are just accidental false + * positives. In case bfqq is weight-raised, such false positives + * would evidently degrade latency guarantees for bfqq. + */ +bool wr_from_too_long(struct bfq_queue *bfqq) +{ + return bfqq->wr_coeff > 1 && + time_is_before_jiffies(bfqq->last_wr_start_finish + + msecs_to_jiffies(100)); +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service + * queue or with a close queue among the scheduled queues. Return + * NULL if no merge was scheduled, a pointer to the shared bfq_queue * structure otherwise. * * The OOM queue is not allowed to participate to cooperation: in fact, since @@ -1442,6 +1847,18 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, * handle merging with the OOM queue would be quite complex and expensive * to maintain. Besides, in such a critical condition as an out of memory, * the benefits of queue merging may be little relevant, or even negligible. + * + * Weight-raised queues can be merged only if their weight-raising + * period has just started. In fact cooperating processes are usually + * started together. Thus, with this filter we avoid false positives + * that would jeopardize low-latency guarantees. + * + * WARNING: queue merging may impair fairness among non-weight raised + * queues, for at least two reasons: 1) the original weight of a + * merged queue may change during the merged state, 2) even being the + * weight the same, a merged queue may be bloated with many more + * requests than the ones produced by its originally-associated + * process. */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -1451,16 +1868,32 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->new_bfqq) return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + + if (io_struct && wr_from_too_long(bfqq) && + likely(bfqq != &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have looked for coop, but bfq%d wr", + bfqq->pid); + + if (!io_struct || + wr_from_too_long(bfqq) || + unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; - /* If device has only one backlogged bfq_queue, don't search. */ + + /* If there is only one backlogged queue, don't search. */ if (bfqd->busy_queues == 1) return NULL; in_service_bfqq = bfqd->in_service_queue; + if (in_service_bfqq && in_service_bfqq != bfqq && + bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) + && likely(in_service_bfqq == &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have tried merge with in-service-queue, but wr"); + if (!in_service_bfqq || in_service_bfqq == bfqq || - !bfqd->in_service_bic || + !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || unlikely(in_service_bfqq == &bfqd->oom_bfqq)) goto check_scheduled; @@ -1482,7 +1915,15 @@ check_scheduled: BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && + if (new_bfqq && wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have merged with bfq%d, but wr", + new_bfqq->pid); + + if (new_bfqq && !wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && bfq_may_be_close_cooperator(bfqq, new_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); @@ -1498,46 +1939,11 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) */ if (!bfqq->bic) return; - if (bfqq->bic->wr_time_left) - /* - * This is the queue of a just-started process, and would - * deserve weight raising: we set wr_time_left to the full - * weight-raising duration to trigger weight-raising when - * and if the queue is split and the first request of the - * queue is enqueued. - */ - bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); - else if (bfqq->wr_coeff > 1) { - unsigned long wr_duration = - jiffies - bfqq->last_wr_start_finish; - /* - * It may happen that a queue's weight raising period lasts - * longer than its wr_cur_max_time, as weight raising is - * handled only when a request is enqueued or dispatched (it - * does not use any timer). If the weight raising period is - * about to end, don't save it. - */ - if (bfqq->wr_cur_max_time <= wr_duration) - bfqq->bic->wr_time_left = 0; - else - bfqq->bic->wr_time_left = - bfqq->wr_cur_max_time - wr_duration; - /* - * The bfq_queue is becoming shared or the requests of the - * process owning the queue are being redirected to a shared - * queue. Stop the weight raising period of the queue, as in - * both cases it should not be owned by an interactive or - * soft real-time application. - */ - bfq_bfqq_end_wr(bfqq); - } else - bfqq->bic->wr_time_left = 0; + bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bfqq->bic->cooperations++; - bfqq->bic->failed_cooperations = 0; } static void bfq_get_bic_reference(struct bfq_queue *bfqq) @@ -1562,6 +1968,40 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, if (bfq_bfqq_IO_bound(bfqq)) bfq_mark_bfqq_IO_bound(new_bfqq); bfq_clear_bfqq_IO_bound(bfqq); + + /* + * If bfqq is weight-raised, then let new_bfqq inherit + * weight-raising. To reduce false positives, neglect the case + * where bfqq has just been created, but has not yet made it + * to be weight-raised (which may happen because EQM may merge + * bfqq even before bfq_add_request is executed for the first + * time for bfqq). Handling this case would however be very + * easy, thanks to the flag just_created. + */ + if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { + new_bfqq->wr_coeff = bfqq->wr_coeff; + new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; + new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; + if (bfq_bfqq_busy(new_bfqq)) + bfqd->wr_busy_queues++; + new_bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, new_bfqq, + "wr starting after merge with %d, " + "rais_max_time %u", + bfqq->pid, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ + bfqq->wr_coeff = 1; + bfqq->entity.prio_changed = 1; + if (bfq_bfqq_busy(bfqq)) + bfqd->wr_busy_queues--; + } + + bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", + bfqd->wr_busy_queues); + /* * Grab a reference to the bic, to prevent it from being destroyed * before being possibly touched by a bfq_split_bfqq(). @@ -1588,18 +2028,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_put_queue(bfqq); } -static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) -{ - struct bfq_io_cq *bic = bfqq->bic; - struct bfq_data *bfqd = bfqq->bfqd; - - if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { - bic->failed_cooperations++; - if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) - bic->cooperations = 0; - } -} - static int bfq_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -1637,30 +2065,86 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, * to decide whether bio and rq can be merged. */ bfqq = new_bfqq; - } else - bfq_bfqq_increase_failed_cooperations(bfqq); + } } return bfqq == RQ_BFQQ(rq); } +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the throughput. + * In practice, a time-slice service scheme is used with seeky + * processes. + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + unsigned int timeout_coeff; + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); +} + static void __bfq_set_in_service_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq) { if (bfqq) { -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -#endif bfq_mark_bfqq_must_alloc(bfqq); - bfq_mark_bfqq_budget_new(bfqq); bfq_clear_bfqq_fifo_expire(bfqq); bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + BUG_ON(bfqq == bfqd->in_service_queue); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + if (bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_before_jiffies(bfqq->budget_timeout)) { + /* + * For soft real-time queues, move the start + * of the weight-raising period forward by the + * time the queue has not received any + * service. Otherwise, a relatively long + * service delay is likely to cause the + * weight-raising period of the queue to end, + * because of the short duration of the + * weight-raising period of a soft real-time + * queue. It is worth noting that this move + * is not so dangerous for the other queues, + * because soft real-time queues are not + * greedy. + * + * To not add a further variable, we use the + * overloaded field budget_timeout to + * determine for how long the queue has not + * received service, i.e., how much time has + * elapsed since the queue expired. However, + * this is a little imprecise, because + * budget_timeout is set to jiffies if bfqq + * not only expires, but also remains with no + * request. + */ + bfqq->last_wr_start_finish += jiffies - + bfqq->budget_timeout; + } + + bfq_set_budget_timeout(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, "set_in_service_queue, cur-budget = %d", bfqq->entity.budget); - } + } else + bfq_log(bfqd, "set_in_service_queue: NULL"); bfqd->in_service_queue = bfqq; } @@ -1676,31 +2160,6 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) return bfqq; } -/* - * If enough samples have been computed, return the current max budget - * stored in bfqd, which is dynamically updated according to the - * estimated disk peak rate; otherwise return the default max budget - */ -static int bfq_max_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget; - else - return bfqd->bfq_max_budget; -} - -/* - * Return min budget, which is a fraction of the current or default - * max budget (trying with 1/32) - */ -static int bfq_min_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget / 32; - else - return bfqd->bfq_max_budget / 32; -} - static void bfq_arm_slice_timer(struct bfq_data *bfqd) { struct bfq_queue *bfqq = bfqd->in_service_queue; @@ -1729,58 +2188,30 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) sl = bfqd->bfq_slice_idle; /* * Unless the queue is being weight-raised or the scenario is - * asymmetric, grant only minimum idle time if the queue either - * has been seeky for long enough or has already proved to be - * constantly seeky. + * asymmetric, grant only minimum idle time if the queue + * is seeky. A long idling is preserved for a weight-raised + * queue, or, more in general, in an asymemtric scenario, + * because a long idling is needed for guaranteeing to a queue + * its reserved share of the throughput (in particular, it is + * needed if the queue has a higher weight than some other + * queue). */ - if (bfq_sample_valid(bfqq->seek_samples) && - ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > - bfq_max_budget(bfqq->bfqd) / 8) || - bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && + if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && bfq_symmetric_scenario(bfqd)) sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); - else if (bfqq->wr_coeff > 1) - sl = sl * 3; + bfqd->last_idling_start = ktime_get(); mod_timer(&bfqd->idle_slice_timer, jiffies + sl); -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -#endif bfq_log(bfqd, "arm idle: %u/%u ms", jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); } /* - * Set the maximum time for the in-service queue to consume its - * budget. This prevents seeky processes from lowering the disk - * throughput (always guaranteed with a time slice scheme as in CFQ). - */ -static void bfq_set_budget_timeout(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq = bfqd->in_service_queue; - unsigned int timeout_coeff; - if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) - timeout_coeff = 1; - else - timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - - bfqd->last_budget_start = ktime_get(); - - bfq_clear_bfqq_budget_new(bfqq); - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; - - bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", - jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * - timeout_coeff)); -} - -/* - * Move request from internal lists to the request queue dispatch list. + * Move request from internal lists to the dispatch list of the request queue */ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) { - struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq = RQ_BFQQ(rq); /* @@ -1794,15 +2225,9 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) * incrementing bfqq->dispatched. */ bfqq->dispatched++; + bfq_remove_request(rq); elv_dispatch_sort(q, rq); - - if (bfq_bfqq_sync(bfqq)) - bfqd->sync_flight++; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), - rq->cmd_flags); -#endif } /* @@ -1822,18 +2247,12 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) rq = rq_entry_fifo(bfqq->fifo.next); - if (time_before(jiffies, rq->fifo_time)) + if (time_is_after_jiffies(rq->fifo_time)) return NULL; return rq; } -static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - return entity->budget - entity->service; -} - static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) { BUG_ON(bfqq != bfqd->in_service_queue); @@ -1850,12 +2269,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_split_coop(bfqq); if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* - * Overloading budget_timeout field to store the time - * at which the queue remains with no backlog; used by - * the weight-raising mechanism. - */ - bfqq->budget_timeout = jiffies; + if (bfqq->dispatched == 0) + /* + * Overloading budget_timeout field to store + * the time at which the queue remains with no + * backlog and no outstanding request; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + bfq_del_bfqq_busy(bfqd, bfqq, 1); } else { bfq_activate_bfqq(bfqd, bfqq); @@ -1882,10 +2304,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, struct request *next_rq; int budget, min_budget; - budget = bfqq->max_budget; + BUG_ON(bfqq != bfqd->in_service_queue); + min_budget = bfq_min_budget(bfqd); - BUG_ON(bfqq != bfqd->in_service_queue); + if (bfqq->wr_coeff == 1) + budget = bfqq->max_budget; + else /* + * Use a constant, low budget for weight-raised queues, + * to help achieve a low latency. Keep it slightly higher + * than the minimum possible budget, to cause a little + * bit fewer expirations. + */ + budget = 2 * min_budget; bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); @@ -1894,7 +2325,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - if (bfq_bfqq_sync(bfqq)) { + if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { switch (reason) { /* * Caveat: in all the following cases we trade latency @@ -1936,14 +2367,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, break; case BFQ_BFQQ_BUDGET_TIMEOUT: /* - * We double the budget here because: 1) it - * gives the chance to boost the throughput if - * this is not a seeky process (which may have - * bumped into this timeout because of, e.g., - * ZBR), 2) together with charge_full_budget - * it helps give seeky processes higher - * timestamps, and hence be served less - * frequently. + * We double the budget here because it gives + * the chance to boost the throughput if this + * is not a seeky process (and has bumped into + * this timeout because of, e.g., ZBR). */ budget = min(budget * 2, bfqd->bfq_max_budget); break; @@ -1960,17 +2387,49 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, budget = min(budget * 4, bfqd->bfq_max_budget); break; case BFQ_BFQQ_NO_MORE_REQUESTS: - /* - * Leave the budget unchanged. - */ + /* + * For queues that expire for this reason, it + * is particularly important to keep the + * budget close to the actual service they + * need. Doing so reduces the timestamp + * misalignment problem described in the + * comments in the body of + * __bfq_activate_entity. In fact, suppose + * that a queue systematically expires for + * BFQ_BFQQ_NO_MORE_REQUESTS and presents a + * new request in time to enjoy timestamp + * back-shifting. The larger the budget of the + * queue is with respect to the service the + * queue actually requests in each service + * slot, the more times the queue can be + * reactivated with the same virtual finish + * time. It follows that, even if this finish + * time is pushed to the system virtual time + * to reduce the consequent timestamp + * misalignment, the queue unjustly enjoys for + * many re-activations a lower finish time + * than all newly activated queues. + * + * The service needed by bfqq is measured + * quite precisely by bfqq->entity.service. + * Since bfqq does not enjoy device idling, + * bfqq->entity.service is equal to the number + * of sectors that the process associated with + * bfqq requested to read/write before waiting + * for request completions, or blocking for + * other reasons. + */ + budget = max_t(int, bfqq->entity.service, min_budget); + break; default: return; } - } else + } else if (!bfq_bfqq_sync(bfqq)) /* - * Async queues get always the maximum possible budget - * (their ability to dispatch is limited by - * @bfqd->bfq_max_budget_async_rq). + * Async queues get always the maximum possible + * budget, as for them we do not care about latency + * (in addition, their ability to dispatch is limited + * by the charging factor). */ budget = bfqd->bfq_max_budget; @@ -1981,65 +2440,105 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); /* - * Make sure that we have enough budget for the next request. - * Since the finish time of the bfqq must be kept in sync with - * the budget, be sure to call __bfq_bfqq_expire() after the + * If there is still backlog, then assign a new budget, making + * sure that it is large enough for the next request. Since + * the finish time of bfqq must be kept in sync with the + * budget, be sure to call __bfq_bfqq_expire() *after* this * update. + * + * If there is no backlog, then no need to update the budget; + * it will be updated on the arrival of a new request. */ next_rq = bfqq->next_rq; - if (next_rq) + if (next_rq) { + BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || + reason == BFQ_BFQQ_NO_MORE_REQUESTS); bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); - else - bfqq->entity.budget = bfqq->max_budget; + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + } bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", next_rq ? blk_rq_sectors(next_rq) : 0, bfqq->entity.budget); } -static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) +static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) { - unsigned long max_budget; - /* * The max_budget calculated when autotuning is equal to the - * amount of sectors transfered in timeout_sync at the + * amount of sectors transfered in timeout at the * estimated peak rate. */ - max_budget = (unsigned long)(peak_rate * 1000 * - timeout >> BFQ_RATE_SHIFT); - - return max_budget; + return bfqd->peak_rate * 1000 * jiffies_to_msecs(bfqd->bfq_timeout) >> + BFQ_RATE_SHIFT; } /* - * In addition to updating the peak rate, checks whether the process - * is "slow", and returns 1 if so. This slow flag is used, in addition - * to the budget timeout, to reduce the amount of service provided to - * seeky processes, and hence reduce their chances to lower the - * throughput. See the code for more details. + * Update the read peak rate (quantity used for auto-tuning) as a + * function of the rate at which bfqq has been served, and check + * whether the process associated with bfqq is "slow". Return true if + * the process is slow. The slow flag is used, in addition to the + * budget timeout, to reduce the amount of service provided to seeky + * processes, and hence reduce their chances to lower the + * throughput. More details in the body of the function. + * + * An important observation is in order: with devices with internal + * queues, it is hard if ever possible to know when and for how long + * an I/O request is processed by the device (apart from the trivial + * I/O pattern where a new request is dispatched only after the + * previous one has been completed). This makes it hard to evaluate + * the real rate at which the I/O requests of each bfq_queue are + * served. In fact, for an I/O scheduler like BFQ, serving a + * bfq_queue means just dispatching its requests during its service + * slot, i.e., until the budget of the queue is exhausted, or the + * queue remains idle, or, finally, a timeout fires. But, during the + * service slot of a bfq_queue, the device may be still processing + * requests of bfq_queues served in previous service slots. On the + * opposite end, the requests of the in-service bfq_queue may be + * completed after the service slot of the queue finishes. Anyway, + * unless more sophisticated solutions are used (where possible), the + * sum of the sizes of the requests dispatched during the service slot + * of a bfq_queue is probably the only approximation available for + * the service received by the bfq_queue during its service slot. And, + * as written above, this sum is the quantity used in this function to + * evaluate the peak rate. */ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason) + bool compensate, enum bfqq_expiration reason, + unsigned long *delta_ms) { - u64 bw, usecs, expected, timeout; - ktime_t delta; + u64 bw, bwdiv10, delta_usecs, delta_ms_tmp; + ktime_t delta_ktime; int update = 0; + bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ - if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) + if (!bfq_bfqq_sync(bfqq)) return false; if (compensate) - delta = bfqd->last_idling_start; + delta_ktime = bfqd->last_idling_start; else - delta = ktime_get(); - delta = ktime_sub(delta, bfqd->last_budget_start); - usecs = ktime_to_us(delta); + delta_ktime = ktime_get(); + delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); + delta_usecs = ktime_to_us(delta_ktime); /* Don't trust short/unrealistic values. */ - if (usecs < 100 || usecs >= LONG_MAX) - return false; + if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { + if (blk_queue_nonrot(bfqd->queue)) + *delta_ms = BFQ_MIN_TT; /* give same worst-case + guarantees as + idling for seeky + */ + else /* Charge at least one seek */ + *delta_ms = jiffies_to_msecs(bfq_slice_idle); + return slow; + } + + delta_ms_tmp = delta_usecs; + do_div(delta_ms_tmp, 1000); + *delta_ms = delta_ms_tmp; /* * Calculate the bandwidth for the last slice. We use a 64 bit @@ -2048,32 +2547,51 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, * and to avoid overflows. */ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; - do_div(bw, (unsigned long)usecs); - - timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + do_div(bw, (unsigned long)delta_usecs); + bfq_log(bfqd, "measured bw = %llu sects/sec", + (1000000*bw)>>BFQ_RATE_SHIFT); /* * Use only long (> 20ms) intervals to filter out spikes for * the peak rate estimation. */ - if (usecs > 20000) { + if (delta_usecs > 20000) { + bool fully_sequential = bfqq->seek_history == 0; + /* + * Soft real-time queues are not good candidates for + * evaluating bw, as they are likely to be slow even + * if sequential. + */ + bool non_soft_rt = bfqq->wr_coeff == 1 || + bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time; + bool consumed_large_budget = + reason == BFQ_BFQQ_BUDGET_EXHAUSTED && + bfqq->entity.budget >= bfqd->bfq_max_budget * 2 / 3; + bool served_for_long_time = + reason == BFQ_BFQQ_BUDGET_TIMEOUT || + consumed_large_budget; + + BUG_ON(bfqq->seek_history == 0 && + hweight32(bfqq->seek_history) != 0); + if (bw > bfqd->peak_rate || - (!BFQQ_SEEKY(bfqq) && - reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { - bfq_log(bfqd, "measured bw =%llu", bw); + (bfq_bfqq_sync(bfqq) && fully_sequential && non_soft_rt && + served_for_long_time)) { /* * To smooth oscillations use a low-pass filter with - * alpha=7/8, i.e., - * new_rate = (7/8) * old_rate + (1/8) * bw + * alpha=9/10, i.e., + * new_rate = (9/10) * old_rate + (1/10) * bw */ - do_div(bw, 8); - if (bw == 0) - return 0; - bfqd->peak_rate *= 7; - do_div(bfqd->peak_rate, 8); - bfqd->peak_rate += bw; + bwdiv10 = bw; + do_div(bwdiv10, 10); + if (bwdiv10 == 0) + return false; /* bw too low to be used */ + bfqd->peak_rate *= 9; + do_div(bfqd->peak_rate, 10); + bfqd->peak_rate += bwdiv10; update = 1; - bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); + bfq_log(bfqd, "new peak_rate = %llu sects/sec", + (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT); } update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; @@ -2086,9 +2604,8 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, int dev_type = blk_queue_nonrot(bfqd->queue); if (bfqd->bfq_user_max_budget == 0) { bfqd->bfq_max_budget = - bfq_calc_max_budget(bfqd->peak_rate, - timeout); - bfq_log(bfqd, "new max_budget=%d", + bfq_calc_max_budget(bfqd); + bfq_log(bfqd, "new max_budget = %d", bfqd->bfq_max_budget); } if (bfqd->device_speed == BFQ_BFQD_FAST && @@ -2102,38 +2619,35 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->RT_prod = R_fast[dev_type] * T_fast[dev_type]; } + bfq_log(bfqd, "dev_speed_class = %d (%d sects/sec), " + "thresh %d setcs/sec", + bfqd->device_speed, + bfqd->device_speed == BFQ_BFQD_FAST ? + (1000000*R_fast[dev_type])>>BFQ_RATE_SHIFT : + (1000000*R_slow[dev_type])>>BFQ_RATE_SHIFT, + (1000000*device_speed_thresh[dev_type])>> + BFQ_RATE_SHIFT); } + /* + * Caveat: processes doing IO in the slower disk zones + * tend to be slow(er) even if not seeky. In this + * respect, the estimated peak rate is likely to be an + * average over the disk surface. Accordingly, to not + * be too harsh with unlucky processes, a process is + * deemed slow only if its bw has been lower than half + * of the estimated peak rate. + */ + slow = bw < bfqd->peak_rate / 2; } - /* - * If the process has been served for a too short time - * interval to let its possible sequential accesses prevail on - * the initial seek time needed to move the disk head on the - * first sector it requested, then give the process a chance - * and for the moment return false. - */ - if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) - return false; - - /* - * A process is considered ``slow'' (i.e., seeky, so that we - * cannot treat it fairly in the service domain, as it would - * slow down too much the other processes) if, when a slice - * ends for whatever reason, it has received service at a - * rate that would not be high enough to complete the budget - * before the budget timeout expiration. - */ - expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; + bfq_log_bfqq(bfqd, bfqq, + "update_peak_rate: bw %llu sect/s, peak rate %llu, " + "slow %d", + (1000000*bw)>>BFQ_RATE_SHIFT, + (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT, + bw < bfqd->peak_rate / 2); - /* - * Caveat: processes doing IO in the slower disk zones will - * tend to be slow(er) even if not seeky. And the estimated - * peak rate will actually be an average over the disk - * surface. Hence, to not be too harsh with unlucky processes, - * we keep a budget/3 margin of safety before declaring a - * process slow. - */ - return expected > (4 * bfqq->entity.budget) / 3; + return slow; } /* @@ -2191,6 +2705,15 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + bfq_log_bfqq(bfqd, bfqq, + "softrt_next_start: service_blkg %lu " + "soft_rate %u sects/sec" + "interval %u", + bfqq->service_from_backlogged, + bfqd->bfq_wr_max_softrt_rate, + jiffies_to_msecs(HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate)); + return max(bfqq->last_idle_bklogged + HZ * bfqq->service_from_backlogged / bfqd->bfq_wr_max_softrt_rate, @@ -2198,13 +2721,21 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, } /* - * Return the largest-possible time instant such that, for as long as possible, - * the current time will be lower than this time instant according to the macro - * time_is_before_jiffies(). + * Return the farthest future time instant according to jiffies + * macros. */ -static unsigned long bfq_infinity_from_now(unsigned long now) +static unsigned long bfq_greatest_from_now(void) { - return now + ULONG_MAX / 2; + return jiffies + MAX_JIFFY_OFFSET; +} + +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; } /** @@ -2214,28 +2745,24 @@ static unsigned long bfq_infinity_from_now(unsigned long now) * @compensate: if true, compensate for the time spent idling. * @reason: the reason causing the expiration. * + * If the process associated with bfqq does slow I/O (e.g., because it + * issues random requests), we charge bfqq with the time it has been + * in service instead of the service it has received (see + * bfq_bfqq_charge_time for details on how this goal is achieved). As + * a consequence, bfqq will typically get higher timestamps upon + * reactivation, and hence it will be rescheduled as if it had + * received more service than what it has actually received. In the + * end, bfqq receives less service in proportion to how slowly its + * associated process consumes its budgets (and hence how seriously it + * tends to lower the throughput). In addition, this time-charging + * strategy guarantees time fairness among slow processes. In + * contrast, if the process associated with bfqq is not slow, we + * charge bfqq exactly with the service it has received. * - * If the process associated to the queue is slow (i.e., seeky), or in - * case of budget timeout, or, finally, if it is async, we - * artificially charge it an entire budget (independently of the - * actual service it received). As a consequence, the queue will get - * higher timestamps than the correct ones upon reactivation, and - * hence it will be rescheduled as if it had received more service - * than what it actually received. In the end, this class of processes - * will receive less service in proportion to how slowly they consume - * their budgets (and hence how seriously they tend to lower the - * throughput). - * - * In contrast, when a queue expires because it has been idling for - * too much or because it exhausted its budget, we do not touch the - * amount of service it has received. Hence when the queue will be - * reactivated and its timestamps updated, the latter will be in sync - * with the actual service received by the queue until expiration. - * - * Charging a full budget to the first type of queues and the exact - * service to the others has the effect of using the WF2Q+ policy to - * schedule the former on a timeslice basis, without violating the - * service domain guarantees of the latter. + * Charging time to the first type of queues and the exact service to + * the other has the effect of using the WF2Q+ policy to schedule the + * former on a timeslice basis, without violating service domain + * guarantees among the latter. */ static void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -2243,40 +2770,53 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, enum bfqq_expiration reason) { bool slow; + unsigned long delta = 0; + struct bfq_entity *entity = &bfqq->entity; + BUG_ON(bfqq != bfqd->in_service_queue); /* - * Update disk peak rate for autotuning and check whether the + * Update device peak rate for autotuning and check whether the * process is slow (see bfq_update_peak_rate). */ - slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason, &delta); /* - * As above explained, 'punish' slow (i.e., seeky), timed-out - * and async queues, to favor sequential sync workloads. + * Increase service_from_backlogged before next statement, + * because the possible next invocation of + * bfq_bfqq_charge_time would likely inflate + * entity->service. In contrast, service_from_backlogged must + * contain real service, to enable the soft real-time + * heuristic to correctly compute the bandwidth consumed by + * bfqq. + */ + bfqq->service_from_backlogged += entity->service; + + /* + * As above explained, charge slow (typically seeky) and + * timed-out queues with the time and not the service + * received, to favor sequential workloads. * - * Processes doing I/O in the slower disk zones will tend to be - * slow(er) even if not seeky. Hence, since the estimated peak - * rate is actually an average over the disk surface, these - * processes may timeout just for bad luck. To avoid punishing - * them we do not charge a full budget to a process that - * succeeded in consuming at least 2/3 of its budget. + * Processes doing I/O in the slower disk zones will tend to + * be slow(er) even if not seeky. Therefore, since the + * estimated peak rate is actually an average over the disk + * surface, these processes may timeout just for bad luck. To + * avoid punishing them, do not charge time to processes that + * succeeded in consuming at least 2/3 of their budget. This + * allows BFQ to preserve enough elasticity to still perform + * bandwidth, and not time, distribution with little unlucky + * or quasi-sequential processes. */ - if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) - bfq_bfqq_charge_full_budget(bfqq); + if (bfqq->wr_coeff == 1 && + (slow || + (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) + bfq_bfqq_charge_time(bfqd, bfqq, delta); - bfqq->service_from_backlogged += bfqq->entity.service; - - if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && - !bfq_bfqq_constantly_seeky(bfqq)) { - bfq_mark_bfqq_constantly_seeky(bfqq); - if (!blk_queue_nonrot(bfqd->queue)) - bfqd->const_seeky_busy_in_flight_queues++; - } + BUG_ON(bfqq->entity.budget < bfqq->entity.service); if (reason == BFQ_BFQQ_TOO_IDLE && - bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) + entity->service <= 2 * entity->budget / 10 ) bfq_clear_bfqq_IO_bound(bfqq); if (bfqd->low_latency && bfqq->wr_coeff == 1) @@ -2285,19 +2825,23 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && RB_EMPTY_ROOT(&bfqq->sort_list)) { /* - * If we get here, and there are no outstanding requests, - * then the request pattern is isochronous (see the comments - * to the function bfq_bfqq_softrt_next_start()). Hence we - * can compute soft_rt_next_start. If, instead, the queue - * still has outstanding requests, then we have to wait - * for the completion of all the outstanding requests to + * If we get here, and there are no outstanding + * requests, then the request pattern is isochronous + * (see the comments on the function + * bfq_bfqq_softrt_next_start()). Thus we can compute + * soft_rt_next_start. If, instead, the queue still + * has outstanding requests, then we have to wait for + * the completion of all the outstanding requests to * discover whether the request pattern is actually * isochronous. */ - if (bfqq->dispatched == 0) + BUG_ON(bfqd->busy_queues < 1); + if (bfqq->dispatched == 0) { bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); - else { + bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", + bfqq->soft_rt_next_start); + } else { /* * The application is still waiting for the * completion of one or more requests: @@ -2314,7 +2858,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, * happened to be in the past. */ bfqq->soft_rt_next_start = - bfq_infinity_from_now(jiffies); + bfq_greatest_from_now(); /* * Schedule an update of soft_rt_next_start to when * the task may be discovered to be isochronous. @@ -2324,15 +2868,27 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, } bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, - slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", + reason, slow, bfqq->dispatched, + bfq_bfqq_idle_window(bfqq), entity->weight); /* * Increase, decrease or leave budget unchanged according to * reason. */ + BUG_ON(bfqq->entity.budget < bfqq->entity.service); __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); __bfq_bfqq_expire(bfqd, bfqq); + + BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && + !bfq_class_idle(bfqq)); + + if (!bfq_bfqq_busy(bfqq) && + reason != BFQ_BFQQ_BUDGET_TIMEOUT && + reason != BFQ_BFQQ_BUDGET_EXHAUSTED) + bfq_mark_bfqq_non_blocking_wait_rq(bfqq); } /* @@ -2342,20 +2898,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, */ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) { - if (bfq_bfqq_budget_new(bfqq) || - time_before(jiffies, bfqq->budget_timeout)) - return false; - return true; + return time_is_before_eq_jiffies(bfqq->budget_timeout); } /* - * If we expire a queue that is waiting for the arrival of a new - * request, we may prevent the fictitious timestamp back-shifting that - * allows the guarantees of the queue to be preserved (see [1] for - * this tricky aspect). Hence we return true only if this condition - * does not hold, or if the queue is slow enough to deserve only to be - * kicked off for preserving a high throughput. -*/ + * If we expire a queue that is actively waiting (i.e., with the + * device idled) for the arrival of a new request, then we may incur + * the timestamp misalignment problem described in the body of the + * function __bfq_activate_entity. Hence we return true only if this + * condition does not hold, or if the queue is slow enough to deserve + * only to be kicked off for preserving a high throughput. + */ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, @@ -2397,10 +2950,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; bool idling_boosts_thr, idling_boosts_thr_without_issues, - all_queues_seeky, on_hdd_and_not_all_queues_seeky, idling_needed_for_service_guarantees, asymmetric_scenario; + if (bfqd->strict_guarantees) + return true; + /* * The next variable takes into account the cases where idling * boosts the throughput. @@ -2422,7 +2977,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) */ idling_boosts_thr = !bfqd->hw_tag || (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && - bfq_bfqq_idle_window(bfqq)) ; + bfq_bfqq_idle_window(bfqq)); /* * The value of the next variable, @@ -2463,74 +3018,27 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) bfqd->wr_busy_queues == 0; /* - * There are then two cases where idling must be performed not + * There is then a case where idling must be performed not * for throughput concerns, but to preserve service - * guarantees. In the description of these cases, we say, for - * short, that a queue is sequential/random if the process - * associated to the queue issues sequential/random requests - * (in the second case the queue may be tagged as seeky or - * even constantly_seeky). - * - * To introduce the first case, we note that, since - * bfq_bfqq_idle_window(bfqq) is false if the device is - * NCQ-capable and bfqq is random (see - * bfq_update_idle_window()), then, from the above two - * assignments it follows that - * idling_boosts_thr_without_issues is false if the device is - * NCQ-capable and bfqq is random. Therefore, for this case, - * device idling would never be allowed if we used just - * idling_boosts_thr_without_issues to decide whether to allow - * it. And, beneficially, this would imply that throughput - * would always be boosted also with random I/O on NCQ-capable - * HDDs. + * guarantees. * - * But we must be careful on this point, to avoid an unfair - * treatment for bfqq. In fact, because of the same above - * assignments, idling_boosts_thr_without_issues is, on the - * other hand, true if 1) the device is an HDD and bfqq is - * sequential, and 2) there are no busy weight-raised - * queues. As a consequence, if we used just - * idling_boosts_thr_without_issues to decide whether to idle - * the device, then with an HDD we might easily bump into a - * scenario where queues that are sequential and I/O-bound - * would enjoy idling, whereas random queues would not. The - * latter might then get a low share of the device throughput, - * simply because the former would get many requests served - * after being set as in service, while the latter would not. - * - * To address this issue, we start by setting to true a - * sentinel variable, on_hdd_and_not_all_queues_seeky, if the - * device is rotational and not all queues with pending or - * in-flight requests are constantly seeky (i.e., there are - * active sequential queues, and bfqq might then be mistreated - * if it does not enjoy idling because it is random). - */ - all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && - bfqd->busy_in_flight_queues == - bfqd->const_seeky_busy_in_flight_queues; - - on_hdd_and_not_all_queues_seeky = - !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; - - /* - * To introduce the second case where idling needs to be - * performed to preserve service guarantees, we can note that - * allowing the drive to enqueue more than one request at a - * time, and hence delegating de facto final scheduling - * decisions to the drive's internal scheduler, causes loss of - * control on the actual request service order. In particular, - * the critical situation is when requests from different - * processes happens to be present, at the same time, in the - * internal queue(s) of the drive. In such a situation, the - * drive, by deciding the service order of the - * internally-queued requests, does determine also the actual - * throughput distribution among these processes. But the - * drive typically has no notion or concern about per-process - * throughput distribution, and makes its decisions only on a - * per-request basis. Therefore, the service distribution - * enforced by the drive's internal scheduler is likely to - * coincide with the desired device-throughput distribution - * only in a completely symmetric scenario where: + * To introduce this case, we can note that allowing the drive + * to enqueue more than one request at a time, and hence + * delegating de facto final scheduling decisions to the + * drive's internal scheduler, entails loss of control on the + * actual request service order. In particular, the critical + * situation is when requests from different processes happen + * to be present, at the same time, in the internal queue(s) + * of the drive. In such a situation, the drive, by deciding + * the service order of the internally-queued requests, does + * determine also the actual throughput distribution among + * these processes. But the drive typically has no notion or + * concern about per-process throughput distribution, and + * makes its decisions only on a per-request basis. Therefore, + * the service distribution enforced by the drive's internal + * scheduler is likely to coincide with the desired + * device-throughput distribution only in a completely + * symmetric scenario where: * (i) each of these processes must get the same throughput as * the others; * (ii) all these processes have the same I/O pattern @@ -2552,26 +3060,53 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * words, only if sub-condition (i) holds, then idling is * allowed, and the device tends to be prevented from queueing * many requests, possibly of several processes. The reason - * for not controlling also sub-condition (ii) is that, first, - * in the case of an HDD, the asymmetry in terms of types of - * I/O patterns is already taken in to account in the above - * sentinel variable - * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a - * flash-based device, we prefer however to privilege - * throughput (and idling lowers throughput for this type of - * devices), for the following reasons: - * 1) differently from HDDs, the service time of random - * requests is not orders of magnitudes lower than the service - * time of sequential requests; thus, even if processes doing - * sequential I/O get a preferential treatment with respect to - * others doing random I/O, the consequences are not as - * dramatic as with HDDs; - * 2) if a process doing random I/O does need strong - * throughput guarantees, it is hopefully already being - * weight-raised, or the user is likely to have assigned it a - * higher weight than the other processes (and thus - * sub-condition (i) is likely to be false, which triggers - * idling). + * for not controlling also sub-condition (ii) is that we + * exploit preemption to preserve guarantees in case of + * symmetric scenarios, even if (ii) does not hold, as + * explained in the next two paragraphs. + * + * Even if a queue, say Q, is expired when it remains idle, Q + * can still preempt the new in-service queue if the next + * request of Q arrives soon (see the comments on + * bfq_bfqq_update_budg_for_activation). If all queues and + * groups have the same weight, this form of preemption, + * combined with the hole-recovery heuristic described in the + * comments on function bfq_bfqq_update_budg_for_activation, + * are enough to preserve a correct bandwidth distribution in + * the mid term, even without idling. In fact, even if not + * idling allows the internal queues of the device to contain + * many requests, and thus to reorder requests, we can rather + * safely assume that the internal scheduler still preserves a + * minimum of mid-term fairness. The motivation for using + * preemption instead of idling is that, by not idling, + * service guarantees are preserved without minimally + * sacrificing throughput. In other words, both a high + * throughput and its desired distribution are obtained. + * + * More precisely, this preemption-based, idleless approach + * provides fairness in terms of IOPS, and not sectors per + * second. This can be seen with a simple example. Suppose + * that there are two queues with the same weight, but that + * the first queue receives requests of 8 sectors, while the + * second queue receives requests of 1024 sectors. In + * addition, suppose that each of the two queues contains at + * most one request at a time, which implies that each queue + * always remains idle after it is served. Finally, after + * remaining idle, each queue receives very quickly a new + * request. It follows that the two queues are served + * alternatively, preempting each other if needed. This + * implies that, although both queues have the same weight, + * the queue with large requests receives a service that is + * 1024/8 times as high as the service received by the other + * queue. + * + * On the other hand, device idling is performed, and thus + * pure sector-domain guarantees are provided, for the + * following queues, which are likely to need stronger + * throughput guarantees: weight-raised queues, and queues + * with a higher weight than other queues. When such queues + * are active, sub-condition (i) is false, which triggers + * device idling. * * According to the above considerations, the next variable is * true (only) if sub-condition (i) holds. To compute the @@ -2579,7 +3114,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * the function bfq_symmetric_scenario(), but also check * whether bfqq is being weight-raised, because * bfq_symmetric_scenario() does not take into account also - * weight-raised queues (see comments to + * weight-raised queues (see comments on * bfq_weights_tree_add()). * * As a side note, it is worth considering that the above @@ -2601,17 +3136,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * bfqq. Such a case is when bfqq became active in a burst of * queue activations. Queues that became active during a large * burst benefit only from throughput, as discussed in the - * comments to bfq_handle_burst. Thus, if bfqq became active + * comments on bfq_handle_burst. Thus, if bfqq became active * in a burst and not idling the device maximizes throughput, * then the device must no be idled, because not idling the * device provides bfqq and all other queues in the burst with - * maximum benefit. Combining this and the two cases above, we - * can now establish when idling is actually needed to - * preserve service guarantees. + * maximum benefit. Combining this and the above case, we can + * now establish when idling is actually needed to preserve + * service guarantees. */ idling_needed_for_service_guarantees = - (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && - !bfq_bfqq_in_large_burst(bfqq); + asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); /* * We have now all the components we need to compute the return @@ -2621,6 +3155,14 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * 2) idling either boosts the throughput (without issues), or * is necessary to preserve service guarantees. */ + bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d " + "wr_busy %d boosts %d IO-bound %d guar %d", + bfq_bfqq_sync(bfqq), idling_boosts_thr, + bfqd->wr_busy_queues, + idling_boosts_thr_without_issues, + bfq_bfqq_IO_bound(bfqq), + idling_needed_for_service_guarantees); + return bfq_bfqq_sync(bfqq) && (idling_boosts_thr_without_issues || idling_needed_for_service_guarantees); @@ -2632,7 +3174,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * 1) the queue must remain in service and cannot be expired, and * 2) the device must be idled to wait for the possible arrival of a new * request for the queue. - * See the comments to the function bfq_bfqq_may_idle for the reasons + * See the comments on the function bfq_bfqq_may_idle for the reasons * why performing device idling is the best choice to boost the throughput * and preserve service guarantees when bfq_bfqq_may_idle itself * returns true. @@ -2698,9 +3240,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) */ bfq_clear_bfqq_wait_request(bfqq); del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif } goto keep_queue; } @@ -2745,14 +3285,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); /* - * If the queue was activated in a burst, or - * too much time has elapsed from the beginning - * of this weight-raising period, or the queue has - * exceeded the acceptable number of cooperations, - * then end weight raising. + * If the queue was activated in a burst, or too much + * time has elapsed from the beginning of this + * weight-raising period, then end weight raising. */ if (bfq_bfqq_in_large_burst(bfqq) || - bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time)) { bfqq->last_wr_start_finish = jiffies; @@ -2811,13 +3348,29 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, */ if (!bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); + BUG_ON(bfqq->entity.budget < bfqq->entity.service); goto expire; } + BUG_ON(bfqq->entity.budget < bfqq->entity.service); /* Finally, insert request into driver dispatch list. */ bfq_bfqq_served(bfqq, service_to_charge); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + bfq_dispatch_insert(bfqd->queue, rq); + /* + * If weight raising has to terminate for bfqq, then next + * function causes an immediate update of bfqq's weight, + * without waiting for next activation. As a consequence, on + * expiration, bfqq will be timestamped as if has never been + * weight-raised during this service slot, even if it has + * received part or even most of the service as a + * weight-raised queue. This inflates bfqq's timestamps, which + * is beneficial, as bfqq is then more willing to leave the + * device immediately to possible other weight-raised queues. + */ bfq_update_wr_data(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, @@ -2833,9 +3386,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, bfqd->in_service_bic = RQ_BIC(rq); } - if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && - dispatched >= bfqd->bfq_max_budget_async_rq) || - bfq_class_idle(bfqq))) + if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) goto expire; return dispatched; @@ -2881,8 +3432,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) st = bfq_entity_service_tree(&bfqq->entity); dispatched += __bfq_forced_dispatch_bfqq(bfqq); - bfqq->max_budget = bfq_max_budget(bfqd); + bfqq->max_budget = bfq_max_budget(bfqd); bfq_forget_idle(st); } @@ -2895,9 +3446,9 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq; - int max_dispatch; bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + if (bfqd->busy_queues == 0) return 0; @@ -2908,21 +3459,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) if (!bfqq) return 0; - if (bfq_class_idle(bfqq)) - max_dispatch = 1; - - if (!bfq_bfqq_sync(bfqq)) - max_dispatch = bfqd->bfq_max_budget_async_rq; - - if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { - if (bfqd->busy_queues > 1) - return 0; - if (bfqq->dispatched >= 4 * max_dispatch) - return 0; - } - - if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) - return 0; + BUG_ON(bfqq->entity.budget < bfqq->entity.service); bfq_clear_bfqq_wait_request(bfqq); BUG_ON(timer_pending(&bfqd->idle_slice_timer)); @@ -2933,6 +3470,8 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", bfq_bfqq_sync(bfqq) ? "sync" : "async"); + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); return 1; } @@ -2944,23 +3483,22 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) */ static void bfq_put_queue(struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; #ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_group *bfqg = bfqq_group(bfqq); #endif - BUG_ON(atomic_read(&bfqq->ref) <= 0); + BUG_ON(bfqq->ref <= 0); - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, - atomic_read(&bfqq->ref)); - if (!atomic_dec_and_test(&bfqq->ref)) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); + bfqq->ref--; + if (bfqq->ref) return; BUG_ON(rb_first(&bfqq->sort_list)); BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqd->in_service_queue == bfqq); + BUG_ON(bfqq->bfqd->in_service_queue == bfqq); if (bfq_bfqq_sync(bfqq)) /* @@ -2973,7 +3511,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) */ hlist_del_init(&bfqq->burst_list_node); - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); kmem_cache_free(bfq_pool, bfqq); #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -3007,8 +3545,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_schedule_dispatch(bfqd); } - bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, - atomic_read(&bfqq->ref)); + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); bfq_put_cooperator(bfqq); @@ -3019,26 +3556,7 @@ static void bfq_init_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); - bic->ttime.last_end_request = jiffies; - /* - * A newly created bic indicates that the process has just - * started doing I/O, and is probably mapping into memory its - * executable and libraries: it definitely needs weight raising. - * There is however the possibility that the process performs, - * for a while, I/O close to some other process. EQM intercepts - * this behavior and may merge the queue corresponding to the - * process with some other queue, BEFORE the weight of the queue - * is raised. Merged queues are not weight-raised (they are assumed - * to belong to processes that benefit only from high throughput). - * If the merge is basically the consequence of an accident, then - * the queue will be split soon and will get back its old weight. - * It is then important to write down somewhere that this queue - * does need weight raising, even if it did not make it to get its - * weight raised before being merged. To this purpose, we overload - * the field raising_time_left and assign 1 to it, to mark the queue - * as needing weight raising. - */ - bic->wr_time_left = 1; + bic->ttime.last_end_request = bfq_smallest_from_now(); } static void bfq_exit_icq(struct io_cq *icq) @@ -3046,21 +3564,21 @@ static void bfq_exit_icq(struct io_cq *icq) struct bfq_io_cq *bic = icq_to_bic(icq); struct bfq_data *bfqd = bic_to_bfqd(bic); - if (bic->bfqq[BLK_RW_ASYNC]) { - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); - bic->bfqq[BLK_RW_ASYNC] = NULL; + if (bic_to_bfqq(bic, false)) { + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); + bic_set_bfqq(bic, NULL, false); } - if (bic->bfqq[BLK_RW_SYNC]) { + if (bic_to_bfqq(bic, true)) { /* * If the bic is using a shared queue, put the reference * taken on the io_context when the bic started using a * shared bfq_queue. */ - if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) + if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) put_io_context(icq->ioc); - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); - bic->bfqq[BLK_RW_SYNC] = NULL; + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); + bic_set_bfqq(bic, NULL, true); } } @@ -3068,7 +3586,8 @@ static void bfq_exit_icq(struct io_cq *icq) * Update the entity prio values; note that the new values will not * be used until the next (re)activation. */ -static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + struct bfq_io_cq *bic) { struct task_struct *tsk = current; int ioprio_class; @@ -3100,7 +3619,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b break; } - if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio >= IOPRIO_BE_NR) { printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); BUG(); @@ -3108,45 +3627,40 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "set_next_ioprio_data: bic_class %d prio %d class %d", + ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); } static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { - struct bfq_data *bfqd; - struct bfq_queue *bfqq, *new_bfqq; + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_queue *bfqq; unsigned long uninitialized_var(flags); int ioprio = bic->icq.ioc->ioprio; - bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), - &flags); /* * This condition may trigger on a newly created bic, be sure to * drop the lock before returning. */ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) - goto out; + return; bic->ioprio = ioprio; - bfqq = bic->bfqq[BLK_RW_ASYNC]; + bfqq = bic_to_bfqq(bic, false); if (bfqq) { - new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, - GFP_ATOMIC); - if (new_bfqq) { - bic->bfqq[BLK_RW_ASYNC] = new_bfqq; - bfq_log_bfqq(bfqd, bfqq, - "check_ioprio_change: bfqq %p %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - } + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bic_set_bfqq(bic, bfqq, false); + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, bfqq->ref); } - bfqq = bic->bfqq[BLK_RW_SYNC]; + bfqq = bic_to_bfqq(bic, true); if (bfqq) bfq_set_next_ioprio_data(bfqq, bic); - -out: - bfq_put_bfqd_unlock(bfqd, &flags); } static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -3155,8 +3669,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - atomic_set(&bfqq->ref, 0); + bfqq->ref = 0; bfqq->bfqd = bfqd; if (bic) @@ -3166,6 +3681,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfq_class_idle(bfqq)) bfq_mark_bfqq_idle_window(bfqq); bfq_mark_bfqq_sync(bfqq); + bfq_mark_bfqq_just_created(bfqq); } else bfq_clear_bfqq_sync(bfqq); bfq_mark_bfqq_IO_bound(bfqq); @@ -3175,72 +3691,17 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->pid = pid; bfqq->wr_coeff = 1; - bfqq->last_wr_start_finish = 0; + bfqq->last_wr_start_finish = bfq_smallest_from_now(); + bfqq->budget_timeout = bfq_smallest_from_now(); + bfqq->split_time = bfq_smallest_from_now(); /* * Set to the value for which bfqq will not be deemed as * soft rt when it becomes backlogged. */ - bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); -} - -static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, - gfp_t gfp_mask) -{ - struct bfq_group *bfqg; - struct bfq_queue *bfqq, *new_bfqq = NULL; - struct blkcg *blkcg; + bfqq->soft_rt_next_start = bfq_greatest_from_now(); -retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - bfqg = bfq_find_alloc_group(bfqd, blkcg); - /* bic always exists here */ - bfqq = bic_to_bfqq(bic, is_sync); - - /* - * Always try a new alloc if we fall back to the OOM bfqq - * originally, since it should just be a temporary situation. - */ - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = NULL; - if (new_bfqq) { - bfqq = new_bfqq; - new_bfqq = NULL; - } else if (gfpflags_allow_blocking(gfp_mask)) { - rcu_read_unlock(); - spin_unlock_irq(bfqd->queue->queue_lock); - new_bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - spin_lock_irq(bfqd->queue->queue_lock); - if (new_bfqq) - goto retry; - } else { - bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - } - - if (bfqq) { - bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); - bfq_init_entity(&bfqq->entity, bfqg); - bfq_log_bfqq(bfqd, bfqq, "allocated"); - } else { - bfqq = &bfqd->oom_bfqq; - bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); - } - } - - if (new_bfqq) - kmem_cache_free(bfq_pool, new_bfqq); - - rcu_read_unlock(); - - return bfqq; + /* first request is almost certainly seeky */ + bfqq->seek_history = 1; } static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, @@ -3263,44 +3724,60 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, gfp_t gfp_mask) + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); struct bfq_queue **async_bfqq = NULL; - struct bfq_queue *bfqq = NULL; + struct bfq_queue *bfqq; + struct bfq_group *bfqg; - if (!is_sync) { - struct blkcg *blkcg; - struct bfq_group *bfqg; + rcu_read_lock(); + + bfqg = bfq_find_set_group(bfqd,bio_blkcg(bio)); + if (!bfqg) { + bfqq = &bfqd->oom_bfqq; + goto out; + } - rcu_read_lock(); - blkcg = bio_blkcg(bio); - rcu_read_unlock(); - bfqg = bfq_find_alloc_group(bfqd, blkcg); + if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); bfqq = *async_bfqq; + if (bfqq) + goto out; } - if (!bfqq) - bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); + bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO, + bfqd->queue->node); + + if (bfqq) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + goto out; + } /* * Pin the queue now that it's allocated, scheduler exit will * prune it. */ - if (!is_sync && !(*async_bfqq)) { - atomic_inc(&bfqq->ref); + if (async_bfqq) { + bfqq->ref++; bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); *async_bfqq = bfqq; } - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, - atomic_read(&bfqq->ref)); +out: + bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + rcu_read_unlock(); return bfqq; } @@ -3316,37 +3793,21 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, bic->ttime.ttime_samples; } -static void bfq_update_io_seektime(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *rq) -{ - sector_t sdist; - u64 total; - if (bfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - bfqq->last_request_pos; - else - sdist = bfqq->last_request_pos - blk_rq_pos(rq); - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc. - */ - if (bfqq->seek_samples == 0) /* first request, not really a seek */ - sdist = 0; - else if (bfqq->seek_samples <= 60) /* second & third seek */ - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); - else - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); - - bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; - bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; - total = bfqq->seek_total + (bfqq->seek_samples/2); - do_div(total, bfqq->seek_samples); - bfqq->seek_mean = (sector_t)total; +static void +bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + sector_t sdist = 0; + if (bfqq->last_request_pos) { + if (bfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - bfqq->last_request_pos; + else + sdist = bfqq->last_request_pos - blk_rq_pos(rq); + } - bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, - (u64)bfqq->seek_mean); + bfqq->seek_history <<= 1; + bfqq->seek_history |= (sdist > BFQQ_SEEK_THR); } /* @@ -3364,7 +3825,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, return; /* Idle window just restored, statistics are meaningless. */ - if (bfq_bfqq_just_split(bfqq)) + if (time_is_after_eq_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) return; enable_idle = bfq_bfqq_idle_window(bfqq); @@ -3404,22 +3866,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_update_io_thinktime(bfqd, bic); bfq_update_io_seektime(bfqd, bfqq, rq); - if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { - bfq_clear_bfqq_constantly_seeky(bfqq); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || !BFQQ_SEEKY(bfqq)) bfq_update_idle_window(bfqd, bfqq, bic); - bfq_clear_bfqq_just_split(bfqq); bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", - bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), - (long long unsigned)bfqq->seek_mean); + "rq_enqueued: idle_window=%d (seeky %d)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); @@ -3433,14 +3886,15 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, * is small and the queue is not to be expired, then * just exit. * - * In this way, if the disk is being idled to wait for - * a new request from the in-service queue, we avoid - * unplugging the device and committing the disk to serve - * just a small request. On the contrary, we wait for - * the block layer to decide when to unplug the device: - * hopefully, new requests will be merged to this one - * quickly, then the device will be unplugged and - * larger requests will be dispatched. + * In this way, if the device is being idled to wait + * for a new request from the in-service queue, we + * avoid unplugging the device and committing the + * device to serve just a small request. On the + * contrary, we wait for the block layer to decide + * when to unplug the device: hopefully, new requests + * will be merged to this one quickly, then the device + * will be unplugged and larger requests will be + * dispatched. */ if (small_req && !budget_timeout) return; @@ -3453,9 +3907,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfq_clear_bfqq_wait_request(bfqq); del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif /* * The queue is not empty, because a new request just @@ -3499,27 +3951,19 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) */ new_bfqq->allocated[rq_data_dir(rq)]++; bfqq->allocated[rq_data_dir(rq)]--; - atomic_inc(&new_bfqq->ref); + new_bfqq->ref++; + bfq_clear_bfqq_just_created(bfqq); bfq_put_queue(bfqq); if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); rq->elv.priv[1] = new_bfqq; bfqq = new_bfqq; - } else - bfq_bfqq_increase_failed_cooperations(bfqq); + } } bfq_add_request(rq); - /* - * Here a newly-created bfq_queue has already started a weight-raising - * period: clear raising_time_left to prevent bfq_bfqq_save_state() - * from assigning it a full weight-raising period. See the detailed - * comments about this field in bfq_init_icq(). - */ - if (bfqq->bic) - bfqq->bic->wr_time_left = 0; rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); @@ -3528,8 +3972,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) static void bfq_update_hw_tag(struct bfq_data *bfqd) { - bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, - bfqd->rq_in_driver); + bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, + bfqd->rq_in_driver); if (bfqd->hw_tag == 1) return; @@ -3555,48 +3999,45 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd = bfqq->bfqd; - bool sync = bfq_bfqq_sync(bfqq); - bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", - blk_rq_sectors(rq), sync); + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", + blk_rq_sectors(rq)); + assert_spin_locked(bfqd->queue->queue_lock); bfq_update_hw_tag(bfqd); BUG_ON(!bfqd->rq_in_driver); BUG_ON(!bfqq->dispatched); bfqd->rq_in_driver--; bfqq->dispatched--; -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_completion(bfqq_group(bfqq), rq_start_time_ns(rq), rq_io_start_time_ns(rq), rq->cmd_flags); -#endif if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + /* + * Set budget_timeout (which we overload to store the + * time at which the queue remains with no backlog and + * no outstanding request; used by the weight-raising + * mechanism). + */ + bfqq->budget_timeout = jiffies; + bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->busy_in_flight_queues); - bfqd->busy_in_flight_queues--; - if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON(!bfqd-> - const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } } - if (sync) { - bfqd->sync_flight--; - RQ_BIC(rq)->ttime.last_end_request = jiffies; - } + RQ_BIC(rq)->ttime.last_end_request = jiffies; /* - * If we are waiting to discover whether the request pattern of the - * task associated with the queue is actually isochronous, and - * both requisites for this condition to hold are satisfied, then - * compute soft_rt_next_start (see the comments to the function - * bfq_bfqq_softrt_next_start()). + * If we are waiting to discover whether the request pattern + * of the task associated with the queue is actually + * isochronous, and both requisites for this condition to hold + * are now satisfied, then compute soft_rt_next_start (see the + * comments on the function bfq_bfqq_softrt_next_start()). We + * schedule this delayed check when bfqq expires, if it still + * has in-flight requests. */ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && RB_EMPTY_ROOT(&bfqq->sort_list)) @@ -3608,10 +4049,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) * or if we want to idle in case it has no pending requests. */ if (bfqd->in_service_queue == bfqq) { - if (bfq_bfqq_budget_new(bfqq)) - bfq_set_budget_timeout(bfqd); - - if (bfq_bfqq_must_idle(bfqq)) { + if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { bfq_arm_slice_timer(bfqd); goto out; } else if (bfq_may_expire_for_budg_timeout(bfqq)) @@ -3682,14 +4120,14 @@ static void bfq_put_request(struct request *rq) rq->elv.priv[1] = NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); bfq_put_queue(bfqq); } } /* * Returns NULL if a new bfqq should be allocated, or the old bfqq if this - * was the last process referring to said bfqq. + * was the last process referring to that bfqq. */ static struct bfq_queue * bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) @@ -3727,11 +4165,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, unsigned long flags; bool split = false; - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); - - bfq_check_ioprio_change(bic, bio); - spin_lock_irqsave(q->queue_lock, flags); + bfq_check_ioprio_change(bic, bio); if (!bic) goto queue_fail; @@ -3741,23 +4176,47 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, new_queue: bfqq = bic_to_bfqq(bic, is_sync); if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); + if (bfqq) + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: was_in_list %d " + "was_in_large_burst %d " + "large burst in progress %d", + bic->was_in_burst_list, + bic->saved_in_large_burst, + bfqd->large_burst); + if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) + bic->saved_in_large_burst) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: marking in " + "large burst"); bfq_mark_bfqq_in_large_burst(bfqq); - else { - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); + } else { + bfq_log_bfqq(bfqd, bfqq, + "set_request: clearing in " + "large burst"); + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); } + bfqq->split_time = jiffies; } } else { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) + bic->saved_in_large_burst = true; + bfqq = bfq_split_bfqq(bic, bfqq); split = true; if (!bfqq) @@ -3766,9 +4225,8 @@ new_queue: } bfqq->allocated[rw]++; - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, - atomic_read(&bfqq->ref)); + bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; @@ -3783,7 +4241,6 @@ new_queue: if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { bfqq->bic = bic; if (split) { - bfq_mark_bfqq_just_split(bfqq); /* * If the queue has just been split from a shared * queue, restore the idle window and the possible @@ -3793,6 +4250,9 @@ new_queue: } } + if (unlikely(bfq_bfqq_just_created(bfqq))) + bfq_handle_burst(bfqd, bfqq); + spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -3872,6 +4332,7 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) cancel_work_sync(&bfqd->unplug_work); } +#ifdef CONFIG_BFQ_GROUP_IOSCHED static void __bfq_put_async_bfqq(struct bfq_data *bfqd, struct bfq_queue **bfqq_ptr) { @@ -3880,9 +4341,9 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, bfq_log(bfqd, "put_async_bfqq: %p", bfqq); if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); + bfq_bfqq_move(bfqd, bfqq, root_group); bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); bfq_put_queue(bfqq); *bfqq_ptr = NULL; } @@ -3904,6 +4365,7 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); } +#endif static void bfq_exit_queue(struct elevator_queue *e) { @@ -3923,8 +4385,6 @@ static void bfq_exit_queue(struct elevator_queue *e) bfq_shutdown_timer_wq(bfqd); - synchronize_rcu(); - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -3973,11 +4433,14 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * will not attempt to free it. */ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); - atomic_inc(&bfqd->oom_bfqq.ref); + bfqd->oom_bfqq.ref++; bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; bfqd->oom_bfqq.entity.new_weight = bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); + + /* oom_bfqq does not participate to bursts */ + bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); /* * Trigger weight initialization, according to ioprio, at the * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio @@ -3996,9 +4459,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqd->active_numerous_groups = 0; -#endif init_timer(&bfqd->idle_slice_timer); bfqd->idle_slice_timer.function = bfq_idle_slice_timer; @@ -4023,20 +4483,19 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_back_penalty = bfq_back_penalty; bfqd->bfq_slice_idle = bfq_slice_idle; bfqd->bfq_class_idle_last_service = 0; - bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; - bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; - bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + bfqd->bfq_timeout = bfq_timeout; - bfqd->bfq_coop_thresh = 2; - bfqd->bfq_failed_cooperations = 7000; bfqd->bfq_requests_within_timer = 120; - bfqd->bfq_large_burst_thresh = 11; - bfqd->bfq_burst_interval = msecs_to_jiffies(500); + bfqd->bfq_large_burst_thresh = 8; + bfqd->bfq_burst_interval = msecs_to_jiffies(180); bfqd->low_latency = true; - bfqd->bfq_wr_coeff = 20; + /* + * Trade-off between responsiveness and fairness. + */ + bfqd->bfq_wr_coeff = 30; bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); bfqd->bfq_wr_max_time = 0; bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); @@ -4048,16 +4507,15 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * video. */ bfqd->wr_busy_queues = 0; - bfqd->busy_in_flight_queues = 0; - bfqd->const_seeky_busy_in_flight_queues = 0; /* - * Begin by assuming, optimistically, that the device peak rate is - * equal to the highest reference rate. + * Begin by assuming, optimistically, that the device is a + * high-speed one, and that its peak rate is equal to 2/3 of + * the highest reference rate. */ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * T_fast[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; bfqd->device_speed = BFQ_BFQD_FAST; return 0; @@ -4161,10 +4619,8 @@ SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -SHOW_FUNCTION(bfq_max_budget_async_rq_show, - bfqd->bfq_max_budget_async_rq, 0); -SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); -SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); +SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); @@ -4199,10 +4655,6 @@ STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, INT_MAX, 0); STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, - 1, INT_MAX, 0); -STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, - INT_MAX, 1); STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, @@ -4224,10 +4676,8 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) { - u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); - if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) - return bfq_calc_max_budget(bfqd->peak_rate, timeout); + return bfq_calc_max_budget(bfqd); else return bfq_default_max_budget; } @@ -4252,6 +4702,10 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, return ret; } +/* + * Leaving this name to preserve name compatibility with cfq + * parameters, but this timeout is used for both sync and async. + */ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, const char *page, size_t count) { @@ -4264,13 +4718,31 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, else if (__data > INT_MAX) __data = INT_MAX; - bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); + bfqd->bfq_timeout = msecs_to_jiffies(__data); if (bfqd->bfq_user_max_budget == 0) bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); return ret; } +static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (!bfqd->strict_guarantees && __data == 1 + && bfqd->bfq_slice_idle < msecs_to_jiffies(8)) + bfqd->bfq_slice_idle = msecs_to_jiffies(8); + + bfqd->strict_guarantees = __data; + + return ret; +} + static ssize_t bfq_low_latency_store(struct elevator_queue *e, const char *page, size_t count) { @@ -4297,9 +4769,8 @@ static struct elv_fs_entry bfq_attrs[] = { BFQ_ATTR(back_seek_penalty), BFQ_ATTR(slice_idle), BFQ_ATTR(max_budget), - BFQ_ATTR(max_budget_async_rq), BFQ_ATTR(timeout_sync), - BFQ_ATTR(timeout_async), + BFQ_ATTR(strict_guarantees), BFQ_ATTR(low_latency), BFQ_ATTR(wr_coeff), BFQ_ATTR(wr_max_time), @@ -4342,9 +4813,28 @@ static struct elevator_type iosched_bfq = { .elevator_owner = THIS_MODULE, }; +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfq_blkg_files, + .legacy_cftypes = bfq_blkcg_legacy_files, + + .cpd_alloc_fn = bfq_cpd_alloc, + .cpd_init_fn = bfq_cpd_init, + .cpd_bind_fn = bfq_cpd_init, + .cpd_free_fn = bfq_cpd_free, + + .pd_alloc_fn = bfq_pd_alloc, + .pd_init_fn = bfq_pd_init, + .pd_offline_fn = bfq_pd_offline, + .pd_free_fn = bfq_pd_free, + .pd_reset_stats_fn = bfq_pd_reset_stats, +}; +#endif + static int __init bfq_init(void) { int ret; + char msg[50] = "BFQ I/O-scheduler: v8r2"; /* * Can be 0 on HZ < 1000 setups. @@ -4352,9 +4842,6 @@ static int __init bfq_init(void) if (bfq_slice_idle == 0) bfq_slice_idle = 1; - if (bfq_timeout_async == 0) - bfq_timeout_async = 1; - #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); if (ret) @@ -4370,23 +4857,34 @@ static int __init bfq_init(void) * installed on the reference devices (see the comments before the * definitions of the two arrays). */ - T_slow[0] = msecs_to_jiffies(2600); - T_slow[1] = msecs_to_jiffies(1000); - T_fast[0] = msecs_to_jiffies(5500); - T_fast[1] = msecs_to_jiffies(2000); + T_slow[0] = msecs_to_jiffies(3500); + T_slow[1] = msecs_to_jiffies(1500); + T_fast[0] = msecs_to_jiffies(8000); + T_fast[1] = msecs_to_jiffies(3000); /* - * Thresholds that determine the switch between speed classes (see - * the comments before the definition of the array). + * Thresholds that determine the switch between speed classes + * (see the comments before the definition of the array + * device_speed_thresh). These thresholds are biased towards + * transitions to the fast class. This is safer than the + * opposite bias. In fact, a wrong transition to the slow + * class results in short weight-raising periods, because the + * speed of the device then tends to be higher that the + * reference peak rate. On the opposite end, a wrong + * transition to the fast class tends to increase + * weight-raising periods, because of the opposite reason. */ - device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; - device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; + device_speed_thresh[0] = (4 * R_slow[0]) / 3; + device_speed_thresh[1] = (4 * R_slow[1]) / 3; ret = elv_register(&iosched_bfq); if (ret) goto err_pol_unreg; - pr_info("BFQ I/O-scheduler: v7r11"); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + strcat(msg, " (with cgroups support)"); +#endif + pr_info("%s", msg); return 0; diff --git a/block/bfq-sched.c b/block/bfq-sched.c index a64fec119..475a9a6e1 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -7,9 +7,11 @@ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * - * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> + * Copyright (C) 2016 Paolo Valente <paolo.valente@unimore.it> */ +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + #ifdef CONFIG_BFQ_GROUP_IOSCHED #define for_each_entity(entity) \ for (; entity ; entity = entity->parent) @@ -22,8 +24,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, int extract, struct bfq_data *bfqd); -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static void bfq_update_budget(struct bfq_entity *next_in_service) { struct bfq_entity *bfqg_entity; @@ -48,6 +48,7 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) static int bfq_update_next_in_service(struct bfq_sched_data *sd) { struct bfq_entity *next_in_service; + struct bfq_queue *bfqq; if (sd->in_service_entity) /* will update/requeue at the end of service */ @@ -65,14 +66,29 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd) if (next_in_service) bfq_update_budget(next_in_service); + else + goto exit; + bfqq = bfq_entity_to_bfqq(next_in_service); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_next_in_service: chosen this queue"); + else { + struct bfq_group *bfqg = + container_of(next_in_service, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_next_in_service: chosen this entity"); + } +exit: return 1; } static void bfq_check_next_in_service(struct bfq_sched_data *sd, struct bfq_entity *entity) { - BUG_ON(sd->next_in_service != entity); + WARN_ON(sd->next_in_service != entity); } #else #define for_each_entity(entity) \ @@ -151,20 +167,35 @@ static u64 bfq_delta(unsigned long service, unsigned long weight) static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - + unsigned long long start, finish, delta ; BUG_ON(entity->weight == 0); entity->finish = entity->start + bfq_delta(service, entity->weight); + start = ((entity->start>>10)*1000)>>12; + finish = ((entity->finish>>10)*1000)>>12; + delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; + if (bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: serv %lu, w %d", service, entity->weight); bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: start %llu, finish %llu, delta %llu", - entity->start, entity->finish, - bfq_delta(service, entity->weight)); + start, finish, delta); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_finish group: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_finish group: start %llu, finish %llu, delta %llu", + start, finish, delta); +#endif } } @@ -386,8 +417,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, BUG_ON(!bfqg); BUG_ON(!bfqd); bfqg->active_entities++; - if (bfqg->active_entities == 2) - bfqd->active_numerous_groups++; } #endif } @@ -399,7 +428,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, static unsigned short bfq_ioprio_to_weight(int ioprio) { BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); - return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; + return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF ; } /** @@ -422,9 +451,9 @@ static void bfq_get_entity(struct bfq_entity *entity) struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); if (bfqq) { - atomic_inc(&bfqq->ref); + bfqq->ref++; bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); } } @@ -499,10 +528,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, BUG_ON(!bfqd); BUG_ON(!bfqg->active_entities); bfqg->active_entities--; - if (bfqg->active_entities == 1) { - BUG_ON(!bfqd->active_numerous_groups); - bfqd->active_numerous_groups--; - } } #endif } @@ -552,7 +577,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, if (bfqq) { sd = entity->sched_data; bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); bfq_put_queue(bfqq); } } @@ -602,7 +627,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (entity->prio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned short prev_weight, new_weight; + unsigned int prev_weight, new_weight; struct bfq_data *bfqd = NULL; struct rb_root *root; #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -628,10 +653,12 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (entity->new_weight != entity->orig_weight) { if (entity->new_weight < BFQ_MIN_WEIGHT || entity->new_weight > BFQ_MAX_WEIGHT) { - printk(KERN_CRIT "update_weight_prio: " - "new_weight %d\n", + pr_crit("update_weight_prio: new_weight %d\n", entity->new_weight); - BUG(); + if (entity->new_weight < BFQ_MIN_WEIGHT) + entity->new_weight = BFQ_MIN_WEIGHT; + else + entity->new_weight = BFQ_MAX_WEIGHT; } entity->orig_weight = entity->new_weight; if (bfqq) @@ -662,6 +689,13 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * associated with its new weight. */ if (prev_weight != new_weight) { + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "weight changed %d %d(%d %d)", + prev_weight, new_weight, + entity->orig_weight, + bfqq->wr_coeff); + root = bfqq ? &bfqd->queue_weights_tree : &bfqd->group_weights_tree; bfq_weights_tree_remove(bfqd, entity, root); @@ -708,7 +742,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) st = bfq_entity_service_tree(entity); entity->service += served; - BUG_ON(entity->service > entity->budget); + BUG_ON(st->wsum == 0); st->vtime += bfq_delta(served, st->wsum); @@ -717,31 +751,69 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) #ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); #endif - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); + st = bfq_entity_service_tree(&bfqq->entity); + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", + served, ((st->vtime>>10)*1000)>>12, st); } /** - * bfq_bfqq_charge_full_budget - set the service to the entity budget. + * bfq_bfqq_charge_time - charge an amount of service equivalent to the length + * of the time interval during which bfqq has been in + * service. + * @bfqd: the device * @bfqq: the queue that needs a service update. + * @time_ms: the amount of time during which the queue has received service + * + * If a queue does not consume its budget fast enough, then providing + * the queue with service fairness may impair throughput, more or less + * severely. For this reason, queues that consume their budget slowly + * are provided with time fairness instead of service fairness. This + * goal is achieved through the BFQ scheduling engine, even if such an + * engine works in the service, and not in the time domain. The trick + * is charging these queues with an inflated amount of service, equal + * to the amount of service that they would have received during their + * service slot if they had been fast, i.e., if their requests had + * been dispatched at a rate equal to the estimated peak rate. * - * When it's not possible to be fair in the service domain, because - * a queue is not consuming its budget fast enough (the meaning of - * fast depends on the timeout parameter), we charge it a full - * budget. In this way we should obtain a sort of time-domain - * fairness among all the seeky/slow queues. + * It is worth noting that time fairness can cause important + * distortions in terms of bandwidth distribution, on devices with + * internal queueing. The reason is that I/O requests dispatched + * during the service slot of a queue may be served after that service + * slot is finished, and may have a total processing time loosely + * correlated with the duration of the service slot. This is + * especially true for short service slots. */ -static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, + unsigned long time_ms) { struct bfq_entity *entity = &bfqq->entity; + int tot_serv_to_charge = entity->service; + unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); + + if (time_ms > 0 && time_ms < timeout_ms) + tot_serv_to_charge = + (bfqd->bfq_max_budget * time_ms) / timeout_ms; + + if (tot_serv_to_charge < entity->service) + tot_serv_to_charge = entity->service; - bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "charge_time: %lu/%u ms, %d/%d/%d sectors", + time_ms, timeout_ms, entity->service, + tot_serv_to_charge, entity->budget); - bfq_bfqq_served(bfqq, entity->budget - entity->service); + /* Increase budget to avoid inconsistencies */ + if (tot_serv_to_charge > entity->budget) + entity->budget = tot_serv_to_charge; + + bfq_bfqq_served(bfqq, + max_t(int, 0, tot_serv_to_charge - entity->service)); } /** * __bfq_activate_entity - activate an entity. * @entity: the entity being activated. + * @non_blocking_wait_rq: true if this entity was waiting for a request * * Called whenever an entity is activated, i.e., it is not active and one * of its children receives a new request, or has to be reactivated due to @@ -749,11 +821,16 @@ static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) * service received if @entity is active) of the queue to calculate its * timestamps. */ -static void __bfq_activate_entity(struct bfq_entity *entity) +static void __bfq_activate_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq) { struct bfq_sched_data *sd = entity->sched_data; struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + bool backshifted = false; + BUG_ON(!sd); + BUG_ON(!st); if (entity == sd->in_service_entity) { BUG_ON(entity->tree); /* @@ -771,45 +848,133 @@ static void __bfq_activate_entity(struct bfq_entity *entity) * old start time. */ bfq_active_extract(st, entity); - } else if (entity->tree == &st->idle) { - /* - * Must be on the idle tree, bfq_idle_extract() will - * check for that. - */ - bfq_idle_extract(st, entity); - entity->start = bfq_gt(st->vtime, entity->finish) ? - st->vtime : entity->finish; } else { - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue - * would have been on the idle tree. - */ - entity->start = st->vtime; - st->wsum += entity->weight; - bfq_get_entity(entity); + unsigned long long min_vstart; - BUG_ON(entity->on_st); - entity->on_st = 1; + /* See comments on bfq_fqq_update_budg_for_activation */ + if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { + backshifted = true; + min_vstart = entity->finish; + } else + min_vstart = st->vtime; + + if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_extract() will + * check for that. + */ + bfq_idle_extract(st, entity); + entity->start = bfq_gt(min_vstart, entity->finish) ? + min_vstart : entity->finish; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = min_vstart; + st->wsum += entity->weight; + bfq_get_entity(entity); + + BUG_ON(entity->on_st); + entity->on_st = 1; + } } st = __bfq_entity_update_weight_prio(st, entity); bfq_calc_finish(entity, entity->budget); + + /* + * If some queues enjoy backshifting for a while, then their + * (virtual) finish timestamps may happen to become lower and + * lower than the system virtual time. In particular, if + * these queues often happen to be idle for short time + * periods, and during such time periods other queues with + * higher timestamps happen to be busy, then the backshifted + * timestamps of the former queues can become much lower than + * the system virtual time. In fact, to serve the queues with + * higher timestamps while the ones with lower timestamps are + * idle, the system virtual time may be pushed-up to much + * higher values than the finish timestamps of the idle + * queues. As a consequence, the finish timestamps of all new + * or newly activated queues may end up being much larger than + * those of lucky queues with backshifted timestamps. The + * latter queues may then monopolize the device for a lot of + * time. This would simply break service guarantees. + * + * To reduce this problem, push up a little bit the + * backshifted timestamps of the queue associated with this + * entity (only a queue can happen to have the backshifted + * flag set): just enough to let the finish timestamp of the + * queue be equal to the current value of the system virtual + * time. This may introduce a little unfairness among queues + * with backshifted timestamps, but it does not break + * worst-case fairness guarantees. + * + * As a special case, if bfqq is weight-raised, push up + * timestamps much less, to keep very low the probability that + * this push up causes the backshifted finish timestamps of + * weight-raised queues to become higher than the backshifted + * finish timestamps of non weight-raised queues. + */ + if (backshifted && bfq_gt(st->vtime, entity->finish)) { + unsigned long delta = st->vtime - entity->finish; + + if (bfqq) + delta /= bfqq->wr_coeff; + + entity->start += delta; + entity->finish += delta; + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: new queue finish %llu", + ((entity->finish>>10)*1000)>>12); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__activate_entity: new group finish %llu", + ((entity->finish>>10)*1000)>>12); +#endif + } + } + bfq_active_insert(st, entity); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: queue %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__activate_entity: group %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); +#endif + } } /** * bfq_activate_entity - activate an entity and its ancestors if necessary. * @entity: the entity to activate. + * @non_blocking_wait_rq: true if this entity was waiting for a request * * Activate @entity and all the entities on the path from it to the root. */ -static void bfq_activate_entity(struct bfq_entity *entity) +static void bfq_activate_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq) { struct bfq_sched_data *sd; for_each_entity(entity) { - __bfq_activate_entity(entity); + BUG_ON(!entity); + __bfq_activate_entity(entity, non_blocking_wait_rq); sd = entity->sched_data; if (!bfq_update_next_in_service(sd)) @@ -890,23 +1055,24 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) if (!__bfq_deactivate_entity(entity, requeue)) /* - * The parent entity is still backlogged, and - * we don't need to update it as it is still - * in service. + * next_in_service has not been changed, so + * no upwards update is needed */ break; if (sd->next_in_service) /* - * The parent entity is still backlogged and - * the budgets on the path towards the root - * need to be updated. + * The parent entity is still backlogged, + * because next_in_service is not NULL, and + * next_in_service has been updated (see + * comment on the body of the above if): + * upwards update of the schedule is needed. */ goto update; /* - * If we reach there the parent is no more backlogged and - * we want to propagate the dequeue upwards. + * If we get here, then the parent is no more backlogged and + * we want to propagate the deactivation upwards. */ requeue = 1; } @@ -916,9 +1082,23 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) update: entity = parent; for_each_entity(entity) { - __bfq_activate_entity(entity); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + __bfq_activate_entity(entity, false); sd = entity->sched_data; + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "invoking udpdate_next for this queue"); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "invoking udpdate_next for this entity"); + } +#endif if (!bfq_update_next_in_service(sd)) break; } @@ -997,10 +1177,11 @@ left: * Update the virtual time in @st and return the first eligible entity * it contains. */ -static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, - bool force) +static struct bfq_entity * +__bfq_lookup_next_entity(struct bfq_service_tree *st, bool force) { struct bfq_entity *entity, *new_next_in_service = NULL; + struct bfq_queue *bfqq; if (RB_EMPTY_ROOT(&st->active)) return NULL; @@ -1009,6 +1190,24 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, entity = bfq_first_active_entity(st); BUG_ON(bfq_gt(entity->start, st->vtime)); + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__lookup_next: start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((st->vtime>>10)*1000)>>12, st); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__lookup_next: start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((st->vtime>>10)*1000)>>12, st); + } +#endif + /* * If the chosen entity does not match with the sched_data's * next_in_service and we are forcedly serving the IDLE priority @@ -1045,10 +1244,28 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, BUG_ON(sd->in_service_entity); if (bfqd && - jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { + jiffies - bfqd->bfq_class_idle_last_service > + BFQ_CL_IDLE_TIMEOUT) { entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); if (entity) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "idle chosen from st %p %d", + st + BFQ_IOPRIO_CLASSES - 1, + BFQ_IOPRIO_CLASSES - 1) ; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "idle chosen from st %p %d", + st + BFQ_IOPRIO_CLASSES - 1, + BFQ_IOPRIO_CLASSES - 1) ; + } +#endif i = BFQ_IOPRIO_CLASSES - 1; bfqd->bfq_class_idle_last_service = jiffies; sd->next_in_service = entity; @@ -1057,6 +1274,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, for (; i < BFQ_IOPRIO_CLASSES; i++) { entity = __bfq_lookup_next_entity(st + i, false); if (entity) { + if (bfqd != NULL) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "chosen from st %p %d", + st + i, i) ; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "chosen from st %p %d", + st + i, i) ; + } +#endif + } + if (extract) { bfq_check_next_in_service(sd, entity); bfq_active_extract(st + i, entity); @@ -1070,6 +1305,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, return entity; } +static bool next_queue_may_preempt(struct bfq_data *bfqd) +{ + struct bfq_sched_data *sd = &bfqd->root_group->sched_data; + + return sd->next_in_service != sd->in_service_entity; +} + /* * Get next queue for service. */ @@ -1086,7 +1328,36 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) sd = &bfqd->root_group->sched_data; for (; sd ; sd = entity->my_sched_data) { +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (entity) { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "get_next_queue: lookup in this group"); + } else + bfq_log_bfqg(bfqd, bfqd->root_group, + "get_next_queue: lookup in root group"); +#endif + entity = bfq_lookup_next_entity(sd, 1, bfqd); + + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "get_next_queue: this queue, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "get_next_queue: this entity, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); + } +#endif + BUG_ON(!entity); entity->service = 0; } @@ -1113,9 +1384,7 @@ static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_entity *entity = &bfqq->entity; - if (bfqq == bfqd->in_service_queue) - __bfq_bfqd_reset_in_service(bfqd); - + BUG_ON(bfqq == bfqd->in_service_queue); bfq_deactivate_entity(entity, requeue); } @@ -1123,12 +1392,11 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; - bfq_activate_entity(entity); + bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq)); + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); } -#ifdef CONFIG_BFQ_GROUP_IOSCHED static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -#endif /* * Called when the bfqq no longer has requests pending, remove it from @@ -1139,6 +1407,7 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, { BUG_ON(!bfq_bfqq_busy(bfqq)); BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + BUG_ON(bfqq == bfqd->in_service_queue); bfq_log_bfqq(bfqd, bfqq, "del from busy"); @@ -1147,27 +1416,20 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, BUG_ON(bfqd->busy_queues == 0); bfqd->busy_queues--; - if (!bfqq->dispatched) { + if (!bfqq->dispatched) bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->busy_in_flight_queues); - bfqd->busy_in_flight_queues--; - if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON(!bfqd-> - const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } - } + if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_dequeue(bfqq_group(bfqq)); -#endif + + BUG_ON(bfqq->entity.budget < 0); bfq_deactivate_bfqq(bfqd, bfqq, requeue); + + BUG_ON(bfqq->entity.budget < 0); } /* @@ -1185,16 +1447,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_busy(bfqq); bfqd->busy_queues++; - if (!bfqq->dispatched) { + if (!bfqq->dispatched) if (bfqq->wr_coeff == 1) bfq_weights_tree_add(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - bfqd->busy_in_flight_queues++; - if (bfq_bfqq_constantly_seeky(bfqq)) - bfqd->const_seeky_busy_in_flight_queues++; - } - } + if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; } diff --git a/block/bfq.h b/block/bfq.h index 32dfceead..c6ba0994f 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ-v7r11 for 4.4.0: data structures and common functions prototypes. + * BFQ-v8r2 for 4.7.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> @@ -28,20 +28,21 @@ #define BFQ_DEFAULT_QUEUE_IOPRIO 4 -#define BFQ_DEFAULT_GRP_WEIGHT 10 +#define BFQ_WEIGHT_LEGACY_DFL 100 #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE +/* + * Soft real-time applications are extremely more latency sensitive + * than interactive ones. Over-raise the weight of the former to + * privilege them against the latter. + */ +#define BFQ_SOFTRT_WEIGHT_FACTOR 100 + struct bfq_entity; /** * struct bfq_service_tree - per ioprio_class service tree. - * @active: tree for active entities (i.e., those backlogged). - * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). - * @first_idle: idle entity with minimum F_i. - * @last_idle: idle entity with maximum F_i. - * @vtime: scheduler virtual time. - * @wsum: scheduler weight sum; active and idle entities contribute to it. * * Each service tree represents a B-WF2Q+ scheduler on its own. Each * ioprio_class has its own independent scheduler, and so its own @@ -49,27 +50,28 @@ struct bfq_entity; * of the containing bfqd. */ struct bfq_service_tree { + /* tree for active entities (i.e., those backlogged) */ struct rb_root active; + /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ struct rb_root idle; - struct bfq_entity *first_idle; - struct bfq_entity *last_idle; + struct bfq_entity *first_idle; /* idle entity with minimum F_i */ + struct bfq_entity *last_idle; /* idle entity with maximum F_i */ - u64 vtime; + u64 vtime; /* scheduler virtual time */ + /* scheduler weight sum; active and idle entities contribute to it */ unsigned long wsum; }; /** * struct bfq_sched_data - multi-class scheduler. - * @in_service_entity: entity in service. - * @next_in_service: head-of-the-line entity in the scheduler. - * @service_tree: array of service trees, one per ioprio_class. * * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as - * an intermediate queue on a hierarchical setup. - * @next_in_service points to the active entity of the sched_data - * service trees that will be scheduled next. + * ioprio_classes, and can be used either as a toplevel queue or as an + * intermediate queue on a hierarchical setup. @next_in_service + * points to the active entity of the sched_data service trees that + * will be scheduled next. It is used to reduce the number of steps + * needed for each hierarchical-schedule update. * * The supported ioprio_classes are the same as in CFQ, in descending * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. @@ -79,48 +81,29 @@ struct bfq_service_tree { * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_sched_data { - struct bfq_entity *in_service_entity; + struct bfq_entity *in_service_entity; /* entity in service */ + /* head-of-the-line entity in the scheduler (see comments above) */ struct bfq_entity *next_in_service; + /* array of service trees, one per ioprio_class */ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; }; /** * struct bfq_weight_counter - counter of the number of all active entities * with a given weight. - * @weight: weight of the entities that this counter refers to. - * @num_active: number of active entities with this weight. - * @weights_node: weights tree member (see bfq_data's @queue_weights_tree - * and @group_weights_tree). */ struct bfq_weight_counter { - short int weight; - unsigned int num_active; + unsigned int weight; /* weight of the entities this counter refers to */ + unsigned int num_active; /* nr of active entities with this weight */ + /* + * Weights tree member (see bfq_data's @queue_weights_tree and + * @group_weights_tree) + */ struct rb_node weights_node; }; /** * struct bfq_entity - schedulable entity. - * @rb_node: service_tree member. - * @weight_counter: pointer to the weight counter associated with this entity. - * @on_st: flag, true if the entity is on a tree (either the active or - * the idle one of its service_tree). - * @finish: B-WF2Q+ finish timestamp (aka F_i). - * @start: B-WF2Q+ start timestamp (aka S_i). - * @tree: tree the entity is enqueued into; %NULL if not on a tree. - * @min_start: minimum start time of the (active) subtree rooted at - * this entity; used for O(log N) lookups into active trees. - * @service: service received during the last round of service. - * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. - * @weight: weight of the queue - * @parent: parent entity, for hierarchical scheduling. - * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the - * associated scheduler queue, %NULL on leaf nodes. - * @sched_data: the scheduler queue this entity belongs to. - * @ioprio: the ioprio in use. - * @new_weight: when a weight change is requested, the new weight value. - * @orig_weight: original weight, used to implement weight boosting - * @prio_changed: flag, true when the user requested a weight, ioprio or - * ioprio_class change. * * A bfq_entity is used to represent either a bfq_queue (leaf node in the * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each @@ -147,27 +130,52 @@ struct bfq_weight_counter { * containing bfqd. */ struct bfq_entity { - struct rb_node rb_node; + struct rb_node rb_node; /* service_tree member */ + /* pointer to the weight counter associated with this entity */ struct bfq_weight_counter *weight_counter; + /* + * flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + */ int on_st; - u64 finish; - u64 start; + u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ + u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ + /* tree the entity is enqueued into; %NULL if not on a tree */ struct rb_root *tree; + /* + * minimum start time of the (active) subtree rooted at this + * entity; used for O(log N) lookups into active trees + */ u64 min_start; - int service, budget; - unsigned short weight, new_weight; - unsigned short orig_weight; + /* amount of service received during the last service slot */ + int service; + + /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ + int budget; + + unsigned int weight; /* weight of the queue */ + unsigned int new_weight; /* next weight if a change is in progress */ + + /* original weight, used to implement weight boosting */ + unsigned int orig_weight; + /* parent entity, for hierarchical scheduling */ struct bfq_entity *parent; + /* + * For non-leaf nodes in the hierarchy, the associated + * scheduler queue, %NULL on leaf nodes. + */ struct bfq_sched_data *my_sched_data; + /* the scheduler queue this entity belongs to */ struct bfq_sched_data *sched_data; + /* flag, set to request a weight, ioprio or ioprio_class change */ int prio_changed; }; @@ -175,56 +183,6 @@ struct bfq_group; /** * struct bfq_queue - leaf schedulable entity. - * @ref: reference counter. - * @bfqd: parent bfq_data. - * @new_ioprio: when an ioprio change is requested, the new ioprio value. - * @ioprio_class: the ioprio_class in use. - * @new_ioprio_class: when an ioprio_class change is requested, the new - * ioprio_class value. - * @new_bfqq: shared bfq_queue if queue is cooperating with - * one or more other queues. - * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). - * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). - * @sort_list: sorted list of pending requests. - * @next_rq: if fifo isn't expired, next request to serve. - * @queued: nr of requests queued in @sort_list. - * @allocated: currently allocated requests. - * @meta_pending: pending metadata requests. - * @fifo: fifo list of requests in sort_list. - * @entity: entity representing this queue in the scheduler. - * @max_budget: maximum budget allowed from the feedback mechanism. - * @budget_timeout: budget expiration (in jiffies). - * @dispatched: number of requests on the dispatch list or inside driver. - * @flags: status flags. - * @bfqq_list: node for active/idle bfqq list inside our bfqd. - * @burst_list_node: node for the device's burst list. - * @seek_samples: number of seeks sampled - * @seek_total: sum of the distances of the seeks sampled - * @seek_mean: mean seek distance - * @last_request_pos: position of the last request enqueued - * @requests_within_timer: number of consecutive pairs of request completion - * and arrival, such that the queue becomes idle - * after the completion, but the next request arrives - * within an idle time slice; used only if the queue's - * IO_bound has been cleared. - * @pid: pid of the process owning the queue, used for logging purposes. - * @last_wr_start_finish: start time of the current weight-raising period if - * the @bfq-queue is being weight-raised, otherwise - * finish time of the last weight-raising period - * @wr_cur_max_time: current max raising time for this queue - * @soft_rt_next_start: minimum time instant such that, only if a new - * request is enqueued after this time instant in an - * idle @bfq_queue with no outstanding requests, then - * the task associated with the queue it is deemed as - * soft real-time (see the comments to the function - * bfq_bfqq_softrt_next_start()) - * @last_idle_bklogged: time of the last transition of the @bfq_queue from - * idle to backlogged - * @service_from_backlogged: cumulative service received from the @bfq_queue - * since the last transition from idle to - * backlogged - * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the - * queue is shared * * A bfq_queue is a leaf request queue; it can be associated with an * io_context or more, if it is async or shared between cooperating @@ -235,117 +193,163 @@ struct bfq_group; * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_queue { - atomic_t ref; + /* reference counter */ + int ref; + /* parent bfq_data */ struct bfq_data *bfqd; - unsigned short ioprio, new_ioprio; - unsigned short ioprio_class, new_ioprio_class; + /* current ioprio and ioprio class */ + unsigned short ioprio, ioprio_class; + /* next ioprio and ioprio class if a change is in progress */ + unsigned short new_ioprio, new_ioprio_class; - /* fields for cooperating queues handling */ + /* + * Shared bfq_queue if queue is cooperating with one or more + * other queues. + */ struct bfq_queue *new_bfqq; + /* request-position tree member (see bfq_group's @rq_pos_tree) */ struct rb_node pos_node; + /* request-position tree root (see bfq_group's @rq_pos_tree) */ struct rb_root *pos_root; + /* sorted list of pending requests */ struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ struct request *next_rq; + /* number of sync and async requests queued */ int queued[2]; + /* number of sync and async requests currently allocated */ int allocated[2]; + /* number of pending metadata requests */ int meta_pending; + /* fifo list of requests in sort_list */ struct list_head fifo; + /* entity representing this queue in the scheduler */ struct bfq_entity entity; + /* maximum budget allowed from the feedback mechanism */ int max_budget; + /* budget expiration (in jiffies) */ unsigned long budget_timeout; + /* number of requests on the dispatch list or inside driver */ int dispatched; - unsigned int flags; + unsigned int flags; /* status flags.*/ + /* node for active/idle bfqq list inside parent bfqd */ struct list_head bfqq_list; + /* bit vector: a 1 for each seeky requests in history */ + u32 seek_history; + + /* node for the device's burst list */ struct hlist_node burst_list_node; - unsigned int seek_samples; - u64 seek_total; - sector_t seek_mean; + /* position of the last request enqueued */ sector_t last_request_pos; + /* Number of consecutive pairs of request completion and + * arrival, such that the queue becomes idle after the + * completion, but the next request arrives within an idle + * time slice; used only if the queue's IO_bound flag has been + * cleared. + */ unsigned int requests_within_timer; + /* pid of the process owning the queue, used for logging purposes */ pid_t pid; + + /* + * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL + * if the queue is shared. + */ struct bfq_io_cq *bic; - /* weight-raising fields */ + /* current maximum weight-raising time for this queue */ unsigned long wr_cur_max_time; + /* + * Minimum time instant such that, only if a new request is + * enqueued after this time instant in an idle @bfq_queue with + * no outstanding requests, then the task associated with the + * queue it is deemed as soft real-time (see the comments on + * the function bfq_bfqq_softrt_next_start()) + */ unsigned long soft_rt_next_start; + /* + * Start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period. + */ unsigned long last_wr_start_finish; + /* factor by which the weight of this queue is multiplied */ unsigned int wr_coeff; + /* + * Time of the last transition of the @bfq_queue from idle to + * backlogged. + */ unsigned long last_idle_bklogged; + /* + * Cumulative service received from the @bfq_queue since the + * last transition from idle to backlogged. + */ unsigned long service_from_backlogged; + + unsigned long split_time; /* time of last split */ }; /** * struct bfq_ttime - per process thinktime stats. - * @ttime_total: total process thinktime - * @ttime_samples: number of thinktime samples - * @ttime_mean: average process thinktime */ struct bfq_ttime { - unsigned long last_end_request; + unsigned long last_end_request; /* completion time of last request */ + + unsigned long ttime_total; /* total process thinktime */ + unsigned long ttime_samples; /* number of thinktime samples */ + unsigned long ttime_mean; /* average process thinktime */ - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; }; /** * struct bfq_io_cq - per (request_queue, io_context) structure. - * @icq: associated io_cq structure - * @bfqq: array of two process queues, the sync and the async - * @ttime: associated @bfq_ttime struct - * @ioprio: per (request_queue, blkcg) ioprio. - * @blkcg_id: id of the blkcg the related io_cq belongs to. - * @wr_time_left: snapshot of the time left before weight raising ends - * for the sync queue associated to this process; this - * snapshot is taken to remember this value while the weight - * raising is suspended because the queue is merged with a - * shared queue, and is used to set @raising_cur_max_time - * when the queue is split from the shared queue and its - * weight is raised again - * @saved_idle_window: same purpose as the previous field for the idle - * window - * @saved_IO_bound: same purpose as the previous two fields for the I/O - * bound classification of a queue - * @saved_in_large_burst: same purpose as the previous fields for the - * value of the field keeping the queue's belonging - * to a large burst - * @was_in_burst_list: true if the queue belonged to a burst list - * before its merge with another cooperating queue - * @cooperations: counter of consecutive successful queue merges underwent - * by any of the process' @bfq_queues - * @failed_cooperations: counter of consecutive failed queue merges of any - * of the process' @bfq_queues */ struct bfq_io_cq { + /* associated io_cq structure */ struct io_cq icq; /* must be the first member */ + /* array of two process queues, the sync and the async */ struct bfq_queue *bfqq[2]; + /* associated @bfq_ttime struct */ struct bfq_ttime ttime; + /* per (request_queue, blkcg) ioprio */ int ioprio; - #ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ + uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif - unsigned int wr_time_left; + /* + * Snapshot of the idle window before merging; taken to + * remember this value while the queue is merged, so as to be + * able to restore it in case of split. + */ bool saved_idle_window; + /* + * Same purpose as the previous two fields for the I/O bound + * classification of a queue. + */ bool saved_IO_bound; + /* + * Same purpose as the previous fields for the value of the + * field keeping the queue's belonging to a large burst + */ bool saved_in_large_burst; + /* + * True if the queue belonged to a burst list before its merge + * with another cooperating queue. + */ bool was_in_burst_list; - - unsigned int cooperations; - unsigned int failed_cooperations; }; enum bfq_device_speed { @@ -354,224 +358,216 @@ enum bfq_device_speed { }; /** - * struct bfq_data - per device data structure. - * @queue: request queue for the managed device. - * @root_group: root bfq_group for the device. - * @active_numerous_groups: number of bfq_groups containing more than one - * active @bfq_entity. - * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by - * weight. Used to keep track of whether all @bfq_queues - * have the same weight. The tree contains one counter - * for each distinct weight associated to some active - * and not weight-raised @bfq_queue (see the comments to - * the functions bfq_weights_tree_[add|remove] for - * further details). - * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted - * by weight. Used to keep track of whether all - * @bfq_groups have the same weight. The tree contains - * one counter for each distinct weight associated to - * some active @bfq_group (see the comments to the - * functions bfq_weights_tree_[add|remove] for further - * details). - * @busy_queues: number of bfq_queues containing requests (including the - * queue in service, even if it is idling). - * @busy_in_flight_queues: number of @bfq_queues containing pending or - * in-flight requests, plus the @bfq_queue in - * service, even if idle but waiting for the - * possible arrival of its next sync request. This - * field is updated only if the device is rotational, - * but used only if the device is also NCQ-capable. - * The reason why the field is updated also for non- - * NCQ-capable rotational devices is related to the - * fact that the value of @hw_tag may be set also - * later than when busy_in_flight_queues may need to - * be incremented for the first time(s). Taking also - * this possibility into account, to avoid unbalanced - * increments/decrements, would imply more overhead - * than just updating busy_in_flight_queues - * regardless of the value of @hw_tag. - * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues - * (that is, seeky queues that expired - * for budget timeout at least once) - * containing pending or in-flight - * requests, including the in-service - * @bfq_queue if constantly seeky. This - * field is updated only if the device - * is rotational, but used only if the - * device is also NCQ-capable (see the - * comments to @busy_in_flight_queues). - * @wr_busy_queues: number of weight-raised busy @bfq_queues. - * @queued: number of queued requests. - * @rq_in_driver: number of requests dispatched and waiting for completion. - * @sync_flight: number of sync requests in the driver. - * @max_rq_in_driver: max number of reqs in driver in the last - * @hw_tag_samples completed requests. - * @hw_tag_samples: nr of samples used to calculate hw_tag. - * @hw_tag: flag set to one if the driver is showing a queueing behavior. - * @budgets_assigned: number of budgets assigned. - * @idle_slice_timer: timer set when idling for the next sequential request - * from the queue in service. - * @unplug_work: delayed work to restart dispatching on the request queue. - * @in_service_queue: bfq_queue in service. - * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. - * @last_position: on-disk position of the last served request. - * @last_budget_start: beginning of the last budget. - * @last_idling_start: beginning of the last idle slice. - * @peak_rate: peak transfer rate observed for a budget. - * @peak_rate_samples: number of samples used to calculate @peak_rate. - * @bfq_max_budget: maximum budget allotted to a bfq_queue before - * rescheduling. - * @active_list: list of all the bfq_queues active on the device. - * @idle_list: list of all the bfq_queues idle on the device. - * @bfq_fifo_expire: timeout for async/sync requests; when it expires - * requests are served in fifo order. - * @bfq_back_penalty: weight of backward seeks wrt forward ones. - * @bfq_back_max: maximum allowed backward seek. - * @bfq_slice_idle: maximum idling time. - * @bfq_user_max_budget: user-configured max budget value - * (0 for auto-tuning). - * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to - * async queues. - * @bfq_timeout: timeout for bfq_queues to consume their budget; used to - * to prevent seeky queues to impose long latencies to well - * behaved ones (this also implies that seeky queues cannot - * receive guarantees in the service domain; after a timeout - * they are charged for the whole allocated budget, to try - * to preserve a behavior reasonably fair among them, but - * without service-domain guarantees). - * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is - * no more granted any weight-raising. - * @bfq_failed_cooperations: number of consecutive failed cooperation - * chances after which weight-raising is restored - * to a queue subject to more than bfq_coop_thresh - * queue merges. - * @bfq_requests_within_timer: number of consecutive requests that must be - * issued within the idle time slice to set - * again idling to a queue which was marked as - * non-I/O-bound (see the definition of the - * IO_bound flag for further details). - * @last_ins_in_burst: last time at which a queue entered the current - * burst of queues being activated shortly after - * each other; for more details about this and the - * following parameters related to a burst of - * activations, see the comments to the function - * @bfq_handle_burst. - * @bfq_burst_interval: reference time interval used to decide whether a - * queue has been activated shortly after - * @last_ins_in_burst. - * @burst_size: number of queues in the current burst of queue activations. - * @bfq_large_burst_thresh: maximum burst size above which the current - * queue-activation burst is deemed as 'large'. - * @large_burst: true if a large queue-activation burst is in progress. - * @burst_list: head of the burst list (as for the above fields, more details - * in the comments to the function bfq_handle_burst). - * @low_latency: if set to true, low-latency heuristics are enabled. - * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised - * queue is multiplied. - * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). - * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. - * @bfq_wr_min_idle_time: minimum idle period after which weight-raising - * may be reactivated for a queue (in jiffies). - * @bfq_wr_min_inter_arr_async: minimum period between request arrivals - * after which weight-raising may be - * reactivated for an already busy queue - * (in jiffies). - * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, - * sectors per seconds. - * @RT_prod: cached value of the product R*T used for computing the maximum - * duration of the weight raising automatically. - * @device_speed: device-speed class for the low-latency heuristic. - * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. + * struct bfq_data - per-device data structure. * * All the fields are protected by the @queue lock. */ struct bfq_data { + /* request queue for the device */ struct request_queue *queue; + /* root bfq_group for the device */ struct bfq_group *root_group; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - int active_numerous_groups; -#endif - + /* + * rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active and not + * weight-raised @bfq_queue (see the comments to the functions + * bfq_weights_tree_[add|remove] for further details). + */ struct rb_root queue_weights_tree; + /* + * rbtree of non-queue @bfq_entity weight counters, sorted by + * weight. Used to keep track of whether all @bfq_groups have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active @bfq_group (see + * the comments to the functions bfq_weights_tree_[add|remove] + * for further details). + */ struct rb_root group_weights_tree; + /* + * Number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + */ int busy_queues; - int busy_in_flight_queues; - int const_seeky_busy_in_flight_queues; + /* number of weight-raised busy @bfq_queues */ int wr_busy_queues; + /* number of queued requests */ int queued; + /* number of requests dispatched and waiting for completion */ int rq_in_driver; - int sync_flight; + /* + * Maximum number of requests in driver in the last + * @hw_tag_samples completed requests. + */ int max_rq_in_driver; + /* number of samples used to calculate hw_tag */ int hw_tag_samples; + /* flag set to one if the driver is showing a queueing behavior */ int hw_tag; + /* number of budgets assigned */ int budgets_assigned; + /* + * Timer set when idling (waiting) for the next request from + * the queue in service. + */ struct timer_list idle_slice_timer; + /* delayed work to restart dispatching on the request queue */ struct work_struct unplug_work; + /* bfq_queue in service */ struct bfq_queue *in_service_queue; + /* bfq_io_cq (bic) associated with the @in_service_queue */ struct bfq_io_cq *in_service_bic; + /* on-disk position of the last served request */ sector_t last_position; + /* beginning of the last budget */ ktime_t last_budget_start; + /* beginning of the last idle slice */ ktime_t last_idling_start; + /* number of samples used to calculate @peak_rate */ int peak_rate_samples; + /* peak transfer rate observed for a budget */ u64 peak_rate; + /* maximum budget allotted to a bfq_queue before rescheduling */ int bfq_max_budget; + /* list of all the bfq_queues active on the device */ struct list_head active_list; + /* list of all the bfq_queues idle on the device */ struct list_head idle_list; + /* + * Timeout for async/sync requests; when it fires, requests + * are served in fifo order. + */ unsigned int bfq_fifo_expire[2]; + /* weight of backward seeks wrt forward ones */ unsigned int bfq_back_penalty; + /* maximum allowed backward seek */ unsigned int bfq_back_max; + /* maximum idling time */ unsigned int bfq_slice_idle; + /* last time CLASS_IDLE was served */ u64 bfq_class_idle_last_service; + /* user-configured max budget value (0 for auto-tuning) */ int bfq_user_max_budget; - int bfq_max_budget_async_rq; - unsigned int bfq_timeout[2]; - - unsigned int bfq_coop_thresh; - unsigned int bfq_failed_cooperations; + /* + * Timeout for bfq_queues to consume their budget; used to + * prevent seeky queues from imposing long latencies to + * sequential or quasi-sequential ones (this also implies that + * seeky queues cannot receive guarantees in the service + * domain; after a timeout they are charged for the time they + * have been in service, to preserve fairness among them, but + * without service-domain guarantees). + */ + unsigned int bfq_timeout; + + /* + * Number of consecutive requests that must be issued within + * the idle time slice to set again idling to a queue which + * was marked as non-I/O-bound (see the definition of the + * IO_bound flag for further details). + */ unsigned int bfq_requests_within_timer; + /* + * Force device idling whenever needed to provide accurate + * service guarantees, without caring about throughput + * issues. CAVEAT: this may even increase latencies, in case + * of useless idling for processes that did stop doing I/O. + */ + bool strict_guarantees; + + /* + * Last time at which a queue entered the current burst of + * queues being activated shortly after each other; for more + * details about this and the following parameters related to + * a burst of activations, see the comments on the function + * bfq_handle_burst. + */ unsigned long last_ins_in_burst; + /* + * Reference time interval used to decide whether a queue has + * been activated shortly after @last_ins_in_burst. + */ unsigned long bfq_burst_interval; + /* number of queues in the current burst of queue activations */ int burst_size; + + /* common parent entity for the queues in the burst */ + struct bfq_entity *burst_parent_entity; + /* Maximum burst size above which the current queue-activation + * burst is deemed as 'large'. + */ unsigned long bfq_large_burst_thresh; + /* true if a large queue-activation burst is in progress */ bool large_burst; + /* + * Head of the burst list (as for the above fields, more + * details in the comments on the function bfq_handle_burst). + */ struct hlist_head burst_list; + /* if set to true, low-latency heuristics are enabled */ bool low_latency; - - /* parameters of the low_latency heuristics */ + /* + * Maximum factor by which the weight of a weight-raised queue + * is multiplied. + */ unsigned int bfq_wr_coeff; + /* maximum duration of a weight-raising period (jiffies) */ unsigned int bfq_wr_max_time; + + /* Maximum weight-raising duration for soft real-time processes */ unsigned int bfq_wr_rt_max_time; + /* + * Minimum idle period after which weight-raising may be + * reactivated for a queue (in jiffies). + */ unsigned int bfq_wr_min_idle_time; + /* + * Minimum period between request arrivals after which + * weight-raising may be reactivated for an already busy async + * queue (in jiffies). + */ unsigned long bfq_wr_min_inter_arr_async; + + /* Max service-rate for a soft real-time queue, in sectors/sec */ unsigned int bfq_wr_max_softrt_rate; + /* + * Cached value of the product R*T, used for computing the + * maximum duration of weight raising automatically. + */ u64 RT_prod; + /* device-speed class for the low-latency heuristic */ enum bfq_device_speed device_speed; + /* fallback dummy bfqq for extreme OOM conditions */ struct bfq_queue oom_bfqq; }; enum bfqq_state_flags { - BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ + BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ + BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* + * waiting for a request + * without idling the device + */ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ - BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ BFQ_BFQQ_FLAG_IO_bound, /* * bfqq has timed-out at least once * having consumed at most 2/10 of @@ -581,17 +577,12 @@ enum bfqq_state_flags { * bfqq activated in a large burst, * see comments to bfq_handle_burst. */ - BFQ_BFQQ_FLAG_constantly_seeky, /* - * bfqq has proved to be slow and - * seeky until budget timeout - */ BFQ_BFQQ_FLAG_softrt_update, /* * may need softrt-next-start * update */ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ - BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ + BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ }; #define BFQ_BFQQ_FNS(name) \ @@ -608,25 +599,53 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ } +BFQ_BFQQ_FNS(just_created); BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(must_alloc); BFQ_BFQQ_FNS(fifo_expire); BFQ_BFQQ_FNS(idle_window); BFQ_BFQQ_FNS(sync); -BFQ_BFQQ_FNS(budget_new); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); -BFQ_BFQQ_FNS(constantly_seeky); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); -BFQ_BFQQ_FNS(just_split); BFQ_BFQQ_FNS(softrt_update); #undef BFQ_BFQQ_FNS /* Logging facilities. */ -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + __pbuf, ##args); \ +} while (0) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ +} while (0) + +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +#endif /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log(bfqd, fmt, args...) \ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) @@ -640,15 +659,12 @@ enum bfqq_expiration { BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ + BFQ_BFQQ_PREEMPTED /* preemption in progress */ }; -#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfqg_stats { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -657,12 +673,8 @@ struct bfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; - /* time not charged to this cgroup */ - struct blkg_stat unaccounted_time; /* sum of number of ios queued across all samples */ struct blkg_stat avg_queue_size_sum; /* count of samples taken for average */ @@ -680,8 +692,10 @@ struct bfqg_stats { uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; +#endif }; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* * struct bfq_group_data - per-blkcg storage for the blkio subsystem. * @@ -692,7 +706,7 @@ struct bfq_group_data { /* must be the first member */ struct blkcg_policy_data pd; - unsigned short weight; + unsigned int weight; }; /** @@ -712,7 +726,7 @@ struct bfq_group_data { * unused for the root group. Used to know whether there * are groups with more than one active @bfq_entity * (see the comments to the function - * bfq_bfqq_must_not_expire()). + * bfq_bfqq_may_idle()). * @rq_pos_tree: rbtree sorted by next_request position, used when * determining if two or more queues have interleaving * requests (see bfq_find_close_cooperator()). @@ -745,7 +759,6 @@ struct bfq_group { struct rb_root rq_pos_tree; struct bfqg_stats stats; - struct bfqg_stats dead_stats; /* stats pushed from dead children */ }; #else @@ -767,11 +780,25 @@ bfq_entity_service_tree(struct bfq_entity *entity) struct bfq_sched_data *sched_data = entity->sched_data; struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : - BFQ_DEFAULT_GRP_CLASS; + BFQ_DEFAULT_GRP_CLASS - 1; BUG_ON(idx >= BFQ_IOPRIO_CLASSES); BUG_ON(sched_data == NULL); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx) ; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx) ; + } +#endif return sched_data->service_tree + idx; } @@ -791,47 +818,6 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) return bic->icq.q->elevator->elevator_data; } -/** - * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. - * @ptr: a pointer to a bfqd. - * @flags: storage for the flags to be saved. - * - * This function allows bfqg->bfqd to be protected by the - * queue lock of the bfqd they reference; the pointer is dereferenced - * under RCU, so the storage for bfqd is assured to be safe as long - * as the RCU read side critical section does not end. After the - * bfqd->queue->queue_lock is taken the pointer is rechecked, to be - * sure that no other writer accessed it. If we raced with a writer, - * the function returns NULL, with the queue unlocked, otherwise it - * returns the dereferenced pointer, with the queue locked. - */ -static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) -{ - struct bfq_data *bfqd; - - rcu_read_lock(); - bfqd = rcu_dereference(*(struct bfq_data **)ptr); - - if (bfqd != NULL) { - spin_lock_irqsave(bfqd->queue->queue_lock, *flags); - if (ptr == NULL) - printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); - else if (*ptr == bfqd) - goto out; - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); - } - - bfqd = NULL; -out: - rcu_read_unlock(); - return bfqd; -} - -static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) -{ - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -} - #ifdef CONFIG_BFQ_GROUP_IOSCHED static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) @@ -857,11 +843,13 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); static void bfq_put_queue(struct bfq_queue *bfqq); static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, gfp_t gfp_mask); + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic); static void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +#ifdef CONFIG_BFQ_GROUP_IOSCHED static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +#endif static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); #endif /* _BFQ_H */ diff --git a/block/bio.c b/block/bio.c index 807d25e46..0e4aa42bc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -311,17 +311,6 @@ static void bio_chain_endio(struct bio *bio) bio_endio(__bio_chain_endio(bio)); } -/* - * Increment chain count for the bio. Make sure the CHAIN flag update - * is visible before the raised count. - */ -static inline void bio_inc_remaining(struct bio *bio) -{ - bio_set_flag(bio, BIO_CHAIN); - smp_mb__before_atomic(); - atomic_inc(&bio->__bi_remaining); -} - /** * bio_chain - chain bio completions * @bio: the target bio diff --git a/block/blk-core.c b/block/blk-core.c index 88d6e981e..3545520c7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -33,6 +33,7 @@ #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/blk-cgroup.h> +#include <linux/wbt.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> @@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, fail: blk_free_flush_queue(q->fq); + wbt_exit(q->rq_wb); + q->rq_wb = NULL; return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1397,6 +1400,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->wb_stat); if (rq->cmd_flags & REQ_QUEUED) blk_queue_end_tag(q, rq); @@ -1487,6 +1491,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); + wbt_done(q->rq_wb, &req->wb_stat); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools @@ -1525,6 +1531,7 @@ EXPORT_SYMBOL(blk_put_request); * blk_add_request_payload - add a payload to a request * @rq: request to update * @page: page backing the payload + * @offset: offset in page * @len: length of the payload. * * This allows to later add a payload to an already submitted request by @@ -1535,12 +1542,12 @@ EXPORT_SYMBOL(blk_put_request); * discard requests should ever use it. */ void blk_add_request_payload(struct request *rq, struct page *page, - unsigned int len) + int offset, unsigned int len) { struct bio *bio = rq->bio; bio->bi_io_vec->bv_page = page; - bio->bi_io_vec->bv_offset = 0; + bio->bi_io_vec->bv_offset = offset; bio->bi_io_vec->bv_len = len; bio->bi_iter.bi_size = len; @@ -1716,6 +1723,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; + bool wb_acct; /* * low level driver can indicate that it wants pages above a @@ -1768,6 +1776,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: + wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, q->queue_lock); + /* * This sync check and mask will be re-done in init_request_from_bio(), * but we need to set it earlier to expose the sync flag to the @@ -1783,11 +1793,16 @@ get_rq: */ req = get_request(q, rw_flags, bio, GFP_NOIO); if (IS_ERR(req)) { + if (wb_acct) + __wbt_done(q->rq_wb); bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; } + if (wb_acct) + wbt_mark_tracked(&req->wb_stat); + /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). @@ -1965,7 +1980,8 @@ generic_make_request_checks(struct bio *bio) * drivers without flush support don't have to worry * about them. */ - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && + !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); if (!nr_sectors) { err = 0; @@ -2518,6 +2534,8 @@ void blk_start_request(struct request *req) { blk_dequeue_request(req); + wbt_issue(req->q->rq_wb, &req->wb_stat); + /* * We are now handing the request to the hardware, initialize * resid_len to full count and add the timeout handler. @@ -2585,6 +2603,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) trace_block_rq_complete(req->q, req, nr_bytes); + blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req); + if (!req->bio) return false; @@ -2752,9 +2772,10 @@ void blk_finish_request(struct request *req, int error) blk_account_io_done(req); - if (req->end_io) + if (req->end_io) { + wbt_done(req->q->rq_wb, &req->wb_stat); req->end_io(req, error); - else { + } else { if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); diff --git a/block/blk-flush.c b/block/blk-flush.c index 9c423e533..b1c91d229 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,17 +95,18 @@ enum { static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq); -static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) +static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) { unsigned int policy = 0; if (blk_rq_sectors(rq)) policy |= REQ_FSEQ_DATA; - if (fflags & REQ_FLUSH) { + if (fflags & (1UL << QUEUE_FLAG_WC)) { if (rq->cmd_flags & REQ_FLUSH) policy |= REQ_FSEQ_PREFLUSH; - if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) + if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && + (rq->cmd_flags & REQ_FUA)) policy |= REQ_FSEQ_POSTFLUSH; } return policy; @@ -384,7 +385,7 @@ static void mq_flush_data_end_io(struct request *rq, int error) void blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - unsigned int fflags = q->flush_flags; /* may change, cache */ + unsigned long fflags = q->queue_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); @@ -393,7 +394,7 @@ void blk_insert_flush(struct request *rq) * REQ_FLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_FLUSH; - if (!(fflags & REQ_FUA)) + if (!(fflags & (1UL << QUEUE_FLAG_FUA))) rq->cmd_flags &= ~REQ_FUA; /* diff --git a/block/blk-lib.c b/block/blk-lib.c index 9ebf65379..9e29dc351 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -9,82 +9,46 @@ #include "blk.h" -struct bio_batch { - atomic_t done; - int error; - struct completion *wait; -}; - -static void bio_batch_end_io(struct bio *bio) +static struct bio *next_bio(struct bio *bio, int rw, unsigned int nr_pages, + gfp_t gfp) { - struct bio_batch *bb = bio->bi_private; + struct bio *new = bio_alloc(gfp, nr_pages); + + if (bio) { + bio_chain(bio, new); + submit_bio(rw, bio); + } - if (bio->bi_error && bio->bi_error != -EOPNOTSUPP) - bb->error = bio->bi_error; - if (atomic_dec_and_test(&bb->done)) - complete(bb->wait); - bio_put(bio); + return new; } -/** - * blkdev_issue_discard - queue a discard - * @bdev: blockdev to issue discard for - * @sector: start sector - * @nr_sects: number of sectors to discard - * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_IFL_* flags to control behaviour - * - * Description: - * Issue a discard request for the sectors in question. - */ -int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int type, struct bio **biop) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); - int type = REQ_WRITE | REQ_DISCARD; + struct bio *bio = *biop; unsigned int granularity; int alignment; - struct bio_batch bb; - struct bio *bio; - int ret = 0; - struct blk_plug plug; if (!q) return -ENXIO; - if (!blk_queue_discard(q)) return -EOPNOTSUPP; + if ((type & REQ_SECURE) && !blk_queue_secdiscard(q)) + return -EOPNOTSUPP; /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; - if (flags & BLKDEV_DISCARD_SECURE) { - if (!blk_queue_secdiscard(q)) - return -EOPNOTSUPP; - type |= REQ_SECURE; - } - - atomic_set(&bb.done, 1); - bb.error = 0; - bb.wait = &wait; - - blk_start_plug(&plug); while (nr_sects) { unsigned int req_sects; sector_t end_sect, tmp; - bio = bio_alloc(gfp_mask, 1); - if (!bio) { - ret = -ENOMEM; - break; - } - /* Make sure bi_size doesn't overflow */ req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9); - /* + /** * If splitting a request, and the next starting sector would be * misaligned, stop the discard at the previous aligned sector. */ @@ -98,18 +62,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, req_sects = end_sect - sector; } + bio = next_bio(bio, type, 1, gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_end_io = bio_batch_end_io; bio->bi_bdev = bdev; - bio->bi_private = &bb; bio->bi_iter.bi_size = req_sects << 9; nr_sects -= req_sects; sector = end_sect; - atomic_inc(&bb.done); - submit_bio(type, bio); - /* * We can loop for a long time in here, if someone does * full device discards (like mkfs). Be nice and allow @@ -118,14 +78,45 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, */ cond_resched(); } - blk_finish_plug(&plug); - /* Wait for bios in-flight */ - if (!atomic_dec_and_test(&bb.done)) - wait_for_completion_io(&wait); + *biop = bio; + return 0; +} +EXPORT_SYMBOL(__blkdev_issue_discard); + +/** + * blkdev_issue_discard - queue a discard + * @bdev: blockdev to issue discard for + * @sector: start sector + * @nr_sects: number of sectors to discard + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: BLKDEV_IFL_* flags to control behaviour + * + * Description: + * Issue a discard request for the sectors in question. + */ +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +{ + int type = REQ_WRITE | REQ_DISCARD; + struct bio *bio = NULL; + struct blk_plug plug; + int ret; + + if (flags & BLKDEV_DISCARD_SECURE) + type |= REQ_SECURE; + + blk_start_plug(&plug); + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, type, + &bio); + if (!ret && bio) { + ret = submit_bio_wait(type, bio); + if (ret == -EOPNOTSUPP) + ret = 0; + bio_put(bio); + } + blk_finish_plug(&plug); - if (bb.error) - return bb.error; return ret; } EXPORT_SYMBOL(blkdev_issue_discard); @@ -145,11 +136,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); unsigned int max_write_same_sectors; - struct bio_batch bb; - struct bio *bio; + struct bio *bio = NULL; int ret = 0; if (!q) @@ -158,21 +147,10 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, /* Ensure that max_write_same_sectors doesn't overflow bi_size */ max_write_same_sectors = UINT_MAX >> 9; - atomic_set(&bb.done, 1); - bb.error = 0; - bb.wait = &wait; - while (nr_sects) { - bio = bio_alloc(gfp_mask, 1); - if (!bio) { - ret = -ENOMEM; - break; - } - + bio = next_bio(bio, REQ_WRITE | REQ_WRITE_SAME, 1, gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_end_io = bio_batch_end_io; bio->bi_bdev = bdev; - bio->bi_private = &bb; bio->bi_vcnt = 1; bio->bi_io_vec->bv_page = page; bio->bi_io_vec->bv_offset = 0; @@ -186,18 +164,13 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, bio->bi_iter.bi_size = nr_sects << 9; nr_sects = 0; } - - atomic_inc(&bb.done); - submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio); } - /* Wait for bios in-flight */ - if (!atomic_dec_and_test(&bb.done)) - wait_for_completion_io(&wait); - - if (bb.error) - return bb.error; - return ret; + if (bio) { + ret = submit_bio_wait(REQ_WRITE | REQ_WRITE_SAME, bio); + bio_put(bio); + } + return ret != -EOPNOTSUPP ? ret : 0; } EXPORT_SYMBOL(blkdev_issue_write_same); @@ -216,28 +189,15 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask) { int ret; - struct bio *bio; - struct bio_batch bb; + struct bio *bio = NULL; unsigned int sz; - DECLARE_COMPLETION_ONSTACK(wait); - atomic_set(&bb.done, 1); - bb.error = 0; - bb.wait = &wait; - - ret = 0; while (nr_sects != 0) { - bio = bio_alloc(gfp_mask, - min(nr_sects, (sector_t)BIO_MAX_PAGES)); - if (!bio) { - ret = -ENOMEM; - break; - } - + bio = next_bio(bio, WRITE, + min(nr_sects, (sector_t)BIO_MAX_PAGES), + gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; - bio->bi_end_io = bio_batch_end_io; - bio->bi_private = &bb; while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); @@ -247,18 +207,14 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if (ret < (sz << 9)) break; } - ret = 0; - atomic_inc(&bb.done); - submit_bio(WRITE, bio); } - /* Wait for bios in-flight */ - if (!atomic_dec_and_test(&bb.done)) - wait_for_completion_io(&wait); - - if (bb.error) - return bb.error; - return ret; + if (bio) { + ret = submit_bio_wait(WRITE, bio); + bio_put(bio); + return ret; + } + return 0; } /** diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 4ea4dd8a1..2f68015f8 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + unsigned int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } +} + +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t count) +{ + blk_mq_stat_clear(hctx); + return count; +} + +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_stat_init(&stat[0]); + blk_stat_init(&stat[1]); + + blk_hctx_stat_get(hctx, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_sysfs_dispatched_show, @@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { .attr = {.name = "io_poll", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_poll_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { + .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, + .show = blk_mq_hw_sysfs_stat_show, + .store = blk_mq_hw_sysfs_stat_store, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, &blk_mq_hw_sysfs_poll.attr, + &blk_mq_hw_sysfs_stat.attr, NULL, }; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index abdbb4740..56a0c37a3 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -464,15 +464,26 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, } } -void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, - void *priv) +static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, + busy_tag_iter_fn *fn, void *priv) { if (tags->nr_reserved_tags) bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, false); } -EXPORT_SYMBOL(blk_mq_all_tag_busy_iter); + +void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, + busy_tag_iter_fn *fn, void *priv) +{ + int i; + + for (i = 0; i < tagset->nr_hw_queues; i++) { + if (tagset->tags && tagset->tags[i]) + blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv); + } +} +EXPORT_SYMBOL(blk_mq_tagset_busy_iter); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1699baf39..23ff76a40 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -22,6 +22,7 @@ #include <linux/sched/sysctl.h> #include <linux/delay.h> #include <linux/crash_dump.h> +#include <linux/wbt.h> #include <trace/events/block.h> @@ -29,6 +30,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-stat.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -274,6 +276,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, if (rq->cmd_flags & REQ_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + + wbt_done(q->rq_wb, &rq->wb_stat); rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); @@ -306,6 +310,7 @@ inline void __blk_mq_end_request(struct request *rq, int error) blk_account_io_done(rq); if (rq->end_io) { + wbt_done(rq->q->rq_wb, &rq->wb_stat); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -356,10 +361,19 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu(); } +static void blk_mq_stat_add(struct request *rq) +{ + struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)]; + + blk_stat_add(stat, rq); +} + static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_stat_add(rq); + if (!q->softirq_done_fn) blk_mq_end_request(rq, rq->errors); else @@ -403,6 +417,8 @@ void blk_mq_start_request(struct request *rq) if (unlikely(blk_bidi_rq(rq))) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + wbt_issue(q->rq_wb, &rq->wb_stat); + blk_add_timer(rq); /* @@ -438,6 +454,7 @@ static void __blk_mq_requeue_request(struct request *rq) struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->wb_stat); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -1122,8 +1139,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { init_request_from_bio(rq, bio); - if (blk_do_io_stat(rq)) - blk_account_io_start(rq, 1); + blk_account_io_start(rq, 1); } static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) @@ -1253,6 +1269,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1263,16 +1280,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); - if (!is_flush_fua && !blk_queue_nomerges(q)) { - if (blk_attempt_plug_merge(q, bio, &request_count, - &same_queue_rq)) - return BLK_QC_T_NONE; - } else - request_count = blk_plug_queued_count(q); + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) + return BLK_QC_T_NONE; + + wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL); rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __wbt_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + wbt_mark_tracked(&rq->wb_stat); cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1349,6 +1371,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) struct blk_map_ctx data; struct request *rq; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1359,13 +1382,23 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return BLK_QC_T_NONE; + if (!is_flush_fua && !blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); + + wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL); rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __wbt_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + wbt_mark_tracked(&rq->wb_stat); cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1496,7 +1529,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, int to_do; void *p; - while (left < order_to_size(this_order - 1) && this_order) + while (this_order && left < order_to_size(this_order - 1)) this_order--; do { @@ -1761,6 +1794,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; + blk_stat_init(&__ctx->stat[0]); + blk_stat_init(&__ctx->stat[1]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) @@ -2021,7 +2056,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!q->queue_ctx) - return ERR_PTR(-ENOMEM); + goto err_exit; q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, set->numa_node); @@ -2085,6 +2120,8 @@ err_map: kfree(q->queue_hw_ctx); err_percpu: free_percpu(q->queue_ctx); +err_exit: + q->mq_ops = NULL; return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(blk_mq_init_allocated_queue); @@ -2097,6 +2134,9 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + wbt_exit(q->rq_wb); + q->rq_wb = NULL; + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); diff --git a/block/blk-mq.h b/block/blk-mq.h index 9087b1103..e107f700f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include "blk-stat.h" + struct blk_mq_tag_set; struct blk_mq_ctx { @@ -20,6 +22,7 @@ struct blk_mq_ctx { /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; diff --git a/block/blk-settings.c b/block/blk-settings.c index 331e4eee0..746dc9fee 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -820,31 +820,54 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) } EXPORT_SYMBOL(blk_queue_update_dma_alignment); +void blk_queue_flush_queueable(struct request_queue *q, bool queueable) +{ + spin_lock_irq(q->queue_lock); + if (queueable) + clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); + else + set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); + /** - * blk_queue_flush - configure queue's cache flush capability + * blk_set_queue_depth - tell the block layer about the device queue depth * @q: the request queue for the device - * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA + * @depth: queue depth * - * Tell block layer cache flush capability of @q. If it supports - * flushing, REQ_FLUSH should be set. If it supports bypassing - * write cache for individual writes, REQ_FUA should be set. */ -void blk_queue_flush(struct request_queue *q, unsigned int flush) +void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { - WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA)); - - if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA))) - flush &= ~REQ_FUA; - - q->flush_flags = flush & (REQ_FLUSH | REQ_FUA); + q->queue_depth = depth; + wbt_set_queue_depth(q->rq_wb, depth); } -EXPORT_SYMBOL_GPL(blk_queue_flush); +EXPORT_SYMBOL(blk_set_queue_depth); -void blk_queue_flush_queueable(struct request_queue *q, bool queueable) -{ - q->flush_not_queueable = !queueable; -} -EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); +/** + * blk_queue_write_cache - configure queue's write cache + * @q: the request queue for the device + * @wc: write back cache on or off + * @fua: device supports FUA writes, if true + * + * Tell the block layer about the write cache of @q. + */ +void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) +{ + spin_lock_irq(q->queue_lock); + if (wc) + queue_flag_set(QUEUE_FLAG_WC, q); + else + queue_flag_clear(QUEUE_FLAG_WC, q); + if (fua) + queue_flag_set(QUEUE_FLAG_FUA, q); + else + queue_flag_clear(QUEUE_FLAG_FUA, q); + spin_unlock_irq(q->queue_lock); + + wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); +} +EXPORT_SYMBOL_GPL(blk_queue_write_cache); static int __init blk_settings_init(void) { diff --git a/block/blk-stat.c b/block/blk-stat.c new file mode 100644 index 000000000..8e3974d87 --- /dev/null +++ b/block/blk-stat.c @@ -0,0 +1,185 @@ +/* + * Block stat tracking code + * + * Copyright (C) 2016 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/blk-mq.h> + +#include "blk-stat.h" +#include "blk-mq.h" + +void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +{ + if (!src->nr_samples) + return; + + dst->min = min(dst->min, src->min); + dst->max = max(dst->max, src->max); + + if (!dst->nr_samples) + dst->mean = src->mean; + else { + dst->mean = div64_s64((src->mean * src->nr_samples) + + (dst->mean * dst->nr_samples), + dst->nr_samples + src->nr_samples); + } + dst->nr_samples += src->nr_samples; +} + +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j, nr; + + blk_stat_init(&dst[0]); + blk_stat_init(&dst[1]); + + nr = 0; + do { + uint64_t newest = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + } + + /* + * No samples + */ + if (!newest) + break; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare. + */ + } while (!nr); +} + +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + if (q->mq_ops) + blk_mq_stat_get(q, dst); + else { + memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat)); + memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat)); + } +} + +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +{ + struct blk_mq_ctx *ctx; + unsigned int i, nr; + + nr = 0; + do { + uint64_t newest = 0; + + hctx_for_each_ctx(hctx, ctx, i) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + + if (!newest) + break; + + hctx_for_each_ctx(hctx, ctx, i) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare, as the window is only updated + * occasionally + */ + } while (!nr); +} + +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->time = time_now & BLK_STAT_MASK; +} + +void blk_stat_init(struct blk_rq_stat *stat) +{ + __blk_stat_init(stat, ktime_to_ns(ktime_get())); +} + +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +{ + s64 delta, now, value; + u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat); + + now = ktime_to_ns(ktime_get()); + if (now < rq_time) + return; + + if ((now & BLK_STAT_MASK) != (stat->time & BLK_STAT_MASK)) + __blk_stat_init(stat, now); + + value = now - rq_time; + if (value > stat->max) + stat->max = value; + if (value < stat->min) + stat->min = value; + + delta = value - stat->mean; + if (delta) + stat->mean += div64_s64(delta, stat->nr_samples + 1); + + stat->nr_samples++; +} + +void blk_stat_clear(struct request_queue *q) +{ + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } + } + } else { + blk_stat_init(&q->rq_stats[0]); + blk_stat_init(&q->rq_stats[1]); + } +} diff --git a/block/blk-stat.h b/block/blk-stat.h new file mode 100644 index 000000000..d77548dbf --- /dev/null +++ b/block/blk-stat.h @@ -0,0 +1,17 @@ +#ifndef BLK_STAT_H +#define BLK_STAT_H + +/* + * ~0.13s window as a power-of-2 (2^27 nsecs) + */ +#define BLK_STAT_NSEC 134217728ULL +#define BLK_STAT_MASK ~(BLK_STAT_NSEC - 1) + +void blk_stat_add(struct blk_rq_stat *, struct request *); +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); +void blk_stat_clear(struct request_queue *q); +void blk_stat_init(struct blk_rq_stat *); +void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); + +#endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 995b58d46..df194bf93 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -10,6 +10,7 @@ #include <linux/blktrace_api.h> #include <linux/blk-mq.h> #include <linux/blk-cgroup.h> +#include <linux/wbt.h> #include "blk.h" #include "blk-mq.h" @@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } +static ssize_t queue_var_store64(u64 *var, const char *page) +{ + int err; + u64 v; + + err = kstrtou64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, (page)); @@ -347,6 +361,110 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_wb_win_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000)); +} + +static ssize_t queue_wb_win_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->win_nsec = val * 1000ULL; + wbt_update_limits(q->rq_wb); + return count; +} + +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->min_lat_nsec = val * 1000ULL; + wbt_update_limits(q->rq_wb); + return count; +} + +static ssize_t queue_wc_show(struct request_queue *q, char *page) +{ + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + return sprintf(page, "write back\n"); + + return sprintf(page, "write through\n"); +} + +static ssize_t queue_wc_store(struct request_queue *q, const char *page, + size_t count) +{ + int set = -1; + + if (!strncmp(page, "write back", 10)) + set = 1; + else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) + set = 0; + + if (set == -1) + return -EINVAL; + + spin_lock_irq(q->queue_lock); + if (set) + queue_flag_set(QUEUE_FLAG_WC, q); + else + queue_flag_clear(QUEUE_FLAG_WC, q); + spin_unlock_irq(q->queue_lock); + + return count; +} + +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t queue_stats_show(struct request_queue *q, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_queue_stat_get(q, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -478,6 +596,29 @@ static struct queue_sysfs_entry queue_poll_entry = { .store = queue_poll_store, }; +static struct queue_sysfs_entry queue_wc_entry = { + .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wc_show, + .store = queue_wc_store, +}; + +static struct queue_sysfs_entry queue_stats_entry = { + .attr = {.name = "stats", .mode = S_IRUGO }, + .show = queue_stats_show, +}; + +static struct queue_sysfs_entry queue_wb_lat_entry = { + .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_lat_show, + .store = queue_wb_lat_store, +}; + +static struct queue_sysfs_entry queue_wb_win_entry = { + .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_win_show, + .store = queue_wb_win_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -503,6 +644,10 @@ static struct attribute *default_attrs[] = { &queue_iostats_entry.attr, &queue_random_entry.attr, &queue_poll_entry.attr, + &queue_wc_entry.attr, + &queue_stats_entry.attr, + &queue_wb_lat_entry.attr, + &queue_wb_win_entry.attr, NULL, }; @@ -617,6 +762,43 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat) +{ + blk_queue_stat_get(data, stat); +} + +static void blk_wb_stat_clear(void *data) +{ + blk_stat_clear(data); +} + +static struct wb_stat_ops wb_stat_ops = { + .get = blk_wb_stat_get, + .clear = blk_wb_stat_clear, +}; + +static void blk_wb_init(struct request_queue *q) +{ + struct rq_wb *rwb; + + rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q); + + /* + * If this fails, we don't get throttling + */ + if (IS_ERR(rwb)) + return; + + if (blk_queue_nonrot(q)) + rwb->min_lat_nsec = 2000000ULL; + else + rwb->min_lat_nsec = 75000000ULL; + + wbt_set_queue_depth(rwb, blk_queue_depth(q)); + wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + q->rq_wb = rwb; +} + int blk_register_queue(struct gendisk *disk) { int ret; @@ -656,6 +838,8 @@ int blk_register_queue(struct gendisk *disk) if (q->mq_ops) blk_mq_register_disk(disk); + blk_wb_init(q); + if (!q->request_fn) return 0; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2149a1ddb..47a3e5406 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -211,15 +211,14 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) * * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a * throtl_grp; otherwise, just "throtl". - * - * TODO: this should be made a function and name formatting should happen - * after testing whether blktrace is enabled. */ #define throtl_log(sq, fmt, args...) do { \ struct throtl_grp *__tg = sq_to_tg((sq)); \ struct throtl_data *__td = sq_to_td((sq)); \ \ (void)__td; \ + if (likely(!blk_trace_note_message_enabled(__td->queue))) \ + break; \ if ((__tg)) { \ char __pbuf[128]; \ \ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4a349787b..49707ef43 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3751,6 +3751,18 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) return; /* + * If we have a non-root cgroup, we can depend on that to + * do proper throttling of writes. Turn off wbt for that + * case. + */ + if (bio_blkcg(bio) != &blkcg_root) { + struct request_queue *q = cfqd->queue; + + if (q->rq_wb) + wbt_disable(q->rq_wb); + } + + /* * Drop reference to queues. New queues will be assigned in new * group upon arrival of fresh requests. */ diff --git a/block/genhd.c b/block/genhd.c index 7a6a655d8..cc778461d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -858,6 +858,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) if (iter) { class_dev_iter_exit(iter); kfree(iter); + seqf->private = NULL; } } diff --git a/block/ioctl.c b/block/ioctl.c index 4ff1f92f8..ed2397f8d 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -4,7 +4,6 @@ #include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> -#include <linux/badblocks.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> @@ -407,35 +406,6 @@ static inline int is_unrecognized_ioctl(int ret) ret == -ENOIOCTLCMD; } -#ifdef CONFIG_FS_DAX -bool blkdev_dax_capable(struct block_device *bdev) -{ - struct gendisk *disk = bdev->bd_disk; - - if (!disk->fops->direct_access) - return false; - - /* - * If the partition is not aligned on a page boundary, we can't - * do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) - || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) - return false; - - /* - * If the device has known bad blocks, force all I/O through the - * driver / page cache. - * - * TODO: support finer grained dax error handling - */ - if (disk->bb && disk->bb->count) - return false; - - return true; -} -#endif - static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -598,9 +568,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); - case BLKDAXGET: - return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); - break; case IOC_PR_REGISTER: return blkdev_pr_register(bdev, argp); case IOC_PR_RESERVE: diff --git a/block/ioprio.c b/block/ioprio.c index cc7800e9e..01b811629 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -150,8 +150,10 @@ static int get_task_ioprio(struct task_struct *p) if (ret) goto out; ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + task_lock(p); if (p->io_context) ret = p->io_context->ioprio; + task_unlock(p); out: return ret; } diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 26cb624ac..bcd86e5cd 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -430,7 +430,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, } /* Check that sizeof_partition_entry has the correct value */ if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { - pr_debug("GUID Partitition Entry Size check failed.\n"); + pr_debug("GUID Partition Entry Size check failed.\n"); goto fail; } @@ -443,7 +443,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, le32_to_cpu((*gpt)->sizeof_partition_entry)); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { - pr_debug("GUID Partitition Entry Array CRC check failed.\n"); + pr_debug("GUID Partition Entry Array CRC check failed.\n"); goto fail_ptes; } diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index e507cfbd0..edcea7067 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -27,6 +27,8 @@ #include <linux/pagemap.h> #include <linux/stringify.h> #include <linux/kernel.h> +#include <linux/uuid.h> + #include "ldm.h" #include "check.h" #include "msdos.h" @@ -66,60 +68,6 @@ void _ldm_printk(const char *level, const char *function, const char *fmt, ...) } /** - * ldm_parse_hexbyte - Convert a ASCII hex number to a byte - * @src: Pointer to at least 2 characters to convert. - * - * Convert a two character ASCII hex string to a number. - * - * Return: 0-255 Success, the byte was parsed correctly - * -1 Error, an invalid character was supplied - */ -static int ldm_parse_hexbyte (const u8 *src) -{ - unsigned int x; /* For correct wrapping */ - int h; - - /* high part */ - x = h = hex_to_bin(src[0]); - if (h < 0) - return -1; - - /* low part */ - h = hex_to_bin(src[1]); - if (h < 0) - return -1; - - return (x << 4) + h; -} - -/** - * ldm_parse_guid - Convert GUID from ASCII to binary - * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba - * @dest: Memory block to hold binary GUID (16 bytes) - * - * N.B. The GUID need not be NULL terminated. - * - * Return: 'true' @dest contains binary GUID - * 'false' @dest contents are undefined - */ -static bool ldm_parse_guid (const u8 *src, u8 *dest) -{ - static const int size[] = { 4, 2, 2, 2, 6 }; - int i, j, v; - - if (src[8] != '-' || src[13] != '-' || - src[18] != '-' || src[23] != '-') - return false; - - for (j = 0; j < 5; j++, src++) - for (i = 0; i < size[j]; i++, src+=2, *dest++ = v) - if ((v = ldm_parse_hexbyte (src)) < 0) - return false; - - return true; -} - -/** * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure * @data: Raw database PRIVHEAD structure loaded from the device * @ph: In-memory privhead structure in which to return parsed information @@ -167,7 +115,7 @@ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) ldm_error("PRIVHEAD disk size doesn't match real disk size"); return false; } - if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) { + if (uuid_be_to_bin(data + 0x0030, (uuid_be *)ph->disk_id)) { ldm_error("PRIVHEAD contains an invalid GUID."); return false; } @@ -944,7 +892,7 @@ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) disk = &vb->vblk.disk; ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name, sizeof (disk->alt_name)); - if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id)) + if (uuid_be_to_bin(buffer + 0x19 + r_name, (uuid_be *)disk->disk_id)) return false; return true; |