From 8d91c1e411f55d7ea91b1183a2e9f8088fb4d5be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Fabian=20Silva=20Delgado?= Date: Tue, 15 Dec 2015 14:52:16 -0300 Subject: Linux-libre 4.3.2-gnu --- block/Kconfig.iosched | 6 +- block/bfq-cgroup.c | 1282 +++++++++++++++++++-------------------------- block/bfq-ioc.c | 6 +- block/bfq-iosched.c | 1092 ++++++++++++++++---------------------- block/bfq-sched.c | 209 ++++---- block/bfq.h | 206 +++----- block/bio-integrity.c | 16 +- block/bio.c | 214 +++----- block/blk-cgroup.c | 524 +++++++++++------- block/blk-core.c | 44 +- block/blk-integrity.c | 3 + block/blk-lib.c | 52 +- block/blk-map.c | 30 +- block/blk-merge.c | 172 ++++-- block/blk-mq-cpumap.c | 9 +- block/blk-mq-sysfs.c | 34 +- block/blk-mq-tag.c | 28 +- block/blk-mq-tag.h | 2 + block/blk-mq.c | 125 +++-- block/blk-mq.h | 3 +- block/blk-settings.c | 44 +- block/blk-sysfs.c | 44 +- block/blk-throttle.c | 505 +++++++----------- block/blk.h | 5 - block/bounce.c | 62 +-- block/cfq-iosched.c | 651 ++++++++++++----------- block/genhd.c | 9 +- block/partition-generic.c | 12 +- block/uuid.c | 6 +- 29 files changed, 2590 insertions(+), 2805 deletions(-) (limited to 'block') diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 1fc1a4dc5..01da733dc 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -51,12 +51,14 @@ config IOSCHED_BFQ applications. If compiled built-in (saying Y here), BFQ can be configured to support hierarchical scheduling. -config BFQ_GROUP_IOSCHED +config CGROUP_BFQIO bool "BFQ hierarchical scheduling support" depends on CGROUPS && IOSCHED_BFQ=y default n ---help--- - Enable hierarchical scheduling in BFQ, using the blkio controller. + Enable hierarchical scheduling in BFQ, using the cgroups + filesystem interface. The name of the subsystem will be + bfqio. choice prompt "Default I/O scheduler" diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index bc34d7a2b..11e2f1d4e 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -13,480 +13,254 @@ * file. */ -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO -/* bfqg stats flags */ -enum bfqg_stats_flags { - BFQG_stats_waiting = 0, - BFQG_stats_idling, - BFQG_stats_empty, -}; - -#define BFQG_FLAG_FNS(name) \ -static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -{ \ - stats->flags |= (1 << BFQG_stats_##name); \ -} \ -static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -{ \ - stats->flags &= ~(1 << BFQG_stats_##name); \ -} \ -static int bfqg_stats_##name(struct bfqg_stats *stats) \ -{ \ - return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -} \ - -BFQG_FLAG_FNS(waiting) -BFQG_FLAG_FNS(idling) -BFQG_FLAG_FNS(empty) -#undef BFQG_FLAG_FNS - -/* This should be called with the queue_lock held. */ -static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -{ - unsigned long long now; - - if (!bfqg_stats_waiting(stats)) - return; - - now = sched_clock(); - if (time_after64(now, stats->start_group_wait_time)) - blkg_stat_add(&stats->group_wait_time, - now - stats->start_group_wait_time); - bfqg_stats_clear_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - struct bfq_group *curr_bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - if (bfqg_stats_waiting(stats)) - return; - if (bfqg == curr_bfqg) - return; - stats->start_group_wait_time = sched_clock(); - bfqg_stats_mark_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -{ - unsigned long long now; - - if (!bfqg_stats_empty(stats)) - return; - - now = sched_clock(); - if (time_after64(now, stats->start_empty_time)) - blkg_stat_add(&stats->empty_time, - now - stats->start_empty_time); - bfqg_stats_clear_empty(stats); -} - -static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -{ - blkg_stat_add(&bfqg->stats.dequeue, 1); -} +static DEFINE_MUTEX(bfqio_mutex); -static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) +static bool bfqio_is_removed(struct bfqio_cgroup *bgrp) { - struct bfqg_stats *stats = &bfqg->stats; - - if (blkg_rwstat_total(&stats->queued)) - return; - - /* - * group is already marked empty. This can happen if bfqq got new - * request in parent group and moved to this group while being added - * to service tree. Just ignore the event and move on. - */ - if (bfqg_stats_empty(stats)) - return; - - stats->start_empty_time = sched_clock(); - bfqg_stats_mark_empty(stats); + return bgrp ? !bgrp->online : false; } -static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - if (bfqg_stats_idling(stats)) { - unsigned long long now = sched_clock(); - - if (time_after64(now, stats->start_idle_time)) - blkg_stat_add(&stats->idle_time, - now - stats->start_idle_time); - bfqg_stats_clear_idling(stats); - } -} - -static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - stats->start_idle_time = sched_clock(); - bfqg_stats_mark_idling(stats); -} - -static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - blkg_stat_add(&stats->avg_queue_size_sum, - blkg_rwstat_total(&stats->queued)); - blkg_stat_add(&stats->avg_queue_size_samples, 1); - bfqg_stats_update_group_wait_time(stats); -} - -static struct blkcg_policy blkcg_policy_bfq; - -/* - * blk-cgroup policy-related handlers - * The following functions help in converting between blk-cgroup - * internal structures and BFQ-specific structures. - */ - -static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct bfq_group, pd) : NULL; -} +static struct bfqio_cgroup bfqio_root_cgroup = { + .weight = BFQ_DEFAULT_GRP_WEIGHT, + .ioprio = BFQ_DEFAULT_GRP_IOPRIO, + .ioprio_class = BFQ_DEFAULT_GRP_CLASS, +}; -static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) +static inline void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) { - return pd_to_blkg(&bfqg->pd); + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class; + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; } -static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) +static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css) { - return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq)); + return css ? container_of(css, struct bfqio_cgroup, css) : NULL; } /* - * bfq_group handlers - * The following functions help in navigating the bfq_group hierarchy - * by allowing to find the parent of a bfq_group or the bfq_group - * associated to a bfq_queue. + * Search the bfq_group for bfqd into the hash table (by now only a list) + * of bgrp. Must be called under rcu_read_lock(). */ - -static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) +static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, + struct bfq_data *bfqd) { - struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; - - return pblkg ? blkg_to_bfqg(pblkg) : NULL; -} - -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -{ - struct bfq_entity *group_entity = bfqq->entity.parent; - - return group_entity ? container_of(group_entity, struct bfq_group, - entity) : - bfqq->bfqd->root_group; -} - -/* - * The following two functions handle get and put of a bfq_group by - * wrapping the related blk-cgroup hooks. - */ - -static void bfqg_get(struct bfq_group *bfqg) -{ - return blkg_get(bfqg_to_blkg(bfqg)); -} - -static void bfqg_put(struct bfq_group *bfqg) -{ - return blkg_put(bfqg_to_blkg(bfqg)); -} + struct bfq_group *bfqg; + void *key; -static void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, - int rw) -{ - blkg_rwstat_add(&bfqg->stats.queued, rw, 1); - bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) - bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -} + hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) { + key = rcu_dereference(bfqg->bfqd); + if (key == bfqd) + return bfqg; + } -static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) -{ - blkg_rwstat_add(&bfqg->stats.queued, rw, -1); + return NULL; } -static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) +static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, + struct bfq_group *bfqg) { - blkg_rwstat_add(&bfqg->stats.merged, rw, 1); -} + struct bfq_entity *entity = &bfqg->entity; -static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); + /* + * If the weight of the entity has never been set via the sysfs + * interface, then bgrp->weight == 0. In this case we initialize + * the weight from the current ioprio value. Otherwise, the group + * weight, if set, has priority over the ioprio value. + */ + if (bgrp->weight == 0) { + entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio); + entity->new_ioprio = bgrp->ioprio; + } else { + if (bgrp->weight < BFQ_MIN_WEIGHT || + bgrp->weight > BFQ_MAX_WEIGHT) { + printk(KERN_CRIT "bfq_group_init_entity: " + "bgrp->weight %d\n", bgrp->weight); + BUG(); + } + entity->new_weight = bgrp->weight; + entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight); + } + entity->orig_weight = entity->weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; + entity->my_sched_data = &bfqg->sched_data; + bfqg->active_entities = 0; } -static void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, int rw) +static inline void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) { - struct bfqg_stats *stats = &bfqg->stats; - unsigned long long now = sched_clock(); - - if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); - if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, rw, - io_start_time - start_time); -} + struct bfq_entity *entity; -/* @stats = 0 */ -static void bfqg_stats_reset(struct bfqg_stats *stats) -{ - if (!stats) - return; - - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); - blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); - blkg_stat_reset(&stats->unaccounted_time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); -} + BUG_ON(parent == NULL); + BUG_ON(bfqg == NULL); -/* @to += @from */ -static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) -{ - if (!to || !from) - return; - - /* queued stats shouldn't be cleared */ - blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); - blkg_rwstat_merge(&to->serviced, &from->serviced); - blkg_rwstat_merge(&to->merged, &from->merged); - blkg_rwstat_merge(&to->service_time, &from->service_time); - blkg_rwstat_merge(&to->wait_time, &from->wait_time); - blkg_stat_merge(&from->time, &from->time); - blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_merge(&to->dequeue, &from->dequeue); - blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); - blkg_stat_merge(&to->idle_time, &from->idle_time); - blkg_stat_merge(&to->empty_time, &from->empty_time); + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; } -/* - * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' - * recursive stats can still account for the amount used by this bfqg after - * it's gone. +/** + * bfq_group_chain_alloc - allocate a chain of groups. + * @bfqd: queue descriptor. + * @css: the leaf cgroup_subsys_state this chain starts from. + * + * Allocate a chain of groups starting from the one belonging to + * @cgroup up to the root cgroup. Stop if a cgroup on the chain + * to the root has already an allocated group on @bfqd. */ -static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) +static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, + struct cgroup_subsys_state *css) { - struct bfq_group *parent; - - if (!bfqg) /* root_group */ - return; + struct bfqio_cgroup *bgrp; + struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; - parent = bfqg_parent(bfqg); + for (; css != NULL; css = css->parent) { + bgrp = css_to_bfqio(css); - lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); - - if (unlikely(!parent)) - return; + bfqg = bfqio_lookup_group(bgrp, bfqd); + if (bfqg != NULL) { + /* + * All the cgroups in the path from there to the + * root must have a bfq_group for bfqd, so we don't + * need any more allocations. + */ + break; + } - bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); - bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); - bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); -} + bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); + if (bfqg == NULL) + goto cleanup; -static void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + bfq_group_init_entity(bgrp, bfqg); + bfqg->my_entity = &bfqg->entity; - entity->weight = entity->new_weight; - entity->orig_weight = entity->new_weight; - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; - bfqg_get(bfqg); + if (leaf == NULL) { + leaf = bfqg; + prev = leaf; + } else { + bfq_group_set_parent(prev, bfqg); + /* + * Build a list of allocated nodes using the bfqd + * filed, that is still unused and will be + * initialized only after the node will be + * connected. + */ + prev->bfqd = bfqg; + prev = bfqg; + } } - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; -} - -static void bfqg_stats_init(struct bfqg_stats *stats) -{ - blkg_rwstat_init(&stats->service_bytes); - blkg_rwstat_init(&stats->serviced); - blkg_rwstat_init(&stats->merged); - blkg_rwstat_init(&stats->service_time); - blkg_rwstat_init(&stats->wait_time); - blkg_rwstat_init(&stats->queued); - - blkg_stat_init(&stats->sectors); - blkg_stat_init(&stats->time); - - blkg_stat_init(&stats->unaccounted_time); - blkg_stat_init(&stats->avg_queue_size_sum); - blkg_stat_init(&stats->avg_queue_size_samples); - blkg_stat_init(&stats->dequeue); - blkg_stat_init(&stats->group_wait_time); - blkg_stat_init(&stats->idle_time); - blkg_stat_init(&stats->empty_time); -} - -static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) - { - return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; - } - -static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -{ - return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -} - -static void bfq_cpd_init(const struct blkcg *blkcg) -{ - struct bfq_group_data *d = - cpd_to_bfqgd(blkcg->pd[blkcg_policy_bfq.plid]); - - d->weight = BFQ_DEFAULT_GRP_WEIGHT; -} - -static void bfq_pd_init(struct blkcg_gq *blkg) -{ - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - struct bfq_data *bfqd = blkg->q->elevator->elevator_data; - struct bfq_entity *entity = &bfqg->entity; - struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); - entity->orig_weight = entity->weight = entity->new_weight = d->weight; - entity->my_sched_data = &bfqg->sched_data; - bfqg->my_entity = entity; /* - * the root_group's will be set to NULL - * in bfq_init_queue() - */ - bfqg->bfqd = bfqd; - bfqg->active_entities = 0; - bfqg->rq_pos_tree = RB_ROOT; + return leaf; - /* if the root_group does not exist, we are handling it right now */ - if (bfqd->root_group && bfqg != bfqd->root_group) - hlist_add_head(&bfqg->bfqd_node, &bfqd->group_list); +cleanup: + while (leaf != NULL) { + prev = leaf; + leaf = leaf->bfqd; + kfree(prev); + } - bfqg_stats_init(&bfqg->stats); - bfqg_stats_init(&bfqg->dead_stats); + return NULL; } -/* offset delta from bfqg->stats to bfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - - offsetof(struct bfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +/** + * bfq_group_chain_link - link an allocated group chain to a cgroup + * hierarchy. + * @bfqd: the queue descriptor. + * @css: the leaf cgroup_subsys_state to start from. + * @leaf: the leaf group (to be associated to @cgroup). + * + * Try to link a chain of groups to a cgroup hierarchy, connecting the + * nodes bottom-up, so we can be sure that when we find a cgroup in the + * hierarchy that already as a group associated to @bfqd all the nodes + * in the path to the root cgroup have one too. + * + * On locking: the queue lock protects the hierarchy (there is a hierarchy + * per device) while the bfqio_cgroup lock protects the list of groups + * belonging to the same cgroup. + */ +static void bfq_group_chain_link(struct bfq_data *bfqd, + struct cgroup_subsys_state *css, + struct bfq_group *leaf) { - u64 sum = 0; + struct bfqio_cgroup *bgrp; + struct bfq_group *bfqg, *next, *prev = NULL; + unsigned long flags; - sum += blkg_stat_recursive_sum(pd, off); - sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); - return sum; -} + assert_spin_locked(bfqd->queue->queue_lock); -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat a, b; + for (; css != NULL && leaf != NULL; css = css->parent) { + bgrp = css_to_bfqio(css); + next = leaf->bfqd; - a = blkg_rwstat_recursive_sum(pd, off); - b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); - blkg_rwstat_merge(&a, &b); - return a; -} + bfqg = bfqio_lookup_group(bgrp, bfqd); + BUG_ON(bfqg != NULL); -static void bfq_pd_reset_stats(struct blkcg_gq *blkg) -{ - struct bfq_group *bfqg = blkg_to_bfqg(blkg); + spin_lock_irqsave(&bgrp->lock, flags); - bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); -} + rcu_assign_pointer(leaf->bfqd, bfqd); + hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); + hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); -static void bfq_group_set_parent(struct bfq_group *bfqg, - struct bfq_group *parent) -{ - struct bfq_entity *entity; + spin_unlock_irqrestore(&bgrp->lock, flags); - BUG_ON(!parent); - BUG_ON(!bfqg); - BUG_ON(bfqg == parent); + prev = leaf; + leaf = next; + } - entity = &bfqg->entity; - entity->parent = parent->my_entity; - entity->sched_data = &parent->sched_data; + BUG_ON(css == NULL && leaf != NULL); + if (css != NULL && prev != NULL) { + bgrp = css_to_bfqio(css); + bfqg = bfqio_lookup_group(bgrp, bfqd); + bfq_group_set_parent(prev, bfqg); + } } +/** + * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. + * @bfqd: queue descriptor. + * @cgroup: cgroup being searched for. + * + * Return a group associated to @bfqd in @cgroup, allocating one if + * necessary. When a group is returned all the cgroups in the path + * to the root have a group associated to @bfqd. + * + * If the allocation fails, return the root group: this breaks guarantees + * but is a safe fallback. If this loss becomes a problem it can be + * mitigated using the equivalent weight (given by the product of the + * weights of the groups in the path from @group to the root) in the + * root scheduler. + * + * We allocate all the missing nodes in the path from the leaf cgroup + * to the root and we connect the nodes only after all the allocations + * have been successful. + */ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) + struct cgroup_subsys_state *css) { - struct request_queue *q = bfqd->queue; - struct bfq_group *bfqg = NULL, *parent; - struct bfq_entity *entity = NULL; + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + struct bfq_group *bfqg; - assert_spin_locked(bfqd->queue->queue_lock); + bfqg = bfqio_lookup_group(bgrp, bfqd); + if (bfqg != NULL) + return bfqg; - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { + bfqg = bfq_group_chain_alloc(bfqd, css); + if (bfqg != NULL) + bfq_group_chain_link(bfqd, css, bfqg); + else bfqg = bfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - bfqg = blkg_to_bfqg(blkg); - else /* fallback to root_group */ - bfqg = bfqd->root_group; - } - - BUG_ON(!bfqg); - - /* - * Update chain of bfq_groups as we might be handling a leaf group - * which, along with some of its relatives, has not been hooked yet - * to the private hierarchy of BFQ. - */ - entity = &bfqg->entity; - for_each_entity(entity) { - bfqg = container_of(entity, struct bfq_group, entity); - BUG_ON(!bfqg); - if (bfqg != bfqd->root_group) { - parent = bfqg_parent(bfqg); - if (!parent) - parent = bfqd->root_group; - BUG_ON(!parent); - bfq_group_set_parent(bfqg, parent); - } - } return bfqg; } -static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); - /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. @@ -522,7 +296,6 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_deactivate_bfqq(bfqd, bfqq, 0); } else if (entity->on_st) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - bfqg_put(bfqq_group(bfqq)); /* * Here we use a reference to bfqg. We don't need a refcounter @@ -531,15 +304,11 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; - bfqg_get(bfqg); - if (busy) { - bfq_pos_tree_add_move(bfqd, bfqq); - if (resume) - bfq_activate_bfqq(bfqd, bfqq); - } + if (busy && resume) + bfq_activate_bfqq(bfqd, bfqq); - if (!bfqd->in_service_queue && !bfqd->rq_in_driver) + if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); } @@ -547,9 +316,9 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * __bfq_bic_change_cgroup - move @bic to @cgroup. * @bfqd: the queue descriptor. * @bic: the bic to move. - * @blkcg: the blk-cgroup to move to. + * @cgroup: the cgroup to move to. * - * Move bic to blkcg, assuming that bfqd->queue is locked; the caller + * Move bic to cgroup, assuming that bfqd->queue is locked; the caller * has to make sure that the reference to cgroup is valid across the call. * * NOTE: an alternative approach might have been to store the current @@ -558,17 +327,18 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct blkcg *blkcg) + struct cgroup_subsys_state *css) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); - struct bfq_group *bfqg; struct bfq_entity *entity; + struct bfq_group *bfqg; + struct bfqio_cgroup *bgrp; - lockdep_assert_held(bfqd->queue->queue_lock); + bgrp = css_to_bfqio(css); - bfqg = bfq_find_alloc_group(bfqd, blkcg); - if (async_bfqq) { + bfqg = bfq_find_alloc_group(bfqd, css); + if (async_bfqq != NULL) { entity = &async_bfqq->entity; if (entity->sched_data != &bfqg->sched_data) { @@ -580,7 +350,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, } } - if (sync_bfqq) { + if (sync_bfqq != NULL) { entity = &sync_bfqq->entity; if (entity->sched_data != &bfqg->sched_data) bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); @@ -589,39 +359,74 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, return bfqg; } -static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) +/** + * bfq_bic_change_cgroup - move @bic to @cgroup. + * @bic: the bic being migrated. + * @cgroup: the destination cgroup. + * + * When the task owning @bic is moved to @cgroup, @bic is immediately + * moved into its new parent group. + */ +static void bfq_bic_change_cgroup(struct bfq_io_cq *bic, + struct cgroup_subsys_state *css) +{ + struct bfq_data *bfqd; + unsigned long uninitialized_var(flags); + + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), + &flags); + if (bfqd != NULL) { + __bfq_bic_change_cgroup(bfqd, bic, css); + bfq_put_bfqd_unlock(bfqd, &flags); + } +} + +/** + * bfq_bic_update_cgroup - update the cgroup of @bic. + * @bic: the @bic to update. + * + * Make sure that @bic is enqueued in the cgroup of the current task. + * We need this in addition to moving bics during the cgroup attach + * phase because the task owning @bic could be at its first disk + * access or we may end up in the root cgroup as the result of a + * memory allocation failure and here we try to move to the right + * group. + * + * Must be called under the queue lock. It is safe to use the returned + * value even after the rcu_read_unlock() as the migration/destruction + * paths act under the queue lock too. IOW it is impossible to race with + * group migration/destruction and end up with an invalid group as: + * a) here cgroup has not yet been destroyed, nor its destroy callback + * has started execution, as current holds a reference to it, + * b) if it is destroyed after rcu_read_unlock() [after current is + * migrated to a different cgroup] its attach() callback will have + * taken care of remove all the references to the old cgroup data. + */ +static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct blkcg *blkcg; - struct bfq_group *bfqg = NULL; - uint64_t id; + struct bfq_group *bfqg; + struct cgroup_subsys_state *css; + + BUG_ON(bfqd == NULL); rcu_read_lock(); - blkcg = bio_blkcg(bio); - id = blkcg->css.serial_nr; + css = task_css(current, bfqio_cgrp_id); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, css); rcu_read_unlock(); - /* - * Check whether blkcg has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ - if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) - return; - - bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); - BUG_ON(!bfqg); - bic->blkcg_id = id; + return bfqg; } /** * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. * @st: the service tree being flushed. */ -static void bfq_flush_idle_tree(struct bfq_service_tree *st) +static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) { struct bfq_entity *entity = st->first_idle; - for (; entity ; entity = st->first_idle) + for (; entity != NULL; entity = st->first_idle) __bfq_deactivate_entity(entity, 0); } @@ -630,12 +435,12 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) * @bfqd: the device data structure with the root group. * @entity: the entity to move. */ -static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_entity *entity) +static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, + struct bfq_entity *entity) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(!bfqq); + BUG_ON(bfqq == NULL); bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); return; } @@ -649,9 +454,9 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, * * Needs queue_lock to be taken and reference to be valid over the call. */ -static void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, - struct bfq_service_tree *st) +static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st) { struct rb_root *active = &st->active; struct bfq_entity *entity = NULL; @@ -659,10 +464,10 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, if (!RB_EMPTY_ROOT(&st->active)) entity = bfq_entity_of(rb_first(active)); - for (; entity ; entity = bfq_entity_of(rb_first(active))) + for (; entity != NULL; entity = bfq_entity_of(rb_first(active))) bfq_reparent_leaf_entity(bfqd, entity); - if (bfqg->sched_data.in_service_entity) + if (bfqg->sched_data.in_service_entity != NULL) bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.in_service_entity); @@ -671,21 +476,20 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, /** * bfq_destroy_group - destroy @bfqg. + * @bgrp: the bfqio_cgroup containing @bfqg. * @bfqg: the group being destroyed. * * Destroy @bfqg, making sure that it is not referenced from its parent. - * blkio already grabs the queue_lock for us, so no need to use RCU-based magic */ -static void bfq_pd_offline(struct blkcg_gq *blkg) +static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) { + struct bfq_data *bfqd; struct bfq_service_tree *st; - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - struct bfq_data *bfqd = bfqg->bfqd; struct bfq_entity *entity = bfqg->my_entity; + unsigned long uninitialized_var(flags); int i; - if (!entity) /* root group */ - return; + hlist_del(&bfqg->group_node); /* * Empty all service_trees belonging to this group before @@ -714,19 +518,37 @@ static void bfq_pd_offline(struct blkcg_gq *blkg) * There is no need to put the sync queues, as the * scheduler has taken no reference. */ - bfq_reparent_active_entities(bfqd, bfqg, st); + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); + if (bfqd != NULL) { + bfq_reparent_active_entities(bfqd, bfqg, st); + bfq_put_bfqd_unlock(bfqd, &flags); + } BUG_ON(!RB_EMPTY_ROOT(&st->active)); BUG_ON(!RB_EMPTY_ROOT(&st->idle)); } - BUG_ON(bfqg->sched_data.next_in_service); - BUG_ON(bfqg->sched_data.in_service_entity); + BUG_ON(bfqg->sched_data.next_in_service != NULL); + BUG_ON(bfqg->sched_data.in_service_entity != NULL); - hlist_del(&bfqg->bfqd_node); - __bfq_deactivate_entity(entity, 0); - bfq_put_async_queues(bfqd, bfqg); - BUG_ON(entity->tree); + /* + * We may race with device destruction, take extra care when + * dereferencing bfqg->bfqd. + */ + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); + if (bfqd != NULL) { + hlist_del(&bfqg->bfqd_node); + __bfq_deactivate_entity(entity, 0); + bfq_put_async_queues(bfqd, bfqg); + bfq_put_bfqd_unlock(bfqd, &flags); + } + BUG_ON(entity->tree != NULL); - bfqg_stats_xfer_dead(bfqg); + /* + * No need to defer the kfree() to the end of the RCU grace + * period: we are called from the destroy() callback of our + * cgroup, so we can be sure that no one is a) still using + * this cgroup or b) doing lookups in it. + */ + kfree(bfqg); } static void bfq_end_wr_async(struct bfq_data *bfqd) @@ -773,309 +595,312 @@ static void bfq_disconnect_groups(struct bfq_data *bfqd) } } -static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, - struct cftype *cftype) +static inline void bfq_free_root_group(struct bfq_data *bfqd) { - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - int ret = -EINVAL; + struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; + struct bfq_group *bfqg = bfqd->root_group; + + bfq_put_async_queues(bfqd, bfqg); - spin_lock_irq(&blkcg->lock); - ret = bfqgd->weight; - spin_unlock_irq(&blkcg->lock); + spin_lock_irq(&bgrp->lock); + hlist_del_rcu(&bfqg->group_node); + spin_unlock_irq(&bgrp->lock); - return ret; + /* + * No need to synchronize_rcu() here: since the device is gone + * there cannot be any read-side access to its root_group. + */ + kfree(bfqg); } -static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, - struct cftype *cftype, - u64 val) +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) { - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - struct blkcg_gq *blkg; - int ret = -EINVAL; - - if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) - return ret; - - ret = 0; - spin_lock_irq(&blkcg->lock); - bfqgd->weight = (unsigned short)val; - hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - if (!bfqg) - continue; - /* - * Setting the prio_changed flag of the entity - * to 1 with new_weight == weight would re-set - * the value of the weight to its ioprio mapping. - * Set the flag only if necessary. - */ - if ((unsigned short)val != bfqg->entity.new_weight) { - bfqg->entity.new_weight = (unsigned short)val; - /* - * Make sure that the above new value has been - * stored in bfqg->entity.new_weight before - * setting the prio_changed flag. In fact, - * this flag may be read asynchronously (in - * critical sections protected by a different - * lock than that held here), and finding this - * flag set may cause the execution of the code - * for updating parameters whose value may - * depend also on bfqg->entity.new_weight (in - * __bfq_entity_update_weight_prio). - * This barrier makes sure that the new value - * of bfqg->entity.new_weight is correctly - * seen in that code. - */ - smp_wmb(); - bfqg->entity.prio_changed = 1; - } - } - spin_unlock_irq(&blkcg->lock); + struct bfq_group *bfqg; + struct bfqio_cgroup *bgrp; + int i; - return ret; -} + bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node); + if (bfqg == NULL) + return NULL; -static int bfqg_print_stat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - &blkcg_policy_bfq, seq_cft(sf)->private, false); - return 0; -} + bfqg->entity.parent = NULL; + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -static int bfqg_print_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_bfq, seq_cft(sf)->private, true); - return 0; + bgrp = &bfqio_root_cgroup; + spin_lock_irq(&bgrp->lock); + rcu_assign_pointer(bfqg->bfqd, bfqd); + hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); + spin_unlock_irq(&bgrp->lock); + + return bfqg; } -static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - u64 sum = bfqg_stat_pd_recursive_sum(pd, off); +#define SHOW_FUNCTION(__VAR) \ +static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \ + struct cftype *cftype) \ +{ \ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ + u64 ret = -ENODEV; \ + \ + mutex_lock(&bfqio_mutex); \ + if (bfqio_is_removed(bgrp)) \ + goto out_unlock; \ + \ + spin_lock_irq(&bgrp->lock); \ + ret = bgrp->__VAR; \ + spin_unlock_irq(&bgrp->lock); \ + \ +out_unlock: \ + mutex_unlock(&bfqio_mutex); \ + return ret; \ +} + +SHOW_FUNCTION(weight); +SHOW_FUNCTION(ioprio); +SHOW_FUNCTION(ioprio_class); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ +static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\ + struct cftype *cftype, \ + u64 val) \ +{ \ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ + struct bfq_group *bfqg; \ + int ret = -EINVAL; \ + \ + if (val < (__MIN) || val > (__MAX)) \ + return ret; \ + \ + ret = -ENODEV; \ + mutex_lock(&bfqio_mutex); \ + if (bfqio_is_removed(bgrp)) \ + goto out_unlock; \ + ret = 0; \ + \ + spin_lock_irq(&bgrp->lock); \ + bgrp->__VAR = (unsigned short)val; \ + hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \ + /* \ + * Setting the ioprio_changed flag of the entity \ + * to 1 with new_##__VAR == ##__VAR would re-set \ + * the value of the weight to its ioprio mapping. \ + * Set the flag only if necessary. \ + */ \ + if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ + bfqg->entity.new_##__VAR = (unsigned short)val; \ + /* \ + * Make sure that the above new value has been \ + * stored in bfqg->entity.new_##__VAR before \ + * setting the ioprio_changed flag. In fact, \ + * this flag may be read asynchronously (in \ + * critical sections protected by a different \ + * lock than that held here), and finding this \ + * flag set may cause the execution of the code \ + * for updating parameters whose value may \ + * depend also on bfqg->entity.new_##__VAR (in \ + * __bfq_entity_update_weight_prio). \ + * This barrier makes sure that the new value \ + * of bfqg->entity.new_##__VAR is correctly \ + * seen in that code. \ + */ \ + smp_wmb(); \ + bfqg->entity.ioprio_changed = 1; \ + } \ + } \ + spin_unlock_irq(&bgrp->lock); \ + \ +out_unlock: \ + mutex_unlock(&bfqio_mutex); \ + return ret; \ +} + +STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); +STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); +#undef STORE_FUNCTION - return __blkg_prfill_u64(sf, pd, sum); -} +static struct cftype bfqio_files[] = { + { + .name = "weight", + .read_u64 = bfqio_cgroup_weight_read, + .write_u64 = bfqio_cgroup_weight_write, + }, + { + .name = "ioprio", + .read_u64 = bfqio_cgroup_ioprio_read, + .write_u64 = bfqio_cgroup_ioprio_write, + }, + { + .name = "ioprio_class", + .read_u64 = bfqio_cgroup_ioprio_class_read, + .write_u64 = bfqio_cgroup_ioprio_class_write, + }, + { }, /* terminate */ +}; -static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) +static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state + *parent_css) { - struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); + struct bfqio_cgroup *bgrp; - return __blkg_prfill_rwstat(sf, pd, &sum); -} + if (parent_css != NULL) { + bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); + if (bgrp == NULL) + return ERR_PTR(-ENOMEM); + } else + bgrp = &bfqio_root_cgroup; -static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_stat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, false); - return 0; + spin_lock_init(&bgrp->lock); + INIT_HLIST_HEAD(&bgrp->group_data); + bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; + bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; + + return &bgrp->css; } -static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +/* + * We cannot support shared io contexts, as we have no means to support + * two tasks with the same ioc in two different groups without major rework + * of the main bic/bfqq data structures. By now we allow a task to change + * its cgroup only if it's the only owner of its ioc; the drawback of this + * behavior is that a group containing a task that forked using CLONE_IO + * will not be destroyed until the tasks sharing the ioc die. + */ +static int bfqio_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, true); - return 0; + struct task_struct *task; + struct io_context *ioc; + int ret = 0; + + cgroup_taskset_for_each(task, tset) { + /* + * task_lock() is needed to avoid races with + * exit_io_context() + */ + task_lock(task); + ioc = task->io_context; + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) + /* + * ioc == NULL means that the task is either too + * young or exiting: if it has still no ioc the + * ioc can't be shared, if the task is exiting the + * attach will fail anyway, no matter what we + * return here. + */ + ret = -EINVAL; + task_unlock(task); + if (ret) + break; + } + + return ret; } -static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, - struct blkg_policy_data *pd, int off) +static void bfqio_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { - struct bfq_group *bfqg = pd_to_bfqg(pd); - u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); - u64 v = 0; + struct task_struct *task; + struct io_context *ioc; + struct io_cq *icq; - if (samples) { - v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); - v = div64_u64(v, samples); + /* + * IMPORTANT NOTE: The move of more than one process at a time to a + * new group has not yet been tested. + */ + cgroup_taskset_for_each(task, tset) { + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + /* + * Handle cgroup change here. + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node) + if (!strncmp( + icq->q->elevator->type->elevator_name, + "bfq", ELV_NAME_MAX)) + bfq_bic_change_cgroup(icq_to_bic(icq), + css); + rcu_read_unlock(); + put_io_context(ioc); + } } - __blkg_prfill_u64(sf, pd, v); - return 0; } -/* print avg_queue_size */ -static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) +static void bfqio_destroy(struct cgroup_subsys_state *css) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, - 0, false); - return 0; + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + struct hlist_node *tmp; + struct bfq_group *bfqg; + + /* + * Since we are destroying the cgroup, there are no more tasks + * referencing it, and all the RCU grace periods that may have + * referenced it are ended (as the destruction of the parent + * cgroup is RCU-safe); bgrp->group_data will not be accessed by + * anything else and we don't need any synchronization. + */ + hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node) + bfq_destroy_group(bgrp, bfqg); + + BUG_ON(!hlist_empty(&bgrp->group_data)); + + kfree(bgrp); } -static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +static int bfqio_css_online(struct cgroup_subsys_state *css) { - int ret; + struct bfqio_cgroup *bgrp = css_to_bfqio(css); - ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); - if (ret) - return NULL; + mutex_lock(&bfqio_mutex); + bgrp->online = true; + mutex_unlock(&bfqio_mutex); - return blkg_to_bfqg(bfqd->queue->root_blkg); + return 0; } -static struct cftype bfqio_files[] = { - { - .name = "bfq.weight", - .read_u64 = bfqio_cgroup_weight_read, - .write_u64 = bfqio_cgroup_weight_write, - }, - /* statistics, cover only the tasks in the bfqg */ - { - .name = "bfq.time", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.sectors", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.io_service_bytes", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_serviced", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_service_time", - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_wait_time", - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_merged", - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_queued", - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, - }, +static void bfqio_css_offline(struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); - /* the same statictics which cover the bfqg and its descendants */ - { - .name = "bfq.time_recursive", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat_recursive, - }, - { - .name = "bfq.sectors_recursive", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat_recursive, - }, - { - .name = "bfq.io_service_bytes_recursive", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_serviced_recursive", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_service_time_recursive", - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_wait_time_recursive", - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_merged_recursive", - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_queued_recursive", - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.avg_queue_size", - .seq_show = bfqg_print_avg_queue_size, - }, - { - .name = "bfq.group_wait_time", - .private = offsetof(struct bfq_group, stats.group_wait_time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.idle_time", - .private = offsetof(struct bfq_group, stats.idle_time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.empty_time", - .private = offsetof(struct bfq_group, stats.empty_time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.dequeue", - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.unaccounted_time", - .private = offsetof(struct bfq_group, stats.unaccounted_time), - .seq_show = bfqg_print_stat, - }, - { } /* terminate */ -}; + mutex_lock(&bfqio_mutex); + bgrp->online = false; + mutex_unlock(&bfqio_mutex); +} -static struct blkcg_policy blkcg_policy_bfq = { - .pd_size = sizeof(struct bfq_group), - .cpd_size = sizeof(struct bfq_group_data), - .cftypes = bfqio_files, - .pd_init_fn = bfq_pd_init, - .cpd_init_fn = bfq_cpd_init, - .pd_offline_fn = bfq_pd_offline, - .pd_reset_stats_fn = bfq_pd_reset_stats, +struct cgroup_subsys bfqio_cgrp_subsys = { + .css_alloc = bfqio_create, + .css_online = bfqio_css_online, + .css_offline = bfqio_css_offline, + .can_attach = bfqio_can_attach, + .attach = bfqio_attach, + .css_free = bfqio_destroy, + .legacy_cftypes = bfqio_files, }; - #else - -static void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) +static inline void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; - } + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class; entity->sched_data = &bfqg->sched_data; } -static struct bfq_group * -bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) +static inline struct bfq_group * +bfq_bic_update_cgroup(struct bfq_io_cq *bic) { struct bfq_data *bfqd = bic_to_bfqd(bic); return bfqd->root_group; } -static void bfq_bfqq_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_entity *entity, - struct bfq_group *bfqg) +static inline void bfq_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_entity *entity, + struct bfq_group *bfqg) { } @@ -1084,24 +909,23 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static void bfq_disconnect_groups(struct bfq_data *bfqd) +static inline void bfq_disconnect_groups(struct bfq_data *bfqd) { bfq_put_async_queues(bfqd, bfqd->root_group); } -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static inline void bfq_free_root_group(struct bfq_data *bfqd) { - return bfqd->root_group; + kfree(bfqd->root_group); } -static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) { struct bfq_group *bfqg; int i; bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); - if (!bfqg) + if (bfqg == NULL) return NULL; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c index fb7bb8f08..7f6b0004c 100644 --- a/block/bfq-ioc.c +++ b/block/bfq-ioc.c @@ -14,7 +14,7 @@ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. * @icq: the iocontext queue. */ -static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) +static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) { /* bic->icq is the first member, %NULL will convert to %NULL */ return container_of(icq, struct bfq_io_cq, icq); @@ -27,8 +27,8 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) * * Queue lock must be held. */ -static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct io_context *ioc) +static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + struct io_context *ioc) { if (ioc) return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cf487f926..71b51c1b4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -82,9 +82,6 @@ static const int bfq_back_penalty = 2; /* Idling period duration, in jiffies. */ static int bfq_slice_idle = HZ / 125; -/* Minimum number of assigned budgets for which stats are safe to compute. */ -static const int bfq_stats_min_budgets = 194; - /* Default maximum budget values, in sectors and number of requests. */ static const int bfq_default_max_budget = 16 * 1024; static const int bfq_max_budget_async_rq = 4; @@ -166,22 +163,38 @@ static int device_speed_thresh[2]; #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -static void bfq_schedule_dispatch(struct bfq_data *bfqd); +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); #include "bfq-ioc.c" #include "bfq-sched.c" #include "bfq-cgroup.c" -#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) +#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_RT) #define bfq_sample_valid(samples) ((samples) > 80) +/* + * The following macro groups conditions that need to be evaluated when + * checking if existing queues and groups form a symmetric scenario + * and therefore idling can be reduced or disabled for some of the + * queues. See the comment to the function bfq_bfqq_must_not_expire() + * for further details. + */ +#ifdef CONFIG_CGROUP_BFQIO +#define symmetric_scenario (!bfqd->active_numerous_groups && \ + !bfq_differentiated_weights(bfqd)) +#else +#define symmetric_scenario (!bfq_differentiated_weights(bfqd)) +#endif + /* * We regard a request as SYNC, if either it's a read or has the SYNC bit * set (in which case it could also be a direct WRITE). */ -static int bfq_bio_sync(struct bio *bio) +static inline int bfq_bio_sync(struct bio *bio) { if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) return 1; @@ -193,7 +206,7 @@ static int bfq_bio_sync(struct bio *bio) * Scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing. */ -static void bfq_schedule_dispatch(struct bfq_data *bfqd) +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) { if (bfqd->queued != 0) { bfq_log(bfqd, "schedule dispatch"); @@ -217,9 +230,9 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ - if (!rq1 || rq1 == rq2) + if (rq1 == NULL || rq1 == rq2) return rq2; - if (!rq2) + if (rq2 == NULL) return rq1; if (rq_is_sync(rq1) && !rq_is_sync(rq2)) @@ -332,17 +345,17 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", (long long unsigned)sector, - bfqq ? bfqq->pid : 0); + bfqq != NULL ? bfqq->pid : 0); return bfqq; } -static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) +static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct rb_node **p, *parent; struct bfq_queue *__bfqq; - if (bfqq->pos_root) { + if (bfqq->pos_root != NULL) { rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } @@ -352,10 +365,10 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (!bfqq->next_rq) return; - bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + bfqq->pos_root = &bfqd->rq_pos_tree; __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, blk_rq_pos(bfqq->next_rq), &parent, &p); - if (!__bfqq) { + if (__bfqq == NULL) { rb_link_node(&bfqq->pos_node, parent, p); rb_insert_color(&bfqq->pos_node, bfqq->pos_root); } else @@ -365,7 +378,7 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* * Tell whether there are active queues or groups with differentiated weights. */ -static bool bfq_differentiated_weights(struct bfq_data *bfqd) +static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) { /* * For weights to differ, at least one of the trees must contain @@ -374,7 +387,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && (bfqd->queue_weights_tree.rb_node->rb_left || bfqd->queue_weights_tree.rb_node->rb_right) -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO ) || (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && (bfqd->group_weights_tree.rb_node->rb_left || @@ -383,40 +396,6 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) ); } -/* - * The following function returns true if every queue must receive the - * same share of the throughput (this condition is used when deciding - * whether idling may be disabled, see the comments in the function - * bfq_bfqq_may_idle()). - * - * Such a scenario occurs when: - * 1) all active queues have the same weight, - * 2) all active groups at the same level in the groups tree have the same - * weight, - * 3) all active groups at the same level in the groups tree have the same - * number of children. - * - * Unfortunately, keeping the necessary state for evaluating exactly the - * above symmetry conditions would be quite complex and time-consuming. - * Therefore this function evaluates, instead, the following stronger - * sub-conditions, for which it is much easier to maintain the needed - * state: - * 1) all active queues have the same weight, - * 2) all active groups have the same weight, - * 3) all active groups have at most one active child each. - * In particular, the last two conditions are always true if hierarchical - * support and the cgroups interface are not enabled, thus no state needs - * to be maintained in this case. - */ -static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -{ - return -#ifdef CONFIG_BFQ_GROUP_IOSCHED - !bfqd->active_numerous_groups && -#endif - !bfq_differentiated_weights(bfqd); -} - /* * If the weight-counter tree passed as input contains no counter for * the weight of the input entity, then add that counter; otherwise just @@ -516,10 +495,10 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - if (rbprev) + if (rbprev != NULL) prev = rb_entry_rq(rbprev); - if (rbnext) + if (rbnext != NULL) next = rb_entry_rq(rbnext); else { rbnext = rb_first(&bfqq->sort_list); @@ -531,8 +510,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, } /* see the definition of bfq_async_charge_factor for details */ -static unsigned long bfq_serv_to_charge(struct request *rq, - struct bfq_queue *bfqq) +static inline unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) { return blk_rq_sectors(rq) * (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * @@ -558,7 +537,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, struct request *next_rq = bfqq->next_rq; unsigned long new_budget; - if (!next_rq) + if (next_rq == NULL) return; if (bfqq == bfqd->in_service_queue) @@ -581,7 +560,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, } } -static unsigned int bfq_wr_duration(struct bfq_data *bfqd) +static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) { u64 dur; @@ -594,12 +573,13 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } -static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq) +static inline unsigned +bfq_bfqq_cooperations(struct bfq_queue *bfqq) { return bfqq->bic ? bfqq->bic->cooperations : 0; } -static void +static inline void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) { if (bic->saved_idle_window) @@ -623,7 +603,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bic->wr_time_left; bfqq->last_wr_start_finish = jiffies; - bfqq->entity.prio_changed = 1; + bfqq->entity.ioprio_changed = 1; } /* * Clear wr_time_left to prevent bfq_bfqq_save_state() from @@ -633,12 +613,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bic->wr_time_left = 0; } +/* Must be called with the queue_lock held. */ static int bfqq_process_refs(struct bfq_queue *bfqq) { int process_refs, io_refs; - lockdep_assert_held(bfqq->bfqd->queue->queue_lock); - io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; BUG_ON(process_refs < 0); @@ -646,7 +625,8 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) } /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) +static inline void bfq_reset_burst_list(struct bfq_data *bfqd, + struct bfq_queue *bfqq) { struct bfq_queue *item; struct hlist_node *n; @@ -878,14 +858,14 @@ static void bfq_add_request(struct request *rq) */ prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); - BUG_ON(!next_rq); + BUG_ON(next_rq == NULL); bfqq->next_rq = next_rq; /* * Adjust priority tree position, if next_rq changes. */ if (prev != bfqq->next_rq) - bfq_pos_tree_add_move(bfqd, bfqq); + bfq_rq_pos_tree_add(bfqd, bfqq); if (!bfq_bfqq_busy(bfqq)) { bool soft_rt, coop_or_in_burst, @@ -893,10 +873,6 @@ static void bfq_add_request(struct request *rq) bfqq->budget_timeout + bfqd->bfq_wr_min_idle_time); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, - rq->cmd_flags); -#endif if (bfq_bfqq_sync(bfqq)) { bool already_in_burst = !hlist_unhashed(&bfqq->burst_list_node) || @@ -941,7 +917,7 @@ static void bfq_add_request(struct request *rq) goto add_bfqq_busy; if (bfq_bfqq_just_split(bfqq)) - goto set_prio_changed; + goto set_ioprio_changed; /* * If the queue: @@ -953,7 +929,7 @@ static void bfq_add_request(struct request *rq) * start a weight-raising period. */ if (old_wr_coeff == 1 && (interactive || soft_rt) && - (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { + (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { bfqq->wr_coeff = bfqd->bfq_wr_coeff; if (interactive) bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); @@ -1032,9 +1008,9 @@ static void bfq_add_request(struct request *rq) bfqd->bfq_wr_rt_max_time; } } -set_prio_changed: +set_ioprio_changed: if (old_wr_coeff != bfqq->wr_coeff) - entity->prio_changed = 1; + entity->ioprio_changed = 1; add_bfqq_busy: bfqq->last_idle_bklogged = jiffies; bfqq->service_from_backlogged = 0; @@ -1049,7 +1025,7 @@ add_bfqq_busy: bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); bfqd->wr_busy_queues++; - entity->prio_changed = 1; + entity->ioprio_changed = 1; bfq_log_bfqq(bfqd, bfqq, "non-idle wrais starting at %lu, rais_max_time %u", jiffies, @@ -1072,11 +1048,11 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, struct bfq_queue *bfqq; bic = bfq_bic_lookup(bfqd, tsk->io_context); - if (!bic) + if (bic == NULL) return NULL; bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); - if (bfqq) + if (bfqq != NULL) return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); return NULL; @@ -1092,7 +1068,8 @@ static void bfq_activate_request(struct request_queue *q, struct request *rq) (long long unsigned)bfqd->last_position); } -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) +static inline void bfq_deactivate_request(struct request_queue *q, + struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; @@ -1124,7 +1101,7 @@ static void bfq_remove_request(struct request *rq) /* * Remove queue from request-position tree as it is empty. */ - if (bfqq->pos_root) { + if (bfqq->pos_root != NULL) { rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } @@ -1134,9 +1111,6 @@ static void bfq_remove_request(struct request *rq) BUG_ON(bfqq->meta_pending == 0); bfqq->meta_pending--; } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -#endif } static int bfq_merge(struct request_queue *q, struct request **req, @@ -1146,7 +1120,7 @@ static int bfq_merge(struct request_queue *q, struct request **req, struct request *__rq; __rq = bfq_find_rq_fmerge(bfqd, bio); - if (__rq && elv_rq_merge_ok(__rq, bio)) { + if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_FRONT_MERGE; } @@ -1173,7 +1147,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, bfqd->last_position); - BUG_ON(!next_rq); + BUG_ON(next_rq == NULL); bfqq->next_rq = next_rq; /* * If next_rq changes, update both the queue's budget to @@ -1182,19 +1156,11 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, */ if (prev != bfqq->next_rq) { bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); + bfq_rq_pos_tree_add(bfqd, bfqq); } } } -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) -{ - bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); -} -#endif - static void bfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { @@ -1221,21 +1187,18 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfq_remove_request(next); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -#endif } /* Must be called with bfqq != NULL */ -static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { - BUG_ON(!bfqq); + BUG_ON(bfqq == NULL); if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; /* Trigger a weight change on the next activation of the queue */ - bfqq->entity.prio_changed = 1; + bfqq->entity.ioprio_changed = 1; } static void bfq_end_wr_async_queues(struct bfq_data *bfqd, @@ -1245,9 +1208,9 @@ static void bfq_end_wr_async_queues(struct bfq_data *bfqd, for (i = 0; i < 2; i++) for (j = 0; j < IOPRIO_BE_NR; j++) - if (bfqg->async_bfqq[i][j]) + if (bfqg->async_bfqq[i][j] != NULL) bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); - if (bfqg->async_idle_bfqq) + if (bfqg->async_idle_bfqq != NULL) bfq_bfqq_end_wr(bfqg->async_idle_bfqq); } @@ -1266,7 +1229,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) spin_unlock_irq(bfqd->queue->queue_lock); } -static sector_t bfq_io_struct_pos(void *io_struct, bool request) +static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) { if (request) return blk_rq_pos(io_struct); @@ -1274,18 +1237,25 @@ static sector_t bfq_io_struct_pos(void *io_struct, bool request) return ((struct bio *)io_struct)->bi_iter.bi_sector; } -static int bfq_rq_close_to_sector(void *io_struct, bool request, - sector_t sector) +static inline sector_t bfq_dist_from(sector_t pos1, + sector_t pos2) { - return abs64(bfq_io_struct_pos(io_struct, request) - sector) <= - BFQQ_SEEK_THR; + if (pos1 >= pos2) + return pos1 - pos2; + else + return pos2 - pos1; } -static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, - struct bfq_queue *bfqq, +static inline int bfq_rq_close_to_sector(void *io_struct, bool request, sector_t sector) { - struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= + BFQQ_SEEK_THR; +} + +static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) +{ + struct rb_root *root = &bfqd->rq_pos_tree; struct rb_node *parent, *node; struct bfq_queue *__bfqq; @@ -1297,7 +1267,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, * request, choose it. */ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); - if (__bfqq) + if (__bfqq != NULL) return __bfqq; /* @@ -1313,7 +1283,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, node = rb_next(&__bfqq->pos_node); else node = rb_prev(&__bfqq->pos_node); - if (!node) + if (node == NULL) return NULL; __bfqq = rb_entry(node, struct bfq_queue, pos_node); @@ -1323,21 +1293,56 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, return NULL; } -static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, - struct bfq_queue *cur_bfqq, - sector_t sector) +/* + * bfqd - obvious + * cur_bfqq - passed in so that we don't decide that the current queue + * is closely cooperating with itself + * sector - used as a reference point to search for a close queue + */ +static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) { struct bfq_queue *bfqq; + if (bfq_class_idle(cur_bfqq)) + return NULL; + if (!bfq_bfqq_sync(cur_bfqq)) + return NULL; + if (BFQQ_SEEKY(cur_bfqq)) + return NULL; + + /* If device has only one backlogged bfq_queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + /* + * We should notice if some of the queues are cooperating, e.g. + * working closely on the same area of the disk. In that case, + * we can group them together and don't waste time idling. + */ + bfqq = bfqq_close(bfqd, sector); + if (bfqq == NULL || bfqq == cur_bfqq) + return NULL; + + /* + * Do not merge queues from different bfq_groups. + */ + if (bfqq->entity.parent != cur_bfqq->entity.parent) + return NULL; + /* - * We shall notice if some of the queues are cooperating, - * e.g., working closely on the same area of the device. In - * that case, we can group them together and: 1) don't waste - * time idling, and 2) serve the union of their requests in - * the best possible order for throughput. + * It only makes sense to merge sync queues. */ - bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); - if (!bfqq || bfqq == cur_bfqq) + if (!bfq_bfqq_sync(bfqq)) + return NULL; + if (BFQQ_SEEKY(bfqq)) + return NULL; + + /* + * Do not merge queues of different priority classes. + */ + if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) return NULL; return bfqq; @@ -1404,32 +1409,6 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) return new_bfqq; } -static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - struct bfq_queue *new_bfqq) -{ - if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || - (bfqq->ioprio_class != new_bfqq->ioprio_class)) - return false; - - /* - * If either of the queues has already been detected as seeky, - * then merging it with the other queue is unlikely to lead to - * sequential I/O. - */ - if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) - return false; - - /* - * Interleaved I/O is known to be done by (some) applications - * only for reads, so it does not make sense to merge async - * queues. - */ - if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) - return false; - - return true; -} - /* * Attempt to schedule a merge of bfqq with the currently in-service queue * or with a close queue among the scheduled queues. @@ -1451,52 +1430,56 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->new_bfqq) return bfqq->new_bfqq; + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; - /* If device has only one backlogged bfq_queue, don't search. */ - if (bfqd->busy_queues == 1) - return NULL; in_service_bfqq = bfqd->in_service_queue; - if (!in_service_bfqq || in_service_bfqq == bfqq || + if (in_service_bfqq == NULL || in_service_bfqq == bfqq || !bfqd->in_service_bic || unlikely(in_service_bfqq == &bfqd->oom_bfqq)) goto check_scheduled; + if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) + goto check_scheduled; + + if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) + goto check_scheduled; + + if (in_service_bfqq->entity.parent != bfqq->entity.parent) + goto check_scheduled; + if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfqq->entity.parent == in_service_bfqq->entity.parent && - bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { + bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); - if (new_bfqq) - return new_bfqq; + if (new_bfqq != NULL) + return new_bfqq; /* Merge with in-service queue */ } + /* * Check whether there is a cooperator among currently scheduled * queues. The only thing we need is that the bio/request is not * NULL, as we need it to establish whether a cooperator exists. */ check_scheduled: - new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - - if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) + new_bfqq = bfq_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); return NULL; } -static void bfq_bfqq_save_state(struct bfq_queue *bfqq) +static inline void +bfq_bfqq_save_state(struct bfq_queue *bfqq) { /* - * If !bfqq->bic, the queue is already shared or its requests + * If bfqq->bic == NULL, the queue is already shared or its requests * have already been redirected to a shared queue; both idle window * and weight raising state have already been saved. Do nothing. */ - if (!bfqq->bic) + if (bfqq->bic == NULL) return; if (bfqq->bic->wr_time_left) /* @@ -1540,7 +1523,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bfqq->bic->failed_cooperations = 0; } -static void bfq_get_bic_reference(struct bfq_queue *bfqq) +static inline void +bfq_get_bic_reference(struct bfq_queue *bfqq) { /* * If bfqq->bic has a non-NULL value, the bic to which it belongs @@ -1588,7 +1572,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_put_queue(bfqq); } -static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) { struct bfq_io_cq *bic = bfqq->bic; struct bfq_data *bfqd = bfqq->bfqd; @@ -1619,7 +1603,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, * Queue lock is held here. */ bic = bfq_bic_lookup(bfqd, current->io_context); - if (!bic) + if (bic == NULL) return 0; bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); @@ -1627,9 +1611,9 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, * We take advantage of this function to perform an early merge * of the queues of possible cooperating processes. */ - if (bfqq) { + if (bfqq != NULL) { new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); - if (new_bfqq) { + if (new_bfqq != NULL) { bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); /* * If we get here, the bio will be queued in the @@ -1647,10 +1631,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, static void __bfq_set_in_service_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - if (bfqq) { -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -#endif + if (bfqq != NULL) { bfq_mark_bfqq_must_alloc(bfqq); bfq_mark_bfqq_budget_new(bfqq); bfq_clear_bfqq_fifo_expire(bfqq); @@ -1658,7 +1639,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; bfq_log_bfqq(bfqd, bfqq, - "set_in_service_queue, cur-budget = %d", + "set_in_service_queue, cur-budget = %lu", bfqq->entity.budget); } @@ -1681,9 +1662,9 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) * stored in bfqd, which is dynamically updated according to the * estimated disk peak rate; otherwise return the default max budget */ -static int bfq_max_budget(struct bfq_data *bfqd) +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) { - if (bfqd->budgets_assigned < bfq_stats_min_budgets) + if (bfqd->budgets_assigned < 194) return bfq_default_max_budget; else return bfqd->bfq_max_budget; @@ -1693,9 +1674,9 @@ static int bfq_max_budget(struct bfq_data *bfqd) * Return min budget, which is a fraction of the current or default * max budget (trying with 1/32) */ -static int bfq_min_budget(struct bfq_data *bfqd) +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) { - if (bfqd->budgets_assigned < bfq_stats_min_budgets) + if (bfqd->budgets_assigned < 194) return bfq_default_max_budget / 32; else return bfqd->bfq_max_budget / 32; @@ -1711,7 +1692,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) /* Processes have exited, don't wait. */ bic = bfqd->in_service_bic; - if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) + if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) return; bfq_mark_bfqq_wait_request(bfqq); @@ -1737,15 +1718,12 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > bfq_max_budget(bfqq->bfqd) / 8) || bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && - bfq_symmetric_scenario(bfqd)) + symmetric_scenario) sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); else if (bfqq->wr_coeff > 1) sl = sl * 3; bfqd->last_idling_start = ktime_get(); mod_timer(&bfqd->idle_slice_timer, jiffies + sl); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -#endif bfq_log(bfqd, "arm idle: %u/%u ms", jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); } @@ -1799,10 +1777,6 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) if (bfq_bfqq_sync(bfqq)) bfqd->sync_flight++; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), - rq->cmd_flags); -#endif } /* @@ -1828,7 +1802,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) return rq; } -static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; return entity->budget - entity->service; @@ -1862,7 +1836,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* * Resort priority tree of potential close cooperators. */ - bfq_pos_tree_add_move(bfqd, bfqq); + bfq_rq_pos_tree_add(bfqd, bfqq); } } @@ -1872,24 +1846,24 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) * @bfqq: queue to update. * @reason: reason for expiration. * - * Handle the feedback on @bfqq budget at queue expiration. - * See the body for detailed comments. + * Handle the feedback on @bfqq budget. See the body for detailed + * comments. */ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, struct bfq_queue *bfqq, enum bfqq_expiration reason) { struct request *next_rq; - int budget, min_budget; + unsigned long budget, min_budget; budget = bfqq->max_budget; min_budget = bfq_min_budget(bfqd); BUG_ON(bfqq != bfqd->in_service_queue); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", budget, bfq_min_budget(bfqd)); bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); @@ -1966,19 +1940,18 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, default: return; } - } else - /* - * Async queues get always the maximum possible budget - * (their ability to dispatch is limited by - * @bfqd->bfq_max_budget_async_rq). - */ + } else /* async queue */ + /* async queues get always the maximum possible budget + * (their ability to dispatch is limited by + * @bfqd->bfq_max_budget_async_rq). + */ budget = bfqd->bfq_max_budget; bfqq->max_budget = budget; - if (bfqd->budgets_assigned >= bfq_stats_min_budgets && - !bfqd->bfq_user_max_budget) - bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); + if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && + bfqq->max_budget > bfqd->bfq_max_budget) + bfqq->max_budget = bfqd->bfq_max_budget; /* * Make sure that we have enough budget for the next request. @@ -1987,14 +1960,14 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, * update. */ next_rq = bfqq->next_rq; - if (next_rq) + if (next_rq != NULL) bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); else bfqq->entity.budget = bfqq->max_budget; - bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", - next_rq ? blk_rq_sectors(next_rq) : 0, + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", + next_rq != NULL ? blk_rq_sectors(next_rq) : 0, bfqq->entity.budget); } @@ -2020,15 +1993,15 @@ static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) * seeky processes, and hence reduce their chances to lower the * throughput. See the code for more details. */ -static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason) +static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int compensate, enum bfqq_expiration reason) { u64 bw, usecs, expected, timeout; ktime_t delta; int update = 0; if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) - return false; + return 0; if (compensate) delta = bfqd->last_idling_start; @@ -2039,7 +2012,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* Don't trust short/unrealistic values. */ if (usecs < 100 || usecs >= LONG_MAX) - return false; + return 0; /* * Calculate the bandwidth for the last slice. We use a 64 bit @@ -2088,7 +2061,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd->peak_rate, timeout); - bfq_log(bfqd, "new max_budget=%d", + bfq_log(bfqd, "new max_budget=%lu", bfqd->bfq_max_budget); } if (bfqd->device_speed == BFQ_BFQD_FAST && @@ -2113,7 +2086,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, * and for the moment return false. */ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) - return false; + return 0; /* * A process is considered ``slow'' (i.e., seeky, so that we @@ -2188,8 +2161,8 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, * seems to be quite precise also in embedded systems and KVM/QEMU virtual * machines. */ -static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) +static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) { return max(bfqq->last_idle_bklogged + HZ * bfqq->service_from_backlogged / @@ -2202,7 +2175,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, * the current time will be lower than this time instant according to the macro * time_is_before_jiffies(). */ -static unsigned long bfq_infinity_from_now(unsigned long now) +static inline unsigned long bfq_infinity_from_now(unsigned long now) { return now + ULONG_MAX / 2; } @@ -2239,14 +2212,13 @@ static unsigned long bfq_infinity_from_now(unsigned long now) */ static void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, + int compensate, enum bfqq_expiration reason) { - bool slow; + int slow; BUG_ON(bfqq != bfqd->in_service_queue); - /* - * Update disk peak rate for autotuning and check whether the + /* Update disk peak rate for autotuning and check whether the * process is slow (see bfq_update_peak_rate). */ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); @@ -2340,12 +2312,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, * just checked on request arrivals and completions, as well as on * idle timer expirations. */ -static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) { if (bfq_bfqq_budget_new(bfqq) || time_before(jiffies, bfqq->budget_timeout)) - return false; - return true; + return 0; + return 1; } /* @@ -2356,7 +2328,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) * does not hold, or if the queue is slow enough to deserve only to be * kicked off for preserving a high throughput. */ -static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "may_budget_timeout: wait_request %d left %d timeout %d", @@ -2371,278 +2343,183 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) } /* - * For a queue that becomes empty, device idling is allowed only if - * this function returns true for that queue. As a consequence, since - * device idling plays a critical role for both throughput boosting - * and service guarantees, the return value of this function plays a - * critical role as well. + * Device idling is allowed only for the queues for which this function + * returns true. For this reason, the return value of this function plays a + * critical role for both throughput boosting and service guarantees. The + * return value is computed through a logical expression. In this rather + * long comment, we try to briefly describe all the details and motivations + * behind the components of this logical expression. + * + * First, the expression is false if bfqq is not sync, or if: bfqq happened + * to become active during a large burst of queue activations, and the + * pattern of requests bfqq contains boosts the throughput if bfqq is + * expired. In fact, queues that became active during a large burst benefit + * only from throughput, as discussed in the comments to bfq_handle_burst. + * In this respect, expiring bfqq certainly boosts the throughput on NCQ- + * capable flash-based devices, whereas, on rotational devices, it boosts + * the throughput only if bfqq contains random requests. + * + * On the opposite end, if (a) bfqq is sync, (b) the above burst-related + * condition does not hold, and (c) bfqq is being weight-raised, then the + * expression always evaluates to true, as device idling is instrumental + * for preserving low-latency guarantees (see [1]). If, instead, conditions + * (a) and (b) do hold, but (c) does not, then the expression evaluates to + * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and + * (2) at least one of the following two conditions holds. + * The first condition is that the device is not performing NCQ, because + * idling the device most certainly boosts the throughput if this condition + * holds and bfqq is I/O-bound and has been granted a non-null idle window. + * The second compound condition is made of the logical AND of two components. + * + * The first component is true only if there is no weight-raised busy + * queue. This guarantees that the device is not idled for a sync non- + * weight-raised queue when there are busy weight-raised queues. The former + * is then expired immediately if empty. Combined with the timestamping + * rules of BFQ (see [1] for details), this causes sync non-weight-raised + * queues to get a lower number of requests served, and hence to ask for a + * lower number of requests from the request pool, before the busy weight- + * raised queues get served again. + * + * This is beneficial for the processes associated with weight-raised + * queues, when the request pool is saturated (e.g., in the presence of + * write hogs). In fact, if the processes associated with the other queues + * ask for requests at a lower rate, then weight-raised processes have a + * higher probability to get a request from the pool immediately (or at + * least soon) when they need one. Hence they have a higher probability to + * actually get a fraction of the disk throughput proportional to their + * high weight. This is especially true with NCQ-capable drives, which + * enqueue several requests in advance and further reorder internally- + * queued requests. + * + * In the end, mistreating non-weight-raised queues when there are busy + * weight-raised queues seems to mitigate starvation problems in the + * presence of heavy write workloads and NCQ, and hence to guarantee a + * higher application and system responsiveness in these hostile scenarios. + * + * If the first component of the compound condition is instead true, i.e., + * there is no weight-raised busy queue, then the second component of the + * compound condition takes into account service-guarantee and throughput + * issues related to NCQ (recall that the compound condition is evaluated + * only if the device is detected as supporting NCQ). * - * In a nutshell, this function returns true only if idling is - * beneficial for throughput or, even if detrimental for throughput, - * idling is however necessary to preserve service guarantees (low - * latency, desired throughput distribution, ...). In particular, on - * NCQ-capable devices, this function tries to return false, so as to - * help keep the drives' internal queues full, whenever this helps the - * device boost the throughput without causing any service-guarantee - * issue. + * As for service guarantees, allowing the drive to enqueue more than one + * request at a time, and hence delegating de facto final scheduling + * decisions to the drive's internal scheduler, causes loss of control on + * the actual request service order. In this respect, when the drive is + * allowed to enqueue more than one request at a time, the service + * distribution enforced by the drive's internal scheduler is likely to + * coincide with the desired device-throughput distribution only in the + * following, perfectly symmetric, scenario: + * 1) all active queues have the same weight, + * 2) all active groups at the same level in the groups tree have the same + * weight, + * 3) all active groups at the same level in the groups tree have the same + * number of children. + * + * Even in such a scenario, sequential I/O may still receive a preferential + * treatment, but this is not likely to be a big issue with flash-based + * devices, because of their non-dramatic loss of throughput with random + * I/O. Things do differ with HDDs, for which additional care is taken, as + * explained after completing the discussion for flash-based devices. * - * In more detail, the return value of this function is obtained by, - * first, computing a number of boolean variables that take into - * account throughput and service-guarantee issues, and, then, - * combining these variables in a logical expression. Most of the - * issues taken into account are not trivial. We discuss these issues - * while introducing the variables. + * Unfortunately, keeping the necessary state for evaluating exactly the + * above symmetry conditions would be quite complex and time-consuming. + * Therefore BFQ evaluates instead the following stronger sub-conditions, + * for which it is much easier to maintain the needed state: + * 1) all active queues have the same weight, + * 2) all active groups have the same weight, + * 3) all active groups have at most one active child each. + * In particular, the last two conditions are always true if hierarchical + * support and the cgroups interface are not enabled, hence no state needs + * to be maintained in this case. + * + * According to the above considerations, the second component of the + * compound condition evaluates to true if any of the above symmetry + * sub-condition does not hold, or the device is not flash-based. Therefore, + * if also the first component is true, then idling is allowed for a sync + * queue. These are the only sub-conditions considered if the device is + * flash-based, as, for such a device, it is sensible to force idling only + * for service-guarantee issues. In fact, as for throughput, idling + * NCQ-capable flash-based devices would not boost the throughput even + * with sequential I/O; rather it would lower the throughput in proportion + * to how fast the device is. In the end, (only) if all the three + * sub-conditions hold and the device is flash-based, the compound + * condition evaluates to false and therefore no idling is performed. + * + * As already said, things change with a rotational device, where idling + * boosts the throughput with sequential I/O (even with NCQ). Hence, for + * such a device the second component of the compound condition evaluates + * to true also if the following additional sub-condition does not hold: + * the queue is constantly seeky. Unfortunately, this different behavior + * with respect to flash-based devices causes an additional asymmetry: if + * some sync queues enjoy idling and some other sync queues do not, then + * the latter get a low share of the device throughput, simply because the + * former get many requests served after being set as in service, whereas + * the latter do not. As a consequence, to guarantee the desired throughput + * distribution, on HDDs the compound expression evaluates to true (and + * hence device idling is performed) also if the following last symmetry + * condition does not hold: no other queue is benefiting from idling. Also + * this last condition is actually replaced with a simpler-to-maintain and + * stronger condition: there is no busy queue which is not constantly seeky + * (and hence may also benefit from idling). + * + * To sum up, when all the required symmetry and throughput-boosting + * sub-conditions hold, the second component of the compound condition + * evaluates to false, and hence no idling is performed. This helps to + * keep the drives' internal queues full on NCQ-capable devices, and hence + * to boost the throughput, without causing 'almost' any loss of service + * guarantees. The 'almost' follows from the fact that, if the internal + * queue of one such device is filled while all the sub-conditions hold, + * but at some point in time some sub-condition stops to hold, then it may + * become impossible to let requests be served in the new desired order + * until all the requests already queued in the device have been served. */ -static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) +static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; - bool idling_boosts_thr, idling_boosts_thr_without_issues, - all_queues_seeky, on_hdd_and_not_all_queues_seeky, - idling_needed_for_service_guarantees, - asymmetric_scenario; - - /* - * The next variable takes into account the cases where idling - * boosts the throughput. - * - * The value of the variable is computed considering, first, that - * idling is virtually always beneficial for the throughput if: - * (a) the device is not NCQ-capable, or - * (b) regardless of the presence of NCQ, the device is rotational - * and the request pattern for bfqq is I/O-bound and sequential. - * - * Secondly, and in contrast to the above item (b), idling an - * NCQ-capable flash-based device would not boost the - * throughput even with sequential I/O; rather it would lower - * the throughput in proportion to how fast the device - * is. Accordingly, the next variable is true if any of the - * above conditions (a) and (b) is true, and, in particular, - * happens to be false if bfqd is an NCQ-capable flash-based - * device. - */ - idling_boosts_thr = !bfqd->hw_tag || - (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && - bfq_bfqq_idle_window(bfqq)) ; +#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \ + bfqd->busy_in_flight_queues == \ + bfqd->const_seeky_busy_in_flight_queues) - /* - * The value of the next variable, - * idling_boosts_thr_without_issues, is equal to that of - * idling_boosts_thr, unless a special case holds. In this - * special case, described below, idling may cause problems to - * weight-raised queues. - * - * When the request pool is saturated (e.g., in the presence - * of write hogs), if the processes associated with - * non-weight-raised queues ask for requests at a lower rate, - * then processes associated with weight-raised queues have a - * higher probability to get a request from the pool - * immediately (or at least soon) when they need one. Thus - * they have a higher probability to actually get a fraction - * of the device throughput proportional to their high - * weight. This is especially true with NCQ-capable drives, - * which enqueue several requests in advance, and further - * reorder internally-queued requests. - * - * For this reason, we force to false the value of - * idling_boosts_thr_without_issues if there are weight-raised - * busy queues. In this case, and if bfqq is not weight-raised, - * this guarantees that the device is not idled for bfqq (if, - * instead, bfqq is weight-raised, then idling will be - * guaranteed by another variable, see below). Combined with - * the timestamping rules of BFQ (see [1] for details), this - * behavior causes bfqq, and hence any sync non-weight-raised - * queue, to get a lower number of requests served, and thus - * to ask for a lower number of requests from the request - * pool, before the busy weight-raised queues get served - * again. This often mitigates starvation problems in the - * presence of heavy write workloads and NCQ, thereby - * guaranteeing a higher application and system responsiveness - * in these hostile scenarios. - */ - idling_boosts_thr_without_issues = idling_boosts_thr && - bfqd->wr_busy_queues == 0; - - /* - * There are then two cases where idling must be performed not - * for throughput concerns, but to preserve service - * guarantees. In the description of these cases, we say, for - * short, that a queue is sequential/random if the process - * associated to the queue issues sequential/random requests - * (in the second case the queue may be tagged as seeky or - * even constantly_seeky). - * - * To introduce the first case, we note that, since - * bfq_bfqq_idle_window(bfqq) is false if the device is - * NCQ-capable and bfqq is random (see - * bfq_update_idle_window()), then, from the above two - * assignments it follows that - * idling_boosts_thr_without_issues is false if the device is - * NCQ-capable and bfqq is random. Therefore, for this case, - * device idling would never be allowed if we used just - * idling_boosts_thr_without_issues to decide whether to allow - * it. And, beneficially, this would imply that throughput - * would always be boosted also with random I/O on NCQ-capable - * HDDs. - * - * But we must be careful on this point, to avoid an unfair - * treatment for bfqq. In fact, because of the same above - * assignments, idling_boosts_thr_without_issues is, on the - * other hand, true if 1) the device is an HDD and bfqq is - * sequential, and 2) there are no busy weight-raised - * queues. As a consequence, if we used just - * idling_boosts_thr_without_issues to decide whether to idle - * the device, then with an HDD we might easily bump into a - * scenario where queues that are sequential and I/O-bound - * would enjoy idling, whereas random queues would not. The - * latter might then get a low share of the device throughput, - * simply because the former would get many requests served - * after being set as in service, while the latter would not. - * - * To address this issue, we start by setting to true a - * sentinel variable, on_hdd_and_not_all_queues_seeky, if the - * device is rotational and not all queues with pending or - * in-flight requests are constantly seeky (i.e., there are - * active sequential queues, and bfqq might then be mistreated - * if it does not enjoy idling because it is random). - */ - all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && - bfqd->busy_in_flight_queues == - bfqd->const_seeky_busy_in_flight_queues; - - on_hdd_and_not_all_queues_seeky = - !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; - - /* - * To introduce the second case where idling needs to be - * performed to preserve service guarantees, we can note that - * allowing the drive to enqueue more than one request at a - * time, and hence delegating de facto final scheduling - * decisions to the drive's internal scheduler, causes loss of - * control on the actual request service order. In particular, - * the critical situation is when requests from different - * processes happens to be present, at the same time, in the - * internal queue(s) of the drive. In such a situation, the - * drive, by deciding the service order of the - * internally-queued requests, does determine also the actual - * throughput distribution among these processes. But the - * drive typically has no notion or concern about per-process - * throughput distribution, and makes its decisions only on a - * per-request basis. Therefore, the service distribution - * enforced by the drive's internal scheduler is likely to - * coincide with the desired device-throughput distribution - * only in a completely symmetric scenario where: - * (i) each of these processes must get the same throughput as - * the others; - * (ii) all these processes have the same I/O pattern - (either sequential or random). - * In fact, in such a scenario, the drive will tend to treat - * the requests of each of these processes in about the same - * way as the requests of the others, and thus to provide - * each of these processes with about the same throughput - * (which is exactly the desired throughput distribution). In - * contrast, in any asymmetric scenario, device idling is - * certainly needed to guarantee that bfqq receives its - * assigned fraction of the device throughput (see [1] for - * details). - * - * We address this issue by controlling, actually, only the - * symmetry sub-condition (i), i.e., provided that - * sub-condition (i) holds, idling is not performed, - * regardless of whether sub-condition (ii) holds. In other - * words, only if sub-condition (i) holds, then idling is - * allowed, and the device tends to be prevented from queueing - * many requests, possibly of several processes. The reason - * for not controlling also sub-condition (ii) is that, first, - * in the case of an HDD, the asymmetry in terms of types of - * I/O patterns is already taken in to account in the above - * sentinel variable - * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a - * flash-based device, we prefer however to privilege - * throughput (and idling lowers throughput for this type of - * devices), for the following reasons: - * 1) differently from HDDs, the service time of random - * requests is not orders of magnitudes lower than the service - * time of sequential requests; thus, even if processes doing - * sequential I/O get a preferential treatment with respect to - * others doing random I/O, the consequences are not as - * dramatic as with HDDs; - * 2) if a process doing random I/O does need strong - * throughput guarantees, it is hopefully already being - * weight-raised, or the user is likely to have assigned it a - * higher weight than the other processes (and thus - * sub-condition (i) is likely to be false, which triggers - * idling). - * - * According to the above considerations, the next variable is - * true (only) if sub-condition (i) holds. To compute the - * value of this variable, we not only use the return value of - * the function bfq_symmetric_scenario(), but also check - * whether bfqq is being weight-raised, because - * bfq_symmetric_scenario() does not take into account also - * weight-raised queues (see comments to - * bfq_weights_tree_add()). - * - * As a side note, it is worth considering that the above - * device-idling countermeasures may however fail in the - * following unlucky scenario: if idling is (correctly) - * disabled in a time period during which all symmetry - * sub-conditions hold, and hence the device is allowed to - * enqueue many requests, but at some later point in time some - * sub-condition stops to hold, then it may become impossible - * to let requests be served in the desired order until all - * the requests already queued in the device have been served. - */ - asymmetric_scenario = bfqq->wr_coeff > 1 || - !bfq_symmetric_scenario(bfqd); +#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \ + bfqd->hw_tag && \ + (blk_queue_nonrot(bfqd->queue) || \ + bfq_bfqq_constantly_seeky(bfqq))) - /* - * Finally, there is a case where maximizing throughput is the - * best choice even if it may cause unfairness toward - * bfqq. Such a case is when bfqq became active in a burst of - * queue activations. Queues that became active during a large - * burst benefit only from throughput, as discussed in the - * comments to bfq_handle_burst. Thus, if bfqq became active - * in a burst and not idling the device maximizes throughput, - * then the device must no be idled, because not idling the - * device provides bfqq and all other queues in the burst with - * maximum benefit. Combining this and the two cases above, we - * can now establish when idling is actually needed to - * preserve service guarantees. - */ - idling_needed_for_service_guarantees = - (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && - !bfq_bfqq_in_large_burst(bfqq); +/* + * Condition for expiring a non-weight-raised queue (and hence not idling + * the device). + */ +#define cond_for_expiring_non_wr (bfqd->hw_tag && \ + (bfqd->wr_busy_queues > 0 || \ + (blk_queue_nonrot(bfqd->queue) || \ + cond_for_seeky_on_ncq_hdd))) - /* - * We have now all the components we need to compute the return - * value of the function, which is true only if both the following - * conditions hold: - * 1) bfqq is sync, because idling make sense only for sync queues; - * 2) idling either boosts the throughput (without issues), or - * is necessary to preserve service guarantees. - */ return bfq_bfqq_sync(bfqq) && - (idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees); + !cond_for_expiring_in_burst && + (bfqq->wr_coeff > 1 || !symmetric_scenario || + (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) && + !cond_for_expiring_non_wr) + ); } /* - * If the in-service queue is empty but the function bfq_bfqq_may_idle - * returns true, then: + * If the in-service queue is empty but sync, and the function + * bfq_bfqq_must_not_expire returns true, then: * 1) the queue must remain in service and cannot be expired, and - * 2) the device must be idled to wait for the possible arrival of a new + * 2) the disk must be idled to wait for the possible arrival of a new * request for the queue. - * See the comments to the function bfq_bfqq_may_idle for the reasons + * See the comments to the function bfq_bfqq_must_not_expire for the reasons * why performing device idling is the best choice to boost the throughput - * and preserve service guarantees when bfq_bfqq_may_idle itself + * and preserve service guarantees when bfq_bfqq_must_not_expire itself * returns true. */ -static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) +static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && - bfq_bfqq_may_idle(bfqq); + bfq_bfqq_must_not_expire(bfqq); } /* @@ -2656,7 +2533,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; bfqq = bfqd->in_service_queue; - if (!bfqq) + if (bfqq == NULL) goto new_queue; bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); @@ -2671,7 +2548,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * If bfqq has requests queued and it has enough budget left to * serve them, keep the queue, otherwise expire it. */ - if (next_rq) { + if (next_rq != NULL) { if (bfq_serv_to_charge(next_rq, bfqq) > bfq_bfqq_budget_left(bfqq)) { reason = BFQ_BFQQ_BUDGET_EXHAUSTED; @@ -2698,9 +2575,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) */ bfq_clear_bfqq_wait_request(bfqq); del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif } goto keep_queue; } @@ -2712,18 +2586,18 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * may idle after their completion, then keep it anyway. */ if (timer_pending(&bfqd->idle_slice_timer) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { + (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { bfqq = NULL; goto keep_queue; } reason = BFQ_BFQQ_NO_MORE_REQUESTS; expire: - bfq_bfqq_expire(bfqd, bfqq, false, reason); + bfq_bfqq_expire(bfqd, bfqq, 0, reason); new_queue: bfqq = bfq_set_in_service_queue(bfqd); bfq_log(bfqd, "select_queue: new queue %d returned", - bfqq ? bfqq->pid : 0); + bfqq != NULL ? bfqq->pid : 0); keep_queue: return bfqq; } @@ -2741,7 +2615,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != entity->orig_weight * bfqq->wr_coeff); - if (entity->prio_changed) + if (entity->ioprio_changed) bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); /* @@ -2785,7 +2659,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, /* Follow expired path, else get first next available. */ rq = bfq_check_fifo(bfqq); - if (!rq) + if (rq == NULL) rq = bfqq->next_rq; service_to_charge = bfq_serv_to_charge(rq, bfqq); @@ -2821,14 +2695,14 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, bfq_update_wr_data(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, - "dispatched %u sec req (%llu), budg left %d", + "dispatched %u sec req (%llu), budg left %lu", blk_rq_sectors(rq), (long long unsigned)blk_rq_pos(rq), bfq_bfqq_budget_left(bfqq)); dispatched++; - if (!bfqd->in_service_bic) { + if (bfqd->in_service_bic == NULL) { atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); bfqd->in_service_bic = RQ_BIC(rq); } @@ -2841,7 +2715,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, return dispatched; expire: - bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); return dispatched; } @@ -2849,7 +2723,7 @@ static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) { int dispatched = 0; - while (bfqq->next_rq) { + while (bfqq->next_rq != NULL) { bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); dispatched++; } @@ -2869,7 +2743,7 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) int dispatched = 0; bfqq = bfqd->in_service_queue; - if (bfqq) + if (bfqq != NULL) __bfq_bfqq_expire(bfqd, bfqq); /* @@ -2905,7 +2779,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) return bfq_forced_dispatch(bfqd); bfqq = bfq_select_queue(bfqd); - if (!bfqq) + if (bfqq == NULL) return 0; if (bfq_class_idle(bfqq)) @@ -2945,9 +2819,6 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) static void bfq_put_queue(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_group *bfqg = bfqq_group(bfqq); -#endif BUG_ON(atomic_read(&bfqq->ref) <= 0); @@ -2956,9 +2827,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq) if (!atomic_dec_and_test(&bfqq->ref)) return; - BUG_ON(rb_first(&bfqq->sort_list)); + BUG_ON(rb_first(&bfqq->sort_list) != NULL); BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree); + BUG_ON(bfqq->entity.tree != NULL); BUG_ON(bfq_bfqq_busy(bfqq)); BUG_ON(bfqd->in_service_queue == bfqq); @@ -2976,9 +2847,6 @@ static void bfq_put_queue(struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); kmem_cache_free(bfq_pool, bfqq); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_put(bfqg); -#endif } static void bfq_put_cooperator(struct bfq_queue *bfqq) @@ -3015,7 +2883,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void bfq_init_icq(struct io_cq *icq) +static inline void bfq_init_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); @@ -3082,38 +2950,40 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b /* * No prio set, inherit CPU scheduling settings. */ - bfqq->new_ioprio = task_nice_ioprio(tsk); - bfqq->new_ioprio_class = task_nice_ioclass(tsk); + bfqq->entity.new_ioprio = task_nice_ioprio(tsk); + bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - bfqq->new_ioprio_class = IOPRIO_CLASS_RT; + bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - bfqq->new_ioprio_class = IOPRIO_CLASS_BE; + bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: - bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->new_ioprio = 7; + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->entity.new_ioprio = 7; bfq_clear_bfqq_idle_window(bfqq); break; } - if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->entity.new_ioprio < 0 || + bfqq->entity.new_ioprio >= IOPRIO_BE_NR) { printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", - bfqq->new_ioprio); + bfqq->entity.new_ioprio); BUG(); } - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio); + bfqq->entity.ioprio_changed = 1; } -static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) +static void bfq_check_ioprio_change(struct bfq_io_cq *bic) { struct bfq_data *bfqd; struct bfq_queue *bfqq, *new_bfqq; + struct bfq_group *bfqg; unsigned long uninitialized_var(flags); int ioprio = bic->icq.ioc->ioprio; @@ -3123,16 +2993,18 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) * This condition may trigger on a newly created bic, be sure to * drop the lock before returning. */ - if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) + if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) goto out; bic->ioprio = ioprio; bfqq = bic->bfqq[BLK_RW_ASYNC]; - if (bfqq) { - new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, + if (bfqq != NULL) { + bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, + sched_data); + new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic, GFP_ATOMIC); - if (new_bfqq) { + if (new_bfqq != NULL) { bic->bfqq[BLK_RW_ASYNC] = new_bfqq; bfq_log_bfqq(bfqd, bfqq, "check_ioprio_change: bfqq %p %d", @@ -3142,7 +3014,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) } bfqq = bic->bfqq[BLK_RW_SYNC]; - if (bfqq) + if (bfqq != NULL) bfq_set_next_ioprio_data(bfqq, bic); out: @@ -3166,8 +3038,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfq_class_idle(bfqq)) bfq_mark_bfqq_idle_window(bfqq); bfq_mark_bfqq_sync(bfqq); - } else - bfq_clear_bfqq_sync(bfqq); + } bfq_mark_bfqq_IO_bound(bfqq); /* Tentative initial value to trade off between thr and lat */ @@ -3184,19 +3055,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, } static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, + struct bfq_group *bfqg, + int is_sync, struct bfq_io_cq *bic, gfp_t gfp_mask) { - struct bfq_group *bfqg; struct bfq_queue *bfqq, *new_bfqq = NULL; - struct blkcg *blkcg; retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - bfqg = bfq_find_alloc_group(bfqd, blkcg); /* bic always exists here */ bfqq = bic_to_bfqq(bic, is_sync); @@ -3204,19 +3070,18 @@ retry: * Always try a new alloc if we fall back to the OOM bfqq * originally, since it should just be a temporary situation. */ - if (!bfqq || bfqq == &bfqd->oom_bfqq) { + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { bfqq = NULL; - if (new_bfqq) { + if (new_bfqq != NULL) { bfqq = new_bfqq; new_bfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { - rcu_read_unlock(); spin_unlock_irq(bfqd->queue->queue_lock); new_bfqq = kmem_cache_alloc_node(bfq_pool, gfp_mask | __GFP_ZERO, bfqd->queue->node); spin_lock_irq(bfqd->queue->queue_lock); - if (new_bfqq) + if (new_bfqq != NULL) goto retry; } else { bfqq = kmem_cache_alloc_node(bfq_pool, @@ -3224,7 +3089,7 @@ retry: bfqd->queue->node); } - if (bfqq) { + if (bfqq != NULL) { bfq_init_bfqq(bfqd, bfqq, bic, current->pid, is_sync); bfq_init_entity(&bfqq->entity, bfqg); @@ -3235,11 +3100,9 @@ retry: } } - if (new_bfqq) + if (new_bfqq != NULL) kmem_cache_free(bfq_pool, new_bfqq); - rcu_read_unlock(); - return bfqq; } @@ -3263,7 +3126,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, + struct bfq_group *bfqg, int is_sync, struct bfq_io_cq *bic, gfp_t gfp_mask) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); @@ -3272,26 +3135,19 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq = NULL; if (!is_sync) { - struct blkcg *blkcg; - struct bfq_group *bfqg; - - rcu_read_lock(); - blkcg = bio_blkcg(bio); - rcu_read_unlock(); - bfqg = bfq_find_alloc_group(bfqd, blkcg); async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); bfqq = *async_bfqq; } - if (!bfqq) - bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); + if (bfqq == NULL) + bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); /* * Pin the queue now that it's allocated, scheduler exit will * prune it. */ - if (!is_sync && !(*async_bfqq)) { + if (!is_sync && *async_bfqq == NULL) { atomic_inc(&bfqq->ref); bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", bfqq, atomic_read(&bfqq->ref)); @@ -3424,9 +3280,9 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { - bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && - blk_rq_sectors(rq) < 32; - bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); + int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32; + int budget_timeout = bfq_bfqq_budget_timeout(bfqq); /* * There is just this request queued: if the request @@ -3453,9 +3309,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfq_clear_bfqq_wait_request(bfqq); del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif /* * The queue is not empty, because a new request just @@ -3465,8 +3318,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, * See [1] for more details. */ if (budget_timeout) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); /* * Let the request rip immediately, or let a new queue be @@ -3490,7 +3342,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) */ if (!in_interrupt()) { new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); - if (new_bfqq) { + if (new_bfqq != NULL) { if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); /* @@ -3518,7 +3370,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) * from assigning it a full weight-raising period. See the detailed * comments about this field in bfq_init_icq(). */ - if (bfqq->bic) + if (bfqq->bic != NULL) bfqq->bic->wr_time_left = 0; rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); @@ -3566,11 +3418,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) BUG_ON(!bfqq->dispatched); bfqd->rq_in_driver--; bfqq->dispatched--; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_completion(bfqq_group(bfqq), - rq_start_time_ns(rq), - rq_io_start_time_ns(rq), rq->cmd_flags); -#endif if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { bfq_weights_tree_remove(bfqd, &bfqq->entity, @@ -3615,12 +3462,11 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) bfq_arm_slice_timer(bfqd); goto out; } else if (bfq_may_expire_for_budg_timeout(bfqq)) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); else if (RB_EMPTY_ROOT(&bfqq->sort_list) && (bfqq->dispatched == 0 || - !bfq_bfqq_may_idle(bfqq))) - bfq_bfqq_expire(bfqd, bfqq, false, + !bfq_bfqq_must_not_expire(bfqq))) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_NO_MORE_REQUESTS); } @@ -3631,7 +3477,7 @@ out: return; } -static int __bfq_may_queue(struct bfq_queue *bfqq) +static inline int __bfq_may_queue(struct bfq_queue *bfqq) { if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { bfq_clear_bfqq_must_alloc(bfqq); @@ -3655,11 +3501,11 @@ static int bfq_may_queue(struct request_queue *q, int rw) * 'may queue' if that fails. */ bic = bfq_bic_lookup(bfqd, tsk->io_context); - if (!bic) + if (bic == NULL) return ELV_MQUEUE_MAY; bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); - if (bfqq) + if (bfqq != NULL) return __bfq_may_queue(bfqq); return ELV_MQUEUE_MAY; @@ -3672,7 +3518,7 @@ static void bfq_put_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); - if (bfqq) { + if (bfqq != NULL) { const int rw = rq_data_dir(rq); BUG_ON(!bfqq->allocated[rw]); @@ -3724,24 +3570,25 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, const int rw = rq_data_dir(rq); const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; + struct bfq_group *bfqg; unsigned long flags; bool split = false; might_sleep_if(gfp_mask & __GFP_WAIT); - bfq_check_ioprio_change(bic, bio); + bfq_check_ioprio_change(bic); spin_lock_irqsave(q->queue_lock, flags); - if (!bic) + if (bic == NULL) goto queue_fail; - bfq_bic_update_cgroup(bic, bio); + bfqg = bfq_bic_update_cgroup(bic); new_queue: bfqq = bic_to_bfqq(bic, is_sync); - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { if ((bic->was_in_burst_list && bfqd->large_burst) || @@ -3837,7 +3684,7 @@ static void bfq_idle_slice_timer(unsigned long data) * the in-service queue. This can hardly happen, but in the worst * case we just expire a queue too early. */ - if (bfqq) { + if (bfqq != NULL) { bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); if (bfq_bfqq_budget_timeout(bfqq)) /* @@ -3857,7 +3704,7 @@ static void bfq_idle_slice_timer(unsigned long data) else goto schedule_dispatch; - bfq_bfqq_expire(bfqd, bfqq, true, reason); + bfq_bfqq_expire(bfqd, bfqq, 1, reason); } schedule_dispatch: @@ -3872,14 +3719,14 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) cancel_work_sync(&bfqd->unplug_work); } -static void __bfq_put_async_bfqq(struct bfq_data *bfqd, +static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, struct bfq_queue **bfqq_ptr) { struct bfq_group *root_group = bfqd->root_group; struct bfq_queue *bfqq = *bfqq_ptr; bfq_log(bfqd, "put_async_bfqq: %p", bfqq); - if (bfqq) { + if (bfqq != NULL) { bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", bfqq, atomic_read(&bfqq->ref)); @@ -3915,7 +3762,7 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_lock_irq(q->queue_lock); - BUG_ON(bfqd->in_service_queue); + BUG_ON(bfqd->in_service_queue != NULL); list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) bfq_deactivate_bfqq(bfqd, bfqq, 0); @@ -3928,39 +3775,22 @@ static void bfq_exit_queue(struct elevator_queue *e) BUG_ON(timer_pending(&bfqd->idle_slice_timer)); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); -#endif - + bfq_free_root_group(bfqd); kfree(bfqd); } -static void bfq_init_root_group(struct bfq_group *root_group, - struct bfq_data *bfqd) -{ - int i; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - root_group->entity.parent = NULL; - root_group->my_entity = NULL; - root_group->bfqd = bfqd; -#endif - root_group->rq_pos_tree = RB_ROOT; - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -} - static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) { + struct bfq_group *bfqg; struct bfq_data *bfqd; struct elevator_queue *eq; eq = elevator_alloc(q, e); - if (!eq) + if (eq == NULL) return -ENOMEM; bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); - if (!bfqd) { + if (bfqd == NULL) { kobject_put(&eq->kobj); return -ENOMEM; } @@ -3973,16 +3803,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) */ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); atomic_inc(&bfqd->oom_bfqq.ref); - bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; - bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE; bfqd->oom_bfqq.entity.new_weight = - bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); + bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio); /* * Trigger weight initialization, according to ioprio, at the * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio * class won't be changed any more. */ - bfqd->oom_bfqq.entity.prio_changed = 1; + bfqd->oom_bfqq.entity.ioprio_changed = 1; bfqd->queue = q; @@ -3990,12 +3820,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) q->elevator = eq; spin_unlock_irq(q->queue_lock); - bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); - if (!bfqd->root_group) - goto out_free; - bfq_init_root_group(bfqd->root_group, bfqd); + bfqg = bfq_alloc_root_group(bfqd, q->node); + if (bfqg == NULL) { + kfree(bfqd); + kobject_put(&eq->kobj); + return -ENOMEM; + } + + bfqd->root_group = bfqg; bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO bfqd->active_numerous_groups = 0; #endif @@ -4003,6 +3837,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->idle_slice_timer.function = bfq_idle_slice_timer; bfqd->idle_slice_timer.data = (unsigned long)bfqd; + bfqd->rq_pos_tree = RB_ROOT; bfqd->queue_weights_tree = RB_ROOT; bfqd->group_weights_tree = RB_ROOT; @@ -4060,23 +3895,18 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->device_speed = BFQ_BFQD_FAST; return 0; - -out_free: - kfree(bfqd); - kobject_put(&eq->kobj); - return -ENOMEM; } static void bfq_slab_kill(void) { - if (bfq_pool) + if (bfq_pool != NULL) kmem_cache_destroy(bfq_pool); } static int __init bfq_slab_setup(void) { bfq_pool = KMEM_CACHE(bfq_queue, 0); - if (!bfq_pool) + if (bfq_pool == NULL) return -ENOMEM; return 0; } @@ -4221,7 +4051,7 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, return count; } -static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) { u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); @@ -4315,9 +4145,6 @@ static struct elevator_type iosched_bfq = { .elevator_merge_fn = bfq_merge, .elevator_merged_fn = bfq_merged_request, .elevator_merge_req_fn = bfq_merged_requests, -#ifdef CONFIG_BFQ_GROUP_IOSCHED - .elevator_bio_merged_fn = bfq_bio_merged, -#endif .elevator_allow_merge_fn = bfq_allow_merge, .elevator_dispatch_fn = bfq_dispatch_requests, .elevator_add_req_fn = bfq_insert_request, @@ -4343,8 +4170,6 @@ static struct elevator_type iosched_bfq = { static int __init bfq_init(void) { - int ret; - /* * Can be 0 on HZ < 1000 setups. */ @@ -4354,15 +4179,8 @@ static int __init bfq_init(void) if (bfq_timeout_async == 0) bfq_timeout_async = 1; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - ret = blkcg_policy_register(&blkcg_policy_bfq); - if (ret) - return ret; -#endif - - ret = -ENOMEM; if (bfq_slab_setup()) - goto err_pol_unreg; + return -ENOMEM; /* * Times to load large popular applications for the typical systems @@ -4381,27 +4199,15 @@ static int __init bfq_init(void) device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; - ret = elv_register(&iosched_bfq); - if (ret) - goto err_pol_unreg; - - pr_info("BFQ I/O-scheduler: v7r9"); + elv_register(&iosched_bfq); + pr_info("BFQ I/O-scheduler: v7r8"); return 0; - -err_pol_unreg: -#ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_bfq); -#endif - return ret; } static void __exit bfq_exit(void) { elv_unregister(&iosched_bfq); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_bfq); -#endif bfq_slab_kill(); } diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 9328a1f09..d0890c6d4 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -10,27 +10,24 @@ * Copyright (C) 2010 Paolo Valente */ -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO #define for_each_entity(entity) \ - for (; entity ; entity = entity->parent) + for (; entity != NULL; entity = entity->parent) #define for_each_entity_safe(entity, parent) \ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) - static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, int extract, struct bfq_data *bfqd); -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - -static void bfq_update_budget(struct bfq_entity *next_in_service) +static inline void bfq_update_budget(struct bfq_entity *next_in_service) { struct bfq_entity *bfqg_entity; struct bfq_group *bfqg; struct bfq_sched_data *group_sd; - BUG_ON(!next_in_service); + BUG_ON(next_in_service == NULL); group_sd = next_in_service->sched_data; @@ -41,7 +38,7 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) * as it must never become an in-service entity. */ bfqg_entity = bfqg->my_entity; - if (bfqg_entity) + if (bfqg_entity != NULL) bfqg_entity->budget = next_in_service->budget; } @@ -49,7 +46,7 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd) { struct bfq_entity *next_in_service; - if (sd->in_service_entity) + if (sd->in_service_entity != NULL) /* will update/requeue at the end of service */ return 0; @@ -63,35 +60,35 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd) next_in_service = bfq_lookup_next_entity(sd, 0, NULL); sd->next_in_service = next_in_service; - if (next_in_service) + if (next_in_service != NULL) bfq_update_budget(next_in_service); return 1; } -static void bfq_check_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *entity) +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) { BUG_ON(sd->next_in_service != entity); } #else #define for_each_entity(entity) \ - for (; entity ; entity = NULL) + for (; entity != NULL; entity = NULL) #define for_each_entity_safe(entity, parent) \ - for (parent = NULL; entity ; entity = parent) + for (parent = NULL; entity != NULL; entity = parent) -static int bfq_update_next_in_service(struct bfq_sched_data *sd) +static inline int bfq_update_next_in_service(struct bfq_sched_data *sd) { return 0; } -static void bfq_check_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *entity) +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) { } -static void bfq_update_budget(struct bfq_entity *next_in_service) +static inline void bfq_update_budget(struct bfq_entity *next_in_service) { } #endif @@ -112,18 +109,18 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) * * Return @a > @b, dealing with wrapping correctly. */ -static int bfq_gt(u64 a, u64 b) +static inline int bfq_gt(u64 a, u64 b) { return (s64)(a - b) > 0; } -static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) { struct bfq_queue *bfqq = NULL; - BUG_ON(!entity); + BUG_ON(entity == NULL); - if (!entity->my_sched_data) + if (entity->my_sched_data == NULL) bfqq = container_of(entity, struct bfq_queue, entity); return bfqq; @@ -135,7 +132,8 @@ static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) * @service: amount of service. * @weight: scale factor (weight of an entity or weight sum). */ -static u64 bfq_delta(unsigned long service, unsigned long weight) +static inline u64 bfq_delta(unsigned long service, + unsigned long weight) { u64 d = (u64)service << WFQ_SERVICE_SHIFT; @@ -148,7 +146,8 @@ static u64 bfq_delta(unsigned long service, unsigned long weight) * @entity: the entity to act upon. * @service: the service to be charged to the entity. */ -static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) +static inline void bfq_calc_finish(struct bfq_entity *entity, + unsigned long service) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); @@ -157,7 +156,7 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) entity->finish = entity->start + bfq_delta(service, entity->weight); - if (bfqq) { + if (bfqq != NULL) { bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: serv %lu, w %d", service, entity->weight); @@ -177,11 +176,11 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) * conversion mechanism because, e.g., in the tree walking functions, * the check for a %NULL value would be redundant. */ -static struct bfq_entity *bfq_entity_of(struct rb_node *node) +static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) { struct bfq_entity *entity = NULL; - if (node) + if (node != NULL) entity = rb_entry(node, struct bfq_entity, rb_node); return entity; @@ -192,7 +191,8 @@ static struct bfq_entity *bfq_entity_of(struct rb_node *node) * @root: the tree root. * @entity: the entity to remove. */ -static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) +static inline void bfq_extract(struct rb_root *root, + struct bfq_entity *entity) { BUG_ON(entity->tree != root); @@ -225,7 +225,7 @@ static void bfq_idle_extract(struct bfq_service_tree *st, bfq_extract(&st->idle, entity); - if (bfqq) + if (bfqq != NULL) list_del(&bfqq->bfqq_list); } @@ -243,9 +243,9 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) struct rb_node **node = &root->rb_node; struct rb_node *parent = NULL; - BUG_ON(entity->tree); + BUG_ON(entity->tree != NULL); - while (*node) { + while (*node != NULL) { parent = *node; entry = rb_entry(parent, struct bfq_entity, rb_node); @@ -271,11 +271,12 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) * that the subtree rooted at @node (which may be its left or its right * child) has a valid min_start value. */ -static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) +static inline void bfq_update_min(struct bfq_entity *entity, + struct rb_node *node) { struct bfq_entity *child; - if (node) { + if (node != NULL) { child = rb_entry(node, struct bfq_entity, rb_node); if (bfq_gt(entity->min_start, child->min_start)) entity->min_start = child->min_start; @@ -290,7 +291,7 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) * this function updates its min_start value. The left and right subtrees * are assumed to hold a correct min_start value. */ -static void bfq_update_active_node(struct rb_node *node) +static inline void bfq_update_active_node(struct rb_node *node) { struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); @@ -317,12 +318,12 @@ up: bfq_update_active_node(node); parent = rb_parent(node); - if (!parent) + if (parent == NULL) return; - if (node == parent->rb_left && parent->rb_right) + if (node == parent->rb_left && parent->rb_right != NULL) bfq_update_active_node(parent->rb_right); - else if (parent->rb_left) + else if (parent->rb_left != NULL) bfq_update_active_node(parent->rb_left); node = parent; @@ -354,7 +355,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node = &entity->rb_node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO struct bfq_sched_data *sd = NULL; struct bfq_group *bfqg = NULL; struct bfq_data *bfqd = NULL; @@ -362,22 +363,22 @@ static void bfq_active_insert(struct bfq_service_tree *st, bfq_insert(&st->active, entity); - if (node->rb_left) + if (node->rb_left != NULL) node = node->rb_left; - else if (node->rb_right) + else if (node->rb_right != NULL) node = node->rb_right; bfq_update_active_tree(node); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO sd = entity->sched_data; bfqg = container_of(sd, struct bfq_group, sched_data); BUG_ON(!bfqg); bfqd = (struct bfq_data *)bfqg->bfqd; #endif - if (bfqq) + if (bfqq != NULL) list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO else { /* bfq_group */ BUG_ON(!bfqd); bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); @@ -396,32 +397,31 @@ static void bfq_active_insert(struct bfq_service_tree *st, * bfq_ioprio_to_weight - calc a weight from an ioprio. * @ioprio: the ioprio value to convert. */ -static unsigned short bfq_ioprio_to_weight(int ioprio) +static inline unsigned short bfq_ioprio_to_weight(int ioprio) { BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); - return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; + return IOPRIO_BE_NR - ioprio; } /** * bfq_weight_to_ioprio - calc an ioprio from a weight. * @weight: the weight value to convert. * - * To preserve as much as possible the old only-ioprio user interface, + * To preserve as mush as possible the old only-ioprio user interface, * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. + * larger than IOPRIO_BE_NR */ -static unsigned short bfq_weight_to_ioprio(int weight) +static inline unsigned short bfq_weight_to_ioprio(int weight) { BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); - return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? - 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; } -static void bfq_get_entity(struct bfq_entity *entity) +static inline void bfq_get_entity(struct bfq_entity *entity) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) { + if (bfqq != NULL) { atomic_inc(&bfqq->ref); bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", bfqq, atomic_read(&bfqq->ref)); @@ -441,15 +441,15 @@ static struct rb_node *bfq_find_deepest(struct rb_node *node) { struct rb_node *deepest; - if (!node->rb_right && !node->rb_left) + if (node->rb_right == NULL && node->rb_left == NULL) deepest = rb_parent(node); - else if (!node->rb_right) + else if (node->rb_right == NULL) deepest = node->rb_left; - else if (!node->rb_left) + else if (node->rb_left == NULL) deepest = node->rb_right; else { deepest = rb_next(node); - if (deepest->rb_right) + if (deepest->rb_right != NULL) deepest = deepest->rb_right; else if (rb_parent(deepest) != node) deepest = rb_parent(deepest); @@ -468,7 +468,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO struct bfq_sched_data *sd = NULL; struct bfq_group *bfqg = NULL; struct bfq_data *bfqd = NULL; @@ -477,18 +477,18 @@ static void bfq_active_extract(struct bfq_service_tree *st, node = bfq_find_deepest(&entity->rb_node); bfq_extract(&st->active, entity); - if (node) + if (node != NULL) bfq_update_active_tree(node); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO sd = entity->sched_data; bfqg = container_of(sd, struct bfq_group, sched_data); BUG_ON(!bfqg); bfqd = (struct bfq_data *)bfqg->bfqd; #endif - if (bfqq) + if (bfqq != NULL) list_del(&bfqq->bfqq_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO else { /* bfq_group */ BUG_ON(!bfqd); bfq_weights_tree_remove(bfqd, entity, @@ -519,14 +519,14 @@ static void bfq_idle_insert(struct bfq_service_tree *st, struct bfq_entity *first_idle = st->first_idle; struct bfq_entity *last_idle = st->last_idle; - if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) st->first_idle = entity; - if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) st->last_idle = entity; bfq_insert(&st->idle, entity); - if (bfqq) + if (bfqq != NULL) list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); } @@ -549,7 +549,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, entity->on_st = 0; st->wsum -= entity->weight; - if (bfqq) { + if (bfqq != NULL) { sd = entity->sched_data; bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", bfqq, atomic_read(&bfqq->ref)); @@ -581,7 +581,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st) struct bfq_entity *first_idle = st->first_idle; struct bfq_entity *last_idle = st->last_idle; - if (RB_EMPTY_ROOT(&st->active) && last_idle && + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && !bfq_gt(last_idle->finish, st->vtime)) { /* * Forget the whole idle tree, increasing the vtime past @@ -590,7 +590,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st) st->vtime = last_idle->finish; } - if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) bfq_put_idle_entity(st, first_idle); } @@ -600,19 +600,19 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, { struct bfq_service_tree *new_st = old_st; - if (entity->prio_changed) { + if (entity->ioprio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); unsigned short prev_weight, new_weight; struct bfq_data *bfqd = NULL; struct rb_root *root; -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO struct bfq_sched_data *sd; struct bfq_group *bfqg; #endif - if (bfqq) + if (bfqq != NULL) bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO else { sd = entity->my_sched_data; bfqg = container_of(sd, struct bfq_group, sched_data); @@ -634,14 +634,12 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, BUG(); } entity->orig_weight = entity->new_weight; - if (bfqq) - bfqq->ioprio = - bfq_weight_to_ioprio(entity->orig_weight); + entity->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); } - if (bfqq) - bfqq->ioprio_class = bfqq->new_ioprio_class; - entity->prio_changed = 0; + entity->ioprio_class = entity->new_ioprio_class; + entity->ioprio_changed = 0; /* * NOTE: here we may be changing the weight too early, @@ -654,7 +652,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, prev_weight = entity->weight; new_weight = entity->orig_weight * - (bfqq ? bfqq->wr_coeff : 1); + (bfqq != NULL ? bfqq->wr_coeff : 1); /* * If the weight of the entity changes, remove the entity * from its old weight counter (if there is a counter @@ -685,10 +683,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, return new_st; } -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -#endif - /** * bfq_bfqq_served - update the scheduler status after selection for * service. @@ -699,7 +693,7 @@ static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); * are synchronized every time a new bfqq is selected for service. By now, * we keep it to better check consistency. */ -static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) { struct bfq_entity *entity = &bfqq->entity; struct bfq_service_tree *st; @@ -714,10 +708,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) st->vtime += bfq_delta(served, st->wsum); bfq_forget_idle(st); } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -#endif - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); } /** @@ -730,7 +721,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) * budget. In this way we should obtain a sort of time-domain * fairness among all the seeky/slow queues. */ -static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; @@ -755,7 +746,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity) struct bfq_service_tree *st = bfq_entity_service_tree(entity); if (entity == sd->in_service_entity) { - BUG_ON(entity->tree); + BUG_ON(entity->tree != NULL); /* * If we are requeueing the current entity we have * to take care of not charging to it service it has @@ -846,7 +837,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) if (!entity->on_st) return 0; - BUG_ON(was_in_service && entity->tree); + BUG_ON(was_in_service && entity->tree != NULL); if (was_in_service) { bfq_calc_finish(entity, entity->service); @@ -855,7 +846,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) bfq_active_extract(st, entity); else if (entity->tree == &st->idle) bfq_idle_extract(st, entity); - else if (entity->tree) + else if (entity->tree != NULL) BUG(); if (was_in_service || sd->next_in_service == entity) @@ -893,7 +884,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) */ break; - if (sd->next_in_service) + if (sd->next_in_service != NULL) /* * The parent entity is still backlogged and * the budgets on the path towards the root @@ -962,7 +953,7 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) struct bfq_entity *entry, *first = NULL; struct rb_node *node = st->active.rb_node; - while (node) { + while (node != NULL) { entry = rb_entry(node, struct bfq_entity, rb_node); left: if (!bfq_gt(entry->start, st->vtime)) @@ -970,7 +961,7 @@ left: BUG_ON(bfq_gt(entry->min_start, st->vtime)); - if (node->rb_left) { + if (node->rb_left != NULL) { entry = rb_entry(node->rb_left, struct bfq_entity, rb_node); if (!bfq_gt(entry->min_start, st->vtime)) { @@ -978,12 +969,12 @@ left: goto left; } } - if (first) + if (first != NULL) break; node = node->rb_right; } - BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); return first; } @@ -1039,13 +1030,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, struct bfq_entity *entity; int i = 0; - BUG_ON(sd->in_service_entity); + BUG_ON(sd->in_service_entity != NULL); - if (bfqd && + if (bfqd != NULL && jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); - if (entity) { + if (entity != NULL) { i = BFQ_IOPRIO_CLASSES - 1; bfqd->bfq_class_idle_last_service = jiffies; sd->next_in_service = entity; @@ -1053,7 +1044,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, } for (; i < BFQ_IOPRIO_CLASSES; i++) { entity = __bfq_lookup_next_entity(st + i, false); - if (entity) { + if (entity != NULL) { if (extract) { bfq_check_next_in_service(sd, entity); bfq_active_extract(st + i, entity); @@ -1076,27 +1067,27 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) struct bfq_sched_data *sd; struct bfq_queue *bfqq; - BUG_ON(bfqd->in_service_queue); + BUG_ON(bfqd->in_service_queue != NULL); if (bfqd->busy_queues == 0) return NULL; sd = &bfqd->root_group->sched_data; - for (; sd ; sd = entity->my_sched_data) { + for (; sd != NULL; sd = entity->my_sched_data) { entity = bfq_lookup_next_entity(sd, 1, bfqd); - BUG_ON(!entity); + BUG_ON(entity == NULL); entity->service = 0; } bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(!bfqq); + BUG_ON(bfqq == NULL); return bfqq; } static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) { - if (bfqd->in_service_bic) { + if (bfqd->in_service_bic != NULL) { put_io_context(bfqd->in_service_bic->icq.ioc); bfqd->in_service_bic = NULL; } @@ -1123,10 +1114,6 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_activate_entity(entity); } -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -#endif - /* * Called when the bfqq no longer has requests pending, remove it from * the service tree. @@ -1160,10 +1147,6 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_dequeue(bfqq_group(bfqq)); -#endif - bfq_deactivate_bfqq(bfqd, bfqq, requeue); } diff --git a/block/bfq.h b/block/bfq.h index 320c4389b..93d3f6e95 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ-v7r9 for 4.2.0: data structures and common functions prototypes. + * BFQ-v7r8 for 4.3.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -17,14 +17,12 @@ #include #include #include -#include #define BFQ_IOPRIO_CLASSES 3 #define BFQ_CL_IDLE_TIMEOUT (HZ/5) -#define BFQ_MIN_WEIGHT 1 -#define BFQ_MAX_WEIGHT 1000 -#define BFQ_WEIGHT_CONVERSION_COEFF 10 +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 #define BFQ_DEFAULT_QUEUE_IOPRIO 4 @@ -119,8 +117,12 @@ struct bfq_weight_counter { * @ioprio: the ioprio in use. * @new_weight: when a weight change is requested, the new weight value. * @orig_weight: original weight, used to implement weight boosting - * @prio_changed: flag, true when the user requested a weight, ioprio or - * ioprio_class change. + * @new_ioprio: when an ioprio change is requested, the new ioprio value. + * @ioprio_class: the ioprio_class in use. + * @new_ioprio_class: when an ioprio_class change is requested, the new + * ioprio_class value. + * @ioprio_changed: flag, true when the user requested a weight, ioprio or + * ioprio_class change. * * A bfq_entity is used to represent either a bfq_queue (leaf node in the * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each @@ -132,7 +134,7 @@ struct bfq_weight_counter { * allow different weights on different devices, but this * functionality is not exported to userspace by now. Priorities and * weights are updated lazily, first storing the new values into the - * new_* fields, then setting the @prio_changed flag. As soon as + * new_* fields, then setting the @ioprio_changed flag. As soon as * there is a transition in the entity state that allows the priority * update to take place the effective and the requested priority * values are synchronized. @@ -159,7 +161,7 @@ struct bfq_entity { u64 min_start; - int service, budget; + unsigned long service, budget; unsigned short weight, new_weight; unsigned short orig_weight; @@ -168,7 +170,10 @@ struct bfq_entity { struct bfq_sched_data *my_sched_data; struct bfq_sched_data *sched_data; - int prio_changed; + unsigned short ioprio, new_ioprio; + unsigned short ioprio_class, new_ioprio_class; + + int ioprio_changed; }; struct bfq_group; @@ -177,14 +182,10 @@ struct bfq_group; * struct bfq_queue - leaf schedulable entity. * @ref: reference counter. * @bfqd: parent bfq_data. - * @new_ioprio: when an ioprio change is requested, the new ioprio value. - * @ioprio_class: the ioprio_class in use. - * @new_ioprio_class: when an ioprio_class change is requested, the new - * ioprio_class value. * @new_bfqq: shared bfq_queue if queue is cooperating with * one or more other queues. - * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). - * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). + * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). + * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). * @sort_list: sorted list of pending requests. * @next_rq: if fifo isn't expired, next request to serve. * @queued: nr of requests queued in @sort_list. @@ -238,9 +239,6 @@ struct bfq_queue { atomic_t ref; struct bfq_data *bfqd; - unsigned short ioprio, new_ioprio; - unsigned short ioprio_class, new_ioprio_class; - /* fields for cooperating queues handling */ struct bfq_queue *new_bfqq; struct rb_node pos_node; @@ -255,7 +253,7 @@ struct bfq_queue { struct bfq_entity entity; - int max_budget; + unsigned long max_budget; unsigned long budget_timeout; int dispatched; @@ -304,8 +302,6 @@ struct bfq_ttime { * @icq: associated io_cq structure * @bfqq: array of two process queues, the sync and the async * @ttime: associated @bfq_ttime struct - * @ioprio: per (request_queue, blkcg) ioprio. - * @blkcg_id: id of the blkcg the related io_cq belongs to. * @wr_time_left: snapshot of the time left before weight raising ends * for the sync queue associated to this process; this * snapshot is taken to remember this value while the weight @@ -333,10 +329,6 @@ struct bfq_io_cq { struct bfq_ttime ttime; int ioprio; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ -#endif - unsigned int wr_time_left; bool saved_idle_window; bool saved_IO_bound; @@ -357,6 +349,9 @@ enum bfq_device_speed { * struct bfq_data - per device data structure. * @queue: request queue for the managed device. * @root_group: root bfq_group for the device. + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_close_cooperator()). * @active_numerous_groups: number of bfq_groups containing more than one * active @bfq_entity. * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by @@ -490,8 +485,9 @@ struct bfq_data { struct request_queue *queue; struct bfq_group *root_group; + struct rb_root rq_pos_tree; -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#ifdef CONFIG_CGROUP_BFQIO int active_numerous_groups; #endif @@ -524,7 +520,7 @@ struct bfq_data { ktime_t last_idling_start; int peak_rate_samples; u64 peak_rate; - int bfq_max_budget; + unsigned long bfq_max_budget; struct hlist_head group_list; struct list_head active_list; @@ -536,8 +532,8 @@ struct bfq_data { unsigned int bfq_slice_idle; u64 bfq_class_idle_last_service; - int bfq_user_max_budget; - int bfq_max_budget_async_rq; + unsigned int bfq_user_max_budget; + unsigned int bfq_max_budget_async_rq; unsigned int bfq_timeout[2]; unsigned int bfq_coop_thresh; @@ -597,15 +593,15 @@ enum bfqq_state_flags { }; #define BFQ_BFQQ_FNS(name) \ -static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ { \ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ } \ -static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ { \ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ } \ -static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ { \ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ } @@ -644,64 +640,14 @@ enum bfqq_expiration { BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ }; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - -struct bfqg_stats { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ - struct blkg_rwstat service_time; - /* total time spent waiting in scheduler queue in ns */ - struct blkg_rwstat wait_time; - /* number of IOs queued up */ - struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; - /* total disk time and nr sectors dispatched by this group */ - struct blkg_stat time; - /* time not charged to this cgroup */ - struct blkg_stat unaccounted_time; - /* sum of number of ios queued across all samples */ - struct blkg_stat avg_queue_size_sum; - /* count of samples taken for average */ - struct blkg_stat avg_queue_size_samples; - /* how many times this group has been removed from service tree */ - struct blkg_stat dequeue; - /* total time spent waiting for it to be assigned a timeslice. */ - struct blkg_stat group_wait_time; - /* time spent idling for this blkcg_gq */ - struct blkg_stat idle_time; - /* total time with empty current active q with other requests queued */ - struct blkg_stat empty_time; - /* fields after this shouldn't be cleared on stat reset */ - uint64_t start_group_wait_time; - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; -}; - -/* - * struct bfq_group_data - per-blkcg storage for the blkio subsystem. - * - * @ps: @blkcg_policy_storage that this structure inherits - * @weight: weight of the bfq_group - */ -struct bfq_group_data { - /* must be the first member */ - struct blkcg_policy_data pd; - - unsigned short weight; -}; - +#ifdef CONFIG_CGROUP_BFQIO /** * struct bfq_group - per (device, cgroup) data structure. * @entity: schedulable entity to insert into the parent group sched_data. * @sched_data: own sched_data, to contain child entities (they may be * both bfq_queues and bfq_groups). + * @group_node: node to be inserted into the bfqio_cgroup->group_data + * list of the containing cgroup's bfqio_cgroup. * @bfqd_node: node to be inserted into the @bfqd->group_list list * of the groups active on the same device; used for cleanup. * @bfqd: the bfq_data for the device this group acts upon. @@ -717,26 +663,23 @@ struct bfq_group_data { * are groups with more than one active @bfq_entity * (see the comments to the function * bfq_bfqq_must_not_expire()). - * @rq_pos_tree: rbtree sorted by next_request position, used when - * determining if two or more queues have interleaving - * requests (see bfq_find_close_cooperator()). * * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup * there is a set of bfq_groups, each one collecting the lower-level * entities belonging to the group that are acting on the same device. * * Locking works as follows: + * o @group_node is protected by the bfqio_cgroup lock, and is accessed + * via RCU from its readers. * o @bfqd is protected by the queue lock, RCU is used to access it * from the readers. * o All the other fields are protected by the @bfqd queue lock. */ struct bfq_group { - /* must be the first member */ - struct blkg_policy_data pd; - struct bfq_entity entity; struct bfq_sched_data sched_data; + struct hlist_node group_node; struct hlist_node bfqd_node; void *bfqd; @@ -747,33 +690,44 @@ struct bfq_group { struct bfq_entity *my_entity; int active_entities; +}; - struct rb_root rq_pos_tree; +/** + * struct bfqio_cgroup - bfq cgroup data structure. + * @css: subsystem state for bfq in the containing cgroup. + * @online: flag marked when the subsystem is inserted. + * @weight: cgroup weight. + * @ioprio: cgroup ioprio. + * @ioprio_class: cgroup ioprio_class. + * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. + * @group_data: list containing the bfq_group belonging to this cgroup. + * + * @group_data is accessed using RCU, with @lock protecting the updates, + * @ioprio and @ioprio_class are protected by @lock. + */ +struct bfqio_cgroup { + struct cgroup_subsys_state css; + bool online; - struct bfqg_stats stats; - struct bfqg_stats dead_stats; /* stats pushed from dead children */ -}; + unsigned short weight, ioprio, ioprio_class; + spinlock_t lock; + struct hlist_head group_data; +}; #else struct bfq_group { struct bfq_sched_data sched_data; struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; struct bfq_queue *async_idle_bfqq; - - struct rb_root rq_pos_tree; }; #endif -static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - -static struct bfq_service_tree * +static inline struct bfq_service_tree * bfq_entity_service_tree(struct bfq_entity *entity) { struct bfq_sched_data *sched_data = entity->sched_data; - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : - BFQ_DEFAULT_GRP_CLASS; + unsigned int idx = entity->ioprio_class - 1; BUG_ON(idx >= BFQ_IOPRIO_CLASSES); BUG_ON(sched_data == NULL); @@ -781,18 +735,19 @@ bfq_entity_service_tree(struct bfq_entity *entity) return sched_data->service_tree + idx; } -static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) +static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, + bool is_sync) { return bic->bfqq[is_sync]; } -static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, - bool is_sync) +static inline void bic_set_bfqq(struct bfq_io_cq *bic, + struct bfq_queue *bfqq, bool is_sync) { bic->bfqq[is_sync] = bfqq; } -static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) +static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) { return bic->icq.q->elevator->elevator_data; } @@ -811,7 +766,8 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) * the function returns NULL, with the queue unlocked, otherwise it * returns the dereferenced pointer, with the queue locked. */ -static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) +static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, + unsigned long *flags) { struct bfq_data *bfqd; @@ -820,9 +776,7 @@ static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) if (bfqd != NULL) { spin_lock_irqsave(bfqd->queue->queue_lock, *flags); - if (ptr == NULL) - printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); - else if (*ptr == bfqd) + if (*ptr == bfqd) goto out; spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); } @@ -833,37 +787,17 @@ out: return bfqd; } -static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) +static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, + unsigned long *flags) { spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - -static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - struct bfq_entity *group_entity = bfqq->entity.parent; - - if (!group_entity) - group_entity = &bfqq->bfqd->root_group->entity; - - return container_of(group_entity, struct bfq_group, entity); -} - -#else - -static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - return bfqq->bfqd->root_group; -} - -#endif - -static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); +static void bfq_check_ioprio_change(struct bfq_io_cq *bic); static void bfq_put_queue(struct bfq_queue *bfqq); static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, + struct bfq_group *bfqg, int is_sync, struct bfq_io_cq *bic, gfp_t gfp_mask); static void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 719b7152a..14b8faf8b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -140,6 +140,11 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; + if (bip->bip_vcnt && + bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), + &bip->bip_vec[bip->bip_vcnt - 1], offset)) + return 0; + iv->bv_page = page; iv->bv_len = len; iv->bv_offset = offset; @@ -355,13 +360,12 @@ static void bio_integrity_verify_fn(struct work_struct *work) container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - int error; - error = bio_integrity_process(bio, bi->verify_fn); + bio->bi_error = bio_integrity_process(bio, bi->verify_fn); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); + bio_endio(bio); } /** @@ -376,7 +380,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) * in process context. This function postpones completion * accordingly. */ -void bio_integrity_endio(struct bio *bio, int error) +void bio_integrity_endio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -386,9 +390,9 @@ void bio_integrity_endio(struct bio *bio, int error) * integrity metadata. Restore original bio end_io handler * and run it. */ - if (error) { + if (bio->bi_error) { bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); + bio_endio(bio); return; } diff --git a/block/bio.c b/block/bio.c index d6e5ba339..ad3f276d7 100644 --- a/block/bio.c +++ b/block/bio.c @@ -269,7 +269,6 @@ static void bio_free(struct bio *bio) void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); - bio->bi_flags = 1 << BIO_UPTODATE; atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); } @@ -292,14 +291,17 @@ void bio_reset(struct bio *bio) __bio_free(bio); memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags | (1 << BIO_UPTODATE); + bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); -static void bio_chain_endio(struct bio *bio, int error) +static void bio_chain_endio(struct bio *bio) { - bio_endio(bio->bi_private, error); + struct bio *parent = bio->bi_private; + + parent->bi_error = bio->bi_error; + bio_endio(parent); bio_put(bio); } @@ -309,7 +311,7 @@ static void bio_chain_endio(struct bio *bio, int error) */ static inline void bio_inc_remaining(struct bio *bio) { - bio->bi_flags |= (1 << BIO_CHAIN); + bio_set_flag(bio, BIO_CHAIN); smp_mb__before_atomic(); atomic_inc(&bio->__bi_remaining); } @@ -493,7 +495,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) if (unlikely(!bvl)) goto err_free; - bio->bi_flags |= 1 << BIO_OWNS_VEC; + bio_set_flag(bio, BIO_OWNS_VEC); } else if (nr_iovecs) { bvl = bio->bi_inline_vecs; } @@ -578,7 +580,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) * so we don't set nor calculate new physical/hw segment counts here */ bio->bi_bdev = bio_src->bi_bdev; - bio->bi_flags |= 1 << BIO_CLONED; + bio_set_flag(bio, BIO_CLONED); bio->bi_rw = bio_src->bi_rw; bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; @@ -692,31 +694,22 @@ integrity_clone: EXPORT_SYMBOL(bio_clone_bioset); /** - * bio_get_nr_vecs - return approx number of vecs - * @bdev: I/O target + * bio_add_pc_page - attempt to add page to bio + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset * - * Return the approximate number of pages we can send to this target. - * There's no guarantee that you will be able to fit this number of pages - * into a bio, it does not account for dynamic restrictions that vary - * on offset. + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by REQ_PC bios. */ -int bio_get_nr_vecs(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - int nr_pages; - - nr_pages = min_t(unsigned, - queue_max_segments(q), - queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); - - return min_t(unsigned, nr_pages, BIO_MAX_PAGES); - -} -EXPORT_SYMBOL(bio_get_nr_vecs); - -static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset, - unsigned int max_sectors) +int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page + *page, unsigned int len, unsigned int offset) { int retried_segments = 0; struct bio_vec *bvec; @@ -727,7 +720,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (unlikely(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) + if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q)) return 0; /* @@ -740,28 +733,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { - unsigned int prev_bv_len = prev->bv_len; prev->bv_len += len; - - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - /* prev_bvec is already charged in - bi_size, discharge it in order to - simulate merging updated prev_bvec - as new bvec. */ - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size - - prev_bv_len, - .bi_rw = bio->bi_rw, - }; - - if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { - prev->bv_len -= len; - return 0; - } - } - bio->bi_iter.bi_size += len; goto done; } @@ -770,8 +742,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && - bvec_gap_to_prev(prev, offset)) + if (bvec_gap_to_prev(q, prev, offset)) return 0; } @@ -804,30 +775,9 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page blk_recount_segments(q, bio); } - /* - * if queue has other restrictions (eg varying max sector size - * depending on offset), it can specify a merge_bvec_fn in the - * queue to get further control - */ - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size - len, - .bi_rw = bio->bi_rw, - }; - - /* - * merge_bvec_fn() returns number of bytes it can accept - * at this offset - */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) - goto failed; - } - /* If we may be able to merge these biovecs, force a recount */ if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) - bio->bi_flags &= ~(1 << BIO_SEG_VALID); + bio_clear_flag(bio, BIO_SEG_VALID); done: return len; @@ -841,28 +791,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page blk_recount_segments(q, bio); return 0; } - -/** - * bio_add_pc_page - attempt to add page to bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - * - * This should only be used by REQ_PC bios. - */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - return __bio_add_page(q, bio, page, len, offset, - queue_max_hw_sectors(q)); -} EXPORT_SYMBOL(bio_add_pc_page); /** @@ -872,22 +800,47 @@ EXPORT_SYMBOL(bio_add_pc_page); * @len: vec entry length * @offset: vec entry offset * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. + * Attempt to add a page to the bio_vec maplist. This will only fail + * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ -int bio_add_page(struct bio *bio, struct page *page, unsigned int len, - unsigned int offset) +int bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int max_sectors; + struct bio_vec *bv; + + /* + * cloned bio must not modify vec list + */ + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; - max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); - if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size) - max_sectors = len >> 9; + /* + * For filesystems with a blocksize smaller than the pagesize + * we will often be called with the same page as last time and + * a consecutive offset. Optimize this special case. + */ + if (bio->bi_vcnt > 0) { + bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - return __bio_add_page(q, bio, page, len, offset, max_sectors); + if (page == bv->bv_page && + offset == bv->bv_offset + bv->bv_len) { + bv->bv_len += len; + goto done; + } + } + + if (bio->bi_vcnt >= bio->bi_max_vecs) + return 0; + + bv = &bio->bi_io_vec[bio->bi_vcnt]; + bv->bv_page = page; + bv->bv_len = len; + bv->bv_offset = offset; + + bio->bi_vcnt++; +done: + bio->bi_iter.bi_size += len; + return len; } EXPORT_SYMBOL(bio_add_page); @@ -896,11 +849,11 @@ struct submit_bio_ret { int error; }; -static void submit_bio_wait_endio(struct bio *bio, int error) +static void submit_bio_wait_endio(struct bio *bio) { struct submit_bio_ret *ret = bio->bi_private; - ret->error = error; + ret->error = bio->bi_error; complete(&ret->event); } @@ -1388,7 +1341,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, if (iter->type & WRITE) bio->bi_rw |= REQ_WRITE; - bio->bi_flags |= (1 << BIO_USER_MAPPED); + bio_set_flag(bio, BIO_USER_MAPPED); /* * subtle -- if __bio_map_user() ended up bouncing a bio, @@ -1445,7 +1398,7 @@ void bio_unmap_user(struct bio *bio) } EXPORT_SYMBOL(bio_unmap_user); -static void bio_map_kern_endio(struct bio *bio, int err) +static void bio_map_kern_endio(struct bio *bio) { bio_put(bio); } @@ -1501,13 +1454,13 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, } EXPORT_SYMBOL(bio_map_kern); -static void bio_copy_kern_endio(struct bio *bio, int err) +static void bio_copy_kern_endio(struct bio *bio) { bio_free_pages(bio); bio_put(bio); } -static void bio_copy_kern_endio_read(struct bio *bio, int err) +static void bio_copy_kern_endio_read(struct bio *bio) { char *p = bio->bi_private; struct bio_vec *bvec; @@ -1518,7 +1471,7 @@ static void bio_copy_kern_endio_read(struct bio *bio, int err) p += bvec->bv_len; } - bio_copy_kern_endio(bio, err); + bio_copy_kern_endio(bio); } /** @@ -1768,7 +1721,7 @@ static inline bool bio_remaining_done(struct bio *bio) BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); if (atomic_dec_and_test(&bio->__bi_remaining)) { - clear_bit(BIO_CHAIN, &bio->bi_flags); + bio_clear_flag(bio, BIO_CHAIN); return true; } @@ -1778,25 +1731,15 @@ static inline bool bio_remaining_done(struct bio *bio) /** * bio_endio - end I/O on a bio * @bio: bio - * @error: error, if any * * Description: - * bio_endio() will end I/O on the whole bio. bio_endio() is the - * preferred way to end I/O on a bio, it takes care of clearing - * BIO_UPTODATE on error. @error is 0 on success, and and one of the - * established -Exxxx (-EIO, for instance) error values in case - * something went wrong. No one should call bi_end_io() directly on a - * bio unless they own it and thus know that it has an end_io - * function. + * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred + * way to end I/O on a bio. No one should call bi_end_io() directly on a + * bio unless they own it and thus know that it has an end_io function. **/ -void bio_endio(struct bio *bio, int error) +void bio_endio(struct bio *bio) { while (bio) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - if (unlikely(!bio_remaining_done(bio))) break; @@ -1810,11 +1753,12 @@ void bio_endio(struct bio *bio, int error) */ if (bio->bi_end_io == bio_chain_endio) { struct bio *parent = bio->bi_private; + parent->bi_error = bio->bi_error; bio_put(bio); bio = parent; } else { if (bio->bi_end_io) - bio->bi_end_io(bio, error); + bio->bi_end_io(bio); bio = NULL; } } @@ -1882,7 +1826,7 @@ void bio_trim(struct bio *bio, int offset, int size) if (offset == 0 && size == bio->bi_iter.bi_size) return; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); + bio_clear_flag(bio, BIO_SEG_VALID); bio_advance(bio, offset << 9); @@ -2046,7 +1990,7 @@ int bio_associate_current(struct bio *bio) get_io_context_active(ioc); bio->bi_ioc = ioc; - bio->bi_css = task_get_css(current, blkio_cgrp_id); + bio->bi_css = task_get_css(current, io_cgrp_id); return 0; } EXPORT_SYMBOL_GPL(bio_associate_current); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9cc48d1d7..55512dd62 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "blk.h" @@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg) return; for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkg->pd[i]); + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - blk_exit_rl(&blkg->rl); + if (blkg->blkcg != &blkcg_root) + blk_exit_rl(&blkg->rl); + + blkg_rwstat_exit(&blkg->stat_ios); + blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } @@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg) return NULL; + if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || + blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; @@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q->node); if (!pd) goto err_free; @@ -129,26 +139,11 @@ err_free: return NULL; } -/** - * __blkg_lookup - internal version of blkg_lookup() - * @blkcg: blkcg of interest - * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not - * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. - */ -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint) +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; - /* * Hint didn't match. Look up from the radix tree. Note that the * hint can only be updated under queue_lock as otherwise @blkg @@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, return NULL; } - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. - */ -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(blk_queue_bypass(q))) - return NULL; - return __blkg_lookup(blkcg, q, false); -} -EXPORT_SYMBOL_GPL(blkg_lookup); +EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as - * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. + * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct request_queue *q, @@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { - ret = -EINVAL; + ret = -ENODEV; goto err_free_blkg; } wb_congested = wb_congested_get_create(&q->backing_dev_info, - blkcg->css.id, GFP_ATOMIC); + blkcg->css.id, GFP_NOWAIT); if (!wb_congested) { ret = -ENOMEM; goto err_put_css; @@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_congested; @@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { - ret = -EINVAL; + ret = -ENODEV; goto err_put_congested; } blkg_get(blkg->parent); @@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) - pol->pd_init_fn(blkg); + pol->pd_init_fn(blkg->pd[i]); } /* insert */ @@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg); + pol->pd_online_fn(blkg->pd[i]); } } blkg->online = true; @@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; } } -EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(blkg->q->queue_lock); @@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg); + pol->pd_offline_fn(blkg->pd[i]); + } + + if (parent) { + blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); + blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); } + blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -403,15 +386,6 @@ static void blkg_destroy_all(struct request_queue *q) void __blkg_release_rcu(struct rcu_head *rcu_head) { struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); - int i; - - /* tell policies that this one is being freed */ - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - } /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); @@ -475,12 +449,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + blkg_rwstat_reset(&blkg->stat_bytes); + blkg_rwstat_reset(&blkg->stat_ios); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkcg_policy_enabled(blkg->q, pol) && - pol->pd_reset_stats_fn) - pol->pd_reset_stats_fn(blkg); + if (blkg->pd[i] && pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg->pd[i]); } } @@ -489,13 +465,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, return 0; } -static const char *blkg_dev_name(struct blkcg_gq *blkg) +const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info.dev) return dev_name(blkg->q->backing_dev_info.dev); return NULL; } +EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data @@ -584,9 +561,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)rwstat->cnt[i]); + (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); - v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -623,31 +601,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); +static u64 blkg_prfill_rwstat_field(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); + + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_bytes. + * cftype->private must be set to the blkcg_policy. + */ +int blkg_print_stat_bytes(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private + * must be set to the blkcg_policy. + */ +int blkg_print_stat_ios(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios); + +static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, + NULL, off); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); + +/** + * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); + /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_stat + * @off: offset to the blkg_stat in blkg_policy_data or @blkg + * + * Collect the blkg_stat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. * - * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is + * at @off bytes into @blkg's blkg_policy_data of the policy. */ -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_stat *stat = (void *)pos_pd + off; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_stat *stat; + + if (!pos_blkg->online) + continue; + + if (pol) + stat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + stat = (void *)blkg + off; - if (pos_blkg->online) - sum += blkg_stat_read(stat); + sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -657,37 +726,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. * - * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. */ -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off) +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_rwstat *rwstat = (void *)pos_pd + off; - struct blkg_rwstat tmp; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; if (!pos_blkg->online) continue; - tmp = blkg_rwstat_read(rwstat); + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i]; + atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), + &sum.aux_cnt[i]); } rcu_read_unlock(); @@ -703,29 +778,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); * @ctx: blkg_conf_ctx to be filled * * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new - * value. This function returns with RCU read lock and queue lock held and - * must be paired with blkg_conf_finish(). + * result. @ctx->blkg points to the blkg to be updated and @ctx->body the + * part of @input following MAJ:MIN. This function returns with RCU read + * lock and queue lock held and must be paired with blkg_conf_finish(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx) + char *input, struct blkg_conf_ctx *ctx) __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; struct blkcg_gq *blkg; unsigned int major, minor; - unsigned long long v; - int part, ret; + int key_len, part, ret; + char *body; - if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; + body = input + key_len; + if (!isspace(*body)) + return -EINVAL; + body = skip_spaces(body); + disk = get_gendisk(MKDEV(major, minor), &part); if (!disk) - return -EINVAL; + return -ENODEV; if (part) { put_disk(disk); - return -EINVAL; + return -ENODEV; } rcu_read_lock(); @@ -734,7 +814,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (blkcg_policy_enabled(disk->queue, pol)) blkg = blkg_lookup_create(blkcg, disk->queue); else - blkg = ERR_PTR(-EINVAL); + blkg = ERR_PTR(-EOPNOTSUPP); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); @@ -756,7 +836,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ctx->disk = disk; ctx->blkg = blkg; - ctx->v = v; + ctx->body = body; return 0; } EXPORT_SYMBOL_GPL(blkg_conf_prep); @@ -777,7 +857,54 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + const char *dname; + struct blkg_rwstat rwstat; + u64 rbytes, wbytes, rios, wios; + + dname = blkg_dev_name(blkg); + if (!dname) + continue; + + spin_lock_irq(blkg->q->queue_lock); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_ios)); + rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + spin_unlock_irq(blkg->q->queue_lock); + + if (rbytes || wbytes || rios || wios) + seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", + dname, rbytes, wbytes, rios, wios); + } + + rcu_read_unlock(); + return 0; +} + struct cftype blkcg_files[] = { + { + .name = "stat", + .seq_show = blkcg_print_stat, + }, + { } /* terminate */ +}; + +struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, @@ -825,18 +952,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); + int i; mutex_lock(&blkcg_pol_mutex); + list_del(&blkcg->all_blkcgs_node); - mutex_unlock(&blkcg_pol_mutex); - if (blkcg != &blkcg_root) { - int i; + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkcg->pd[i]); - kfree(blkcg); - } + mutex_unlock(&blkcg_pol_mutex); + + kfree(blkcg); } static struct cgroup_subsys_state * @@ -850,13 +978,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) { blkcg = &blkcg_root; - goto done; - } - - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) { - ret = ERR_PTR(-ENOMEM); - goto free_blkcg; + } else { + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) { + ret = ERR_PTR(-ENOMEM); + goto free_blkcg; + } } for (i = 0; i < BLKCG_MAX_POLS ; i++) { @@ -869,23 +996,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ - if (!pol || !pol->cpd_size) + if (!pol || !pol->cpd_alloc_fn) continue; - BUG_ON(blkcg->pd[i]); - cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { ret = ERR_PTR(-ENOMEM); goto free_pd_blkcg; } - blkcg->pd[i] = cpd; + blkcg->cpd[i] = cpd; + cpd->blkcg = blkcg; cpd->plid = i; - pol->cpd_init_fn(blkcg); + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); } -done: spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); @@ -897,7 +1024,8 @@ done: free_pd_blkcg: for (i--; i >= 0; i--) - kfree(blkcg->pd[i]); + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); free_blkcg: kfree(blkcg); mutex_unlock(&blkcg_pol_mutex); @@ -941,7 +1069,7 @@ int blkcg_init_queue(struct request_queue *q) radix_tree_preload_end(); if (IS_ERR(blkg)) { - kfree(new_blkg); + blkg_free(new_blkg); return PTR_ERR(blkg); } @@ -1018,12 +1146,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_cgrp_subsys = { +static void blkcg_bind(struct cgroup_subsys_state *root_css) +{ + int i; + + mutex_lock(&blkcg_pol_mutex); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg *blkcg; + + if (!pol || !pol->cpd_bind_fn) + continue; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) + if (blkcg->cpd[pol->plid]) + pol->cpd_bind_fn(blkcg->cpd[pol->plid]); + } + mutex_unlock(&blkcg_pol_mutex); +} + +struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .legacy_cftypes = blkcg_files, + .bind = blkcg_bind, + .dfl_cftypes = blkcg_files, + .legacy_cftypes = blkcg_legacy_files, + .legacy_name = "blkio", #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled @@ -1033,7 +1184,7 @@ struct cgroup_subsys blkio_cgrp_subsys = { .depends_on = 1 << memory_cgrp_id, #endif }; -EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); +EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -1054,65 +1205,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { - LIST_HEAD(pds); + struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg; - struct blkg_policy_data *pd, *nd; - int cnt = 0, ret; + int ret; if (blkcg_policy_enabled(q, pol)) return 0; - /* count and allocate policy_data for all existing blkgs */ blk_queue_bypass_start(q); - spin_lock_irq(q->queue_lock); - list_for_each_entry(blkg, &q->blkg_list, q_node) - cnt++; - spin_unlock_irq(q->queue_lock); - - /* allocate per-blkg policy data for all existing blkgs */ - while (cnt--) { - pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); - if (!pd) { +pd_prealloc: + if (!pd_prealloc) { + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); + if (!pd_prealloc) { ret = -ENOMEM; - goto out_free; + goto out_bypass_end; } - list_add_tail(&pd->alloc_node, &pds); } - /* - * Install the allocated pds and cpds. With @q bypassing, no new blkg - * should have been created while the queue lock was dropped. - */ spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { - /* umm... this shouldn't happen, just abort */ - ret = -ENOMEM; - goto out_unlock; - } - pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); - list_del_init(&pd->alloc_node); + struct blkg_policy_data *pd; - /* grab blkcg lock too while installing @pd on @blkg */ - spin_lock(&blkg->blkcg->lock); + if (blkg->pd[pol->plid]) + continue; + + pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); + if (!pd) + swap(pd, pd_prealloc); + if (!pd) { + spin_unlock_irq(q->queue_lock); + goto pd_prealloc; + } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - pol->pd_init_fn(blkg); - - spin_unlock(&blkg->blkcg->lock); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; -out_unlock: + spin_unlock_irq(q->queue_lock); -out_free: +out_bypass_end: blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, nd, &pds, alloc_node) - kfree(pd); + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1142,13 +1282,12 @@ void blkcg_deactivate_policy(struct request_queue *q, /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); - if (pol->pd_offline_fn) - pol->pd_offline_fn(blkg); - if (pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - - kfree(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; + if (blkg->pd[pol->plid]) { + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } spin_unlock(&blkg->blkcg->lock); } @@ -1170,9 +1309,6 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; - if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) - return -EINVAL; - mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); @@ -1189,36 +1325,42 @@ int blkcg_policy_register(struct blkcg_policy *pol) blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; - cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { mutex_unlock(&blkcg_pol_mutex); goto err_free_cpds; } - blkcg->pd[pol->plid] = cpd; + blkcg->cpd[pol->plid] = cpd; + cpd->blkcg = blkcg; cpd->plid = pol->plid; - pol->cpd_init_fn(blkcg); + pol->cpd_init_fn(cpd); } } mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->cftypes) - WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, - pol->cftypes)); + if (pol->dfl_cftypes) + WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + if (pol->legacy_cftypes) + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, + pol->legacy_cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; err_free_cpds: - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->pd[pol->plid]); - blkcg->pd[pol->plid] = NULL; + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } } } blkcg_policy[pol->plid] = NULL; @@ -1245,16 +1387,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) goto out_unlock; /* kill the intf files first */ - if (pol->cftypes) - cgroup_rm_cftypes(pol->cftypes); + if (pol->dfl_cftypes) + cgroup_rm_cftypes(pol->dfl_cftypes); + if (pol->legacy_cftypes) + cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->pd[pol->plid]); - blkcg->pd[pol->plid] = NULL; + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } } } blkcg_policy[pol->plid] = NULL; diff --git a/block/blk-core.c b/block/blk-core.c index d1a63defd..48a6cc8df 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -145,18 +145,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + bio->bi_error = error; if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); + bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - bio_endio(bio, error); + bio_endio(bio); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -580,7 +578,7 @@ void blk_cleanup_queue(struct request_queue *q) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); - bdi_destroy(&q->backing_dev_info); + bdi_unregister(&q->backing_dev_info); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); @@ -647,6 +645,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (q->id < 0) goto fail_q; + q->bio_split = bioset_create(BIO_POOL_SIZE, 0); + if (!q->bio_split) + goto fail_id; + q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; @@ -655,7 +657,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) err = bdi_init(&q->backing_dev_info); if (err) - goto fail_id; + goto fail_split; setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); @@ -697,6 +699,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) fail_bdi: bdi_destroy(&q->backing_dev_info); +fail_split: + bioset_free(q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); fail_q: @@ -1614,6 +1618,8 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) struct request *req; unsigned int request_count = 0; + blk_queue_split(q, &bio, q->bio_split); + /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even @@ -1622,7 +1628,8 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); + bio->bi_error = -EIO; + bio_endio(bio); return; } @@ -1675,7 +1682,8 @@ get_rq: */ req = get_request(q, rw_flags, bio, GFP_NOIO); if (IS_ERR(req)) { - bio_endio(bio, PTR_ERR(req)); /* @q is dead */ + bio->bi_error = PTR_ERR(req); + bio_endio(bio); goto out_unlock; } @@ -1834,15 +1842,6 @@ generic_make_request_checks(struct bio *bio) goto end_io; } - if (likely(bio_is_rw(bio) && - nr_sectors > queue_max_hw_sectors(q))) { - printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } - part = bio->bi_bdev->bd_part; if (should_fail_request(part, bio->bi_iter.bi_size) || should_fail_request(&part_to_disk(part)->part0, @@ -1891,14 +1890,15 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); - if (blk_throtl_bio(q, bio)) - return false; /* throttled, will be resubmitted later */ + if (!blkcg_bio_issue_check(q, bio)) + return false; trace_block_bio_queue(q, bio); return true; end_io: - bio_endio(bio, err); + bio->bi_error = err; + bio_endio(bio); return false; } @@ -1991,7 +1991,7 @@ void submit_bio(int rw, struct bio *bio) bio->bi_rw |= rw; if (unlikely(trap_non_toi_io)) - BUG_ON(!(bio->bi_flags & (1 << BIO_TOI))); + BUG_ON(!bio_flagged(bio, BIO_TOI)); /* * If it's a regular read/write or a barrier with data attached, diff --git a/block/blk-integrity.c b/block/blk-integrity.c index f548b64be..75f29cf70 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -204,6 +204,9 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, q->limits.max_integrity_segments) return false; + if (integrity_req_gap_back_merge(req, next->bio)) + return false; + return true; } EXPORT_SYMBOL(blk_integrity_merge_rq); diff --git a/block/blk-lib.c b/block/blk-lib.c index 7688ee3f5..9ebf65379 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -11,16 +11,16 @@ struct bio_batch { atomic_t done; - unsigned long flags; + int error; struct completion *wait; }; -static void bio_batch_end_io(struct bio *bio, int err) +static void bio_batch_end_io(struct bio *bio) { struct bio_batch *bb = bio->bi_private; - if (err && (err != -EOPNOTSUPP)) - clear_bit(BIO_UPTODATE, &bb->flags); + if (bio->bi_error && bio->bi_error != -EOPNOTSUPP) + bb->error = bio->bi_error; if (atomic_dec_and_test(&bb->done)) complete(bb->wait); bio_put(bio); @@ -43,7 +43,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; - unsigned int max_discard_sectors, granularity; + unsigned int granularity; int alignment; struct bio_batch bb; struct bio *bio; @@ -60,17 +60,6 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, granularity = max(q->limits.discard_granularity >> 9, 1U); alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; - /* - * Ensure that max_discard_sectors is of the proper - * granularity, so that requests stay aligned after a split. - */ - max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - max_discard_sectors -= max_discard_sectors % granularity; - if (unlikely(!max_discard_sectors)) { - /* Avoid infinite loop below. Being cautious never hurts. */ - return -EOPNOTSUPP; - } - if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; @@ -78,7 +67,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } atomic_set(&bb.done, 1); - bb.flags = 1 << BIO_UPTODATE; + bb.error = 0; bb.wait = &wait; blk_start_plug(&plug); @@ -92,7 +81,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, break; } - req_sects = min_t(sector_t, nr_sects, max_discard_sectors); + /* Make sure bi_size doesn't overflow */ + req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9); /* * If splitting a request, and the next starting sector would be @@ -134,9 +124,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!atomic_dec_and_test(&bb.done)) wait_for_completion_io(&wait); - if (!test_bit(BIO_UPTODATE, &bb.flags)) - ret = -EIO; - + if (bb.error) + return bb.error; return ret; } EXPORT_SYMBOL(blkdev_issue_discard); @@ -166,13 +155,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; - max_write_same_sectors = q->limits.max_write_same_sectors; - - if (max_write_same_sectors == 0) - return -EOPNOTSUPP; + /* Ensure that max_write_same_sectors doesn't overflow bi_size */ + max_write_same_sectors = UINT_MAX >> 9; atomic_set(&bb.done, 1); - bb.flags = 1 << BIO_UPTODATE; + bb.error = 0; bb.wait = &wait; while (nr_sects) { @@ -208,9 +195,8 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!atomic_dec_and_test(&bb.done)) wait_for_completion_io(&wait); - if (!test_bit(BIO_UPTODATE, &bb.flags)) - ret = -ENOTSUPP; - + if (bb.error) + return bb.error; return ret; } EXPORT_SYMBOL(blkdev_issue_write_same); @@ -236,7 +222,7 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(wait); atomic_set(&bb.done, 1); - bb.flags = 1 << BIO_UPTODATE; + bb.error = 0; bb.wait = &wait; ret = 0; @@ -270,10 +256,8 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if (!atomic_dec_and_test(&bb.done)) wait_for_completion_io(&wait); - if (!test_bit(BIO_UPTODATE, &bb.flags)) - /* One of bios in the batch was completed with error.*/ - ret = -EIO; - + if (bb.error) + return bb.error; return ret; } diff --git a/block/blk-map.c b/block/blk-map.c index da310a105..f565e11f4 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -9,6 +9,24 @@ #include "blk.h" +static bool iovec_gap_to_prv(struct request_queue *q, + struct iovec *prv, struct iovec *cur) +{ + unsigned long prev_end; + + if (!queue_virt_boundary(q)) + return false; + + if (prv->iov_base == NULL && prv->iov_len == 0) + /* prv is not set - don't check */ + return false; + + prev_end = (unsigned long)(prv->iov_base + prv->iov_len); + + return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) || + prev_end & queue_virt_boundary(q)); +} + int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -67,7 +85,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct bio *bio; int unaligned = 0; struct iov_iter i; - struct iovec iov; + struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; if (!iter || !iter->count) return -EINVAL; @@ -81,8 +99,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, /* * Keep going so we check length of all segments */ - if (uaddr & queue_dma_alignment(q)) + if ((uaddr & queue_dma_alignment(q)) || + iovec_gap_to_prv(q, &prv, &iov)) unaligned = 1; + + prv.iov_base = iov.iov_base; + prv.iov_len = iov.iov_len; } if (unaligned || (q->dma_pad_mask & iter->count) || map_data) @@ -94,7 +116,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, return PTR_ERR(bio); if (map_data && map_data->null_mapped) - bio->bi_flags |= (1 << BIO_NULL_MAPPED); + bio_set_flag(bio, BIO_NULL_MAPPED); if (bio->bi_iter.bi_size != iter->count) { /* @@ -103,7 +125,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, * normal IO completion path */ bio_get(bio); - bio_endio(bio, 0); + bio_endio(bio); __blk_rq_unmap_user(bio); return -EINVAL; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 30a0d9f89..c4e9c37f3 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -9,12 +9,134 @@ #include "blk.h" +static struct bio *blk_bio_discard_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs) +{ + unsigned int max_discard_sectors, granularity; + int alignment; + sector_t tmp; + unsigned split_sectors; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + + max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); + max_discard_sectors -= max_discard_sectors % granularity; + + if (unlikely(!max_discard_sectors)) { + /* XXX: warn */ + return NULL; + } + + if (bio_sectors(bio) <= max_discard_sectors) + return NULL; + + split_sectors = max_discard_sectors; + + /* + * If the next starting sector would be misaligned, stop the discard at + * the previous aligned sector. + */ + alignment = (q->limits.discard_alignment >> 9) % granularity; + + tmp = bio->bi_iter.bi_sector + split_sectors - alignment; + tmp = sector_div(tmp, granularity); + + if (split_sectors > tmp) + split_sectors -= tmp; + + return bio_split(bio, split_sectors, GFP_NOIO, bs); +} + +static struct bio *blk_bio_write_same_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs) +{ + if (!q->limits.max_write_same_sectors) + return NULL; + + if (bio_sectors(bio) <= q->limits.max_write_same_sectors) + return NULL; + + return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); +} + +static struct bio *blk_bio_segment_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs) +{ + struct bio_vec bv, bvprv, *bvprvp = NULL; + struct bvec_iter iter; + unsigned seg_size = 0, nsegs = 0, sectors = 0; + + bio_for_each_segment(bv, bio, iter) { + if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q)) + goto split; + + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, disallow it. + */ + if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) + goto split; + + if (bvprvp && blk_queue_cluster(q)) { + if (seg_size + bv.bv_len > queue_max_segment_size(q)) + goto new_segment; + if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv)) + goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv)) + goto new_segment; + + seg_size += bv.bv_len; + bvprv = bv; + bvprvp = &bv; + sectors += bv.bv_len >> 9; + continue; + } +new_segment: + if (nsegs == queue_max_segments(q)) + goto split; + + nsegs++; + bvprv = bv; + bvprvp = &bv; + seg_size = bv.bv_len; + sectors += bv.bv_len >> 9; + } + + return NULL; +split: + return bio_split(bio, sectors, GFP_NOIO, bs); +} + +void blk_queue_split(struct request_queue *q, struct bio **bio, + struct bio_set *bs) +{ + struct bio *split; + + if ((*bio)->bi_rw & REQ_DISCARD) + split = blk_bio_discard_split(q, *bio, bs); + else if ((*bio)->bi_rw & REQ_WRITE_SAME) + split = blk_bio_write_same_split(q, *bio, bs); + else + split = blk_bio_segment_split(q, *bio, q->bio_split); + + if (split) { + bio_chain(split, *bio); + generic_make_request(*bio); + *bio = split; + } +} +EXPORT_SYMBOL(blk_queue_split); + static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio *bio, bool no_sg_merge) { struct bio_vec bv, bvprv = { NULL }; - int cluster, high, highprv = 1; + int cluster, prev = 0; unsigned int seg_size, nr_phys_segs; struct bio *fbio, *bbio; struct bvec_iter iter; @@ -36,7 +158,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, cluster = blk_queue_cluster(q); seg_size = 0; nr_phys_segs = 0; - high = 0; for_each_bio(bio) { bio_for_each_segment(bv, bio, iter) { /* @@ -46,13 +167,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (no_sg_merge) goto new_segment; - /* - * the trick here is making sure that a high page is - * never considered part of another segment, since - * that might change with the bounce page. - */ - high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); - if (!high && !highprv && cluster) { + if (prev && cluster) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; @@ -72,8 +187,8 @@ new_segment: nr_phys_segs++; bvprv = bv; + prev = 1; seg_size = bv.bv_len; - highprv = high; } bbio = bio; } @@ -116,7 +231,7 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio) bio->bi_next = nxt; } - bio->bi_flags |= (1 << BIO_SEG_VALID); + bio_set_flag(bio, BIO_SEG_VALID); } EXPORT_SYMBOL(blk_recount_segments); @@ -266,7 +381,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (rq->cmd_flags & REQ_WRITE) memset(q->dma_drain_buffer, 0, q->dma_drain_size); - sg->page_link &= ~0x02; + sg_unmark_end(sg); sg = sg_next(sg); sg_set_page(sg, virt_to_page(q->dma_drain_buffer), q->dma_drain_size, @@ -312,6 +427,11 @@ no_merge: int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + if (req_gap_back_merge(req, bio)) + return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_back_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -330,6 +450,12 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + + if (req_gap_front_merge(req, bio)) + return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_front_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -356,14 +482,6 @@ static bool req_no_special_merge(struct request *req) return !q->mq_ops && req->special; } -static int req_gap_to_prev(struct request *req, struct request *next) -{ - struct bio *prev = req->biotail; - - return bvec_gap_to_prev(&prev->bi_io_vec[prev->bi_vcnt - 1], - next->bio->bi_io_vec[0].bv_offset); -} - static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -378,8 +496,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; - if (test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags) && - req_gap_to_prev(req, next)) + if (req_gap_back_merge(req, next->bio)) return 0; /* @@ -564,8 +681,6 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, bool blk_rq_merge_ok(struct request *rq, struct bio *bio) { - struct request_queue *q = rq->q; - if (!rq_mergeable(rq) || !bio_mergeable(bio)) return false; @@ -589,15 +704,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) !blk_write_same_mergeable(rq->bio, bio)) return false; - /* Only check gaps if the bio carries data */ - if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) { - struct bio_vec *bprev; - - bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1]; - if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset)) - return false; - } - return true; } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 1e28ddb65..8764c241e 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu) return cpu; } -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, + const struct cpumask *online_mask) { unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; cpumask_var_t cpus; @@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) cpumask_clear(cpus); nr_cpus = nr_uniq_cpus = 0; - for_each_online_cpu(i) { + for_each_cpu(i, online_mask) { nr_cpus++; first_sibling = get_first_sibling(i); if (!cpumask_test_cpu(first_sibling, cpus)) @@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) queue = 0; for_each_possible_cpu(i) { - if (!cpu_online(i)) { + if (!cpumask_test_cpu(i, online_mask)) { map[i] = 0; continue; } @@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) if (!map) return NULL; - if (!blk_mq_update_queue_map(map, set->nr_hw_queues)) + if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask)) return map; kfree(map); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 279c5d674..788fffd9b 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -229,8 +229,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) unsigned int i, first = 1; ssize_t ret = 0; - blk_mq_disable_hotplug(); - for_each_cpu(i, hctx->cpumask) { if (first) ret += sprintf(ret + page, "%u", i); @@ -240,8 +238,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) first = 0; } - blk_mq_enable_hotplug(); - ret += sprintf(ret + page, "\n"); return ret; } @@ -343,7 +339,7 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) struct blk_mq_ctx *ctx; int i; - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + if (!hctx->nr_ctx) return; hctx_for_each_ctx(hctx, ctx, i) @@ -358,7 +354,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) struct blk_mq_ctx *ctx; int i, ret; - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + if (!hctx->nr_ctx) return 0; ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); @@ -381,6 +377,8 @@ void blk_mq_unregister_disk(struct gendisk *disk) struct blk_mq_ctx *ctx; int i, j; + blk_mq_disable_hotplug(); + queue_for_each_hw_ctx(q, hctx, i) { blk_mq_unregister_hctx(hctx); @@ -395,6 +393,9 @@ void blk_mq_unregister_disk(struct gendisk *disk) kobject_put(&q->mq_kobj); kobject_put(&disk_to_dev(disk)->kobj); + + q->mq_sysfs_init_done = false; + blk_mq_enable_hotplug(); } static void blk_mq_sysfs_init(struct request_queue *q) @@ -425,27 +426,30 @@ int blk_mq_register_disk(struct gendisk *disk) struct blk_mq_hw_ctx *hctx; int ret, i; + blk_mq_disable_hotplug(); + blk_mq_sysfs_init(q); ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) - return ret; + goto out; kobject_uevent(&q->mq_kobj, KOBJ_ADD); queue_for_each_hw_ctx(q, hctx, i) { - hctx->flags |= BLK_MQ_F_SYSFS_UP; ret = blk_mq_register_hctx(hctx); if (ret) break; } - if (ret) { + if (ret) blk_mq_unregister_disk(disk); - return ret; - } + else + q->mq_sysfs_init_done = true; +out: + blk_mq_enable_hotplug(); - return 0; + return ret; } EXPORT_SYMBOL_GPL(blk_mq_register_disk); @@ -454,6 +458,9 @@ void blk_mq_sysfs_unregister(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; + if (!q->mq_sysfs_init_done) + return; + queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); } @@ -463,6 +470,9 @@ int blk_mq_sysfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i, ret = 0; + if (!q->mq_sysfs_init_done) + return ret; + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 9115c6d59..ec2d11915 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -471,17 +471,30 @@ void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, } EXPORT_SYMBOL(blk_mq_all_tag_busy_iter); -void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { - struct blk_mq_tags *tags = hctx->tags; + struct blk_mq_hw_ctx *hctx; + int i; + + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); + } - if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); } -EXPORT_SYMBOL(blk_mq_tag_busy_iter); static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) { @@ -628,6 +641,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags) { bt_free(&tags->bitmap_tags); bt_free(&tags->breserved_tags); + free_cpumask_var(tags->cpumask); kfree(tags); } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 9eb2cf4f0..d468a79f2 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -58,6 +58,8 @@ extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv); enum { BLK_MQ_TAG_CACHE_MIN = 1, diff --git a/block/blk-mq.c b/block/blk-mq.c index c69902695..85f014327 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -393,14 +393,16 @@ void __blk_mq_complete_request(struct request *rq) * Ends all I/O on a request. It does not handle partial completions. * The actual completion happens out-of-order, through a IPI handler. **/ -void blk_mq_complete_request(struct request *rq) +void blk_mq_complete_request(struct request *rq, int error) { struct request_queue *q = rq->q; if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) + if (!blk_mark_rq_complete(rq)) { + rq->errors = error; __blk_mq_complete_request(rq); + } } EXPORT_SYMBOL(blk_mq_complete_request); @@ -616,10 +618,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * If a request wasn't started before the queue was * marked dying, kill it here or it'll go unnoticed. */ - if (unlikely(blk_queue_dying(rq->q))) { - rq->errors = -EIO; - blk_mq_complete_request(rq); - } + if (unlikely(blk_queue_dying(rq->q))) + blk_mq_complete_request(rq, -EIO); return; } if (rq->cmd_flags & REQ_NO_TIMEOUT) @@ -641,24 +641,16 @@ static void blk_mq_rq_timer(unsigned long priv) .next = 0, .next_set = 0, }; - struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - /* - * If not software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; - - blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); - } + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { + struct blk_mq_hw_ctx *hctx; + queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) @@ -1185,7 +1177,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, struct blk_mq_alloc_data alloc_data; if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) { - bio_endio(bio, -EIO); + bio_io_error(bio); return NULL; } @@ -1269,10 +1261,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); + bio_io_error(bio); return; } + blk_queue_split(q, &bio, q->bio_split); + if (!is_flush_fua && !blk_queue_nomerges(q) && blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return; @@ -1354,10 +1348,12 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); + bio_io_error(bio); return; } + blk_queue_split(q, &bio, q->bio_split); + if (!is_flush_fua && !blk_queue_nomerges(q) && blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; @@ -1785,13 +1781,19 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } -static void blk_mq_map_swqueue(struct request_queue *q) +static void blk_mq_map_swqueue(struct request_queue *q, + const struct cpumask *online_mask) { unsigned int i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; + /* + * Avoid others reading imcomplete hctx->cpumask through sysfs + */ + mutex_lock(&q->sysfs_lock); + queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -1802,7 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ queue_for_each_ctx(q, ctx, i) { /* If the cpu isn't online, the cpu is mapped to first hctx */ - if (!cpu_online(i)) + if (!cpumask_test_cpu(i, online_mask)) continue; hctx = q->mq_ops->map_queue(q, i); @@ -1811,6 +1813,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->ctxs[hctx->nr_ctx++] = ctx; } + mutex_unlock(&q->sysfs_lock); + queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_ctxmap *map = &hctx->ctx_map; @@ -1848,7 +1852,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) } queue_for_each_ctx(q, ctx, i) { - if (!cpu_online(i)) + if (!cpumask_test_cpu(i, online_mask)) continue; hctx = q->mq_ops->map_queue(q, i); @@ -1921,6 +1925,9 @@ void blk_mq_release(struct request_queue *q) kfree(hctx); } + kfree(q->mq_map); + q->mq_map = NULL; + kfree(q->queue_hw_ctx); /* ctx kobj stays in queue_ctx */ @@ -2030,13 +2037,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (blk_mq_init_hw_queues(q, set)) goto err_hctxs; + get_online_cpus(); mutex_lock(&all_q_mutex); - list_add_tail(&q->all_q_node, &all_q_list); - mutex_unlock(&all_q_mutex); + list_add_tail(&q->all_q_node, &all_q_list); blk_mq_add_queue_tag_set(set, q); + blk_mq_map_swqueue(q, cpu_online_mask); - blk_mq_map_swqueue(q); + mutex_unlock(&all_q_mutex); + put_online_cpus(); return q; @@ -2060,30 +2069,27 @@ void blk_mq_free_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; + mutex_lock(&all_q_mutex); + list_del_init(&q->all_q_node); + mutex_unlock(&all_q_mutex); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); blk_mq_free_hw_queues(q, set); percpu_ref_exit(&q->mq_usage_counter); - - kfree(q->mq_map); - - q->mq_map = NULL; - - mutex_lock(&all_q_mutex); - list_del_init(&q->all_q_node); - mutex_unlock(&all_q_mutex); } /* Basically redo blk_mq_init_queue with queue frozen */ -static void blk_mq_queue_reinit(struct request_queue *q) +static void blk_mq_queue_reinit(struct request_queue *q, + const struct cpumask *online_mask) { WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); blk_mq_sysfs_unregister(q); - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe @@ -2091,7 +2097,7 @@ static void blk_mq_queue_reinit(struct request_queue *q) * involves free and re-allocate memory, worthy doing?) */ - blk_mq_map_swqueue(q); + blk_mq_map_swqueue(q, online_mask); blk_mq_sysfs_register(q); } @@ -2100,16 +2106,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, unsigned long action, void *hcpu) { struct request_queue *q; + int cpu = (unsigned long)hcpu; + /* + * New online cpumask which is going to be set in this hotplug event. + * Declare this cpumasks as global as cpu-hotplug operation is invoked + * one-by-one and dynamically allocating this could result in a failure. + */ + static struct cpumask online_new; /* - * Before new mappings are established, hotadded cpu might already - * start handling requests. This doesn't break anything as we map - * offline CPUs to first hardware queue. We will re-init the queue - * below to get optimal settings. + * Before hotadded cpu starts handling requests, new mappings must + * be established. Otherwise, these requests in hw queue might + * never be dispatched. + * + * For example, there is a single hw queue (hctx) and two CPU queues + * (ctx0 for CPU0, and ctx1 for CPU1). + * + * Now CPU1 is just onlined and a request is inserted into + * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is + * still zero. + * + * And then while running hw queue, flush_busy_ctxs() finds bit0 is + * set in pending bitmap and tries to retrieve requests in + * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, + * so the request in ctx1->rq_list is ignored. */ - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && - action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DEAD: + case CPU_UP_CANCELED: + cpumask_copy(&online_new, cpu_online_mask); + break; + case CPU_UP_PREPARE: + cpumask_copy(&online_new, cpu_online_mask); + cpumask_set_cpu(cpu, &online_new); + break; + default: return NOTIFY_OK; + } mutex_lock(&all_q_mutex); @@ -2133,7 +2166,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, } list_for_each_entry(q, &all_q_list, all_q_node) - blk_mq_queue_reinit(q); + blk_mq_queue_reinit(q, &online_new); list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_unfreeze_queue(q); @@ -2263,10 +2296,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) int i; for (i = 0; i < set->nr_hw_queues; i++) { - if (set->tags[i]) { + if (set->tags[i]) blk_mq_free_rq_map(set, set->tags[i], i); - free_cpumask_var(set->tags[i]->cpumask); - } } kfree(set->tags); diff --git a/block/blk-mq.h b/block/blk-mq.h index 6a48c4c0d..f4fea7964 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void); * CPU -> queue mappings */ extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, + const struct cpumask *online_mask); extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); /* diff --git a/block/blk-settings.c b/block/blk-settings.c index e0057d035..7d8f129a1 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -53,28 +53,6 @@ void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn) } EXPORT_SYMBOL(blk_queue_unprep_rq); -/** - * blk_queue_merge_bvec - set a merge_bvec function for queue - * @q: queue - * @mbfn: merge_bvec_fn - * - * Usually queues have static limitations on the max sectors or segments that - * we can put in a request. Stacking drivers may have some settings that - * are dynamic, and thus we have to query the queue whether it is ok to - * add a new bio_vec to a bio at a given offset or not. If the block device - * has such limitations, it needs to register a merge_bvec_fn to control - * the size of bio's sent to it. Note that a block device *must* allow a - * single page to be added to an empty bio. The block device driver may want - * to use the bio_split() function to deal with these bio's. By default - * no merge_bvec_fn is defined for a queue, and only the fixed limits are - * honored. - */ -void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) -{ - q->merge_bvec_fn = mbfn; -} -EXPORT_SYMBOL(blk_queue_merge_bvec); - void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) { q->softirq_done_fn = fn; @@ -111,11 +89,13 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_segments = BLK_MAX_SEGMENTS; lim->max_integrity_segments = 0; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; + lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; @@ -257,7 +237,9 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ __func__, max_hw_sectors); } - limits->max_sectors = limits->max_hw_sectors = max_hw_sectors; + limits->max_hw_sectors = max_hw_sectors; + limits->max_sectors = min_t(unsigned int, max_hw_sectors, + BLK_DEF_MAX_SECTORS); } EXPORT_SYMBOL(blk_limits_max_hw_sectors); @@ -303,6 +285,7 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors); void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors) { + q->limits.max_hw_discard_sectors = max_discard_sectors; q->limits.max_discard_sectors = max_discard_sectors; } EXPORT_SYMBOL(blk_queue_max_discard_sectors); @@ -550,6 +533,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); + t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask, + b->virt_boundary_mask); t->max_segments = min_not_zero(t->max_segments, b->max_segments); t->max_integrity_segments = min_not_zero(t->max_integrity_segments, @@ -641,6 +626,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_discard_sectors = min_not_zero(t->max_discard_sectors, b->max_discard_sectors); + t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors, + b->max_hw_discard_sectors); t->discard_granularity = max(t->discard_granularity, b->discard_granularity); t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % @@ -787,6 +774,17 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) } EXPORT_SYMBOL(blk_queue_segment_boundary); +/** + * blk_queue_virt_boundary - set boundary rules for bio merging + * @q: the request queue for the device + * @mask: the memory boundary mask + **/ +void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask) +{ + q->limits.virt_boundary_mask = mask; +} +EXPORT_SYMBOL(blk_queue_virt_boundary); + /** * blk_queue_dma_alignment - set dma length and memory alignment * @q: the request queue for the device diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6264b382d..07b42f5ad 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -145,12 +145,43 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag return queue_var_show(q->limits.discard_granularity, page); } +static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) +{ + unsigned long long val; + + val = q->limits.max_hw_discard_sectors << 9; + return sprintf(page, "%llu\n", val); +} + static ssize_t queue_discard_max_show(struct request_queue *q, char *page) { return sprintf(page, "%llu\n", (unsigned long long)q->limits.max_discard_sectors << 9); } +static ssize_t queue_discard_max_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long max_discard; + ssize_t ret = queue_var_store(&max_discard, page, count); + + if (ret < 0) + return ret; + + if (max_discard & (q->limits.discard_granularity - 1)) + return -EINVAL; + + max_discard >>= 9; + if (max_discard > UINT_MAX) + return -EINVAL; + + if (max_discard > q->limits.max_hw_discard_sectors) + max_discard = q->limits.max_hw_discard_sectors; + + q->limits.max_discard_sectors = max_discard; + return ret; +} + static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) { return queue_var_show(queue_discard_zeroes_data(q), page); @@ -360,9 +391,15 @@ static struct queue_sysfs_entry queue_discard_granularity_entry = { .show = queue_discard_granularity_show, }; +static struct queue_sysfs_entry queue_discard_max_hw_entry = { + .attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO }, + .show = queue_discard_max_hw_show, +}; + static struct queue_sysfs_entry queue_discard_max_entry = { - .attr = {.name = "discard_max_bytes", .mode = S_IRUGO }, + .attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR }, .show = queue_discard_max_show, + .store = queue_discard_max_store, }; static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { @@ -421,6 +458,7 @@ static struct attribute *default_attrs[] = { &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, &queue_discard_max_entry.attr, + &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_nonrot_entry.attr, @@ -502,6 +540,7 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); + bdi_exit(&q->backing_dev_info); blkcg_exit_queue(q); if (q->elevator) { @@ -523,6 +562,9 @@ static void blk_release_queue(struct kobject *kobj) blk_trace_shutdown(q); + if (q->bio_split) + bioset_free(q->bio_split); + ida_simple_remove(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b23193518..c75a2636d 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -83,14 +83,6 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* Per-cpu group stats */ -struct tg_stats_cpu { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; -}; - struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -141,12 +133,6 @@ struct throtl_grp { /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - - /* Per cpu stats pointer */ - struct tg_stats_cpu __percpu *stats_cpu; - - /* List of tgs waiting for per cpu stats memory to be allocated */ - struct list_head stats_alloc_node; }; struct throtl_data @@ -168,13 +154,6 @@ struct throtl_data struct work_struct dispatch_work; }; -/* list and work item to allocate percpu group stats */ -static DEFINE_SPINLOCK(tg_stats_alloc_lock); -static LIST_HEAD(tg_stats_alloc_list); - -static void tg_stats_alloc_fn(struct work_struct *); -static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); - static void throtl_pending_timer_fn(unsigned long arg); static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) @@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) return pd_to_blkg(&tg->pd); } -static inline struct throtl_grp *td_root_tg(struct throtl_data *td) -{ - return blkg_to_tg(td->queue->root_blkg); -} - /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest @@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) } \ } while (0) -static void tg_stats_init(struct tg_stats_cpu *tg_stats) -{ - blkg_rwstat_init(&tg_stats->service_bytes); - blkg_rwstat_init(&tg_stats->serviced); -} - -/* - * Worker for allocating per cpu stat for tgs. This is scheduled on the - * system_wq once there are some groups on the alloc_list waiting for - * allocation. - */ -static void tg_stats_alloc_fn(struct work_struct *work) -{ - static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ - struct delayed_work *dwork = to_delayed_work(work); - bool empty = false; - -alloc_stats: - if (!stats_cpu) { - int cpu; - - stats_cpu = alloc_percpu(struct tg_stats_cpu); - if (!stats_cpu) { - /* allocation failed, try again after some time */ - schedule_delayed_work(dwork, msecs_to_jiffies(10)); - return; - } - for_each_possible_cpu(cpu) - tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); - } - - spin_lock_irq(&tg_stats_alloc_lock); - - if (!list_empty(&tg_stats_alloc_list)) { - struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, - struct throtl_grp, - stats_alloc_node); - swap(tg->stats_cpu, stats_cpu); - list_del_init(&tg->stats_alloc_node); - } - - empty = list_empty(&tg_stats_alloc_list); - spin_unlock_irq(&tg_stats_alloc_lock); - if (!empty) - goto alloc_stats; -} - static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); @@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued, } /* init a service_queue, assumes the caller zeroed it */ -static void throtl_service_queue_init(struct throtl_service_queue *sq, - struct throtl_service_queue *parent_sq) +static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[1]); sq->pending_tree = RB_ROOT; - sq->parent_sq = parent_sq; setup_timer(&sq->pending_timer, throtl_pending_timer_fn, (unsigned long)sq); } -static void throtl_service_queue_exit(struct throtl_service_queue *sq) +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { - del_timer_sync(&sq->pending_timer); + struct throtl_grp *tg; + int rw; + + tg = kzalloc_node(sizeof(*tg), gfp, node); + if (!tg) + return NULL; + + throtl_service_queue_init(&tg->service_queue); + + for (rw = READ; rw <= WRITE; rw++) { + throtl_qnode_init(&tg->qnode_on_self[rw], tg); + throtl_qnode_init(&tg->qnode_on_parent[rw], tg); + } + + RB_CLEAR_NODE(&tg->rb_node); + tg->bps[READ] = -1; + tg->bps[WRITE] = -1; + tg->iops[READ] = -1; + tg->iops[WRITE] = -1; + + return &tg->pd; } -static void throtl_pd_init(struct blkcg_gq *blkg) +static void throtl_pd_init(struct blkg_policy_data *pd) { - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; - struct throtl_service_queue *parent_sq; - unsigned long flags; - int rw; + struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical @@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg) * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ - parent_sq = &td->service_queue; - + sq->parent_sq = &td->service_queue; if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) - parent_sq = &blkg_to_tg(blkg->parent)->service_queue; - - throtl_service_queue_init(&tg->service_queue, parent_sq); - - for (rw = READ; rw <= WRITE; rw++) { - throtl_qnode_init(&tg->qnode_on_self[rw], tg); - throtl_qnode_init(&tg->qnode_on_parent[rw], tg); - } - - RB_CLEAR_NODE(&tg->rb_node); + sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; - - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; - - /* - * Ugh... We need to perform per-cpu allocation for tg->stats_cpu - * but percpu allocator can't be called from IO path. Queue tg on - * tg_stats_alloc_list and allocate from work item. - */ - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); - schedule_delayed_work(&tg_stats_alloc_work, 0); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); } /* @@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg) (tg->bps[rw] != -1 || tg->iops[rw] != -1); } -static void throtl_pd_online(struct blkcg_gq *blkg) +static void throtl_pd_online(struct blkg_policy_data *pd) { /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(blkg_to_tg(blkg)); -} - -static void throtl_pd_exit(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; - - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_del_init(&tg->stats_alloc_node); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); - - free_percpu(tg->stats_cpu); - - throtl_service_queue_exit(&tg->service_queue); -} - -static void throtl_pd_reset_stats(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - int cpu; - - if (tg->stats_cpu == NULL) - return; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - blkg_rwstat_reset(&sc->service_bytes); - blkg_rwstat_reset(&sc->serviced); - } -} - -static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, - struct blkcg *blkcg) -{ - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) - return td_root_tg(td); - - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); + tg_update_has_rules(pd_to_tg(pd)); } -static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, - struct blkcg *blkcg) +static void throtl_pd_free(struct blkg_policy_data *pd) { - struct request_queue *q = td->queue; - struct throtl_grp *tg = NULL; - - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) { - tg = td_root_tg(td); - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - - /* if %NULL and @q is alive, fall back to root_tg */ - if (!IS_ERR(blkg)) - tg = blkg_to_tg(blkg); - else if (!blk_queue_dying(q)) - tg = td_root_tg(td); - } + struct throtl_grp *tg = pd_to_tg(pd); - return tg; + del_timer_sync(&tg->service_queue.pending_timer); + kfree(tg); } static struct throtl_grp * @@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, - int rw) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - struct tg_stats_cpu *stats_cpu; - unsigned long flags; - - /* If per cpu stats are not allocated yet, don't do any accounting. */ - if (tg->stats_cpu == NULL) - return; - - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(tg->stats_cpu); - - blkg_rwstat_add(&stats_cpu->serviced, rw, 1); - blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); - - local_irq_restore(flags); -} - static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) * more than once as a throttled bio will go through blk-throtl the * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. - * - * Dispatch stats aren't recursive and each @bio should only be - * accounted by the @tg it was originally associated with. Let's - * update the stats when setting REQ_THROTTLED for the first time - * which is guaranteed to be for the @bio's original tg. */ - if (!(bio->bi_rw & REQ_THROTTLED)) { + if (!(bio->bi_rw & REQ_THROTTLED)) bio->bi_rw |= REQ_THROTTLED; - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - } } /** @@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) } } -static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct throtl_grp *tg = pd_to_tg(pd); - struct blkg_rwstat rwstat = { }, tmp; - int i, cpu; - - if (tg->stats_cpu == NULL) - return 0; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - tmp = blkg_rwstat_read((void *)sc + off); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - rwstat.cnt[i] += tmp.cnt[i]; - } - - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, - &blkcg_policy_throtl, seq_cft(sf)->private, true); - return 0; -} - static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static ssize_t tg_set_conf(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, bool is_u64) +static void tg_conf_updated(struct throtl_grp *tg) { - struct blkcg *blkcg = css_to_blkcg(of_css(of)); - struct blkg_conf_ctx ctx; - struct throtl_grp *tg; - struct throtl_service_queue *sq; - struct blkcg_gq *blkg; + struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; - int ret; - - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); - if (ret) - return ret; - - tg = blkg_to_tg(ctx.blkg); - sq = &tg->service_queue; - - if (!ctx.v) - ctx.v = -1; - - if (is_u64) - *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; - else - *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; + struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } +} + +static ssize_t tg_set_conf(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, bool is_u64) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + int ret; + u64 v; + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + if (!v) + v = -1; + + tg = blkg_to_tg(ctx.blkg); + + if (is_u64) + *(u64 *)((void *)tg + of_cft(of)->private) = v; + else + *(unsigned int *)((void *)tg + of_cft(of)->private) = v; + + tg_conf_updated(tg); + ret = 0; +out_finish: blkg_conf_finish(&ctx); - return nbytes; + return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, @@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } -static struct cftype throtl_files[] = { +static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), @@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = { }, { .name = "throttle.io_service_bytes", - .private = offsetof(struct tg_stats_cpu, service_bytes), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes, }, { .name = "throttle.io_serviced", - .private = offsetof(struct tg_stats_cpu, serviced), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios, + }, + { } /* terminate */ +}; + +static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[4][21] = { "max", "max", "max", "max" }; + + if (!dname) + return 0; + if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && + tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + return 0; + + if (tg->bps[READ] != -1) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); + if (tg->bps[WRITE] != -1) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); + if (tg->iops[READ] != -1) + snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); + if (tg->iops[WRITE] != -1) + snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3]); + return 0; +} + +static int tg_print_max(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t tg_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + u64 v[4]; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + tg = blkg_to_tg(ctx.blkg); + + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; + + while (true) { + char tok[27]; /* wiops=18446744073709551616 */ + char *p; + u64 val = -1; + int len; + + if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) + break; + if (tok[0] == '\0') + break; + ctx.body += len; + + ret = -EINVAL; + p = tok; + strsep(&p, "="); + if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) + goto out_finish; + + ret = -ERANGE; + if (!val) + goto out_finish; + + ret = -EINVAL; + if (!strcmp(tok, "rbps")) + v[0] = val; + else if (!strcmp(tok, "wbps")) + v[1] = val; + else if (!strcmp(tok, "riops")) + v[2] = min_t(u64, val, UINT_MAX); + else if (!strcmp(tok, "wiops")) + v[3] = min_t(u64, val, UINT_MAX); + else + goto out_finish; + } + + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; + + tg_conf_updated(tg); + ret = 0; +out_finish: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static struct cftype throtl_files[] = { + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_max, + .write = tg_set_max, }, { } /* terminate */ }; @@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { - .pd_size = sizeof(struct throtl_grp), - .cftypes = throtl_files, + .dfl_cftypes = throtl_files, + .legacy_cftypes = throtl_legacy_files, + .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, - .pd_exit_fn = throtl_pd_exit, - .pd_reset_stats_fn = throtl_pd_reset_stats, + .pd_free_fn = throtl_pd_free, }; -bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { - struct throtl_data *td = q->td; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg; + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); - struct blkcg *blkcg; bool throttled = false; + WARN_ON_ONCE(!rcu_read_lock_held()); + /* see throtl_charge_bio() */ - if (bio->bi_rw & REQ_THROTTLED) + if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) goto out; - /* - * A throtl_grp pointer retrieved under rcu can be used to access - * basic fields like stats and io rates. If a group has no rules, - * just update the dispatch stats in lockless manner and return. - */ - rcu_read_lock(); - blkcg = bio_blkcg(bio); - tg = throtl_lookup_tg(td, blkcg); - if (tg) { - if (!tg->has_rules[rw]) { - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - goto out_unlock_rcu; - } - } - - /* - * Either group has not been allocated yet or it is not an unlimited - * IO group - */ spin_lock_irq(q->queue_lock); - tg = throtl_lookup_create_tg(td, blkcg); - if (unlikely(!tg)) + + if (unlikely(blk_queue_bypass(q))) goto out_unlock; sq = &tg->service_queue; @@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) out_unlock: spin_unlock_irq(q->queue_lock); -out_unlock_rcu: - rcu_read_unlock(); out: /* * As multiple blk-throtls may stack in the same issue path, we @@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue, NULL); + throtl_service_queue_init(&td->service_queue); q->td = td; td->queue = q; diff --git a/block/blk.h b/block/blk.h index 838188b35..98614ad37 100644 --- a/block/blk.h +++ b/block/blk.h @@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) * Internal throttling interface */ #ifdef CONFIG_BLK_DEV_THROTTLING -extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ -static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) -{ - return false; -} static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } diff --git a/block/bounce.c b/block/bounce.c index b17311227..1cb5dd3a5 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -123,17 +123,19 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) } } -static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) +static void bounce_end_io(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, *org_vec; int i; + int start = bio_orig->bi_iter.bi_idx; /* * free up bounce indirect pages used */ bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i; + org_vec = bio_orig->bi_io_vec + i + start; + if (bvec->bv_page == org_vec->bv_page) continue; @@ -141,61 +143,44 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) mempool_free(bvec->bv_page, pool); } - bio_endio(bio_orig, err); + bio_orig->bi_error = bio->bi_error; + bio_endio(bio_orig); bio_put(bio); } -static void bounce_end_io_write(struct bio *bio, int err) +static void bounce_end_io_write(struct bio *bio) { - bounce_end_io(bio, page_pool, err); + bounce_end_io(bio, page_pool); } -static void bounce_end_io_write_isa(struct bio *bio, int err) +static void bounce_end_io_write_isa(struct bio *bio) { - bounce_end_io(bio, isa_page_pool, err); + bounce_end_io(bio, isa_page_pool); } -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (!bio->bi_error) copy_to_high_bio_irq(bio_orig, bio); - bounce_end_io(bio, pool, err); -} - -static void bounce_end_io_read(struct bio *bio, int err) -{ - __bounce_end_io_read(bio, page_pool, err); + bounce_end_io(bio, pool); } -static void bounce_end_io_read_isa(struct bio *bio, int err) +static void bounce_end_io_read(struct bio *bio) { - __bounce_end_io_read(bio, isa_page_pool, err); + __bounce_end_io_read(bio, page_pool); } -#ifdef CONFIG_NEED_BOUNCE_POOL -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) -{ - if (bio_data_dir(bio) != WRITE) - return 0; - - if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) - return 0; - - return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); -} -#else -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +static void bounce_end_io_read_isa(struct bio *bio) { - return 0; + __bounce_end_io_read(bio, isa_page_pool); } -#endif /* CONFIG_NEED_BOUNCE_POOL */ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool, int force) + mempool_t *pool) { struct bio *bio; int rw = bio_data_dir(*bio_orig); @@ -203,8 +188,6 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, struct bvec_iter iter; unsigned i; - if (force) - goto bounce; bio_for_each_segment(from, *bio_orig, iter) if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) goto bounce; @@ -216,7 +199,7 @@ bounce: bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; - if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) + if (page_to_pfn(page) <= queue_bounce_pfn(q)) continue; to->bv_page = mempool_alloc(pool, q->bounce_gfp); @@ -254,7 +237,6 @@ bounce: void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { - int must_bounce; mempool_t *pool; /* @@ -263,15 +245,13 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) if (!bio_has_data(*bio_orig)) return; - must_bounce = must_snapshot_stable_pages(q, *bio_orig); - /* * for non-isa bounce case, just check if the bounce pfn is equal * to or bigger than the highest pfn in the system -- in that case, * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) + if (queue_bounce_pfn(q) >= blk_max_pfn) return; pool = page_pool; } else { @@ -282,7 +262,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) /* * slow path */ - __blk_queue_bounce(q, bio_orig, pool, must_bounce); + __blk_queue_bounce(q, bio_orig, pool); } EXPORT_SYMBOL(blk_queue_bounce); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c62bb2e65..04de88463 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool; #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) /* blkio-related constants */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 +#define CFQ_WEIGHT_LEGACY_MIN 10 +#define CFQ_WEIGHT_LEGACY_DFL 500 +#define CFQ_WEIGHT_LEGACY_MAX 1000 struct cfq_ttime { unsigned long last_end_request; @@ -177,10 +177,6 @@ enum wl_type_t { struct cfqg_stats { #ifdef CONFIG_CFQ_GROUP_IOSCHED - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -189,8 +185,6 @@ struct cfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; #ifdef CONFIG_DEBUG_BLK_CGROUP @@ -220,7 +214,7 @@ struct cfqg_stats { /* Per-cgroup data */ struct cfq_group_data { /* must be the first member */ - struct blkcg_policy_data pd; + struct blkcg_policy_data cpd; unsigned int weight; unsigned int leaf_weight; @@ -304,7 +298,11 @@ struct cfq_group { int dispatched; struct cfq_ttime ttime; struct cfqg_stats stats; /* stats for this cfqg */ - struct cfqg_stats dead_stats; /* stats pushed from dead children */ + + /* async queue for each priority case */ + struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; + struct cfq_queue *async_idle_cfqq; + }; struct cfq_io_cq { @@ -370,12 +368,6 @@ struct cfq_data { struct cfq_queue *active_queue; struct cfq_io_cq *active_cic; - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - sector_t last_position; /* @@ -401,6 +393,7 @@ struct cfq_data { }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); +static void cfq_put_queue(struct cfq_queue *cfqq); static struct cfq_rb_root *st_for(struct cfq_group *cfqg, enum wl_class_t class, @@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) static struct cfq_group_data *cpd_to_cfqgd(struct blkcg_policy_data *cpd) { - return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL; + return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; } static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) @@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) blkg_rwstat_add(&cfqg->stats.merged, rw, 1); } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); -} - static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { @@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, static void cfqg_stats_reset(struct cfqg_stats *stats) { /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); @@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) } /* @to += @from */ -static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) { /* queued stats shouldn't be cleared */ - blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); - blkg_rwstat_merge(&to->serviced, &from->serviced); - blkg_rwstat_merge(&to->merged, &from->merged); - blkg_rwstat_merge(&to->service_time, &from->service_time); - blkg_rwstat_merge(&to->wait_time, &from->wait_time); - blkg_stat_merge(&from->time, &from->time); + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_merge(&to->dequeue, &from->dequeue); - blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); - blkg_stat_merge(&to->idle_time, &from->idle_time); - blkg_stat_merge(&to->empty_time, &from->empty_time); + blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* - * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this cfqg after * it's gone. */ @@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) if (unlikely(!parent)) return; - cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); - cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_add_aux(&parent->stats, &cfqg->stats); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } #else /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, unsigned long time, unsigned long unaccounted_time) { } static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { } @@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio, - gfp_t gfp_mask); + struct cfq_io_cq *cic, struct bio *bio); static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { @@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void cfqg_stats_init(struct cfqg_stats *stats) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight); + +static void cfqg_stats_exit(struct cfqg_stats *stats) { - blkg_rwstat_init(&stats->service_bytes); - blkg_rwstat_init(&stats->serviced); - blkg_rwstat_init(&stats->merged); - blkg_rwstat_init(&stats->service_time); - blkg_rwstat_init(&stats->wait_time); - blkg_rwstat_init(&stats->queued); + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); + blkg_stat_exit(&stats->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_exit(&stats->unaccounted_time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); +#endif +} - blkg_stat_init(&stats->sectors); - blkg_stat_init(&stats->time); +static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + blkg_stat_init(&stats->time, gfp)) + goto err; #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_init(&stats->unaccounted_time); - blkg_stat_init(&stats->avg_queue_size_sum); - blkg_stat_init(&stats->avg_queue_size_samples); - blkg_stat_init(&stats->dequeue); - blkg_stat_init(&stats->group_wait_time); - blkg_stat_init(&stats->idle_time); - blkg_stat_init(&stats->empty_time); + if (blkg_stat_init(&stats->unaccounted_time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || + blkg_stat_init(&stats->group_wait_time, gfp) || + blkg_stat_init(&stats->idle_time, gfp) || + blkg_stat_init(&stats->empty_time, gfp)) + goto err; #endif + return 0; +err: + cfqg_stats_exit(stats); + return -ENOMEM; } -static void cfq_cpd_init(const struct blkcg *blkcg) +static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) { - struct cfq_group_data *cgd = - cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]); + struct cfq_group_data *cgd; - if (blkcg == &blkcg_root) { - cgd->weight = 2 * CFQ_WEIGHT_DEFAULT; - cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; - } else { - cgd->weight = CFQ_WEIGHT_DEFAULT; - cgd->leaf_weight = CFQ_WEIGHT_DEFAULT; - } + cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); + if (!cgd) + return NULL; + return &cgd->cpd; +} + +static void cfq_cpd_init(struct blkcg_policy_data *cpd) +{ + struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); + unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ? + CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (cpd_to_blkcg(cpd) == &blkcg_root) + weight *= 2; + + cgd->weight = weight; + cgd->leaf_weight = weight; } -static void cfq_pd_init(struct blkcg_gq *blkg) +static void cfq_cpd_free(struct blkcg_policy_data *cpd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); + kfree(cpd_to_cfqgd(cpd)); +} + +static void cfq_cpd_bind(struct blkcg_policy_data *cpd) +{ + struct blkcg *blkcg = cpd_to_blkcg(cpd); + bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup); + unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (blkcg == &blkcg_root) + weight *= 2; + + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); +} + +static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) +{ + struct cfq_group *cfqg; + + cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); + if (!cfqg) + return NULL; cfq_init_cfqg_base(cfqg); + if (cfqg_stats_init(&cfqg->stats, gfp)) { + kfree(cfqg); + return NULL; + } + + return &cfqg->pd; +} + +static void cfq_pd_init(struct blkg_policy_data *pd) +{ + struct cfq_group *cfqg = pd_to_cfqg(pd); + struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); + cfqg->weight = cgd->weight; cfqg->leaf_weight = cgd->leaf_weight; - cfqg_stats_init(&cfqg->stats); - cfqg_stats_init(&cfqg->dead_stats); } -static void cfq_pd_offline(struct blkcg_gq *blkg) +static void cfq_pd_offline(struct blkg_policy_data *pd) { + struct cfq_group *cfqg = pd_to_cfqg(pd); + int i; + + for (i = 0; i < IOPRIO_BE_NR; i++) { + if (cfqg->async_cfqq[0][i]) + cfq_put_queue(cfqg->async_cfqq[0][i]); + if (cfqg->async_cfqq[1][i]) + cfq_put_queue(cfqg->async_cfqq[1][i]); + } + + if (cfqg->async_idle_cfqq) + cfq_put_queue(cfqg->async_idle_cfqq); + /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ - cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); + cfqg_stats_xfer_dead(cfqg); } -/* offset delta from cfqg->stats to cfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - - offsetof(struct cfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +static void cfq_pd_free(struct blkg_policy_data *pd) { - u64 sum = 0; - - sum += blkg_stat_recursive_sum(pd, off); - sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); - return sum; -} - -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat a, b; + struct cfq_group *cfqg = pd_to_cfqg(pd); - a = blkg_rwstat_recursive_sum(pd, off); - b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); - blkg_rwstat_merge(&a, &b); - return a; + cfqg_stats_exit(&cfqg->stats); + return kfree(cfqg); } -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +static void cfq_pd_reset_stats(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = pd_to_cfqg(pd); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } -/* - * Search for the cfq group current task belongs to. request_queue lock must - * be held. - */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { - struct request_queue *q = cfqd->queue; - struct cfq_group *cfqg = NULL; - - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - cfqg = cfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - cfqg = blkg_to_cfqg(blkg); - } + struct blkcg_gq *blkg; - return cfqg; + blkg = blkg_lookup(blkcg, cfqd->queue); + if (likely(blkg)) + return blkg_to_cfqg(blkg); + return NULL; } static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { - /* Currently, all async queues are mapped to root group */ - if (!cfq_cfqq_sync(cfqq)) - cfqg = cfqq->cfqd->root_group; - cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ cfqg_get(cfqg); @@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v) static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, - bool is_leaf_weight) + bool on_dfl, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; struct cfq_group_data *cfqgd; int ret; + u64 v; ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); if (ret) return ret; - ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v && on_dfl) + goto out_finish; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; + goto out_finish; + } + cfqg = blkg_to_cfqg(ctx.blkg); cfqgd = blkcg_to_cfqgd(blkcg); - if (!cfqg || !cfqgd) - goto err; - if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { + ret = -ERANGE; + if (!v || (v >= min && v <= max)) { if (!is_leaf_weight) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = ctx.v ?: cfqgd->weight; + cfqg->dev_weight = v; + cfqg->new_weight = v ?: cfqgd->weight; } else { - cfqg->dev_leaf_weight = ctx.v; - cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight; + cfqg->dev_leaf_weight = v; + cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; } ret = 0; } - -err: +out_finish: blkg_conf_finish(&ctx); return ret ?: nbytes; } @@ -1776,25 +1805,27 @@ err: static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, false); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); } static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, true); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); } -static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val, bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; struct cfq_group_data *cfqgd; int ret = 0; - if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) - return -EINVAL; + if (val < min || val > max) + return -ERANGE; spin_lock_irq(&blkcg->lock); cfqgd = blkcg_to_cfqgd(blkcg); @@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, continue; if (!is_leaf_weight) { + if (reset_dev) + cfqg->dev_weight = 0; if (!cfqg->dev_weight) cfqg->new_weight = cfqgd->weight; } else { + if (reset_dev) + cfqg->dev_leaf_weight = 0; if (!cfqg->dev_leaf_weight) cfqg->new_leaf_weight = cfqgd->leaf_weight; } @@ -1831,13 +1866,13 @@ out: static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, false); + return __cfq_set_weight(css, val, false, false, false); } static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, true); + return __cfq_set_weight(css, val, false, false, true); } static int cfqg_print_stat(struct seq_file *sf, void *v) @@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = cfqg_stat_pd_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); + return 0; +} + +static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, + false); + return 0; +} + #ifdef CONFIG_DEBUG_BLK_CGROUP static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) @@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) } #endif /* CONFIG_DEBUG_BLK_CGROUP */ -static struct cftype cfq_blkcg_files[] = { +static struct cftype cfq_blkcg_legacy_files[] = { /* on root, weight is mapped to leaf_weight */ { .name = "weight_device", @@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat, + .seq_show = cfqg_print_stat_sectors, }, { .name = "io_service_bytes", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "io_serviced", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios, }, { .name = "io_service_time", @@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors_recursive", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_sectors_recursive, }, { .name = "io_service_bytes_recursive", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "io_serviced_recursive", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "io_service_time_recursive", @@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = { #endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; + +static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + + seq_printf(sf, "default %u\n", cgd->weight); + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); + return 0; +} + +static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = __cfq_set_weight(of_css(of), v, true, false, false); + return ret ?: nbytes; + } + + /* "MAJ:MIN WEIGHT" */ + return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); +} + +static struct cftype cfq_blkcg_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cfq_print_weight_on_dfl, + .write = cfq_set_weight_on_dfl, + }, + { } /* terminate */ +}; + #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { return cfqd->root_group; } @@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); } /* @@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq) struct cfq_io_cq *cic = icq_to_cic(icq); struct cfq_data *cfqd = cic_to_cfqd(cic); - if (cic->cfqq[BLK_RW_ASYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); - cic->cfqq[BLK_RW_ASYNC] = NULL; + if (cic_to_cfqq(cic, false)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); + cic_set_cfqq(cic, NULL, false); } - if (cic->cfqq[BLK_RW_SYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); - cic->cfqq[BLK_RW_SYNC] = NULL; + if (cic_to_cfqq(cic, true)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); + cic_set_cfqq(cic, NULL, true); } } @@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) return; - cfqq = cic->cfqq[BLK_RW_ASYNC]; + cfqq = cic_to_cfqq(cic, false); if (cfqq) { - struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, - GFP_ATOMIC); - if (new_cfqq) { - cic->cfqq[BLK_RW_ASYNC] = new_cfqq; - cfq_put_queue(cfqq); - } + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); + cic_set_cfqq(cic, cfqq, false); } - cfqq = cic->cfqq[BLK_RW_SYNC]; + cfqq = cic_to_cfqq(cic, true); if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); @@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *sync_cfqq; + struct cfq_queue *cfqq; uint64_t serial_nr; rcu_read_lock(); @@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) return; - sync_cfqq = cic_to_cfqq(cic, 1); - if (sync_cfqq) { - /* - * Drop reference to sync queue. A new sync queue will be - * assigned in new group upon arrival of a fresh request. - */ - cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, 1); - cfq_put_queue(sync_cfqq); + /* + * Drop reference to queues. New queues will be assigned in new + * group upon arrival of fresh requests. + */ + cfqq = cic_to_cfqq(cic, false); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, false); + cfq_put_queue(cfqq); + } + + cfqq = cic_to_cfqq(cic, true); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, true); + cfq_put_queue(cfqq); } cic->blkcg_serial_nr = serial_nr; @@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ -static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) -{ - struct blkcg *blkcg; - struct cfq_queue *cfqq, *new_cfqq = NULL; - struct cfq_group *cfqg; - -retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); - if (!cfqg) { - cfqq = &cfqd->oom_cfqq; - goto out; - } - - cfqq = cic_to_cfqq(cic, is_sync); - - /* - * Always try a new alloc if we fell back to the OOM cfqq - * originally, since it should just be a temporary situation. - */ - if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = NULL; - if (new_cfqq) { - cfqq = new_cfqq; - new_cfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { - rcu_read_unlock(); - spin_unlock_irq(cfqd->queue->queue_lock); - new_cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - spin_lock_irq(cfqd->queue->queue_lock); - if (new_cfqq) - goto retry; - else - return &cfqd->oom_cfqq; - } else { - cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - } - - if (cfqq) { - cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, cic); - cfq_link_cfqq_cfqg(cfqq, cfqg); - cfq_log_cfqq(cfqd, cfqq, "alloced"); - } else - cfqq = &cfqd->oom_cfqq; - } -out: - if (new_cfqq) - kmem_cache_free(cfq_pool, new_cfqq); - - rcu_read_unlock(); - return cfqq; -} - static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) +cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; + return &cfqg->async_cfqq[0][ioprio]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_NORM; /* fall through */ case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; + return &cfqg->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; + return &cfqg->async_idle_cfqq; default: BUG(); } @@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio) { int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; - struct cfq_queue *cfqq = NULL; + struct cfq_queue *cfqq; + struct cfq_group *cfqg; + + rcu_read_lock(); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); + if (!cfqg) { + cfqq = &cfqd->oom_cfqq; + goto out; + } if (!is_sync) { if (!ioprio_valid(cic->ioprio)) { @@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, ioprio = task_nice_ioprio(tsk); ioprio_class = task_nice_ioclass(tsk); } - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); + async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); cfqq = *async_cfqq; + if (cfqq) + goto out; } - if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, + cfqd->queue->node); + if (!cfqq) { + cfqq = &cfqd->oom_cfqq; + goto out; + } - /* - * pin the queue now that it's allocated, scheduler exit will prune it - */ - if (!is_sync && !(*async_cfqq)) { + cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); + cfq_init_prio_data(cfqq, cic); + cfq_link_cfqq_cfqg(cfqq, cfqg); + cfq_log_cfqq(cfqd, cfqq, "alloced"); + + if (async_cfqq) { + /* a new async queue is created, pin and remember */ cfqq->ref++; *async_cfqq = cfqq; } - +out: cfqq->ref++; + rcu_read_unlock(); return cfqq; } @@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - might_sleep_if(gfp_mask & __GFP_WAIT); - spin_lock_irq(q->queue_lock); check_ioprio_changed(cic, bio); @@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); + if (cfqq) + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); cic_set_cfqq(cic, cfqq, is_sync); } else { /* @@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) cancel_work_sync(&cfqd->unplug_work); } -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e) if (cfqd->active_queue) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - cfq_put_async_queues(cfqd); - spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); @@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; cfq_init_cfqg_base(cfqd->root_group); + cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; + cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; #endif - cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; - cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) cfqd->prio_trees[i] = RB_ROOT; /* - * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. + * Our fallback cfqq if cfq_get_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. oom_cfqq is linked to root_group * but shouldn't hold a reference as it'll never be unlinked. Lose @@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .pd_size = sizeof(struct cfq_group), - .cpd_size = sizeof(struct cfq_group_data), - .cftypes = cfq_blkcg_files, + .dfl_cftypes = cfq_blkcg_files, + .legacy_cftypes = cfq_blkcg_legacy_files, + .cpd_alloc_fn = cfq_cpd_alloc, .cpd_init_fn = cfq_cpd_init, + .cpd_free_fn = cfq_cpd_free, + .cpd_bind_fn = cfq_cpd_bind, + + .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, + .pd_free_fn = cfq_pd_free, .pd_reset_stats_fn = cfq_pd_reset_stats, }; #endif diff --git a/block/genhd.c b/block/genhd.c index 6d1003f0d..398dab06d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1112,8 +1112,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); - free_part_stats(&disk->part0); - free_part_info(&disk->part0); + hd_free_part(&disk->part0); if (disk->queue) blk_put_queue(disk->queue); kfree(disk); @@ -1287,7 +1286,11 @@ struct gendisk *alloc_disk_node(int minors, int node_id) * converted to make use of bd_mutex and sequence counters. */ seqcount_init(&disk->part0.nr_sects_seq); - hd_ref_init(&disk->part0); + if (hd_ref_init(&disk->part0)) { + hd_free_part(&disk->part0); + kfree(disk); + return NULL; + } disk->minors = minors; rand_initialize_disk(disk); diff --git a/block/partition-generic.c b/block/partition-generic.c index 0d9e5f97f..e77111332 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -212,8 +212,7 @@ static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); blk_free_devt(dev->devt); - free_part_stats(p); - free_part_info(p); + hd_free_part(p); kfree(p); } @@ -233,8 +232,9 @@ static void delete_partition_rcu_cb(struct rcu_head *head) put_device(part_to_dev(part)); } -void __delete_partition(struct hd_struct *part) +void __delete_partition(struct percpu_ref *ref) { + struct hd_struct *part = container_of(ref, struct hd_struct, ref); call_rcu(&part->rcu_head, delete_partition_rcu_cb); } @@ -255,7 +255,7 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); - hd_struct_put(part); + hd_struct_kill(part); } static ssize_t whole_disk_show(struct device *dev, @@ -356,8 +356,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - hd_ref_init(p); - return p; + if (!hd_ref_init(p)) + return p; out_free_info: free_part_info(p); diff --git a/block/uuid.c b/block/uuid.c index 722d53b63..4610d7b8f 100644 --- a/block/uuid.c +++ b/block/uuid.c @@ -142,12 +142,12 @@ static int null_uuid(const char *uuid) } -static void uuid_end_bio(struct bio *bio, int err) +static void uuid_end_bio(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if(!test_bit(BIO_UPTODATE, &bio->bi_flags)) - SetPageError(page); + if (bio->bi_error) + SetPageError(page); unlock_page(page); bio_put(bio); -- cgit v1.2.3