diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-01-20 14:01:31 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-01-20 14:01:31 -0300 |
commit | b4b7ff4b08e691656c9d77c758fc355833128ac0 (patch) | |
tree | 82fcb00e6b918026dc9f2d1f05ed8eee83874cc0 /block | |
parent | 35acfa0fc609f2a2cd95cef4a6a9c3a5c38f1778 (diff) |
Linux-libre 4.4-gnupck-4.4-gnu
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig.iosched | 6 | ||||
-rw-r--r-- | block/bfq-cgroup.c | 1363 | ||||
-rw-r--r-- | block/bfq-ioc.c | 6 | ||||
-rw-r--r-- | block/bfq-iosched.c | 1096 | ||||
-rw-r--r-- | block/bfq-sched.c | 209 | ||||
-rw-r--r-- | block/bfq.h | 206 | ||||
-rw-r--r-- | block/bio-integrity.c | 17 | ||||
-rw-r--r-- | block/bio.c | 26 | ||||
-rw-r--r-- | block/blk-cgroup.c | 7 | ||||
-rw-r--r-- | block/blk-core.c | 241 | ||||
-rw-r--r-- | block/blk-integrity.c | 192 | ||||
-rw-r--r-- | block/blk-ioc.c | 2 | ||||
-rw-r--r-- | block/blk-merge.c | 61 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 16 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 6 | ||||
-rw-r--r-- | block/blk-mq.c | 248 | ||||
-rw-r--r-- | block/blk-mq.h | 3 | ||||
-rw-r--r-- | block/blk-settings.c | 36 | ||||
-rw-r--r-- | block/blk-sysfs.c | 41 | ||||
-rw-r--r-- | block/blk-throttle.c | 2 | ||||
-rw-r--r-- | block/blk-timeout.c | 8 | ||||
-rw-r--r-- | block/blk.h | 21 | ||||
-rw-r--r-- | block/cfq-iosched.c | 4 | ||||
-rw-r--r-- | block/elevator.c | 2 | ||||
-rw-r--r-- | block/genhd.c | 2 | ||||
-rw-r--r-- | block/ioctl.c | 330 | ||||
-rw-r--r-- | block/ioprio.c | 6 | ||||
-rw-r--r-- | block/noop-iosched.c | 10 | ||||
-rw-r--r-- | block/partition-generic.c | 3 | ||||
-rw-r--r-- | block/partitions/mac.c | 10 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 6 | ||||
-rw-r--r-- | block/t10-pi.c | 16 |
32 files changed, 2571 insertions, 1631 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 01da733dc..1fc1a4dc5 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -51,14 +51,12 @@ config IOSCHED_BFQ applications. If compiled built-in (saying Y here), BFQ can be configured to support hierarchical scheduling. -config CGROUP_BFQIO +config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" depends on CGROUPS && IOSCHED_BFQ=y default n ---help--- - Enable hierarchical scheduling in BFQ, using the cgroups - filesystem interface. The name of the subsystem will be - bfqio. + Enable hierarchical scheduling in BFQ, using the blkio controller. choice prompt "Default I/O scheduler" diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 11e2f1d4e..7a6192007 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -13,254 +13,522 @@ * file. */ -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED -static DEFINE_MUTEX(bfqio_mutex); +/* bfqg stats flags */ +enum bfqg_stats_flags { + BFQG_stats_waiting = 0, + BFQG_stats_idling, + BFQG_stats_empty, +}; -static bool bfqio_is_removed(struct bfqio_cgroup *bgrp) -{ - return bgrp ? !bgrp->online : false; -} +#define BFQG_FLAG_FNS(name) \ +static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ +{ \ + stats->flags |= (1 << BFQG_stats_##name); \ +} \ +static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ +{ \ + stats->flags &= ~(1 << BFQG_stats_##name); \ +} \ +static int bfqg_stats_##name(struct bfqg_stats *stats) \ +{ \ + return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ +} \ -static struct bfqio_cgroup bfqio_root_cgroup = { - .weight = BFQ_DEFAULT_GRP_WEIGHT, - .ioprio = BFQ_DEFAULT_GRP_IOPRIO, - .ioprio_class = BFQ_DEFAULT_GRP_CLASS, -}; +BFQG_FLAG_FNS(waiting) +BFQG_FLAG_FNS(idling) +BFQG_FLAG_FNS(empty) +#undef BFQG_FLAG_FNS -static inline void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) +/* This should be called with the queue_lock held. */ +static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) { - entity->weight = entity->new_weight; - entity->orig_weight = entity->new_weight; - entity->ioprio = entity->new_ioprio; - entity->ioprio_class = entity->new_ioprio_class; - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; + unsigned long long now; + + if (!bfqg_stats_waiting(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + blkg_stat_add(&stats->group_wait_time, + now - stats->start_group_wait_time); + bfqg_stats_clear_waiting(stats); } -static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css) +/* This should be called with the queue_lock held. */ +static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, + struct bfq_group *curr_bfqg) { - return css ? container_of(css, struct bfqio_cgroup, css) : NULL; + struct bfqg_stats *stats = &bfqg->stats; + + if (bfqg_stats_waiting(stats)) + return; + if (bfqg == curr_bfqg) + return; + stats->start_group_wait_time = sched_clock(); + bfqg_stats_mark_waiting(stats); } -/* - * Search the bfq_group for bfqd into the hash table (by now only a list) - * of bgrp. Must be called under rcu_read_lock(). - */ -static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, - struct bfq_data *bfqd) +/* This should be called with the queue_lock held. */ +static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { - struct bfq_group *bfqg; - void *key; + unsigned long long now; - hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) { - key = rcu_dereference(bfqg->bfqd); - if (key == bfqd) - return bfqg; - } + if (!bfqg_stats_empty(stats)) + return; - return NULL; + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + blkg_stat_add(&stats->empty_time, + now - stats->start_empty_time); + bfqg_stats_clear_empty(stats); } -static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, - struct bfq_group *bfqg) +static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { - struct bfq_entity *entity = &bfqg->entity; + blkg_stat_add(&bfqg->stats.dequeue, 1); +} + +static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + if (blkg_rwstat_total(&stats->queued)) + return; /* - * If the weight of the entity has never been set via the sysfs - * interface, then bgrp->weight == 0. In this case we initialize - * the weight from the current ioprio value. Otherwise, the group - * weight, if set, has priority over the ioprio value. + * group is already marked empty. This can happen if bfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. */ - if (bgrp->weight == 0) { - entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio); - entity->new_ioprio = bgrp->ioprio; - } else { - if (bgrp->weight < BFQ_MIN_WEIGHT || - bgrp->weight > BFQ_MAX_WEIGHT) { - printk(KERN_CRIT "bfq_group_init_entity: " - "bgrp->weight %d\n", bgrp->weight); - BUG(); - } - entity->new_weight = bgrp->weight; - entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight); + if (bfqg_stats_empty(stats)) + return; + + stats->start_empty_time = sched_clock(); + bfqg_stats_mark_empty(stats); +} + +static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + if (bfqg_stats_idling(stats)) { + unsigned long long now = sched_clock(); + + if (time_after64(now, stats->start_idle_time)) + blkg_stat_add(&stats->idle_time, + now - stats->start_idle_time); + bfqg_stats_clear_idling(stats); } - entity->orig_weight = entity->weight = entity->new_weight; - entity->ioprio = entity->new_ioprio; - entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; - entity->my_sched_data = &bfqg->sched_data; - bfqg->active_entities = 0; } -static inline void bfq_group_set_parent(struct bfq_group *bfqg, - struct bfq_group *parent) +static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { - struct bfq_entity *entity; + struct bfqg_stats *stats = &bfqg->stats; - BUG_ON(parent == NULL); - BUG_ON(bfqg == NULL); + stats->start_idle_time = sched_clock(); + bfqg_stats_mark_idling(stats); +} - entity = &bfqg->entity; - entity->parent = parent->my_entity; - entity->sched_data = &parent->sched_data; +static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + blkg_stat_add(&stats->avg_queue_size_sum, + blkg_rwstat_total(&stats->queued)); + blkg_stat_add(&stats->avg_queue_size_samples, 1); + bfqg_stats_update_group_wait_time(stats); } -/** - * bfq_group_chain_alloc - allocate a chain of groups. - * @bfqd: queue descriptor. - * @css: the leaf cgroup_subsys_state this chain starts from. - * - * Allocate a chain of groups starting from the one belonging to - * @cgroup up to the root cgroup. Stop if a cgroup on the chain - * to the root has already an allocated group on @bfqd. +static struct blkcg_policy blkcg_policy_bfq; + +/* + * blk-cgroup policy-related handlers + * The following functions help in converting between blk-cgroup + * internal structures and BFQ-specific structures. */ -static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, - struct cgroup_subsys_state *css) + +static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) { - struct bfqio_cgroup *bgrp; - struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; + return pd ? container_of(pd, struct bfq_group, pd) : NULL; +} - for (; css != NULL; css = css->parent) { - bgrp = css_to_bfqio(css); +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) +{ + return pd_to_blkg(&bfqg->pd); +} - bfqg = bfqio_lookup_group(bgrp, bfqd); - if (bfqg != NULL) { - /* - * All the cgroups in the path from there to the - * root must have a bfq_group for bfqd, so we don't - * need any more allocations. - */ - break; - } +static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) +{ + return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq)); +} - bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); - if (bfqg == NULL) - goto cleanup; +/* + * bfq_group handlers + * The following functions help in navigating the bfq_group hierarchy + * by allowing to find the parent of a bfq_group or the bfq_group + * associated to a bfq_queue. + */ - bfq_group_init_entity(bgrp, bfqg); - bfqg->my_entity = &bfqg->entity; +static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) +{ + struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; - if (leaf == NULL) { - leaf = bfqg; - prev = leaf; - } else { - bfq_group_set_parent(prev, bfqg); - /* - * Build a list of allocated nodes using the bfqd - * filed, that is still unused and will be - * initialized only after the node will be - * connected. - */ - prev->bfqd = bfqg; - prev = bfqg; - } - } + return pblkg ? blkg_to_bfqg(pblkg) : NULL; +} - return leaf; +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) +{ + struct bfq_entity *group_entity = bfqq->entity.parent; -cleanup: - while (leaf != NULL) { - prev = leaf; - leaf = leaf->bfqd; - kfree(prev); - } + return group_entity ? container_of(group_entity, struct bfq_group, + entity) : + bfqq->bfqd->root_group; +} + +/* + * The following two functions handle get and put of a bfq_group by + * wrapping the related blk-cgroup hooks. + */ - return NULL; +static void bfqg_get(struct bfq_group *bfqg) +{ + return blkg_get(bfqg_to_blkg(bfqg)); } -/** - * bfq_group_chain_link - link an allocated group chain to a cgroup - * hierarchy. - * @bfqd: the queue descriptor. - * @css: the leaf cgroup_subsys_state to start from. - * @leaf: the leaf group (to be associated to @cgroup). - * - * Try to link a chain of groups to a cgroup hierarchy, connecting the - * nodes bottom-up, so we can be sure that when we find a cgroup in the - * hierarchy that already as a group associated to @bfqd all the nodes - * in the path to the root cgroup have one too. - * - * On locking: the queue lock protects the hierarchy (there is a hierarchy - * per device) while the bfqio_cgroup lock protects the list of groups - * belonging to the same cgroup. +static void bfqg_put(struct bfq_group *bfqg) +{ + return blkg_put(bfqg_to_blkg(bfqg)); +} + +static void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, + int rw) +{ + blkg_rwstat_add(&bfqg->stats.queued, rw, 1); + bfqg_stats_end_empty_time(&bfqg->stats); + if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); +} + +static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) +{ + blkg_rwstat_add(&bfqg->stats.queued, rw, -1); +} + +static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) +{ + blkg_rwstat_add(&bfqg->stats.merged, rw, 1); +} + +static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, + uint64_t bytes, int rw) +{ + blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); + blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); + blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); +} + +static void bfqg_stats_update_completion(struct bfq_group *bfqg, + uint64_t start_time, uint64_t io_start_time, int rw) +{ + struct bfqg_stats *stats = &bfqg->stats; + unsigned long long now = sched_clock(); + + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&stats->wait_time, rw, + io_start_time - start_time); +} + +/* @stats = 0 */ +static void bfqg_stats_reset(struct bfqg_stats *stats) +{ + if (!stats) + return; + + /* queued stats shouldn't be cleared */ + blkg_rwstat_reset(&stats->service_bytes); + blkg_rwstat_reset(&stats->serviced); + blkg_rwstat_reset(&stats->merged); + blkg_rwstat_reset(&stats->service_time); + blkg_rwstat_reset(&stats->wait_time); + blkg_stat_reset(&stats->time); + blkg_stat_reset(&stats->unaccounted_time); + blkg_stat_reset(&stats->avg_queue_size_sum); + blkg_stat_reset(&stats->avg_queue_size_samples); + blkg_stat_reset(&stats->dequeue); + blkg_stat_reset(&stats->group_wait_time); + blkg_stat_reset(&stats->idle_time); + blkg_stat_reset(&stats->empty_time); +} + +/* @to += @from */ +static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) +{ + if (!to || !from) + return; + + /* queued stats shouldn't be cleared */ + blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); + blkg_rwstat_add_aux(&to->serviced, &from->serviced); + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); + blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); +} + +/* + * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' + * recursive stats can still account for the amount used by this bfqg after + * it's gone. */ -static void bfq_group_chain_link(struct bfq_data *bfqd, - struct cgroup_subsys_state *css, - struct bfq_group *leaf) +static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) { - struct bfqio_cgroup *bgrp; - struct bfq_group *bfqg, *next, *prev = NULL; - unsigned long flags; + struct bfq_group *parent; - assert_spin_locked(bfqd->queue->queue_lock); + if (!bfqg) /* root_group */ + return; - for (; css != NULL && leaf != NULL; css = css->parent) { - bgrp = css_to_bfqio(css); - next = leaf->bfqd; + parent = bfqg_parent(bfqg); - bfqg = bfqio_lookup_group(bgrp, bfqd); - BUG_ON(bfqg != NULL); + lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); - spin_lock_irqsave(&bgrp->lock, flags); + if (unlikely(!parent)) + return; - rcu_assign_pointer(leaf->bfqd, bfqd); - hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); - hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); + bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); + bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); + bfqg_stats_reset(&bfqg->stats); + bfqg_stats_reset(&bfqg->dead_stats); +} - spin_unlock_irqrestore(&bgrp->lock, flags); +static void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - prev = leaf; - leaf = next; + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + if (bfqq) { + bfqq->ioprio = bfqq->new_ioprio; + bfqq->ioprio_class = bfqq->new_ioprio_class; + bfqg_get(bfqg); } + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; +} - BUG_ON(css == NULL && leaf != NULL); - if (css != NULL && prev != NULL) { - bgrp = css_to_bfqio(css); - bfqg = bfqio_lookup_group(bgrp, bfqd); - bfq_group_set_parent(prev, bfqg); +static void bfqg_stats_exit(struct bfqg_stats *stats) +{ + blkg_rwstat_exit(&stats->service_bytes); + blkg_rwstat_exit(&stats->serviced); + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); + blkg_stat_exit(&stats->sectors); + blkg_stat_exit(&stats->time); + blkg_stat_exit(&stats->unaccounted_time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); +} + +static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->service_bytes, gfp) || + blkg_rwstat_init(&stats->serviced, gfp) || + blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + blkg_stat_init(&stats->sectors, gfp) || + blkg_stat_init(&stats->time, gfp) || + blkg_stat_init(&stats->unaccounted_time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || + blkg_stat_init(&stats->group_wait_time, gfp) || + blkg_stat_init(&stats->idle_time, gfp) || + blkg_stat_init(&stats->empty_time, gfp)) { + bfqg_stats_exit(stats); + return -ENOMEM; } + + return 0; } -/** - * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. - * @bfqd: queue descriptor. - * @cgroup: cgroup being searched for. - * - * Return a group associated to @bfqd in @cgroup, allocating one if - * necessary. When a group is returned all the cgroups in the path - * to the root have a group associated to @bfqd. - * - * If the allocation fails, return the root group: this breaks guarantees - * but is a safe fallback. If this loss becomes a problem it can be - * mitigated using the equivalent weight (given by the product of the - * weights of the groups in the path from @group to the root) in the - * root scheduler. - * - * We allocate all the missing nodes in the path from the leaf cgroup - * to the root and we connect the nodes only after all the allocations - * have been successful. - */ -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct cgroup_subsys_state *css) +static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) + { + return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; + } + +static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) +{ + return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); +} + +static void bfq_cpd_init(struct blkcg_policy_data *cpd) +{ + struct bfq_group_data *d = cpd_to_bfqgd(cpd); + + d->weight = BFQ_DEFAULT_GRP_WEIGHT; +} + +static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) { - struct bfqio_cgroup *bgrp = css_to_bfqio(css); struct bfq_group *bfqg; - bfqg = bfqio_lookup_group(bgrp, bfqd); - if (bfqg != NULL) - return bfqg; + bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); + if (!bfqg) + return NULL; + + if (bfqg_stats_init(&bfqg->stats, gfp)) { + kfree(bfqg); + return NULL; + } + + return &bfqg->pd; +} + +static void bfq_pd_init(struct blkg_policy_data *pd) +{ + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct bfq_group *bfqg = blkg_to_bfqg(blkg); + struct bfq_data *bfqd = blkg->q->elevator->elevator_data; + struct bfq_entity *entity = &bfqg->entity; + struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); + + entity->orig_weight = entity->weight = entity->new_weight = d->weight; + entity->my_sched_data = &bfqg->sched_data; + bfqg->my_entity = entity; /* + * the root_group's will be set to NULL + * in bfq_init_queue() + */ + bfqg->bfqd = bfqd; + bfqg->active_entities = 0; + bfqg->rq_pos_tree = RB_ROOT; + + /* if the root_group does not exist, we are handling it right now */ + if (bfqd->root_group && bfqg != bfqd->root_group) + hlist_add_head(&bfqg->bfqd_node, &bfqd->group_list); +} + +static void bfq_pd_free(struct blkg_policy_data *pd) +{ + return kfree(pd_to_bfqg(pd)); +} + +/* offset delta from bfqg->stats to bfqg->dead_stats */ +static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - + offsetof(struct bfq_group, stats); + +/* to be used by recursive prfill, sums live and dead stats recursively */ +static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +{ + u64 sum = 0; + + sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); + sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, + off + dead_stats_off_delta); + return sum; +} + +/* to be used by recursive prfill, sums live and dead rwstats recursively */ +static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat a, b; + + a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); + b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, + off + dead_stats_off_delta); + blkg_rwstat_add_aux(&a, &b); + return a; +} + +static void bfq_pd_reset_stats(struct blkg_policy_data *pd) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + + bfqg_stats_reset(&bfqg->stats); + bfqg_stats_reset(&bfqg->dead_stats); +} - bfqg = bfq_group_chain_alloc(bfqd, css); - if (bfqg != NULL) - bfq_group_chain_link(bfqd, css, bfqg); - else +static void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) +{ + struct bfq_entity *entity; + + BUG_ON(!parent); + BUG_ON(!bfqg); + BUG_ON(bfqg == parent); + + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; +} + +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, + struct blkcg *blkcg) +{ + struct request_queue *q = bfqd->queue; + struct bfq_group *bfqg = NULL, *parent; + struct bfq_entity *entity = NULL; + + assert_spin_locked(bfqd->queue->queue_lock); + + /* avoid lookup for the common case where there's no blkcg */ + if (blkcg == &blkcg_root) { bfqg = bfqd->root_group; + } else { + struct blkcg_gq *blkg; + + blkg = blkg_lookup_create(blkcg, q); + if (!IS_ERR(blkg)) + bfqg = blkg_to_bfqg(blkg); + else /* fallback to root_group */ + bfqg = bfqd->root_group; + } + + BUG_ON(!bfqg); + + /* + * Update chain of bfq_groups as we might be handling a leaf group + * which, along with some of its relatives, has not been hooked yet + * to the private hierarchy of BFQ. + */ + entity = &bfqg->entity; + for_each_entity(entity) { + bfqg = container_of(entity, struct bfq_group, entity); + BUG_ON(!bfqg); + if (bfqg != bfqd->root_group) { + parent = bfqg_parent(bfqg); + if (!parent) + parent = bfqd->root_group; + BUG_ON(!parent); + bfq_group_set_parent(bfqg, parent); + } + } return bfqg; } +static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); + /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. @@ -296,6 +564,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_deactivate_bfqq(bfqd, bfqq, 0); } else if (entity->on_st) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + bfqg_put(bfqq_group(bfqq)); /* * Here we use a reference to bfqg. We don't need a refcounter @@ -304,11 +573,15 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; + bfqg_get(bfqg); - if (busy && resume) - bfq_activate_bfqq(bfqd, bfqq); + if (busy) { + bfq_pos_tree_add_move(bfqd, bfqq); + if (resume) + bfq_activate_bfqq(bfqd, bfqq); + } - if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver) + if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); } @@ -316,9 +589,9 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * __bfq_bic_change_cgroup - move @bic to @cgroup. * @bfqd: the queue descriptor. * @bic: the bic to move. - * @cgroup: the cgroup to move to. + * @blkcg: the blk-cgroup to move to. * - * Move bic to cgroup, assuming that bfqd->queue is locked; the caller + * Move bic to blkcg, assuming that bfqd->queue is locked; the caller * has to make sure that the reference to cgroup is valid across the call. * * NOTE: an alternative approach might have been to store the current @@ -327,18 +600,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct cgroup_subsys_state *css) + struct blkcg *blkcg) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); - struct bfq_entity *entity; struct bfq_group *bfqg; - struct bfqio_cgroup *bgrp; + struct bfq_entity *entity; - bgrp = css_to_bfqio(css); + lockdep_assert_held(bfqd->queue->queue_lock); - bfqg = bfq_find_alloc_group(bfqd, css); - if (async_bfqq != NULL) { + bfqg = bfq_find_alloc_group(bfqd, blkcg); + if (async_bfqq) { entity = &async_bfqq->entity; if (entity->sched_data != &bfqg->sched_data) { @@ -350,7 +622,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, } } - if (sync_bfqq != NULL) { + if (sync_bfqq) { entity = &sync_bfqq->entity; if (entity->sched_data != &bfqg->sched_data) bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); @@ -359,74 +631,39 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, return bfqg; } -/** - * bfq_bic_change_cgroup - move @bic to @cgroup. - * @bic: the bic being migrated. - * @cgroup: the destination cgroup. - * - * When the task owning @bic is moved to @cgroup, @bic is immediately - * moved into its new parent group. - */ -static void bfq_bic_change_cgroup(struct bfq_io_cq *bic, - struct cgroup_subsys_state *css) -{ - struct bfq_data *bfqd; - unsigned long uninitialized_var(flags); - - bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), - &flags); - if (bfqd != NULL) { - __bfq_bic_change_cgroup(bfqd, bic, css); - bfq_put_bfqd_unlock(bfqd, &flags); - } -} - -/** - * bfq_bic_update_cgroup - update the cgroup of @bic. - * @bic: the @bic to update. - * - * Make sure that @bic is enqueued in the cgroup of the current task. - * We need this in addition to moving bics during the cgroup attach - * phase because the task owning @bic could be at its first disk - * access or we may end up in the root cgroup as the result of a - * memory allocation failure and here we try to move to the right - * group. - * - * Must be called under the queue lock. It is safe to use the returned - * value even after the rcu_read_unlock() as the migration/destruction - * paths act under the queue lock too. IOW it is impossible to race with - * group migration/destruction and end up with an invalid group as: - * a) here cgroup has not yet been destroyed, nor its destroy callback - * has started execution, as current holds a reference to it, - * b) if it is destroyed after rcu_read_unlock() [after current is - * migrated to a different cgroup] its attach() callback will have - * taken care of remove all the references to the old cgroup data. - */ -static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic) +static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct bfq_group *bfqg; - struct cgroup_subsys_state *css; - - BUG_ON(bfqd == NULL); + struct blkcg *blkcg; + struct bfq_group *bfqg = NULL; + uint64_t id; rcu_read_lock(); - css = task_css(current, bfqio_cgrp_id); - bfqg = __bfq_bic_change_cgroup(bfqd, bic, css); + blkcg = bio_blkcg(bio); + id = blkcg->css.serial_nr; rcu_read_unlock(); - return bfqg; + /* + * Check whether blkcg has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ + if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) + return; + + bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); + BUG_ON(!bfqg); + bic->blkcg_id = id; } /** * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. * @st: the service tree being flushed. */ -static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) +static void bfq_flush_idle_tree(struct bfq_service_tree *st) { struct bfq_entity *entity = st->first_idle; - for (; entity != NULL; entity = st->first_idle) + for (; entity ; entity = st->first_idle) __bfq_deactivate_entity(entity, 0); } @@ -435,12 +672,12 @@ static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) * @bfqd: the device data structure with the root group. * @entity: the entity to move. */ -static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_entity *entity) +static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, + struct bfq_entity *entity) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(bfqq == NULL); + BUG_ON(!bfqq); bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); return; } @@ -454,9 +691,9 @@ static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, * * Needs queue_lock to be taken and reference to be valid over the call. */ -static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, - struct bfq_service_tree *st) +static void bfq_reparent_active_entities(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st) { struct rb_root *active = &st->active; struct bfq_entity *entity = NULL; @@ -464,10 +701,10 @@ static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, if (!RB_EMPTY_ROOT(&st->active)) entity = bfq_entity_of(rb_first(active)); - for (; entity != NULL; entity = bfq_entity_of(rb_first(active))) + for (; entity ; entity = bfq_entity_of(rb_first(active))) bfq_reparent_leaf_entity(bfqd, entity); - if (bfqg->sched_data.in_service_entity != NULL) + if (bfqg->sched_data.in_service_entity) bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.in_service_entity); @@ -476,20 +713,21 @@ static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, /** * bfq_destroy_group - destroy @bfqg. - * @bgrp: the bfqio_cgroup containing @bfqg. * @bfqg: the group being destroyed. * * Destroy @bfqg, making sure that it is not referenced from its parent. + * blkio already grabs the queue_lock for us, so no need to use RCU-based magic */ -static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) +static void bfq_pd_offline(struct blkg_policy_data *pd) { - struct bfq_data *bfqd; struct bfq_service_tree *st; + struct bfq_group *bfqg = pd_to_bfqg(pd); + struct bfq_data *bfqd = bfqg->bfqd; struct bfq_entity *entity = bfqg->my_entity; - unsigned long uninitialized_var(flags); int i; - hlist_del(&bfqg->group_node); + if (!entity) /* root group */ + return; /* * Empty all service_trees belonging to this group before @@ -518,37 +756,19 @@ static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) * There is no need to put the sync queues, as the * scheduler has taken no reference. */ - bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); - if (bfqd != NULL) { - bfq_reparent_active_entities(bfqd, bfqg, st); - bfq_put_bfqd_unlock(bfqd, &flags); - } + bfq_reparent_active_entities(bfqd, bfqg, st); BUG_ON(!RB_EMPTY_ROOT(&st->active)); BUG_ON(!RB_EMPTY_ROOT(&st->idle)); } - BUG_ON(bfqg->sched_data.next_in_service != NULL); - BUG_ON(bfqg->sched_data.in_service_entity != NULL); + BUG_ON(bfqg->sched_data.next_in_service); + BUG_ON(bfqg->sched_data.in_service_entity); - /* - * We may race with device destruction, take extra care when - * dereferencing bfqg->bfqd. - */ - bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); - if (bfqd != NULL) { - hlist_del(&bfqg->bfqd_node); - __bfq_deactivate_entity(entity, 0); - bfq_put_async_queues(bfqd, bfqg); - bfq_put_bfqd_unlock(bfqd, &flags); - } - BUG_ON(entity->tree != NULL); + hlist_del(&bfqg->bfqd_node); + __bfq_deactivate_entity(entity, 0); + bfq_put_async_queues(bfqd, bfqg); + BUG_ON(entity->tree); - /* - * No need to defer the kfree() to the end of the RCU grace - * period: we are called from the destroy() callback of our - * cgroup, so we can be sure that no one is a) still using - * this cgroup or b) doing lookups in it. - */ - kfree(bfqg); + bfqg_stats_xfer_dead(bfqg); } static void bfq_end_wr_async(struct bfq_data *bfqd) @@ -595,312 +815,362 @@ static void bfq_disconnect_groups(struct bfq_data *bfqd) } } -static inline void bfq_free_root_group(struct bfq_data *bfqd) +static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, + struct cftype *cftype) { - struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; - struct bfq_group *bfqg = bfqd->root_group; + struct blkcg *blkcg = css_to_blkcg(css); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + int ret = -EINVAL; - bfq_put_async_queues(bfqd, bfqg); - - spin_lock_irq(&bgrp->lock); - hlist_del_rcu(&bfqg->group_node); - spin_unlock_irq(&bgrp->lock); + spin_lock_irq(&blkcg->lock); + ret = bfqgd->weight; + spin_unlock_irq(&blkcg->lock); - /* - * No need to synchronize_rcu() here: since the device is gone - * there cannot be any read-side access to its root_group. - */ - kfree(bfqg); + return ret; } -static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) +static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) { - struct bfq_group *bfqg; - struct bfqio_cgroup *bgrp; - int i; + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node); - if (bfqg == NULL) - return NULL; + spin_lock_irq(&blkcg->lock); + seq_printf(sf, "%u\n", bfqgd->weight); + spin_unlock_irq(&blkcg->lock); - bfqg->entity.parent = NULL; - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + return 0; +} - bgrp = &bfqio_root_cgroup; - spin_lock_irq(&bgrp->lock); - rcu_assign_pointer(bfqg->bfqd, bfqd); - hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); - spin_unlock_irq(&bgrp->lock); +static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 val) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + struct blkcg_gq *blkg; + int ret = -EINVAL; + + if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) + return ret; + + ret = 0; + spin_lock_irq(&blkcg->lock); + bfqgd->weight = (unsigned short)val; + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + struct bfq_group *bfqg = blkg_to_bfqg(blkg); + if (!bfqg) + continue; + /* + * Setting the prio_changed flag of the entity + * to 1 with new_weight == weight would re-set + * the value of the weight to its ioprio mapping. + * Set the flag only if necessary. + */ + if ((unsigned short)val != bfqg->entity.new_weight) { + bfqg->entity.new_weight = (unsigned short)val; + /* + * Make sure that the above new value has been + * stored in bfqg->entity.new_weight before + * setting the prio_changed flag. In fact, + * this flag may be read asynchronously (in + * critical sections protected by a different + * lock than that held here), and finding this + * flag set may cause the execution of the code + * for updating parameters whose value may + * depend also on bfqg->entity.new_weight (in + * __bfq_entity_update_weight_prio). + * This barrier makes sure that the new value + * of bfqg->entity.new_weight is correctly + * seen in that code. + */ + smp_wmb(); + bfqg->entity.prio_changed = 1; + } + } + spin_unlock_irq(&blkcg->lock); - return bfqg; + return ret; } -#define SHOW_FUNCTION(__VAR) \ -static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \ - struct cftype *cftype) \ -{ \ - struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ - u64 ret = -ENODEV; \ - \ - mutex_lock(&bfqio_mutex); \ - if (bfqio_is_removed(bgrp)) \ - goto out_unlock; \ - \ - spin_lock_irq(&bgrp->lock); \ - ret = bgrp->__VAR; \ - spin_unlock_irq(&bgrp->lock); \ - \ -out_unlock: \ - mutex_unlock(&bfqio_mutex); \ - return ret; \ -} - -SHOW_FUNCTION(weight); -SHOW_FUNCTION(ioprio); -SHOW_FUNCTION(ioprio_class); -#undef SHOW_FUNCTION - -#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ -static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\ - struct cftype *cftype, \ - u64 val) \ -{ \ - struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ - struct bfq_group *bfqg; \ - int ret = -EINVAL; \ - \ - if (val < (__MIN) || val > (__MAX)) \ - return ret; \ - \ - ret = -ENODEV; \ - mutex_lock(&bfqio_mutex); \ - if (bfqio_is_removed(bgrp)) \ - goto out_unlock; \ - ret = 0; \ - \ - spin_lock_irq(&bgrp->lock); \ - bgrp->__VAR = (unsigned short)val; \ - hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \ - /* \ - * Setting the ioprio_changed flag of the entity \ - * to 1 with new_##__VAR == ##__VAR would re-set \ - * the value of the weight to its ioprio mapping. \ - * Set the flag only if necessary. \ - */ \ - if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ - bfqg->entity.new_##__VAR = (unsigned short)val; \ - /* \ - * Make sure that the above new value has been \ - * stored in bfqg->entity.new_##__VAR before \ - * setting the ioprio_changed flag. In fact, \ - * this flag may be read asynchronously (in \ - * critical sections protected by a different \ - * lock than that held here), and finding this \ - * flag set may cause the execution of the code \ - * for updating parameters whose value may \ - * depend also on bfqg->entity.new_##__VAR (in \ - * __bfq_entity_update_weight_prio). \ - * This barrier makes sure that the new value \ - * of bfqg->entity.new_##__VAR is correctly \ - * seen in that code. \ - */ \ - smp_wmb(); \ - bfqg->entity.ioprio_changed = 1; \ - } \ - } \ - spin_unlock_irq(&bgrp->lock); \ - \ -out_unlock: \ - mutex_unlock(&bfqio_mutex); \ - return ret; \ -} - -STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); -STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); -STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); -#undef STORE_FUNCTION - -static struct cftype bfqio_files[] = { - { - .name = "weight", - .read_u64 = bfqio_cgroup_weight_read, - .write_u64 = bfqio_cgroup_weight_write, - }, - { - .name = "ioprio", - .read_u64 = bfqio_cgroup_ioprio_read, - .write_u64 = bfqio_cgroup_ioprio_write, - }, - { - .name = "ioprio_class", - .read_u64 = bfqio_cgroup_ioprio_class_read, - .write_u64 = bfqio_cgroup_ioprio_class_write, - }, - { }, /* terminate */ -}; +static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + /* First unsigned long found in the file is used */ + return bfqio_cgroup_weight_write(of_css(of), NULL, + simple_strtoull(strim(buf), NULL, 0)); +} -static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state - *parent_css) +static int bfqg_print_stat(struct seq_file *sf, void *v) { - struct bfqio_cgroup *bgrp; + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_bfq, seq_cft(sf)->private, false); + return 0; +} - if (parent_css != NULL) { - bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); - if (bgrp == NULL) - return ERR_PTR(-ENOMEM); - } else - bgrp = &bfqio_root_cgroup; +static int bfqg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, + &blkcg_policy_bfq, seq_cft(sf)->private, true); + return 0; +} - spin_lock_init(&bgrp->lock); - INIT_HLIST_HEAD(&bgrp->group_data); - bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; - bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; +static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + u64 sum = bfqg_stat_pd_recursive_sum(pd, off); - return &bgrp->css; + return __blkg_prfill_u64(sf, pd, sum); } -/* - * We cannot support shared io contexts, as we have no means to support - * two tasks with the same ioc in two different groups without major rework - * of the main bic/bfqq data structures. By now we allow a task to change - * its cgroup only if it's the only owner of its ioc; the drawback of this - * behavior is that a group containing a task that forked using CLONE_IO - * will not be destroyed until the tasks sharing the ioc die. - */ -static int bfqio_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) { - struct task_struct *task; - struct io_context *ioc; - int ret = 0; + struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); - cgroup_taskset_for_each(task, tset) { - /* - * task_lock() is needed to avoid races with - * exit_io_context() - */ - task_lock(task); - ioc = task->io_context; - if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) - /* - * ioc == NULL means that the task is either too - * young or exiting: if it has still no ioc the - * ioc can't be shared, if the task is exiting the - * attach will fail anyway, no matter what we - * return here. - */ - ret = -EINVAL; - task_unlock(task); - if (ret) - break; - } + return __blkg_prfill_rwstat(sf, pd, &sum); +} - return ret; +static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_stat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, false); + return 0; } -static void bfqio_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) { - struct task_struct *task; - struct io_context *ioc; - struct io_cq *icq; + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, true); + return 0; +} - /* - * IMPORTANT NOTE: The move of more than one process at a time to a - * new group has not yet been tested. - */ - cgroup_taskset_for_each(task, tset) { - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - /* - * Handle cgroup change here. - */ - rcu_read_lock(); - hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node) - if (!strncmp( - icq->q->elevator->type->elevator_name, - "bfq", ELV_NAME_MAX)) - bfq_bic_change_cgroup(icq_to_bic(icq), - css); - rcu_read_unlock(); - put_io_context(ioc); - } +static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); + u64 v = 0; + + if (samples) { + v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); + v = div64_u64(v, samples); } + __blkg_prfill_u64(sf, pd, v); + return 0; } -static void bfqio_destroy(struct cgroup_subsys_state *css) +/* print avg_queue_size */ +static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) { - struct bfqio_cgroup *bgrp = css_to_bfqio(css); - struct hlist_node *tmp; - struct bfq_group *bfqg; + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, + 0, false); + return 0; +} - /* - * Since we are destroying the cgroup, there are no more tasks - * referencing it, and all the RCU grace periods that may have - * referenced it are ended (as the destruction of the parent - * cgroup is RCU-safe); bgrp->group_data will not be accessed by - * anything else and we don't need any synchronization. - */ - hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node) - bfq_destroy_group(bgrp, bfqg); +static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +{ + int ret; - BUG_ON(!hlist_empty(&bgrp->group_data)); + ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); + if (ret) + return NULL; - kfree(bgrp); + return blkg_to_bfqg(bfqd->queue->root_blkg); } -static int bfqio_css_online(struct cgroup_subsys_state *css) +static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) { - struct bfqio_cgroup *bgrp = css_to_bfqio(css); - - mutex_lock(&bfqio_mutex); - bgrp->online = true; - mutex_unlock(&bfqio_mutex); + struct bfq_group_data *bgd; - return 0; + bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); + if (!bgd) + return NULL; + return &bgd->pd; } -static void bfqio_css_offline(struct cgroup_subsys_state *css) +static void bfq_cpd_free(struct blkcg_policy_data *cpd) { - struct bfqio_cgroup *bgrp = css_to_bfqio(css); - - mutex_lock(&bfqio_mutex); - bgrp->online = false; - mutex_unlock(&bfqio_mutex); + kfree(cpd_to_bfqgd(cpd)); } -struct cgroup_subsys bfqio_cgrp_subsys = { - .css_alloc = bfqio_create, - .css_online = bfqio_css_online, - .css_offline = bfqio_css_offline, - .can_attach = bfqio_can_attach, - .attach = bfqio_attach, - .css_free = bfqio_destroy, - .legacy_cftypes = bfqio_files, +static struct cftype bfqio_files_dfl[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfqio_cgroup_weight_read_dfl, + .write = bfqio_cgroup_weight_write_dfl, + }, + {} /* terminate */ }; + +static struct cftype bfqio_files[] = { + { + .name = "bfq.weight", + .read_u64 = bfqio_cgroup_weight_read, + .write_u64 = bfqio_cgroup_weight_write, + }, + /* statistics, cover only the tasks in the bfqg */ + { + .name = "bfq.time", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.sectors", + .private = offsetof(struct bfq_group, stats.sectors), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.io_service_bytes", + .private = offsetof(struct bfq_group, stats.service_bytes), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_serviced", + .private = offsetof(struct bfq_group, stats.serviced), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_service_time", + .private = offsetof(struct bfq_group, stats.service_time), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_wait_time", + .private = offsetof(struct bfq_group, stats.wait_time), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_merged", + .private = offsetof(struct bfq_group, stats.merged), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_queued", + .private = offsetof(struct bfq_group, stats.queued), + .seq_show = bfqg_print_rwstat, + }, + + /* the same statictics which cover the bfqg and its descendants */ + { + .name = "bfq.time_recursive", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat_recursive, + }, + { + .name = "bfq.sectors_recursive", + .private = offsetof(struct bfq_group, stats.sectors), + .seq_show = bfqg_print_stat_recursive, + }, + { + .name = "bfq.io_service_bytes_recursive", + .private = offsetof(struct bfq_group, stats.service_bytes), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_serviced_recursive", + .private = offsetof(struct bfq_group, stats.serviced), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_service_time_recursive", + .private = offsetof(struct bfq_group, stats.service_time), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_wait_time_recursive", + .private = offsetof(struct bfq_group, stats.wait_time), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_merged_recursive", + .private = offsetof(struct bfq_group, stats.merged), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_queued_recursive", + .private = offsetof(struct bfq_group, stats.queued), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.avg_queue_size", + .seq_show = bfqg_print_avg_queue_size, + }, + { + .name = "bfq.group_wait_time", + .private = offsetof(struct bfq_group, stats.group_wait_time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.idle_time", + .private = offsetof(struct bfq_group, stats.idle_time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.empty_time", + .private = offsetof(struct bfq_group, stats.empty_time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.dequeue", + .private = offsetof(struct bfq_group, stats.dequeue), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.unaccounted_time", + .private = offsetof(struct bfq_group, stats.unaccounted_time), + .seq_show = bfqg_print_stat, + }, + { } /* terminate */ +}; + +static struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfqio_files_dfl, + .legacy_cftypes = bfqio_files, + + .pd_alloc_fn = bfq_pd_alloc, + .pd_init_fn = bfq_pd_init, + .pd_offline_fn = bfq_pd_offline, + .pd_free_fn = bfq_pd_free, + .pd_reset_stats_fn = bfq_pd_reset_stats, + + .cpd_alloc_fn = bfq_cpd_alloc, + .cpd_init_fn = bfq_cpd_init, + .cpd_bind_fn = bfq_cpd_init, + .cpd_free_fn = bfq_cpd_free, + +}; + #else -static inline void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) + +static void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; - entity->ioprio = entity->new_ioprio; - entity->ioprio_class = entity->new_ioprio_class; + if (bfqq) { + bfqq->ioprio = bfqq->new_ioprio; + bfqq->ioprio_class = bfqq->new_ioprio_class; + } entity->sched_data = &bfqg->sched_data; } -static inline struct bfq_group * -bfq_bic_update_cgroup(struct bfq_io_cq *bic) +static struct bfq_group * +bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); return bfqd->root_group; } -static inline void bfq_bfqq_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_entity *entity, - struct bfq_group *bfqg) +static void bfq_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_entity *entity, + struct bfq_group *bfqg) { } @@ -909,23 +1179,24 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static inline void bfq_disconnect_groups(struct bfq_data *bfqd) +static void bfq_disconnect_groups(struct bfq_data *bfqd) { bfq_put_async_queues(bfqd, bfqd->root_group); } -static inline void bfq_free_root_group(struct bfq_data *bfqd) +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, + struct blkcg *blkcg) { - kfree(bfqd->root_group); + return bfqd->root_group; } -static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) +static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { struct bfq_group *bfqg; int i; bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); - if (bfqg == NULL) + if (!bfqg) return NULL; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c index 7f6b0004c..fb7bb8f08 100644 --- a/block/bfq-ioc.c +++ b/block/bfq-ioc.c @@ -14,7 +14,7 @@ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. * @icq: the iocontext queue. */ -static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) +static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) { /* bic->icq is the first member, %NULL will convert to %NULL */ return container_of(icq, struct bfq_io_cq, icq); @@ -27,8 +27,8 @@ static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) * * Queue lock must be held. */ -static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct io_context *ioc) +static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + struct io_context *ioc) { if (ioc) return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 71b51c1b4..dbce1f83f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -82,6 +82,9 @@ static const int bfq_back_penalty = 2; /* Idling period duration, in jiffies. */ static int bfq_slice_idle = HZ / 125; +/* Minimum number of assigned budgets for which stats are safe to compute. */ +static const int bfq_stats_min_budgets = 194; + /* Default maximum budget values, in sectors and number of requests. */ static const int bfq_default_max_budget = 16 * 1024; static const int bfq_max_budget_async_rq = 4; @@ -163,38 +166,22 @@ static int device_speed_thresh[2]; #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); +static void bfq_schedule_dispatch(struct bfq_data *bfqd); #include "bfq-ioc.c" #include "bfq-sched.c" #include "bfq-cgroup.c" -#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ - IOPRIO_CLASS_IDLE) -#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ - IOPRIO_CLASS_RT) +#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) #define bfq_sample_valid(samples) ((samples) > 80) /* - * The following macro groups conditions that need to be evaluated when - * checking if existing queues and groups form a symmetric scenario - * and therefore idling can be reduced or disabled for some of the - * queues. See the comment to the function bfq_bfqq_must_not_expire() - * for further details. - */ -#ifdef CONFIG_CGROUP_BFQIO -#define symmetric_scenario (!bfqd->active_numerous_groups && \ - !bfq_differentiated_weights(bfqd)) -#else -#define symmetric_scenario (!bfq_differentiated_weights(bfqd)) -#endif - -/* * We regard a request as SYNC, if either it's a read or has the SYNC bit * set (in which case it could also be a direct WRITE). */ -static inline int bfq_bio_sync(struct bio *bio) +static int bfq_bio_sync(struct bio *bio) { if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) return 1; @@ -206,7 +193,7 @@ static inline int bfq_bio_sync(struct bio *bio) * Scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing. */ -static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) +static void bfq_schedule_dispatch(struct bfq_data *bfqd) { if (bfqd->queued != 0) { bfq_log(bfqd, "schedule dispatch"); @@ -230,9 +217,9 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ - if (rq1 == NULL || rq1 == rq2) + if (!rq1 || rq1 == rq2) return rq2; - if (rq2 == NULL) + if (!rq2) return rq1; if (rq_is_sync(rq1) && !rq_is_sync(rq2)) @@ -345,17 +332,17 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", (long long unsigned)sector, - bfqq != NULL ? bfqq->pid : 0); + bfqq ? bfqq->pid : 0); return bfqq; } -static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) +static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct rb_node **p, *parent; struct bfq_queue *__bfqq; - if (bfqq->pos_root != NULL) { + if (bfqq->pos_root) { rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } @@ -365,10 +352,10 @@ static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (!bfqq->next_rq) return; - bfqq->pos_root = &bfqd->rq_pos_tree; + bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, blk_rq_pos(bfqq->next_rq), &parent, &p); - if (__bfqq == NULL) { + if (!__bfqq) { rb_link_node(&bfqq->pos_node, parent, p); rb_insert_color(&bfqq->pos_node, bfqq->pos_root); } else @@ -378,7 +365,7 @@ static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* * Tell whether there are active queues or groups with differentiated weights. */ -static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) +static bool bfq_differentiated_weights(struct bfq_data *bfqd) { /* * For weights to differ, at least one of the trees must contain @@ -387,7 +374,7 @@ static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && (bfqd->queue_weights_tree.rb_node->rb_left || bfqd->queue_weights_tree.rb_node->rb_right) -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED ) || (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && (bfqd->group_weights_tree.rb_node->rb_left || @@ -397,6 +384,40 @@ static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) } /* + * The following function returns true if every queue must receive the + * same share of the throughput (this condition is used when deciding + * whether idling may be disabled, see the comments in the function + * bfq_bfqq_may_idle()). + * + * Such a scenario occurs when: + * 1) all active queues have the same weight, + * 2) all active groups at the same level in the groups tree have the same + * weight, + * 3) all active groups at the same level in the groups tree have the same + * number of children. + * + * Unfortunately, keeping the necessary state for evaluating exactly the + * above symmetry conditions would be quite complex and time-consuming. + * Therefore this function evaluates, instead, the following stronger + * sub-conditions, for which it is much easier to maintain the needed + * state: + * 1) all active queues have the same weight, + * 2) all active groups have the same weight, + * 3) all active groups have at most one active child each. + * In particular, the last two conditions are always true if hierarchical + * support and the cgroups interface are not enabled, thus no state needs + * to be maintained in this case. + */ +static bool bfq_symmetric_scenario(struct bfq_data *bfqd) +{ + return +#ifdef CONFIG_BFQ_GROUP_IOSCHED + !bfqd->active_numerous_groups && +#endif + !bfq_differentiated_weights(bfqd); +} + +/* * If the weight-counter tree passed as input contains no counter for * the weight of the input entity, then add that counter; otherwise just * increment the existing counter. @@ -495,10 +516,10 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - if (rbprev != NULL) + if (rbprev) prev = rb_entry_rq(rbprev); - if (rbnext != NULL) + if (rbnext) next = rb_entry_rq(rbnext); else { rbnext = rb_first(&bfqq->sort_list); @@ -510,8 +531,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, } /* see the definition of bfq_async_charge_factor for details */ -static inline unsigned long bfq_serv_to_charge(struct request *rq, - struct bfq_queue *bfqq) +static unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) { return blk_rq_sectors(rq) * (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * @@ -537,7 +558,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, struct request *next_rq = bfqq->next_rq; unsigned long new_budget; - if (next_rq == NULL) + if (!next_rq) return; if (bfqq == bfqd->in_service_queue) @@ -560,7 +581,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, } } -static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) +static unsigned int bfq_wr_duration(struct bfq_data *bfqd) { u64 dur; @@ -573,13 +594,12 @@ static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } -static inline unsigned -bfq_bfqq_cooperations(struct bfq_queue *bfqq) +static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq) { return bfqq->bic ? bfqq->bic->cooperations : 0; } -static inline void +static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) { if (bic->saved_idle_window) @@ -603,7 +623,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bic->wr_time_left; bfqq->last_wr_start_finish = jiffies; - bfqq->entity.ioprio_changed = 1; + bfqq->entity.prio_changed = 1; } /* * Clear wr_time_left to prevent bfq_bfqq_save_state() from @@ -613,11 +633,12 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bic->wr_time_left = 0; } -/* Must be called with the queue_lock held. */ static int bfqq_process_refs(struct bfq_queue *bfqq) { int process_refs, io_refs; + lockdep_assert_held(bfqq->bfqd->queue->queue_lock); + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; BUG_ON(process_refs < 0); @@ -625,8 +646,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) } /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -static inline void bfq_reset_burst_list(struct bfq_data *bfqd, - struct bfq_queue *bfqq) +static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_queue *item; struct hlist_node *n; @@ -858,14 +878,14 @@ static void bfq_add_request(struct request *rq) */ prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); - BUG_ON(next_rq == NULL); + BUG_ON(!next_rq); bfqq->next_rq = next_rq; /* * Adjust priority tree position, if next_rq changes. */ if (prev != bfqq->next_rq) - bfq_rq_pos_tree_add(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); if (!bfq_bfqq_busy(bfqq)) { bool soft_rt, coop_or_in_burst, @@ -873,6 +893,10 @@ static void bfq_add_request(struct request *rq) bfqq->budget_timeout + bfqd->bfq_wr_min_idle_time); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, + rq->cmd_flags); +#endif if (bfq_bfqq_sync(bfqq)) { bool already_in_burst = !hlist_unhashed(&bfqq->burst_list_node) || @@ -917,7 +941,7 @@ static void bfq_add_request(struct request *rq) goto add_bfqq_busy; if (bfq_bfqq_just_split(bfqq)) - goto set_ioprio_changed; + goto set_prio_changed; /* * If the queue: @@ -929,7 +953,7 @@ static void bfq_add_request(struct request *rq) * start a weight-raising period. */ if (old_wr_coeff == 1 && (interactive || soft_rt) && - (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { + (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { bfqq->wr_coeff = bfqd->bfq_wr_coeff; if (interactive) bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); @@ -1008,9 +1032,9 @@ static void bfq_add_request(struct request *rq) bfqd->bfq_wr_rt_max_time; } } -set_ioprio_changed: +set_prio_changed: if (old_wr_coeff != bfqq->wr_coeff) - entity->ioprio_changed = 1; + entity->prio_changed = 1; add_bfqq_busy: bfqq->last_idle_bklogged = jiffies; bfqq->service_from_backlogged = 0; @@ -1025,7 +1049,7 @@ add_bfqq_busy: bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); bfqd->wr_busy_queues++; - entity->ioprio_changed = 1; + entity->prio_changed = 1; bfq_log_bfqq(bfqd, bfqq, "non-idle wrais starting at %lu, rais_max_time %u", jiffies, @@ -1048,11 +1072,11 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, struct bfq_queue *bfqq; bic = bfq_bic_lookup(bfqd, tsk->io_context); - if (bic == NULL) + if (!bic) return NULL; bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); - if (bfqq != NULL) + if (bfqq) return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); return NULL; @@ -1068,8 +1092,7 @@ static void bfq_activate_request(struct request_queue *q, struct request *rq) (long long unsigned)bfqd->last_position); } -static inline void bfq_deactivate_request(struct request_queue *q, - struct request *rq) +static void bfq_deactivate_request(struct request_queue *q, struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; @@ -1101,7 +1124,7 @@ static void bfq_remove_request(struct request *rq) /* * Remove queue from request-position tree as it is empty. */ - if (bfqq->pos_root != NULL) { + if (bfqq->pos_root) { rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } @@ -1111,6 +1134,9 @@ static void bfq_remove_request(struct request *rq) BUG_ON(bfqq->meta_pending == 0); bfqq->meta_pending--; } +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); +#endif } static int bfq_merge(struct request_queue *q, struct request **req, @@ -1120,7 +1146,7 @@ static int bfq_merge(struct request_queue *q, struct request **req, struct request *__rq; __rq = bfq_find_rq_fmerge(bfqd, bio); - if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { + if (__rq && elv_rq_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_FRONT_MERGE; } @@ -1147,7 +1173,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, bfqd->last_position); - BUG_ON(next_rq == NULL); + BUG_ON(!next_rq); bfqq->next_rq = next_rq; /* * If next_rq changes, update both the queue's budget to @@ -1156,11 +1182,19 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, */ if (prev != bfqq->next_rq) { bfq_updated_next_req(bfqd, bfqq); - bfq_rq_pos_tree_add(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); } } } +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static void bfq_bio_merged(struct request_queue *q, struct request *req, + struct bio *bio) +{ + bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); +} +#endif + static void bfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { @@ -1187,18 +1221,21 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfq_remove_request(next); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +#endif } /* Must be called with bfqq != NULL */ -static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { - BUG_ON(bfqq == NULL); + BUG_ON(!bfqq); if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; /* Trigger a weight change on the next activation of the queue */ - bfqq->entity.ioprio_changed = 1; + bfqq->entity.prio_changed = 1; } static void bfq_end_wr_async_queues(struct bfq_data *bfqd, @@ -1208,9 +1245,9 @@ static void bfq_end_wr_async_queues(struct bfq_data *bfqd, for (i = 0; i < 2; i++) for (j = 0; j < IOPRIO_BE_NR; j++) - if (bfqg->async_bfqq[i][j] != NULL) + if (bfqg->async_bfqq[i][j]) bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); - if (bfqg->async_idle_bfqq != NULL) + if (bfqg->async_idle_bfqq) bfq_bfqq_end_wr(bfqg->async_idle_bfqq); } @@ -1229,7 +1266,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) spin_unlock_irq(bfqd->queue->queue_lock); } -static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) +static sector_t bfq_io_struct_pos(void *io_struct, bool request) { if (request) return blk_rq_pos(io_struct); @@ -1237,25 +1274,18 @@ static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) return ((struct bio *)io_struct)->bi_iter.bi_sector; } -static inline sector_t bfq_dist_from(sector_t pos1, - sector_t pos2) -{ - if (pos1 >= pos2) - return pos1 - pos2; - else - return pos2 - pos1; -} - -static inline int bfq_rq_close_to_sector(void *io_struct, bool request, - sector_t sector) +static int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) { - return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= + return abs(bfq_io_struct_pos(io_struct, request) - sector) <= BFQQ_SEEK_THR; } -static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) +static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + sector_t sector) { - struct rb_root *root = &bfqd->rq_pos_tree; + struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; struct rb_node *parent, *node; struct bfq_queue *__bfqq; @@ -1267,7 +1297,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) * request, choose it. */ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); - if (__bfqq != NULL) + if (__bfqq) return __bfqq; /* @@ -1283,7 +1313,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) node = rb_next(&__bfqq->pos_node); else node = rb_prev(&__bfqq->pos_node); - if (node == NULL) + if (!node) return NULL; __bfqq = rb_entry(node, struct bfq_queue, pos_node); @@ -1293,56 +1323,21 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) return NULL; } -/* - * bfqd - obvious - * cur_bfqq - passed in so that we don't decide that the current queue - * is closely cooperating with itself - * sector - used as a reference point to search for a close queue - */ -static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, - struct bfq_queue *cur_bfqq, - sector_t sector) +static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) { struct bfq_queue *bfqq; - if (bfq_class_idle(cur_bfqq)) - return NULL; - if (!bfq_bfqq_sync(cur_bfqq)) - return NULL; - if (BFQQ_SEEKY(cur_bfqq)) - return NULL; - - /* If device has only one backlogged bfq_queue, don't search. */ - if (bfqd->busy_queues == 1) - return NULL; - - /* - * We should notice if some of the queues are cooperating, e.g. - * working closely on the same area of the disk. In that case, - * we can group them together and don't waste time idling. - */ - bfqq = bfqq_close(bfqd, sector); - if (bfqq == NULL || bfqq == cur_bfqq) - return NULL; - - /* - * Do not merge queues from different bfq_groups. - */ - if (bfqq->entity.parent != cur_bfqq->entity.parent) - return NULL; - /* - * It only makes sense to merge sync queues. + * We shall notice if some of the queues are cooperating, + * e.g., working closely on the same area of the device. In + * that case, we can group them together and: 1) don't waste + * time idling, and 2) serve the union of their requests in + * the best possible order for throughput. */ - if (!bfq_bfqq_sync(bfqq)) - return NULL; - if (BFQQ_SEEKY(bfqq)) - return NULL; - - /* - * Do not merge queues of different priority classes. - */ - if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) + bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); + if (!bfqq || bfqq == cur_bfqq) return NULL; return bfqq; @@ -1409,6 +1404,32 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) return new_bfqq; } +static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + struct bfq_queue *new_bfqq) +{ + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || + (bfqq->ioprio_class != new_bfqq->ioprio_class)) + return false; + + /* + * If either of the queues has already been detected as seeky, + * then merging it with the other queue is unlikely to lead to + * sequential I/O. + */ + if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) + return false; + + /* + * Interleaved I/O is known to be done by (some) applications + * only for reads, so it does not make sense to merge async + * queues. + */ + if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) + return false; + + return true; +} + /* * Attempt to schedule a merge of bfqq with the currently in-service queue * or with a close queue among the scheduled queues. @@ -1430,56 +1451,52 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->new_bfqq) return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; + /* If device has only one backlogged bfq_queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; in_service_bfqq = bfqd->in_service_queue; - if (in_service_bfqq == NULL || in_service_bfqq == bfqq || + if (!in_service_bfqq || in_service_bfqq == bfqq || !bfqd->in_service_bic || unlikely(in_service_bfqq == &bfqd->oom_bfqq)) goto check_scheduled; - if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) - goto check_scheduled; - - if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) - goto check_scheduled; - - if (in_service_bfqq->entity.parent != bfqq->entity.parent) - goto check_scheduled; - if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { + bfqq->entity.parent == in_service_bfqq->entity.parent && + bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); - if (new_bfqq != NULL) - return new_bfqq; /* Merge with in-service queue */ + if (new_bfqq) + return new_bfqq; } - /* * Check whether there is a cooperator among currently scheduled * queues. The only thing we need is that the bio/request is not * NULL, as we need it to establish whether a cooperator exists. */ check_scheduled: - new_bfqq = bfq_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq)) + new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + + BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); + + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); return NULL; } -static inline void -bfq_bfqq_save_state(struct bfq_queue *bfqq) +static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { /* - * If bfqq->bic == NULL, the queue is already shared or its requests + * If !bfqq->bic, the queue is already shared or its requests * have already been redirected to a shared queue; both idle window * and weight raising state have already been saved. Do nothing. */ - if (bfqq->bic == NULL) + if (!bfqq->bic) return; if (bfqq->bic->wr_time_left) /* @@ -1523,8 +1540,7 @@ bfq_bfqq_save_state(struct bfq_queue *bfqq) bfqq->bic->failed_cooperations = 0; } -static inline void -bfq_get_bic_reference(struct bfq_queue *bfqq) +static void bfq_get_bic_reference(struct bfq_queue *bfqq) { /* * If bfqq->bic has a non-NULL value, the bic to which it belongs @@ -1572,7 +1588,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_put_queue(bfqq); } -static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) { struct bfq_io_cq *bic = bfqq->bic; struct bfq_data *bfqd = bfqq->bfqd; @@ -1603,7 +1619,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, * Queue lock is held here. */ bic = bfq_bic_lookup(bfqd, current->io_context); - if (bic == NULL) + if (!bic) return 0; bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); @@ -1611,9 +1627,9 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, * We take advantage of this function to perform an early merge * of the queues of possible cooperating processes. */ - if (bfqq != NULL) { + if (bfqq) { new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); - if (new_bfqq != NULL) { + if (new_bfqq) { bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); /* * If we get here, the bio will be queued in the @@ -1631,7 +1647,10 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, static void __bfq_set_in_service_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - if (bfqq != NULL) { + if (bfqq) { +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); +#endif bfq_mark_bfqq_must_alloc(bfqq); bfq_mark_bfqq_budget_new(bfqq); bfq_clear_bfqq_fifo_expire(bfqq); @@ -1639,7 +1658,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; bfq_log_bfqq(bfqd, bfqq, - "set_in_service_queue, cur-budget = %lu", + "set_in_service_queue, cur-budget = %d", bfqq->entity.budget); } @@ -1662,9 +1681,9 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) * stored in bfqd, which is dynamically updated according to the * estimated disk peak rate; otherwise return the default max budget */ -static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) +static int bfq_max_budget(struct bfq_data *bfqd) { - if (bfqd->budgets_assigned < 194) + if (bfqd->budgets_assigned < bfq_stats_min_budgets) return bfq_default_max_budget; else return bfqd->bfq_max_budget; @@ -1674,9 +1693,9 @@ static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) * Return min budget, which is a fraction of the current or default * max budget (trying with 1/32) */ -static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) +static int bfq_min_budget(struct bfq_data *bfqd) { - if (bfqd->budgets_assigned < 194) + if (bfqd->budgets_assigned < bfq_stats_min_budgets) return bfq_default_max_budget / 32; else return bfqd->bfq_max_budget / 32; @@ -1692,7 +1711,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) /* Processes have exited, don't wait. */ bic = bfqd->in_service_bic; - if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) + if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) return; bfq_mark_bfqq_wait_request(bfqq); @@ -1718,12 +1737,15 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > bfq_max_budget(bfqq->bfqd) / 8) || bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && - symmetric_scenario) + bfq_symmetric_scenario(bfqd)) sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); else if (bfqq->wr_coeff > 1) sl = sl * 3; bfqd->last_idling_start = ktime_get(); mod_timer(&bfqd->idle_slice_timer, jiffies + sl); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); +#endif bfq_log(bfqd, "arm idle: %u/%u ms", jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); } @@ -1777,6 +1799,10 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) if (bfq_bfqq_sync(bfqq)) bfqd->sync_flight++; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), + rq->cmd_flags); +#endif } /* @@ -1802,7 +1828,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) return rq; } -static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) +static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; return entity->budget - entity->service; @@ -1836,7 +1862,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* * Resort priority tree of potential close cooperators. */ - bfq_rq_pos_tree_add(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); } } @@ -1846,24 +1872,24 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) * @bfqq: queue to update. * @reason: reason for expiration. * - * Handle the feedback on @bfqq budget. See the body for detailed - * comments. + * Handle the feedback on @bfqq budget at queue expiration. + * See the body for detailed comments. */ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, struct bfq_queue *bfqq, enum bfqq_expiration reason) { struct request *next_rq; - unsigned long budget, min_budget; + int budget, min_budget; budget = bfqq->max_budget; min_budget = bfq_min_budget(bfqd); BUG_ON(bfqq != bfqd->in_service_queue); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", budget, bfq_min_budget(bfqd)); bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); @@ -1940,18 +1966,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, default: return; } - } else /* async queue */ - /* async queues get always the maximum possible budget - * (their ability to dispatch is limited by - * @bfqd->bfq_max_budget_async_rq). - */ + } else + /* + * Async queues get always the maximum possible budget + * (their ability to dispatch is limited by + * @bfqd->bfq_max_budget_async_rq). + */ budget = bfqd->bfq_max_budget; bfqq->max_budget = budget; - if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && - bfqq->max_budget > bfqd->bfq_max_budget) - bfqq->max_budget = bfqd->bfq_max_budget; + if (bfqd->budgets_assigned >= bfq_stats_min_budgets && + !bfqd->bfq_user_max_budget) + bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); /* * Make sure that we have enough budget for the next request. @@ -1960,14 +1987,14 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, * update. */ next_rq = bfqq->next_rq; - if (next_rq != NULL) + if (next_rq) bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); else bfqq->entity.budget = bfqq->max_budget; - bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", - next_rq != NULL ? blk_rq_sectors(next_rq) : 0, + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", + next_rq ? blk_rq_sectors(next_rq) : 0, bfqq->entity.budget); } @@ -1993,15 +2020,15 @@ static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) * seeky processes, and hence reduce their chances to lower the * throughput. See the code for more details. */ -static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int compensate, enum bfqq_expiration reason) +static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool compensate, enum bfqq_expiration reason) { u64 bw, usecs, expected, timeout; ktime_t delta; int update = 0; if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) - return 0; + return false; if (compensate) delta = bfqd->last_idling_start; @@ -2012,7 +2039,7 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* Don't trust short/unrealistic values. */ if (usecs < 100 || usecs >= LONG_MAX) - return 0; + return false; /* * Calculate the bandwidth for the last slice. We use a 64 bit @@ -2061,7 +2088,7 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd->peak_rate, timeout); - bfq_log(bfqd, "new max_budget=%lu", + bfq_log(bfqd, "new max_budget=%d", bfqd->bfq_max_budget); } if (bfqd->device_speed == BFQ_BFQD_FAST && @@ -2086,7 +2113,7 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, * and for the moment return false. */ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) - return 0; + return false; /* * A process is considered ``slow'' (i.e., seeky, so that we @@ -2161,8 +2188,8 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, * seems to be quite precise also in embedded systems and KVM/QEMU virtual * machines. */ -static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) +static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) { return max(bfqq->last_idle_bklogged + HZ * bfqq->service_from_backlogged / @@ -2175,7 +2202,7 @@ static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, * the current time will be lower than this time instant according to the macro * time_is_before_jiffies(). */ -static inline unsigned long bfq_infinity_from_now(unsigned long now) +static unsigned long bfq_infinity_from_now(unsigned long now) { return now + ULONG_MAX / 2; } @@ -2212,13 +2239,14 @@ static inline unsigned long bfq_infinity_from_now(unsigned long now) */ static void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int compensate, + bool compensate, enum bfqq_expiration reason) { - int slow; + bool slow; BUG_ON(bfqq != bfqd->in_service_queue); - /* Update disk peak rate for autotuning and check whether the + /* + * Update disk peak rate for autotuning and check whether the * process is slow (see bfq_update_peak_rate). */ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); @@ -2312,12 +2340,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, * just checked on request arrivals and completions, as well as on * idle timer expirations. */ -static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) { if (bfq_bfqq_budget_new(bfqq) || time_before(jiffies, bfqq->budget_timeout)) - return 0; - return 1; + return false; + return true; } /* @@ -2328,7 +2356,7 @@ static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) * does not hold, or if the queue is slow enough to deserve only to be * kicked off for preserving a high throughput. */ -static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "may_budget_timeout: wait_request %d left %d timeout %d", @@ -2343,183 +2371,278 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) } /* - * Device idling is allowed only for the queues for which this function - * returns true. For this reason, the return value of this function plays a - * critical role for both throughput boosting and service guarantees. The - * return value is computed through a logical expression. In this rather - * long comment, we try to briefly describe all the details and motivations - * behind the components of this logical expression. - * - * First, the expression is false if bfqq is not sync, or if: bfqq happened - * to become active during a large burst of queue activations, and the - * pattern of requests bfqq contains boosts the throughput if bfqq is - * expired. In fact, queues that became active during a large burst benefit - * only from throughput, as discussed in the comments to bfq_handle_burst. - * In this respect, expiring bfqq certainly boosts the throughput on NCQ- - * capable flash-based devices, whereas, on rotational devices, it boosts - * the throughput only if bfqq contains random requests. - * - * On the opposite end, if (a) bfqq is sync, (b) the above burst-related - * condition does not hold, and (c) bfqq is being weight-raised, then the - * expression always evaluates to true, as device idling is instrumental - * for preserving low-latency guarantees (see [1]). If, instead, conditions - * (a) and (b) do hold, but (c) does not, then the expression evaluates to - * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and - * (2) at least one of the following two conditions holds. - * The first condition is that the device is not performing NCQ, because - * idling the device most certainly boosts the throughput if this condition - * holds and bfqq is I/O-bound and has been granted a non-null idle window. - * The second compound condition is made of the logical AND of two components. - * - * The first component is true only if there is no weight-raised busy - * queue. This guarantees that the device is not idled for a sync non- - * weight-raised queue when there are busy weight-raised queues. The former - * is then expired immediately if empty. Combined with the timestamping - * rules of BFQ (see [1] for details), this causes sync non-weight-raised - * queues to get a lower number of requests served, and hence to ask for a - * lower number of requests from the request pool, before the busy weight- - * raised queues get served again. - * - * This is beneficial for the processes associated with weight-raised - * queues, when the request pool is saturated (e.g., in the presence of - * write hogs). In fact, if the processes associated with the other queues - * ask for requests at a lower rate, then weight-raised processes have a - * higher probability to get a request from the pool immediately (or at - * least soon) when they need one. Hence they have a higher probability to - * actually get a fraction of the disk throughput proportional to their - * high weight. This is especially true with NCQ-capable drives, which - * enqueue several requests in advance and further reorder internally- - * queued requests. - * - * In the end, mistreating non-weight-raised queues when there are busy - * weight-raised queues seems to mitigate starvation problems in the - * presence of heavy write workloads and NCQ, and hence to guarantee a - * higher application and system responsiveness in these hostile scenarios. - * - * If the first component of the compound condition is instead true, i.e., - * there is no weight-raised busy queue, then the second component of the - * compound condition takes into account service-guarantee and throughput - * issues related to NCQ (recall that the compound condition is evaluated - * only if the device is detected as supporting NCQ). + * For a queue that becomes empty, device idling is allowed only if + * this function returns true for that queue. As a consequence, since + * device idling plays a critical role for both throughput boosting + * and service guarantees, the return value of this function plays a + * critical role as well. * - * As for service guarantees, allowing the drive to enqueue more than one - * request at a time, and hence delegating de facto final scheduling - * decisions to the drive's internal scheduler, causes loss of control on - * the actual request service order. In this respect, when the drive is - * allowed to enqueue more than one request at a time, the service - * distribution enforced by the drive's internal scheduler is likely to - * coincide with the desired device-throughput distribution only in the - * following, perfectly symmetric, scenario: - * 1) all active queues have the same weight, - * 2) all active groups at the same level in the groups tree have the same - * weight, - * 3) all active groups at the same level in the groups tree have the same - * number of children. - * - * Even in such a scenario, sequential I/O may still receive a preferential - * treatment, but this is not likely to be a big issue with flash-based - * devices, because of their non-dramatic loss of throughput with random - * I/O. Things do differ with HDDs, for which additional care is taken, as - * explained after completing the discussion for flash-based devices. + * In a nutshell, this function returns true only if idling is + * beneficial for throughput or, even if detrimental for throughput, + * idling is however necessary to preserve service guarantees (low + * latency, desired throughput distribution, ...). In particular, on + * NCQ-capable devices, this function tries to return false, so as to + * help keep the drives' internal queues full, whenever this helps the + * device boost the throughput without causing any service-guarantee + * issue. * - * Unfortunately, keeping the necessary state for evaluating exactly the - * above symmetry conditions would be quite complex and time-consuming. - * Therefore BFQ evaluates instead the following stronger sub-conditions, - * for which it is much easier to maintain the needed state: - * 1) all active queues have the same weight, - * 2) all active groups have the same weight, - * 3) all active groups have at most one active child each. - * In particular, the last two conditions are always true if hierarchical - * support and the cgroups interface are not enabled, hence no state needs - * to be maintained in this case. - * - * According to the above considerations, the second component of the - * compound condition evaluates to true if any of the above symmetry - * sub-condition does not hold, or the device is not flash-based. Therefore, - * if also the first component is true, then idling is allowed for a sync - * queue. These are the only sub-conditions considered if the device is - * flash-based, as, for such a device, it is sensible to force idling only - * for service-guarantee issues. In fact, as for throughput, idling - * NCQ-capable flash-based devices would not boost the throughput even - * with sequential I/O; rather it would lower the throughput in proportion - * to how fast the device is. In the end, (only) if all the three - * sub-conditions hold and the device is flash-based, the compound - * condition evaluates to false and therefore no idling is performed. - * - * As already said, things change with a rotational device, where idling - * boosts the throughput with sequential I/O (even with NCQ). Hence, for - * such a device the second component of the compound condition evaluates - * to true also if the following additional sub-condition does not hold: - * the queue is constantly seeky. Unfortunately, this different behavior - * with respect to flash-based devices causes an additional asymmetry: if - * some sync queues enjoy idling and some other sync queues do not, then - * the latter get a low share of the device throughput, simply because the - * former get many requests served after being set as in service, whereas - * the latter do not. As a consequence, to guarantee the desired throughput - * distribution, on HDDs the compound expression evaluates to true (and - * hence device idling is performed) also if the following last symmetry - * condition does not hold: no other queue is benefiting from idling. Also - * this last condition is actually replaced with a simpler-to-maintain and - * stronger condition: there is no busy queue which is not constantly seeky - * (and hence may also benefit from idling). - * - * To sum up, when all the required symmetry and throughput-boosting - * sub-conditions hold, the second component of the compound condition - * evaluates to false, and hence no idling is performed. This helps to - * keep the drives' internal queues full on NCQ-capable devices, and hence - * to boost the throughput, without causing 'almost' any loss of service - * guarantees. The 'almost' follows from the fact that, if the internal - * queue of one such device is filled while all the sub-conditions hold, - * but at some point in time some sub-condition stops to hold, then it may - * become impossible to let requests be served in the new desired order - * until all the requests already queued in the device have been served. + * In more detail, the return value of this function is obtained by, + * first, computing a number of boolean variables that take into + * account throughput and service-guarantee issues, and, then, + * combining these variables in a logical expression. Most of the + * issues taken into account are not trivial. We discuss these issues + * while introducing the variables. */ -static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) +static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; -#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \ - bfqd->busy_in_flight_queues == \ - bfqd->const_seeky_busy_in_flight_queues) + bool idling_boosts_thr, idling_boosts_thr_without_issues, + all_queues_seeky, on_hdd_and_not_all_queues_seeky, + idling_needed_for_service_guarantees, + asymmetric_scenario; -#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \ - bfqd->hw_tag && \ - (blk_queue_nonrot(bfqd->queue) || \ - bfq_bfqq_constantly_seeky(bfqq))) + /* + * The next variable takes into account the cases where idling + * boosts the throughput. + * + * The value of the variable is computed considering, first, that + * idling is virtually always beneficial for the throughput if: + * (a) the device is not NCQ-capable, or + * (b) regardless of the presence of NCQ, the device is rotational + * and the request pattern for bfqq is I/O-bound and sequential. + * + * Secondly, and in contrast to the above item (b), idling an + * NCQ-capable flash-based device would not boost the + * throughput even with sequential I/O; rather it would lower + * the throughput in proportion to how fast the device + * is. Accordingly, the next variable is true if any of the + * above conditions (a) and (b) is true, and, in particular, + * happens to be false if bfqd is an NCQ-capable flash-based + * device. + */ + idling_boosts_thr = !bfqd->hw_tag || + (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && + bfq_bfqq_idle_window(bfqq)) ; -/* - * Condition for expiring a non-weight-raised queue (and hence not idling - * the device). - */ -#define cond_for_expiring_non_wr (bfqd->hw_tag && \ - (bfqd->wr_busy_queues > 0 || \ - (blk_queue_nonrot(bfqd->queue) || \ - cond_for_seeky_on_ncq_hdd))) + /* + * The value of the next variable, + * idling_boosts_thr_without_issues, is equal to that of + * idling_boosts_thr, unless a special case holds. In this + * special case, described below, idling may cause problems to + * weight-raised queues. + * + * When the request pool is saturated (e.g., in the presence + * of write hogs), if the processes associated with + * non-weight-raised queues ask for requests at a lower rate, + * then processes associated with weight-raised queues have a + * higher probability to get a request from the pool + * immediately (or at least soon) when they need one. Thus + * they have a higher probability to actually get a fraction + * of the device throughput proportional to their high + * weight. This is especially true with NCQ-capable drives, + * which enqueue several requests in advance, and further + * reorder internally-queued requests. + * + * For this reason, we force to false the value of + * idling_boosts_thr_without_issues if there are weight-raised + * busy queues. In this case, and if bfqq is not weight-raised, + * this guarantees that the device is not idled for bfqq (if, + * instead, bfqq is weight-raised, then idling will be + * guaranteed by another variable, see below). Combined with + * the timestamping rules of BFQ (see [1] for details), this + * behavior causes bfqq, and hence any sync non-weight-raised + * queue, to get a lower number of requests served, and thus + * to ask for a lower number of requests from the request + * pool, before the busy weight-raised queues get served + * again. This often mitigates starvation problems in the + * presence of heavy write workloads and NCQ, thereby + * guaranteeing a higher application and system responsiveness + * in these hostile scenarios. + */ + idling_boosts_thr_without_issues = idling_boosts_thr && + bfqd->wr_busy_queues == 0; + + /* + * There are then two cases where idling must be performed not + * for throughput concerns, but to preserve service + * guarantees. In the description of these cases, we say, for + * short, that a queue is sequential/random if the process + * associated to the queue issues sequential/random requests + * (in the second case the queue may be tagged as seeky or + * even constantly_seeky). + * + * To introduce the first case, we note that, since + * bfq_bfqq_idle_window(bfqq) is false if the device is + * NCQ-capable and bfqq is random (see + * bfq_update_idle_window()), then, from the above two + * assignments it follows that + * idling_boosts_thr_without_issues is false if the device is + * NCQ-capable and bfqq is random. Therefore, for this case, + * device idling would never be allowed if we used just + * idling_boosts_thr_without_issues to decide whether to allow + * it. And, beneficially, this would imply that throughput + * would always be boosted also with random I/O on NCQ-capable + * HDDs. + * + * But we must be careful on this point, to avoid an unfair + * treatment for bfqq. In fact, because of the same above + * assignments, idling_boosts_thr_without_issues is, on the + * other hand, true if 1) the device is an HDD and bfqq is + * sequential, and 2) there are no busy weight-raised + * queues. As a consequence, if we used just + * idling_boosts_thr_without_issues to decide whether to idle + * the device, then with an HDD we might easily bump into a + * scenario where queues that are sequential and I/O-bound + * would enjoy idling, whereas random queues would not. The + * latter might then get a low share of the device throughput, + * simply because the former would get many requests served + * after being set as in service, while the latter would not. + * + * To address this issue, we start by setting to true a + * sentinel variable, on_hdd_and_not_all_queues_seeky, if the + * device is rotational and not all queues with pending or + * in-flight requests are constantly seeky (i.e., there are + * active sequential queues, and bfqq might then be mistreated + * if it does not enjoy idling because it is random). + */ + all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && + bfqd->busy_in_flight_queues == + bfqd->const_seeky_busy_in_flight_queues; + + on_hdd_and_not_all_queues_seeky = + !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; + + /* + * To introduce the second case where idling needs to be + * performed to preserve service guarantees, we can note that + * allowing the drive to enqueue more than one request at a + * time, and hence delegating de facto final scheduling + * decisions to the drive's internal scheduler, causes loss of + * control on the actual request service order. In particular, + * the critical situation is when requests from different + * processes happens to be present, at the same time, in the + * internal queue(s) of the drive. In such a situation, the + * drive, by deciding the service order of the + * internally-queued requests, does determine also the actual + * throughput distribution among these processes. But the + * drive typically has no notion or concern about per-process + * throughput distribution, and makes its decisions only on a + * per-request basis. Therefore, the service distribution + * enforced by the drive's internal scheduler is likely to + * coincide with the desired device-throughput distribution + * only in a completely symmetric scenario where: + * (i) each of these processes must get the same throughput as + * the others; + * (ii) all these processes have the same I/O pattern + (either sequential or random). + * In fact, in such a scenario, the drive will tend to treat + * the requests of each of these processes in about the same + * way as the requests of the others, and thus to provide + * each of these processes with about the same throughput + * (which is exactly the desired throughput distribution). In + * contrast, in any asymmetric scenario, device idling is + * certainly needed to guarantee that bfqq receives its + * assigned fraction of the device throughput (see [1] for + * details). + * + * We address this issue by controlling, actually, only the + * symmetry sub-condition (i), i.e., provided that + * sub-condition (i) holds, idling is not performed, + * regardless of whether sub-condition (ii) holds. In other + * words, only if sub-condition (i) holds, then idling is + * allowed, and the device tends to be prevented from queueing + * many requests, possibly of several processes. The reason + * for not controlling also sub-condition (ii) is that, first, + * in the case of an HDD, the asymmetry in terms of types of + * I/O patterns is already taken in to account in the above + * sentinel variable + * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a + * flash-based device, we prefer however to privilege + * throughput (and idling lowers throughput for this type of + * devices), for the following reasons: + * 1) differently from HDDs, the service time of random + * requests is not orders of magnitudes lower than the service + * time of sequential requests; thus, even if processes doing + * sequential I/O get a preferential treatment with respect to + * others doing random I/O, the consequences are not as + * dramatic as with HDDs; + * 2) if a process doing random I/O does need strong + * throughput guarantees, it is hopefully already being + * weight-raised, or the user is likely to have assigned it a + * higher weight than the other processes (and thus + * sub-condition (i) is likely to be false, which triggers + * idling). + * + * According to the above considerations, the next variable is + * true (only) if sub-condition (i) holds. To compute the + * value of this variable, we not only use the return value of + * the function bfq_symmetric_scenario(), but also check + * whether bfqq is being weight-raised, because + * bfq_symmetric_scenario() does not take into account also + * weight-raised queues (see comments to + * bfq_weights_tree_add()). + * + * As a side note, it is worth considering that the above + * device-idling countermeasures may however fail in the + * following unlucky scenario: if idling is (correctly) + * disabled in a time period during which all symmetry + * sub-conditions hold, and hence the device is allowed to + * enqueue many requests, but at some later point in time some + * sub-condition stops to hold, then it may become impossible + * to let requests be served in the desired order until all + * the requests already queued in the device have been served. + */ + asymmetric_scenario = bfqq->wr_coeff > 1 || + !bfq_symmetric_scenario(bfqd); + + /* + * Finally, there is a case where maximizing throughput is the + * best choice even if it may cause unfairness toward + * bfqq. Such a case is when bfqq became active in a burst of + * queue activations. Queues that became active during a large + * burst benefit only from throughput, as discussed in the + * comments to bfq_handle_burst. Thus, if bfqq became active + * in a burst and not idling the device maximizes throughput, + * then the device must no be idled, because not idling the + * device provides bfqq and all other queues in the burst with + * maximum benefit. Combining this and the two cases above, we + * can now establish when idling is actually needed to + * preserve service guarantees. + */ + idling_needed_for_service_guarantees = + (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && + !bfq_bfqq_in_large_burst(bfqq); + /* + * We have now all the components we need to compute the return + * value of the function, which is true only if both the following + * conditions hold: + * 1) bfqq is sync, because idling make sense only for sync queues; + * 2) idling either boosts the throughput (without issues), or + * is necessary to preserve service guarantees. + */ return bfq_bfqq_sync(bfqq) && - !cond_for_expiring_in_burst && - (bfqq->wr_coeff > 1 || !symmetric_scenario || - (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) && - !cond_for_expiring_non_wr) - ); + (idling_boosts_thr_without_issues || + idling_needed_for_service_guarantees); } /* - * If the in-service queue is empty but sync, and the function - * bfq_bfqq_must_not_expire returns true, then: + * If the in-service queue is empty but the function bfq_bfqq_may_idle + * returns true, then: * 1) the queue must remain in service and cannot be expired, and - * 2) the disk must be idled to wait for the possible arrival of a new + * 2) the device must be idled to wait for the possible arrival of a new * request for the queue. - * See the comments to the function bfq_bfqq_must_not_expire for the reasons + * See the comments to the function bfq_bfqq_may_idle for the reasons * why performing device idling is the best choice to boost the throughput - * and preserve service guarantees when bfq_bfqq_must_not_expire itself + * and preserve service guarantees when bfq_bfqq_may_idle itself * returns true. */ -static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) +static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && - bfq_bfqq_must_not_expire(bfqq); + bfq_bfqq_may_idle(bfqq); } /* @@ -2533,7 +2656,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; bfqq = bfqd->in_service_queue; - if (bfqq == NULL) + if (!bfqq) goto new_queue; bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); @@ -2548,7 +2671,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * If bfqq has requests queued and it has enough budget left to * serve them, keep the queue, otherwise expire it. */ - if (next_rq != NULL) { + if (next_rq) { if (bfq_serv_to_charge(next_rq, bfqq) > bfq_bfqq_budget_left(bfqq)) { reason = BFQ_BFQQ_BUDGET_EXHAUSTED; @@ -2575,6 +2698,9 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) */ bfq_clear_bfqq_wait_request(bfqq); del_timer(&bfqd->idle_slice_timer); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_idle_time(bfqq_group(bfqq)); +#endif } goto keep_queue; } @@ -2586,18 +2712,18 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * may idle after their completion, then keep it anyway. */ if (timer_pending(&bfqd->idle_slice_timer) || - (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { + (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { bfqq = NULL; goto keep_queue; } reason = BFQ_BFQQ_NO_MORE_REQUESTS; expire: - bfq_bfqq_expire(bfqd, bfqq, 0, reason); + bfq_bfqq_expire(bfqd, bfqq, false, reason); new_queue: bfqq = bfq_set_in_service_queue(bfqd); bfq_log(bfqd, "select_queue: new queue %d returned", - bfqq != NULL ? bfqq->pid : 0); + bfqq ? bfqq->pid : 0); keep_queue: return bfqq; } @@ -2615,7 +2741,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != entity->orig_weight * bfqq->wr_coeff); - if (entity->ioprio_changed) + if (entity->prio_changed) bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); /* @@ -2659,7 +2785,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, /* Follow expired path, else get first next available. */ rq = bfq_check_fifo(bfqq); - if (rq == NULL) + if (!rq) rq = bfqq->next_rq; service_to_charge = bfq_serv_to_charge(rq, bfqq); @@ -2695,14 +2821,14 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, bfq_update_wr_data(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, - "dispatched %u sec req (%llu), budg left %lu", + "dispatched %u sec req (%llu), budg left %d", blk_rq_sectors(rq), (long long unsigned)blk_rq_pos(rq), bfq_bfqq_budget_left(bfqq)); dispatched++; - if (bfqd->in_service_bic == NULL) { + if (!bfqd->in_service_bic) { atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); bfqd->in_service_bic = RQ_BIC(rq); } @@ -2715,7 +2841,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, return dispatched; expire: - bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); + bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); return dispatched; } @@ -2723,7 +2849,7 @@ static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) { int dispatched = 0; - while (bfqq->next_rq != NULL) { + while (bfqq->next_rq) { bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); dispatched++; } @@ -2743,7 +2869,7 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) int dispatched = 0; bfqq = bfqd->in_service_queue; - if (bfqq != NULL) + if (bfqq) __bfq_bfqq_expire(bfqd, bfqq); /* @@ -2779,7 +2905,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) return bfq_forced_dispatch(bfqd); bfqq = bfq_select_queue(bfqd); - if (bfqq == NULL) + if (!bfqq) return 0; if (bfq_class_idle(bfqq)) @@ -2819,6 +2945,9 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) static void bfq_put_queue(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_group *bfqg = bfqq_group(bfqq); +#endif BUG_ON(atomic_read(&bfqq->ref) <= 0); @@ -2827,9 +2956,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq) if (!atomic_dec_and_test(&bfqq->ref)) return; - BUG_ON(rb_first(&bfqq->sort_list) != NULL); + BUG_ON(rb_first(&bfqq->sort_list)); BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree != NULL); + BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); BUG_ON(bfqd->in_service_queue == bfqq); @@ -2847,6 +2976,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); kmem_cache_free(bfq_pool, bfqq); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_put(bfqg); +#endif } static void bfq_put_cooperator(struct bfq_queue *bfqq) @@ -2883,7 +3015,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static inline void bfq_init_icq(struct io_cq *icq) +static void bfq_init_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); @@ -2950,40 +3082,38 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b /* * No prio set, inherit CPU scheduling settings. */ - bfqq->entity.new_ioprio = task_nice_ioprio(tsk); - bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); + bfqq->new_ioprio = task_nice_ioprio(tsk); + bfqq->new_ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; + bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; + bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: - bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->entity.new_ioprio = 7; + bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->new_ioprio = 7; bfq_clear_bfqq_idle_window(bfqq); break; } - if (bfqq->entity.new_ioprio < 0 || - bfqq->entity.new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", - bfqq->entity.new_ioprio); + bfqq->new_ioprio); BUG(); } - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio); - bfqq->entity.ioprio_changed = 1; + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfqq->entity.prio_changed = 1; } -static void bfq_check_ioprio_change(struct bfq_io_cq *bic) +static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd; struct bfq_queue *bfqq, *new_bfqq; - struct bfq_group *bfqg; unsigned long uninitialized_var(flags); int ioprio = bic->icq.ioc->ioprio; @@ -2993,18 +3123,16 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic) * This condition may trigger on a newly created bic, be sure to * drop the lock before returning. */ - if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) + if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) goto out; bic->ioprio = ioprio; bfqq = bic->bfqq[BLK_RW_ASYNC]; - if (bfqq != NULL) { - bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, - sched_data); - new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic, + if (bfqq) { + new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, GFP_ATOMIC); - if (new_bfqq != NULL) { + if (new_bfqq) { bic->bfqq[BLK_RW_ASYNC] = new_bfqq; bfq_log_bfqq(bfqd, bfqq, "check_ioprio_change: bfqq %p %d", @@ -3014,7 +3142,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic) } bfqq = bic->bfqq[BLK_RW_SYNC]; - if (bfqq != NULL) + if (bfqq) bfq_set_next_ioprio_data(bfqq, bic); out: @@ -3038,7 +3166,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfq_class_idle(bfqq)) bfq_mark_bfqq_idle_window(bfqq); bfq_mark_bfqq_sync(bfqq); - } + } else + bfq_clear_bfqq_sync(bfqq); bfq_mark_bfqq_IO_bound(bfqq); /* Tentative initial value to trade off between thr and lat */ @@ -3055,14 +3184,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, } static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, - struct bfq_group *bfqg, - int is_sync, + struct bio *bio, int is_sync, struct bfq_io_cq *bic, gfp_t gfp_mask) { + struct bfq_group *bfqg; struct bfq_queue *bfqq, *new_bfqq = NULL; + struct blkcg *blkcg; retry: + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + bfqg = bfq_find_alloc_group(bfqd, blkcg); /* bic always exists here */ bfqq = bic_to_bfqq(bic, is_sync); @@ -3070,18 +3204,19 @@ retry: * Always try a new alloc if we fall back to the OOM bfqq * originally, since it should just be a temporary situation. */ - if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + if (!bfqq || bfqq == &bfqd->oom_bfqq) { bfqq = NULL; - if (new_bfqq != NULL) { + if (new_bfqq) { bfqq = new_bfqq; new_bfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { + } else if (gfpflags_allow_blocking(gfp_mask)) { + rcu_read_unlock(); spin_unlock_irq(bfqd->queue->queue_lock); new_bfqq = kmem_cache_alloc_node(bfq_pool, gfp_mask | __GFP_ZERO, bfqd->queue->node); spin_lock_irq(bfqd->queue->queue_lock); - if (new_bfqq != NULL) + if (new_bfqq) goto retry; } else { bfqq = kmem_cache_alloc_node(bfq_pool, @@ -3089,7 +3224,7 @@ retry: bfqd->queue->node); } - if (bfqq != NULL) { + if (bfqq) { bfq_init_bfqq(bfqd, bfqq, bic, current->pid, is_sync); bfq_init_entity(&bfqq->entity, bfqg); @@ -3100,9 +3235,11 @@ retry: } } - if (new_bfqq != NULL) + if (new_bfqq) kmem_cache_free(bfq_pool, new_bfqq); + rcu_read_unlock(); + return bfqq; } @@ -3126,7 +3263,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bfq_group *bfqg, int is_sync, + struct bio *bio, int is_sync, struct bfq_io_cq *bic, gfp_t gfp_mask) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); @@ -3135,19 +3272,26 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq = NULL; if (!is_sync) { + struct blkcg *blkcg; + struct bfq_group *bfqg; + + rcu_read_lock(); + blkcg = bio_blkcg(bio); + rcu_read_unlock(); + bfqg = bfq_find_alloc_group(bfqd, blkcg); async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); bfqq = *async_bfqq; } - if (bfqq == NULL) - bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); + if (!bfqq) + bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); /* * Pin the queue now that it's allocated, scheduler exit will * prune it. */ - if (!is_sync && *async_bfqq == NULL) { + if (!is_sync && !(*async_bfqq)) { atomic_inc(&bfqq->ref); bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", bfqq, atomic_read(&bfqq->ref)); @@ -3280,9 +3424,9 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { - int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && - blk_rq_sectors(rq) < 32; - int budget_timeout = bfq_bfqq_budget_timeout(bfqq); + bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32; + bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); /* * There is just this request queued: if the request @@ -3309,6 +3453,9 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfq_clear_bfqq_wait_request(bfqq); del_timer(&bfqd->idle_slice_timer); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_idle_time(bfqq_group(bfqq)); +#endif /* * The queue is not empty, because a new request just @@ -3318,7 +3465,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, * See [1] for more details. */ if (budget_timeout) - bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_BUDGET_TIMEOUT); /* * Let the request rip immediately, or let a new queue be @@ -3342,7 +3490,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) */ if (!in_interrupt()) { new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); - if (new_bfqq != NULL) { + if (new_bfqq) { if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); /* @@ -3370,7 +3518,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) * from assigning it a full weight-raising period. See the detailed * comments about this field in bfq_init_icq(). */ - if (bfqq->bic != NULL) + if (bfqq->bic) bfqq->bic->wr_time_left = 0; rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); @@ -3418,6 +3566,11 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) BUG_ON(!bfqq->dispatched); bfqd->rq_in_driver--; bfqq->dispatched--; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_completion(bfqq_group(bfqq), + rq_start_time_ns(rq), + rq_io_start_time_ns(rq), rq->cmd_flags); +#endif if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { bfq_weights_tree_remove(bfqd, &bfqq->entity, @@ -3462,11 +3615,12 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) bfq_arm_slice_timer(bfqd); goto out; } else if (bfq_may_expire_for_budg_timeout(bfqq)) - bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_BUDGET_TIMEOUT); else if (RB_EMPTY_ROOT(&bfqq->sort_list) && (bfqq->dispatched == 0 || - !bfq_bfqq_must_not_expire(bfqq))) - bfq_bfqq_expire(bfqd, bfqq, 0, + !bfq_bfqq_may_idle(bfqq))) + bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_NO_MORE_REQUESTS); } @@ -3477,7 +3631,7 @@ out: return; } -static inline int __bfq_may_queue(struct bfq_queue *bfqq) +static int __bfq_may_queue(struct bfq_queue *bfqq) { if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { bfq_clear_bfqq_must_alloc(bfqq); @@ -3501,11 +3655,11 @@ static int bfq_may_queue(struct request_queue *q, int rw) * 'may queue' if that fails. */ bic = bfq_bic_lookup(bfqd, tsk->io_context); - if (bic == NULL) + if (!bic) return ELV_MQUEUE_MAY; bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); - if (bfqq != NULL) + if (bfqq) return __bfq_may_queue(bfqq); return ELV_MQUEUE_MAY; @@ -3518,7 +3672,7 @@ static void bfq_put_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); - if (bfqq != NULL) { + if (bfqq) { const int rw = rq_data_dir(rq); BUG_ON(!bfqq->allocated[rw]); @@ -3570,25 +3724,24 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, const int rw = rq_data_dir(rq); const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; - struct bfq_group *bfqg; unsigned long flags; bool split = false; - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); - bfq_check_ioprio_change(bic); + bfq_check_ioprio_change(bic, bio); spin_lock_irqsave(q->queue_lock, flags); - if (bic == NULL) + if (!bic) goto queue_fail; - bfqg = bfq_bic_update_cgroup(bic); + bfq_bic_update_cgroup(bic, bio); new_queue: bfqq = bic_to_bfqq(bic, is_sync); - if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { - bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); + if (!bfqq || bfqq == &bfqd->oom_bfqq) { + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { if ((bic->was_in_burst_list && bfqd->large_burst) || @@ -3684,7 +3837,7 @@ static void bfq_idle_slice_timer(unsigned long data) * the in-service queue. This can hardly happen, but in the worst * case we just expire a queue too early. */ - if (bfqq != NULL) { + if (bfqq) { bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); if (bfq_bfqq_budget_timeout(bfqq)) /* @@ -3704,7 +3857,7 @@ static void bfq_idle_slice_timer(unsigned long data) else goto schedule_dispatch; - bfq_bfqq_expire(bfqd, bfqq, 1, reason); + bfq_bfqq_expire(bfqd, bfqq, true, reason); } schedule_dispatch: @@ -3719,14 +3872,14 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) cancel_work_sync(&bfqd->unplug_work); } -static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, +static void __bfq_put_async_bfqq(struct bfq_data *bfqd, struct bfq_queue **bfqq_ptr) { struct bfq_group *root_group = bfqd->root_group; struct bfq_queue *bfqq = *bfqq_ptr; bfq_log(bfqd, "put_async_bfqq: %p", bfqq); - if (bfqq != NULL) { + if (bfqq) { bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", bfqq, atomic_read(&bfqq->ref)); @@ -3762,7 +3915,7 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_lock_irq(q->queue_lock); - BUG_ON(bfqd->in_service_queue != NULL); + BUG_ON(bfqd->in_service_queue); list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) bfq_deactivate_bfqq(bfqd, bfqq, 0); @@ -3775,22 +3928,39 @@ static void bfq_exit_queue(struct elevator_queue *e) BUG_ON(timer_pending(&bfqd->idle_slice_timer)); - bfq_free_root_group(bfqd); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + blkcg_deactivate_policy(q, &blkcg_policy_bfq); +#endif + kfree(bfqd); } +static void bfq_init_root_group(struct bfq_group *root_group, + struct bfq_data *bfqd) +{ + int i; + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + root_group->entity.parent = NULL; + root_group->my_entity = NULL; + root_group->bfqd = bfqd; +#endif + root_group->rq_pos_tree = RB_ROOT; + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; +} + static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) { - struct bfq_group *bfqg; struct bfq_data *bfqd; struct elevator_queue *eq; eq = elevator_alloc(q, e); - if (eq == NULL) + if (!eq) return -ENOMEM; bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); - if (bfqd == NULL) { + if (!bfqd) { kobject_put(&eq->kobj); return -ENOMEM; } @@ -3803,16 +3973,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) */ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); atomic_inc(&bfqd->oom_bfqq.ref); - bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; - bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; bfqd->oom_bfqq.entity.new_weight = - bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio); + bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); /* * Trigger weight initialization, according to ioprio, at the * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio * class won't be changed any more. */ - bfqd->oom_bfqq.entity.ioprio_changed = 1; + bfqd->oom_bfqq.entity.prio_changed = 1; bfqd->queue = q; @@ -3820,16 +3990,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) q->elevator = eq; spin_unlock_irq(q->queue_lock); - bfqg = bfq_alloc_root_group(bfqd, q->node); - if (bfqg == NULL) { - kfree(bfqd); - kobject_put(&eq->kobj); - return -ENOMEM; - } - - bfqd->root_group = bfqg; + bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); + if (!bfqd->root_group) + goto out_free; + bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqd->active_numerous_groups = 0; #endif @@ -3837,7 +4003,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->idle_slice_timer.function = bfq_idle_slice_timer; bfqd->idle_slice_timer.data = (unsigned long)bfqd; - bfqd->rq_pos_tree = RB_ROOT; bfqd->queue_weights_tree = RB_ROOT; bfqd->group_weights_tree = RB_ROOT; @@ -3895,18 +4060,23 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->device_speed = BFQ_BFQD_FAST; return 0; + +out_free: + kfree(bfqd); + kobject_put(&eq->kobj); + return -ENOMEM; } static void bfq_slab_kill(void) { - if (bfq_pool != NULL) + if (bfq_pool) kmem_cache_destroy(bfq_pool); } static int __init bfq_slab_setup(void) { bfq_pool = KMEM_CACHE(bfq_queue, 0); - if (bfq_pool == NULL) + if (!bfq_pool) return -ENOMEM; return 0; } @@ -4051,7 +4221,7 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, return count; } -static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) +static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) { u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); @@ -4145,6 +4315,9 @@ static struct elevator_type iosched_bfq = { .elevator_merge_fn = bfq_merge, .elevator_merged_fn = bfq_merged_request, .elevator_merge_req_fn = bfq_merged_requests, +#ifdef CONFIG_BFQ_GROUP_IOSCHED + .elevator_bio_merged_fn = bfq_bio_merged, +#endif .elevator_allow_merge_fn = bfq_allow_merge, .elevator_dispatch_fn = bfq_dispatch_requests, .elevator_add_req_fn = bfq_insert_request, @@ -4170,6 +4343,8 @@ static struct elevator_type iosched_bfq = { static int __init bfq_init(void) { + int ret; + /* * Can be 0 on HZ < 1000 setups. */ @@ -4179,8 +4354,15 @@ static int __init bfq_init(void) if (bfq_timeout_async == 0) bfq_timeout_async = 1; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + ret = blkcg_policy_register(&blkcg_policy_bfq); + if (ret) + return ret; +#endif + + ret = -ENOMEM; if (bfq_slab_setup()) - return -ENOMEM; + goto err_pol_unreg; /* * Times to load large popular applications for the typical systems @@ -4199,15 +4381,27 @@ static int __init bfq_init(void) device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; - elv_register(&iosched_bfq); - pr_info("BFQ I/O-scheduler: v7r8"); + ret = elv_register(&iosched_bfq); + if (ret) + goto err_pol_unreg; + + pr_info("BFQ I/O-scheduler: v7r10"); return 0; + +err_pol_unreg: +#ifdef CONFIG_BFQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_bfq); +#endif + return ret; } static void __exit bfq_exit(void) { elv_unregister(&iosched_bfq); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_bfq); +#endif bfq_slab_kill(); } diff --git a/block/bfq-sched.c b/block/bfq-sched.c index d0890c6d4..9328a1f09 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -10,24 +10,27 @@ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> */ -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED #define for_each_entity(entity) \ - for (; entity != NULL; entity = entity->parent) + for (; entity ; entity = entity->parent) #define for_each_entity_safe(entity, parent) \ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, int extract, struct bfq_data *bfqd); -static inline void bfq_update_budget(struct bfq_entity *next_in_service) +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + +static void bfq_update_budget(struct bfq_entity *next_in_service) { struct bfq_entity *bfqg_entity; struct bfq_group *bfqg; struct bfq_sched_data *group_sd; - BUG_ON(next_in_service == NULL); + BUG_ON(!next_in_service); group_sd = next_in_service->sched_data; @@ -38,7 +41,7 @@ static inline void bfq_update_budget(struct bfq_entity *next_in_service) * as it must never become an in-service entity. */ bfqg_entity = bfqg->my_entity; - if (bfqg_entity != NULL) + if (bfqg_entity) bfqg_entity->budget = next_in_service->budget; } @@ -46,7 +49,7 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd) { struct bfq_entity *next_in_service; - if (sd->in_service_entity != NULL) + if (sd->in_service_entity) /* will update/requeue at the end of service */ return 0; @@ -60,35 +63,35 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd) next_in_service = bfq_lookup_next_entity(sd, 0, NULL); sd->next_in_service = next_in_service; - if (next_in_service != NULL) + if (next_in_service) bfq_update_budget(next_in_service); return 1; } -static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *entity) +static void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) { BUG_ON(sd->next_in_service != entity); } #else #define for_each_entity(entity) \ - for (; entity != NULL; entity = NULL) + for (; entity ; entity = NULL) #define for_each_entity_safe(entity, parent) \ - for (parent = NULL; entity != NULL; entity = parent) + for (parent = NULL; entity ; entity = parent) -static inline int bfq_update_next_in_service(struct bfq_sched_data *sd) +static int bfq_update_next_in_service(struct bfq_sched_data *sd) { return 0; } -static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *entity) +static void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) { } -static inline void bfq_update_budget(struct bfq_entity *next_in_service) +static void bfq_update_budget(struct bfq_entity *next_in_service) { } #endif @@ -109,18 +112,18 @@ static inline void bfq_update_budget(struct bfq_entity *next_in_service) * * Return @a > @b, dealing with wrapping correctly. */ -static inline int bfq_gt(u64 a, u64 b) +static int bfq_gt(u64 a, u64 b) { return (s64)(a - b) > 0; } -static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) +static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) { struct bfq_queue *bfqq = NULL; - BUG_ON(entity == NULL); + BUG_ON(!entity); - if (entity->my_sched_data == NULL) + if (!entity->my_sched_data) bfqq = container_of(entity, struct bfq_queue, entity); return bfqq; @@ -132,8 +135,7 @@ static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) * @service: amount of service. * @weight: scale factor (weight of an entity or weight sum). */ -static inline u64 bfq_delta(unsigned long service, - unsigned long weight) +static u64 bfq_delta(unsigned long service, unsigned long weight) { u64 d = (u64)service << WFQ_SERVICE_SHIFT; @@ -146,8 +148,7 @@ static inline u64 bfq_delta(unsigned long service, * @entity: the entity to act upon. * @service: the service to be charged to the entity. */ -static inline void bfq_calc_finish(struct bfq_entity *entity, - unsigned long service) +static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); @@ -156,7 +157,7 @@ static inline void bfq_calc_finish(struct bfq_entity *entity, entity->finish = entity->start + bfq_delta(service, entity->weight); - if (bfqq != NULL) { + if (bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: serv %lu, w %d", service, entity->weight); @@ -176,11 +177,11 @@ static inline void bfq_calc_finish(struct bfq_entity *entity, * conversion mechanism because, e.g., in the tree walking functions, * the check for a %NULL value would be redundant. */ -static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) +static struct bfq_entity *bfq_entity_of(struct rb_node *node) { struct bfq_entity *entity = NULL; - if (node != NULL) + if (node) entity = rb_entry(node, struct bfq_entity, rb_node); return entity; @@ -191,8 +192,7 @@ static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) * @root: the tree root. * @entity: the entity to remove. */ -static inline void bfq_extract(struct rb_root *root, - struct bfq_entity *entity) +static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) { BUG_ON(entity->tree != root); @@ -225,7 +225,7 @@ static void bfq_idle_extract(struct bfq_service_tree *st, bfq_extract(&st->idle, entity); - if (bfqq != NULL) + if (bfqq) list_del(&bfqq->bfqq_list); } @@ -243,9 +243,9 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) struct rb_node **node = &root->rb_node; struct rb_node *parent = NULL; - BUG_ON(entity->tree != NULL); + BUG_ON(entity->tree); - while (*node != NULL) { + while (*node) { parent = *node; entry = rb_entry(parent, struct bfq_entity, rb_node); @@ -271,12 +271,11 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) * that the subtree rooted at @node (which may be its left or its right * child) has a valid min_start value. */ -static inline void bfq_update_min(struct bfq_entity *entity, - struct rb_node *node) +static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) { struct bfq_entity *child; - if (node != NULL) { + if (node) { child = rb_entry(node, struct bfq_entity, rb_node); if (bfq_gt(entity->min_start, child->min_start)) entity->min_start = child->min_start; @@ -291,7 +290,7 @@ static inline void bfq_update_min(struct bfq_entity *entity, * this function updates its min_start value. The left and right subtrees * are assumed to hold a correct min_start value. */ -static inline void bfq_update_active_node(struct rb_node *node) +static void bfq_update_active_node(struct rb_node *node) { struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); @@ -318,12 +317,12 @@ up: bfq_update_active_node(node); parent = rb_parent(node); - if (parent == NULL) + if (!parent) return; - if (node == parent->rb_left && parent->rb_right != NULL) + if (node == parent->rb_left && parent->rb_right) bfq_update_active_node(parent->rb_right); - else if (parent->rb_left != NULL) + else if (parent->rb_left) bfq_update_active_node(parent->rb_left); node = parent; @@ -355,7 +354,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node = &entity->rb_node; -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_sched_data *sd = NULL; struct bfq_group *bfqg = NULL; struct bfq_data *bfqd = NULL; @@ -363,22 +362,22 @@ static void bfq_active_insert(struct bfq_service_tree *st, bfq_insert(&st->active, entity); - if (node->rb_left != NULL) + if (node->rb_left) node = node->rb_left; - else if (node->rb_right != NULL) + else if (node->rb_right) node = node->rb_right; bfq_update_active_tree(node); -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED sd = entity->sched_data; bfqg = container_of(sd, struct bfq_group, sched_data); BUG_ON(!bfqg); bfqd = (struct bfq_data *)bfqg->bfqd; #endif - if (bfqq != NULL) + if (bfqq) list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED else { /* bfq_group */ BUG_ON(!bfqd); bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); @@ -397,31 +396,32 @@ static void bfq_active_insert(struct bfq_service_tree *st, * bfq_ioprio_to_weight - calc a weight from an ioprio. * @ioprio: the ioprio value to convert. */ -static inline unsigned short bfq_ioprio_to_weight(int ioprio) +static unsigned short bfq_ioprio_to_weight(int ioprio) { BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); - return IOPRIO_BE_NR - ioprio; + return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; } /** * bfq_weight_to_ioprio - calc an ioprio from a weight. * @weight: the weight value to convert. * - * To preserve as mush as possible the old only-ioprio user interface, + * To preserve as much as possible the old only-ioprio user interface, * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR + * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. */ -static inline unsigned short bfq_weight_to_ioprio(int weight) +static unsigned short bfq_weight_to_ioprio(int weight) { BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); - return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; + return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? + 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; } -static inline void bfq_get_entity(struct bfq_entity *entity) +static void bfq_get_entity(struct bfq_entity *entity) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - if (bfqq != NULL) { + if (bfqq) { atomic_inc(&bfqq->ref); bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", bfqq, atomic_read(&bfqq->ref)); @@ -441,15 +441,15 @@ static struct rb_node *bfq_find_deepest(struct rb_node *node) { struct rb_node *deepest; - if (node->rb_right == NULL && node->rb_left == NULL) + if (!node->rb_right && !node->rb_left) deepest = rb_parent(node); - else if (node->rb_right == NULL) + else if (!node->rb_right) deepest = node->rb_left; - else if (node->rb_left == NULL) + else if (!node->rb_left) deepest = node->rb_right; else { deepest = rb_next(node); - if (deepest->rb_right != NULL) + if (deepest->rb_right) deepest = deepest->rb_right; else if (rb_parent(deepest) != node) deepest = rb_parent(deepest); @@ -468,7 +468,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node; -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_sched_data *sd = NULL; struct bfq_group *bfqg = NULL; struct bfq_data *bfqd = NULL; @@ -477,18 +477,18 @@ static void bfq_active_extract(struct bfq_service_tree *st, node = bfq_find_deepest(&entity->rb_node); bfq_extract(&st->active, entity); - if (node != NULL) + if (node) bfq_update_active_tree(node); -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED sd = entity->sched_data; bfqg = container_of(sd, struct bfq_group, sched_data); BUG_ON(!bfqg); bfqd = (struct bfq_data *)bfqg->bfqd; #endif - if (bfqq != NULL) + if (bfqq) list_del(&bfqq->bfqq_list); -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED else { /* bfq_group */ BUG_ON(!bfqd); bfq_weights_tree_remove(bfqd, entity, @@ -519,14 +519,14 @@ static void bfq_idle_insert(struct bfq_service_tree *st, struct bfq_entity *first_idle = st->first_idle; struct bfq_entity *last_idle = st->last_idle; - if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) + if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) st->first_idle = entity; - if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) + if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) st->last_idle = entity; bfq_insert(&st->idle, entity); - if (bfqq != NULL) + if (bfqq) list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); } @@ -549,7 +549,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, entity->on_st = 0; st->wsum -= entity->weight; - if (bfqq != NULL) { + if (bfqq) { sd = entity->sched_data; bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", bfqq, atomic_read(&bfqq->ref)); @@ -581,7 +581,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st) struct bfq_entity *first_idle = st->first_idle; struct bfq_entity *last_idle = st->last_idle; - if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && + if (RB_EMPTY_ROOT(&st->active) && last_idle && !bfq_gt(last_idle->finish, st->vtime)) { /* * Forget the whole idle tree, increasing the vtime past @@ -590,7 +590,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st) st->vtime = last_idle->finish; } - if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) + if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) bfq_put_idle_entity(st, first_idle); } @@ -600,19 +600,19 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, { struct bfq_service_tree *new_st = old_st; - if (entity->ioprio_changed) { + if (entity->prio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); unsigned short prev_weight, new_weight; struct bfq_data *bfqd = NULL; struct rb_root *root; -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_sched_data *sd; struct bfq_group *bfqg; #endif - if (bfqq != NULL) + if (bfqq) bfqd = bfqq->bfqd; -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED else { sd = entity->my_sched_data; bfqg = container_of(sd, struct bfq_group, sched_data); @@ -634,12 +634,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, BUG(); } entity->orig_weight = entity->new_weight; - entity->ioprio = - bfq_weight_to_ioprio(entity->orig_weight); + if (bfqq) + bfqq->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); } - entity->ioprio_class = entity->new_ioprio_class; - entity->ioprio_changed = 0; + if (bfqq) + bfqq->ioprio_class = bfqq->new_ioprio_class; + entity->prio_changed = 0; /* * NOTE: here we may be changing the weight too early, @@ -652,7 +654,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, prev_weight = entity->weight; new_weight = entity->orig_weight * - (bfqq != NULL ? bfqq->wr_coeff : 1); + (bfqq ? bfqq->wr_coeff : 1); /* * If the weight of the entity changes, remove the entity * from its old weight counter (if there is a counter @@ -683,6 +685,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, return new_st; } +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); +#endif + /** * bfq_bfqq_served - update the scheduler status after selection for * service. @@ -693,7 +699,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * are synchronized every time a new bfqq is selected for service. By now, * we keep it to better check consistency. */ -static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) +static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) { struct bfq_entity *entity = &bfqq->entity; struct bfq_service_tree *st; @@ -708,7 +714,10 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) st->vtime += bfq_delta(served, st->wsum); bfq_forget_idle(st); } - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); +#endif + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); } /** @@ -721,7 +730,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) * budget. In this way we should obtain a sort of time-domain * fairness among all the seeky/slow queues. */ -static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; @@ -746,7 +755,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity) struct bfq_service_tree *st = bfq_entity_service_tree(entity); if (entity == sd->in_service_entity) { - BUG_ON(entity->tree != NULL); + BUG_ON(entity->tree); /* * If we are requeueing the current entity we have * to take care of not charging to it service it has @@ -837,7 +846,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) if (!entity->on_st) return 0; - BUG_ON(was_in_service && entity->tree != NULL); + BUG_ON(was_in_service && entity->tree); if (was_in_service) { bfq_calc_finish(entity, entity->service); @@ -846,7 +855,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) bfq_active_extract(st, entity); else if (entity->tree == &st->idle) bfq_idle_extract(st, entity); - else if (entity->tree != NULL) + else if (entity->tree) BUG(); if (was_in_service || sd->next_in_service == entity) @@ -884,7 +893,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) */ break; - if (sd->next_in_service != NULL) + if (sd->next_in_service) /* * The parent entity is still backlogged and * the budgets on the path towards the root @@ -953,7 +962,7 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) struct bfq_entity *entry, *first = NULL; struct rb_node *node = st->active.rb_node; - while (node != NULL) { + while (node) { entry = rb_entry(node, struct bfq_entity, rb_node); left: if (!bfq_gt(entry->start, st->vtime)) @@ -961,7 +970,7 @@ left: BUG_ON(bfq_gt(entry->min_start, st->vtime)); - if (node->rb_left != NULL) { + if (node->rb_left) { entry = rb_entry(node->rb_left, struct bfq_entity, rb_node); if (!bfq_gt(entry->min_start, st->vtime)) { @@ -969,12 +978,12 @@ left: goto left; } } - if (first != NULL) + if (first) break; node = node->rb_right; } - BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); + BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); return first; } @@ -1030,13 +1039,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, struct bfq_entity *entity; int i = 0; - BUG_ON(sd->in_service_entity != NULL); + BUG_ON(sd->in_service_entity); - if (bfqd != NULL && + if (bfqd && jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); - if (entity != NULL) { + if (entity) { i = BFQ_IOPRIO_CLASSES - 1; bfqd->bfq_class_idle_last_service = jiffies; sd->next_in_service = entity; @@ -1044,7 +1053,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, } for (; i < BFQ_IOPRIO_CLASSES; i++) { entity = __bfq_lookup_next_entity(st + i, false); - if (entity != NULL) { + if (entity) { if (extract) { bfq_check_next_in_service(sd, entity); bfq_active_extract(st + i, entity); @@ -1067,27 +1076,27 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) struct bfq_sched_data *sd; struct bfq_queue *bfqq; - BUG_ON(bfqd->in_service_queue != NULL); + BUG_ON(bfqd->in_service_queue); if (bfqd->busy_queues == 0) return NULL; sd = &bfqd->root_group->sched_data; - for (; sd != NULL; sd = entity->my_sched_data) { + for (; sd ; sd = entity->my_sched_data) { entity = bfq_lookup_next_entity(sd, 1, bfqd); - BUG_ON(entity == NULL); + BUG_ON(!entity); entity->service = 0; } bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(bfqq == NULL); + BUG_ON(!bfqq); return bfqq; } static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) { - if (bfqd->in_service_bic != NULL) { + if (bfqd->in_service_bic) { put_io_context(bfqd->in_service_bic->icq.ioc); bfqd->in_service_bic = NULL; } @@ -1114,6 +1123,10 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_activate_entity(entity); } +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); +#endif + /* * Called when the bfqq no longer has requests pending, remove it from * the service tree. @@ -1147,6 +1160,10 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_dequeue(bfqq_group(bfqq)); +#endif + bfq_deactivate_bfqq(bfqd, bfqq, requeue); } diff --git a/block/bfq.h b/block/bfq.h index 93d3f6e95..97a677f8c 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ-v7r8 for 4.3.0: data structures and common functions prototypes. + * BFQ-v7r10 for 4.4.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> @@ -17,12 +17,14 @@ #include <linux/hrtimer.h> #include <linux/ioprio.h> #include <linux/rbtree.h> +#include <linux/blk-cgroup.h> #define BFQ_IOPRIO_CLASSES 3 #define BFQ_CL_IDLE_TIMEOUT (HZ/5) -#define BFQ_MIN_WEIGHT 1 -#define BFQ_MAX_WEIGHT 1000 +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 +#define BFQ_WEIGHT_CONVERSION_COEFF 10 #define BFQ_DEFAULT_QUEUE_IOPRIO 4 @@ -117,12 +119,8 @@ struct bfq_weight_counter { * @ioprio: the ioprio in use. * @new_weight: when a weight change is requested, the new weight value. * @orig_weight: original weight, used to implement weight boosting - * @new_ioprio: when an ioprio change is requested, the new ioprio value. - * @ioprio_class: the ioprio_class in use. - * @new_ioprio_class: when an ioprio_class change is requested, the new - * ioprio_class value. - * @ioprio_changed: flag, true when the user requested a weight, ioprio or - * ioprio_class change. + * @prio_changed: flag, true when the user requested a weight, ioprio or + * ioprio_class change. * * A bfq_entity is used to represent either a bfq_queue (leaf node in the * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each @@ -134,7 +132,7 @@ struct bfq_weight_counter { * allow different weights on different devices, but this * functionality is not exported to userspace by now. Priorities and * weights are updated lazily, first storing the new values into the - * new_* fields, then setting the @ioprio_changed flag. As soon as + * new_* fields, then setting the @prio_changed flag. As soon as * there is a transition in the entity state that allows the priority * update to take place the effective and the requested priority * values are synchronized. @@ -161,7 +159,7 @@ struct bfq_entity { u64 min_start; - unsigned long service, budget; + int service, budget; unsigned short weight, new_weight; unsigned short orig_weight; @@ -170,10 +168,7 @@ struct bfq_entity { struct bfq_sched_data *my_sched_data; struct bfq_sched_data *sched_data; - unsigned short ioprio, new_ioprio; - unsigned short ioprio_class, new_ioprio_class; - - int ioprio_changed; + int prio_changed; }; struct bfq_group; @@ -182,10 +177,14 @@ struct bfq_group; * struct bfq_queue - leaf schedulable entity. * @ref: reference counter. * @bfqd: parent bfq_data. + * @new_ioprio: when an ioprio change is requested, the new ioprio value. + * @ioprio_class: the ioprio_class in use. + * @new_ioprio_class: when an ioprio_class change is requested, the new + * ioprio_class value. * @new_bfqq: shared bfq_queue if queue is cooperating with * one or more other queues. - * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). - * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). + * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). + * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). * @sort_list: sorted list of pending requests. * @next_rq: if fifo isn't expired, next request to serve. * @queued: nr of requests queued in @sort_list. @@ -239,6 +238,9 @@ struct bfq_queue { atomic_t ref; struct bfq_data *bfqd; + unsigned short ioprio, new_ioprio; + unsigned short ioprio_class, new_ioprio_class; + /* fields for cooperating queues handling */ struct bfq_queue *new_bfqq; struct rb_node pos_node; @@ -253,7 +255,7 @@ struct bfq_queue { struct bfq_entity entity; - unsigned long max_budget; + int max_budget; unsigned long budget_timeout; int dispatched; @@ -302,6 +304,8 @@ struct bfq_ttime { * @icq: associated io_cq structure * @bfqq: array of two process queues, the sync and the async * @ttime: associated @bfq_ttime struct + * @ioprio: per (request_queue, blkcg) ioprio. + * @blkcg_id: id of the blkcg the related io_cq belongs to. * @wr_time_left: snapshot of the time left before weight raising ends * for the sync queue associated to this process; this * snapshot is taken to remember this value while the weight @@ -329,6 +333,10 @@ struct bfq_io_cq { struct bfq_ttime ttime; int ioprio; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + uint64_t blkcg_id; /* the current blkcg ID */ +#endif + unsigned int wr_time_left; bool saved_idle_window; bool saved_IO_bound; @@ -349,9 +357,6 @@ enum bfq_device_speed { * struct bfq_data - per device data structure. * @queue: request queue for the managed device. * @root_group: root bfq_group for the device. - * @rq_pos_tree: rbtree sorted by next_request position, used when - * determining if two or more queues have interleaving - * requests (see bfq_close_cooperator()). * @active_numerous_groups: number of bfq_groups containing more than one * active @bfq_entity. * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by @@ -485,9 +490,8 @@ struct bfq_data { struct request_queue *queue; struct bfq_group *root_group; - struct rb_root rq_pos_tree; -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED int active_numerous_groups; #endif @@ -520,7 +524,7 @@ struct bfq_data { ktime_t last_idling_start; int peak_rate_samples; u64 peak_rate; - unsigned long bfq_max_budget; + int bfq_max_budget; struct hlist_head group_list; struct list_head active_list; @@ -532,8 +536,8 @@ struct bfq_data { unsigned int bfq_slice_idle; u64 bfq_class_idle_last_service; - unsigned int bfq_user_max_budget; - unsigned int bfq_max_budget_async_rq; + int bfq_user_max_budget; + int bfq_max_budget_async_rq; unsigned int bfq_timeout[2]; unsigned int bfq_coop_thresh; @@ -593,15 +597,15 @@ enum bfqq_state_flags { }; #define BFQ_BFQQ_FNS(name) \ -static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ { \ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ } \ -static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ { \ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ } \ -static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ { \ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ } @@ -640,14 +644,64 @@ enum bfqq_expiration { BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ }; -#ifdef CONFIG_CGROUP_BFQIO +#ifdef CONFIG_BFQ_GROUP_IOSCHED + +struct bfqg_stats { + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total sectors transferred */ + struct blkg_stat sectors; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stat time; + /* time not charged to this cgroup */ + struct blkg_stat unaccounted_time; + /* sum of number of ios queued across all samples */ + struct blkg_stat avg_queue_size_sum; + /* count of samples taken for average */ + struct blkg_stat avg_queue_size_samples; + /* how many times this group has been removed from service tree */ + struct blkg_stat dequeue; + /* total time spent waiting for it to be assigned a timeslice. */ + struct blkg_stat group_wait_time; + /* time spent idling for this blkcg_gq */ + struct blkg_stat idle_time; + /* total time with empty current active q with other requests queued */ + struct blkg_stat empty_time; + /* fields after this shouldn't be cleared on stat reset */ + uint64_t start_group_wait_time; + uint64_t start_idle_time; + uint64_t start_empty_time; + uint16_t flags; +}; + +/* + * struct bfq_group_data - per-blkcg storage for the blkio subsystem. + * + * @ps: @blkcg_policy_storage that this structure inherits + * @weight: weight of the bfq_group + */ +struct bfq_group_data { + /* must be the first member */ + struct blkcg_policy_data pd; + + unsigned short weight; +}; + /** * struct bfq_group - per (device, cgroup) data structure. * @entity: schedulable entity to insert into the parent group sched_data. * @sched_data: own sched_data, to contain child entities (they may be * both bfq_queues and bfq_groups). - * @group_node: node to be inserted into the bfqio_cgroup->group_data - * list of the containing cgroup's bfqio_cgroup. * @bfqd_node: node to be inserted into the @bfqd->group_list list * of the groups active on the same device; used for cleanup. * @bfqd: the bfq_data for the device this group acts upon. @@ -663,23 +717,26 @@ enum bfqq_expiration { * are groups with more than one active @bfq_entity * (see the comments to the function * bfq_bfqq_must_not_expire()). + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_find_close_cooperator()). * * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup * there is a set of bfq_groups, each one collecting the lower-level * entities belonging to the group that are acting on the same device. * * Locking works as follows: - * o @group_node is protected by the bfqio_cgroup lock, and is accessed - * via RCU from its readers. * o @bfqd is protected by the queue lock, RCU is used to access it * from the readers. * o All the other fields are protected by the @bfqd queue lock. */ struct bfq_group { + /* must be the first member */ + struct blkg_policy_data pd; + struct bfq_entity entity; struct bfq_sched_data sched_data; - struct hlist_node group_node; struct hlist_node bfqd_node; void *bfqd; @@ -690,44 +747,33 @@ struct bfq_group { struct bfq_entity *my_entity; int active_entities; -}; -/** - * struct bfqio_cgroup - bfq cgroup data structure. - * @css: subsystem state for bfq in the containing cgroup. - * @online: flag marked when the subsystem is inserted. - * @weight: cgroup weight. - * @ioprio: cgroup ioprio. - * @ioprio_class: cgroup ioprio_class. - * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. - * @group_data: list containing the bfq_group belonging to this cgroup. - * - * @group_data is accessed using RCU, with @lock protecting the updates, - * @ioprio and @ioprio_class are protected by @lock. - */ -struct bfqio_cgroup { - struct cgroup_subsys_state css; - bool online; - - unsigned short weight, ioprio, ioprio_class; + struct rb_root rq_pos_tree; - spinlock_t lock; - struct hlist_head group_data; + struct bfqg_stats stats; + struct bfqg_stats dead_stats; /* stats pushed from dead children */ }; + #else struct bfq_group { struct bfq_sched_data sched_data; struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; struct bfq_queue *async_idle_bfqq; + + struct rb_root rq_pos_tree; }; #endif -static inline struct bfq_service_tree * +static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); + +static struct bfq_service_tree * bfq_entity_service_tree(struct bfq_entity *entity) { struct bfq_sched_data *sched_data = entity->sched_data; - unsigned int idx = entity->ioprio_class - 1; + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : + BFQ_DEFAULT_GRP_CLASS; BUG_ON(idx >= BFQ_IOPRIO_CLASSES); BUG_ON(sched_data == NULL); @@ -735,19 +781,18 @@ bfq_entity_service_tree(struct bfq_entity *entity) return sched_data->service_tree + idx; } -static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, - bool is_sync) +static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) { return bic->bfqq[is_sync]; } -static inline void bic_set_bfqq(struct bfq_io_cq *bic, - struct bfq_queue *bfqq, bool is_sync) +static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, + bool is_sync) { bic->bfqq[is_sync] = bfqq; } -static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) +static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) { return bic->icq.q->elevator->elevator_data; } @@ -766,8 +811,7 @@ static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) * the function returns NULL, with the queue unlocked, otherwise it * returns the dereferenced pointer, with the queue locked. */ -static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, - unsigned long *flags) +static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) { struct bfq_data *bfqd; @@ -776,7 +820,9 @@ static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, if (bfqd != NULL) { spin_lock_irqsave(bfqd->queue->queue_lock, *flags); - if (*ptr == bfqd) + if (ptr == NULL) + printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); + else if (*ptr == bfqd) goto out; spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); } @@ -787,17 +833,37 @@ out: return bfqd; } -static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, - unsigned long *flags) +static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) { spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); } -static void bfq_check_ioprio_change(struct bfq_io_cq *bic); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + +static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + struct bfq_entity *group_entity = bfqq->entity.parent; + + if (!group_entity) + group_entity = &bfqq->bfqd->root_group->entity; + + return container_of(group_entity, struct bfq_group, entity); +} + +#else + +static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + return bfqq->bfqd->root_group; +} + +#endif + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); static void bfq_put_queue(struct bfq_queue *bfqq); static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bfq_group *bfqg, int is_sync, + struct bio *bio, int is_sync, struct bfq_io_cq *bic, gfp_t gfp_mask); static void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 14b8faf8b..f6325d573 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -32,6 +32,11 @@ static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; +void blk_flush_integrity(void) +{ + flush_workqueue(kintegrityd_wq); +} + /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -177,11 +182,11 @@ bool bio_integrity_enabled(struct bio *bio) if (bi == NULL) return false; - if (bio_data_dir(bio) == READ && bi->verify_fn != NULL && + if (bio_data_dir(bio) == READ && bi->profile->verify_fn != NULL && (bi->flags & BLK_INTEGRITY_VERIFY)) return true; - if (bio_data_dir(bio) == WRITE && bi->generate_fn != NULL && + if (bio_data_dir(bio) == WRITE && bi->profile->generate_fn != NULL && (bi->flags & BLK_INTEGRITY_GENERATE)) return true; @@ -202,7 +207,7 @@ EXPORT_SYMBOL(bio_integrity_enabled); static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { - return sectors >> (ilog2(bi->interval) - 9); + return sectors >> (bi->interval_exp - 9); } static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, @@ -229,7 +234,7 @@ static int bio_integrity_process(struct bio *bio, bip->bip_vec->bv_offset; iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = bi->interval; + iter.interval = 1 << bi->interval_exp; iter.seed = bip_get_seed(bip); iter.prot_buf = prot_buf; @@ -340,7 +345,7 @@ int bio_integrity_prep(struct bio *bio) /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE) - bio_integrity_process(bio, bi->generate_fn); + bio_integrity_process(bio, bi->profile->generate_fn); return 0; } @@ -361,7 +366,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio *bio = bip->bip_bio; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - bio->bi_error = bio_integrity_process(bio, bi->verify_fn); + bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; diff --git a/block/bio.c b/block/bio.c index ad3f276d7..4f184d938 100644 --- a/block/bio.c +++ b/block/bio.c @@ -211,7 +211,7 @@ fallback: bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); /* * Make this allocation restricted and don't dump info on @@ -221,11 +221,11 @@ fallback: __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* - * Try a slab allocation. If this fails and __GFP_WAIT + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM * is set, retry with the 1-entry mempool */ bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { *idx = BIOVEC_MAX_IDX; goto fallback; } @@ -395,12 +395,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is * backed by the @bs's mempool. * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. * * Note that when running under generic_make_request() (i.e. any block * driver), bios are not submitted until after you return - see the code in @@ -459,13 +459,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. */ if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 55512dd62..5a37188b5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -899,6 +899,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) struct cftype blkcg_files[] = { { .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, .seq_show = blkcg_print_stat, }, { } /* terminate */ @@ -1126,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/block/blk-core.c b/block/blk-core.c index 48a6cc8df..e8e229c70 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -209,6 +209,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs) EXPORT_SYMBOL(blk_delay_queue); /** + * blk_start_queue_async - asynchronously restart a previously stopped queue + * @q: The &struct request_queue in question + * + * Description: + * blk_start_queue_async() will clear the stop flag on the queue, and + * ensure that the request_fn for the queue is run from an async + * context. + **/ +void blk_start_queue_async(struct request_queue *q) +{ + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + blk_run_queue_async(q); +} +EXPORT_SYMBOL(blk_start_queue_async); + +/** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question * @@ -556,22 +572,23 @@ void blk_cleanup_queue(struct request_queue *q) * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that q->request_fn() gets invoked after draining finished. */ - if (q->mq_ops) { - blk_mq_freeze_queue(q); - spin_lock_irq(lock); - } else { - spin_lock_irq(lock); + blk_freeze_queue(q); + spin_lock_irq(lock); + if (!q->mq_ops) __blk_drain_queue(q, true); - } queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); + /* for synchronous bio-based driver finish in-flight integrity i/o */ + blk_flush_integrity(); + /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); if (q->mq_ops) blk_mq_free_queue(q); + percpu_ref_exit(&q->q_usage_counter); spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) @@ -631,6 +648,40 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); +int blk_queue_enter(struct request_queue *q, gfp_t gfp) +{ + while (true) { + int ret; + + if (percpu_ref_tryget_live(&q->q_usage_counter)) + return 0; + + if (!gfpflags_allow_blocking(gfp)) + return -EBUSY; + + ret = wait_event_interruptible(q->mq_freeze_wq, + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); + if (blk_queue_dying(q)) + return -ENODEV; + if (ret) + return ret; + } +} + +void blk_queue_exit(struct request_queue *q) +{ + percpu_ref_put(&q->q_usage_counter); +} + +static void blk_queue_usage_counter_release(struct percpu_ref *ref) +{ + struct request_queue *q = + container_of(ref, struct request_queue, q_usage_counter); + + wake_up_all(&q->mq_freeze_wq); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -692,11 +743,22 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) init_waitqueue_head(&q->mq_freeze_wq); - if (blkcg_init_queue(q)) + /* + * Init percpu_ref in atomic mode so that it's faster to shutdown. + * See blk_register_queue() for details. + */ + if (percpu_ref_init(&q->q_usage_counter, + blk_queue_usage_counter_release, + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_bdi; + if (blkcg_init_queue(q)) + goto fail_ref; + return q; +fail_ref: + percpu_ref_exit(&q->q_usage_counter); fail_bdi: bdi_destroy(&q->backing_dev_info); fail_split: @@ -765,7 +827,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) } EXPORT_SYMBOL(blk_init_queue_node); -static void blk_queue_bio(struct request_queue *q, struct bio *bio); +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, @@ -1162,8 +1224,8 @@ rq_starved: * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * - * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this - * function keeps retrying under memory pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. @@ -1183,7 +1245,7 @@ retry: if (!IS_ERR(rq)) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1261,11 +1323,11 @@ EXPORT_SYMBOL(blk_get_request); * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for - * anything but the first bio in the chain. Otherwise you risk waiting for IO - * completion of a bio that hasn't been submitted yet, thus resulting in a - * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead - * of bio_alloc(), as that avoids the mempool deadlock. + * given to how you allocate bios. In particular, you cannot use + * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise + * you risk waiting for IO completion of a bio that hasn't been submitted yet, + * thus resulting in a deadlock. Alternatively bios should be allocated using + * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ @@ -1531,6 +1593,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests + * @same_queue_rq: pointer to &struct request that gets filled in when + * another request associated with @q is found on the plug list + * (optional, may be %NULL) * * Determine whether @bio being queued on @q can be merged with a request * on %current's plugged list. Returns %true if merge was successful, @@ -1596,6 +1661,30 @@ out: return ret; } +unsigned int blk_plug_queued_count(struct request_queue *q) +{ + struct blk_plug *plug; + struct request *rq; + struct list_head *plug_list; + unsigned int ret = 0; + + plug = current->plug; + if (!plug) + goto out; + + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry(rq, plug_list, queuelist) { + if (rq->q == q) + ret++; + } +out: + return ret; +} + void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; @@ -1610,7 +1699,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -static void blk_queue_bio(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1618,8 +1707,6 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) struct request *req; unsigned int request_count = 0; - blk_queue_split(q, &bio, q->bio_split); - /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even @@ -1627,10 +1714,12 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); + blk_queue_split(q, &bio, q->bio_split); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio->bi_error = -EIO; bio_endio(bio); - return; + return BLK_QC_T_NONE; } if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { @@ -1643,9 +1732,11 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (!blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return; + if (!blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); spin_lock_irq(q->queue_lock); @@ -1721,6 +1812,8 @@ get_rq: out_unlock: spin_unlock_irq(q->queue_lock); } + + return BLK_QC_T_NONE; } /* @@ -1926,12 +2019,13 @@ end_io: * a lower device by calling into generic_make_request recursively, which * means the bio should NOT be touched after the call to ->make_request_fn. */ -void generic_make_request(struct bio *bio) +blk_qc_t generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) - return; + goto out; /* * We only want one ->make_request_fn to be active at a time, else @@ -1945,7 +2039,7 @@ void generic_make_request(struct bio *bio) */ if (current->bio_list) { bio_list_add(current->bio_list, bio); - return; + goto out; } /* following loop may be a bit non-obvious, and so deserves some @@ -1968,11 +2062,24 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - q->make_request_fn(q, bio); + if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) { + + ret = q->make_request_fn(q, bio); + + blk_queue_exit(q); - bio = bio_list_pop(current->bio_list); + bio = bio_list_pop(current->bio_list); + } else { + struct bio *bio_next = bio_list_pop(current->bio_list); + + bio_io_error(bio); + bio = bio_next; + } } while (bio); current->bio_list = NULL; /* deactivate */ + +out: + return ret; } EXPORT_SYMBOL(generic_make_request); @@ -1986,7 +2093,7 @@ EXPORT_SYMBOL(generic_make_request); * interfaces; @bio must be presetup and ready for I/O. * */ -void submit_bio(int rw, struct bio *bio) +blk_qc_t submit_bio(int rw, struct bio *bio) { bio->bi_rw |= rw; @@ -2023,12 +2130,13 @@ void submit_bio(int rw, struct bio *bio) } } - generic_make_request(bio); + return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); /** - * blk_rq_check_limits - Helper function to check a request for the queue limit + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for new the queue limits * @q: the queue * @rq: the request being checked * @@ -2039,20 +2147,13 @@ EXPORT_SYMBOL(submit_bio); * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * - * This function should also be useful for request stacking drivers - * in some cases below, so export this function. * Request stacking drivers like request-based dm may change the queue - * limits while requests are in the queue (e.g. dm's table swapping). - * Such request stacking drivers should check those requests against - * the new queue limits again when they dispatch those requests, - * although such checkings are also done against the old queue limits - * when submitting requests. + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. */ -int blk_rq_check_limits(struct request_queue *q, struct request *rq) +static int blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) { - if (!rq_mergeable(rq)) - return 0; - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; @@ -2072,7 +2173,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) return 0; } -EXPORT_SYMBOL_GPL(blk_rq_check_limits); /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request @@ -2084,7 +2184,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) unsigned long flags; int where = ELEVATOR_INSERT_BACK; - if (blk_rq_check_limits(q, rq)) + if (blk_cloned_rq_check_limits(q, rq)) return -EIO; if (rq->rq_disk && @@ -3229,6 +3329,47 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); +bool blk_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct blk_plug *plug; + long state; + + if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return false; + + plug = current->plug; + if (plug) + blk_flush_plug_list(plug, false); + + state = current->state; + while (!need_resched()) { + unsigned int queue_num = blk_qc_t_to_queue_num(cookie); + struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); + if (ret > 0) { + hctx->poll_success++; + set_current_state(TASK_RUNNING); + return true; + } + + if (signal_pending_state(state, current)) + set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return true; + if (ret < 0) + break; + cpu_relax(); + } + + return false; +} + #ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine @@ -3285,6 +3426,9 @@ int blk_pre_runtime_suspend(struct request_queue *q) { int ret = 0; + if (!q->dev) + return ret; + spin_lock_irq(q->queue_lock); if (q->nr_pending) { ret = -EBUSY; @@ -3312,6 +3456,9 @@ EXPORT_SYMBOL(blk_pre_runtime_suspend); */ void blk_post_runtime_suspend(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; @@ -3336,6 +3483,9 @@ EXPORT_SYMBOL(blk_post_runtime_suspend); */ void blk_pre_runtime_resume(struct request_queue *q) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); q->rpm_status = RPM_RESUMING; spin_unlock_irq(q->queue_lock); @@ -3358,6 +3508,9 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); */ void blk_post_runtime_resume(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 75f29cf70..d69c5c79f 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -30,10 +30,6 @@ #include "blk.h" -static struct kmem_cache *integrity_cachep; - -static const char *bi_unsupported_name = "unsupported"; - /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements * @q: request queue @@ -146,40 +142,40 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); */ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) { - struct blk_integrity *b1 = gd1->integrity; - struct blk_integrity *b2 = gd2->integrity; + struct blk_integrity *b1 = &gd1->queue->integrity; + struct blk_integrity *b2 = &gd2->queue->integrity; - if (!b1 && !b2) + if (!b1->profile && !b2->profile) return 0; - if (!b1 || !b2) + if (!b1->profile || !b2->profile) return -1; - if (b1->interval != b2->interval) { + if (b1->interval_exp != b2->interval_exp) { pr_err("%s: %s/%s protection interval %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, - b1->interval, b2->interval); + 1 << b1->interval_exp, 1 << b2->interval_exp); return -1; } if (b1->tuple_size != b2->tuple_size) { - printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__, + pr_err("%s: %s/%s tuple sz %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, b1->tuple_size, b2->tuple_size); return -1; } if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { - printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__, + pr_err("%s: %s/%s tag sz %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, b1->tag_size, b2->tag_size); return -1; } - if (strcmp(b1->name, b2->name)) { - printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__, + if (b1->profile != b2->profile) { + pr_err("%s: %s/%s type %s != %s\n", __func__, gd1->disk_name, gd2->disk_name, - b1->name, b2->name); + b1->profile->name, b2->profile->name); return -1; } @@ -249,8 +245,8 @@ struct integrity_sysfs_entry { static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { - struct blk_integrity *bi = - container_of(kobj, struct blk_integrity, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); + struct blk_integrity *bi = &disk->queue->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); @@ -261,8 +257,8 @@ static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = - container_of(kobj, struct blk_integrity, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); + struct blk_integrity *bi = &disk->queue->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); ssize_t ret = 0; @@ -275,18 +271,21 @@ static ssize_t integrity_attr_store(struct kobject *kobj, static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) { - if (bi != NULL && bi->name != NULL) - return sprintf(page, "%s\n", bi->name); + if (bi->profile && bi->profile->name) + return sprintf(page, "%s\n", bi->profile->name); else return sprintf(page, "none\n"); } static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) { - if (bi != NULL) - return sprintf(page, "%u\n", bi->tag_size); - else - return sprintf(page, "0\n"); + return sprintf(page, "%u\n", bi->tag_size); +} + +static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%u\n", + bi->interval_exp ? 1 << bi->interval_exp : 0); } static ssize_t integrity_verify_store(struct blk_integrity *bi, @@ -343,6 +342,11 @@ static struct integrity_sysfs_entry integrity_tag_size_entry = { .show = integrity_tag_size_show, }; +static struct integrity_sysfs_entry integrity_interval_entry = { + .attr = { .name = "protection_interval_bytes", .mode = S_IRUGO }, + .show = integrity_interval_show, +}; + static struct integrity_sysfs_entry integrity_verify_entry = { .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR }, .show = integrity_verify_show, @@ -363,6 +367,7 @@ static struct integrity_sysfs_entry integrity_device_entry = { static struct attribute *integrity_attrs[] = { &integrity_format_entry.attr, &integrity_tag_size_entry.attr, + &integrity_interval_entry.attr, &integrity_verify_entry.attr, &integrity_generate_entry.attr, &integrity_device_entry.attr, @@ -374,114 +379,89 @@ static const struct sysfs_ops integrity_ops = { .store = &integrity_attr_store, }; -static int __init blk_dev_integrity_init(void) -{ - integrity_cachep = kmem_cache_create("blkdev_integrity", - sizeof(struct blk_integrity), - 0, SLAB_PANIC, NULL); - return 0; -} -subsys_initcall(blk_dev_integrity_init); - -static void blk_integrity_release(struct kobject *kobj) -{ - struct blk_integrity *bi = - container_of(kobj, struct blk_integrity, kobj); - - kmem_cache_free(integrity_cachep, bi); -} - static struct kobj_type integrity_ktype = { .default_attrs = integrity_attrs, .sysfs_ops = &integrity_ops, - .release = blk_integrity_release, }; -bool blk_integrity_is_initialized(struct gendisk *disk) +static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) { - struct blk_integrity *bi = blk_get_integrity(disk); - - return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0); + return 0; } -EXPORT_SYMBOL(blk_integrity_is_initialized); + +static struct blk_integrity_profile nop_profile = { + .name = "nop", + .generate_fn = blk_integrity_nop_fn, + .verify_fn = blk_integrity_nop_fn, +}; /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware - * @template: optional integrity profile to register + * @template: block integrity profile to register * - * Description: When a device needs to advertise itself as being able - * to send/receive integrity metadata it must use this function to - * register the capability with the block layer. The template is a - * blk_integrity struct with values appropriate for the underlying - * hardware. If template is NULL the new profile is allocated but - * not filled out. See Documentation/block/data-integrity.txt. + * Description: When a device needs to advertise itself as being able to + * send/receive integrity metadata it must use this function to register + * the capability with the block layer. The template is a blk_integrity + * struct with values appropriate for the underlying hardware. See + * Documentation/block/data-integrity.txt. */ -int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) +void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) { - struct blk_integrity *bi; + struct blk_integrity *bi = &disk->queue->integrity; - BUG_ON(disk == NULL); + bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | + template->flags; + bi->interval_exp = ilog2(queue_logical_block_size(disk->queue)); + bi->profile = template->profile ? template->profile : &nop_profile; + bi->tuple_size = template->tuple_size; + bi->tag_size = template->tag_size; - if (disk->integrity == NULL) { - bi = kmem_cache_alloc(integrity_cachep, - GFP_KERNEL | __GFP_ZERO); - if (!bi) - return -1; - - if (kobject_init_and_add(&bi->kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, - "%s", "integrity")) { - kmem_cache_free(integrity_cachep, bi); - return -1; - } - - kobject_uevent(&bi->kobj, KOBJ_ADD); - - bi->flags |= BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE; - bi->interval = queue_logical_block_size(disk->queue); - disk->integrity = bi; - } else - bi = disk->integrity; - - /* Use the provided profile as template */ - if (template != NULL) { - bi->name = template->name; - bi->generate_fn = template->generate_fn; - bi->verify_fn = template->verify_fn; - bi->tuple_size = template->tuple_size; - bi->tag_size = template->tag_size; - bi->flags |= template->flags; - } else - bi->name = bi_unsupported_name; - - disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; - - return 0; + blk_integrity_revalidate(disk); } EXPORT_SYMBOL(blk_integrity_register); /** - * blk_integrity_unregister - Remove block integrity profile - * @disk: disk whose integrity profile to deallocate + * blk_integrity_unregister - Unregister block integrity profile + * @disk: disk whose integrity profile to unregister * - * Description: This function frees all memory used by the block - * integrity profile. To be called at device teardown. + * Description: This function unregisters the integrity capability from + * a block device. */ void blk_integrity_unregister(struct gendisk *disk) { - struct blk_integrity *bi; + blk_integrity_revalidate(disk); + memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); +} +EXPORT_SYMBOL(blk_integrity_unregister); + +void blk_integrity_revalidate(struct gendisk *disk) +{ + struct blk_integrity *bi = &disk->queue->integrity; - if (!disk || !disk->integrity) + if (!(disk->flags & GENHD_FL_UP)) return; - disk->queue->backing_dev_info.capabilities &= ~BDI_CAP_STABLE_WRITES; + if (bi->profile) + disk->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + disk->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; +} + +void blk_integrity_add(struct gendisk *disk) +{ + if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity")) + return; - bi = disk->integrity; + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); +} - kobject_uevent(&bi->kobj, KOBJ_REMOVE); - kobject_del(&bi->kobj); - kobject_put(&bi->kobj); - disk->integrity = NULL; +void blk_integrity_del(struct gendisk *disk) +{ + kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE); + kobject_del(&disk->integrity_kobj); + kobject_put(&disk->integrity_kobj); } -EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 1a27f45ec..381cb50a6 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task, { struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { task_lock(task); diff --git a/block/blk-merge.c b/block/blk-merge.c index 0e5f4fc12..e01405a3e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -11,13 +11,16 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, struct bio *bio, - struct bio_set *bs) + struct bio_set *bs, + unsigned *nsegs) { unsigned int max_discard_sectors, granularity; int alignment; sector_t tmp; unsigned split_sectors; + *nsegs = 1; + /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); @@ -51,8 +54,11 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, static struct bio *blk_bio_write_same_split(struct request_queue *q, struct bio *bio, - struct bio_set *bs) + struct bio_set *bs, + unsigned *nsegs) { + *nsegs = 1; + if (!q->limits.max_write_same_sectors) return NULL; @@ -64,11 +70,15 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q, static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, - struct bio_set *bs) + struct bio_set *bs, + unsigned *segs) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned seg_size = 0, nsegs = 0, sectors = 0; + unsigned front_seg_size = bio->bi_seg_front_size; + bool do_split = true; + struct bio *new = NULL; bio_for_each_segment(bv, bio, iter) { if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q)) @@ -93,6 +103,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bvprv = bv; bvprvp = &bvprv; sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; continue; } new_segment: @@ -104,26 +117,50 @@ new_segment: bvprvp = &bvprv; seg_size = bv.bv_len; sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; } - return NULL; + do_split = false; split: - return bio_split(bio, sectors, GFP_NOIO, bs); + *segs = nsegs; + + if (do_split) { + new = bio_split(bio, sectors, GFP_NOIO, bs); + if (new) + bio = new; + } + + bio->bi_seg_front_size = front_seg_size; + if (seg_size > bio->bi_seg_back_size) + bio->bi_seg_back_size = seg_size; + + return do_split ? new : NULL; } void blk_queue_split(struct request_queue *q, struct bio **bio, struct bio_set *bs) { - struct bio *split; + struct bio *split, *res; + unsigned nsegs; if ((*bio)->bi_rw & REQ_DISCARD) - split = blk_bio_discard_split(q, *bio, bs); + split = blk_bio_discard_split(q, *bio, bs, &nsegs); else if ((*bio)->bi_rw & REQ_WRITE_SAME) - split = blk_bio_write_same_split(q, *bio, bs); + split = blk_bio_write_same_split(q, *bio, bs, &nsegs); else - split = blk_bio_segment_split(q, *bio, q->bio_split); + split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); + + /* physical segments can be figured out during splitting */ + res = split ? split : *bio; + res->bi_phys_segments = nsegs; + bio_set_flag(res, BIO_SEG_VALID); if (split) { + /* there isn't chance to merge the splitted bio */ + split->bi_rw |= REQ_NOMERGE; + bio_chain(split, *bio); generic_make_request(*bio); *bio = split; @@ -394,6 +431,12 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (sg) sg_mark_end(sg); + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > rq->nr_phys_segments); + return nsegs; } EXPORT_SYMBOL(blk_rq_map_sg); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 788fffd9b..1cf18784c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -174,6 +174,11 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) return ret; } +static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); +} + static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -295,6 +300,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { + .attr = {.name = "io_poll", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_poll_show, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -304,6 +313,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, + &blk_mq_hw_sysfs_poll.attr, NULL, }; @@ -413,12 +423,6 @@ static void blk_mq_sysfs_init(struct request_queue *q) kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } -/* see blk_register_queue() */ -void blk_mq_finish_init(struct request_queue *q) -{ - percpu_ref_switch_to_percpu(&q->mq_usage_counter); -} - int blk_mq_register_disk(struct gendisk *disk) { struct device *dev = disk_to_dev(disk); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ec2d11915..a07ca3488 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -75,6 +75,10 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) struct blk_mq_bitmap_tags *bt; int i, wake_index; + /* + * Make sure all changes prior to this are visible from other CPUs. + */ + smp_mb(); bt = &tags->bitmap_tags; wake_index = atomic_read(&bt->wake_index); for (i = 0; i < BT_WAIT_QUEUES; i++) { @@ -264,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) return tag; - if (!(data->gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(data->gfp)) return -1; bs = bt_wait_ptr(bt, hctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 85f014327..6d6f8feb4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -9,6 +9,7 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> @@ -77,47 +78,13 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) -{ - while (true) { - int ret; - - if (percpu_ref_tryget_live(&q->mq_usage_counter)) - return 0; - - if (!(gfp & __GFP_WAIT)) - return -EBUSY; - - ret = wait_event_interruptible(q->mq_freeze_wq, - !atomic_read(&q->mq_freeze_depth) || - blk_queue_dying(q)); - if (blk_queue_dying(q)) - return -ENODEV; - if (ret) - return ret; - } -} - -static void blk_mq_queue_exit(struct request_queue *q) -{ - percpu_ref_put(&q->mq_usage_counter); -} - -static void blk_mq_usage_counter_release(struct percpu_ref *ref) -{ - struct request_queue *q = - container_of(ref, struct request_queue, mq_usage_counter); - - wake_up_all(&q->mq_freeze_wq); -} - void blk_mq_freeze_queue_start(struct request_queue *q) { int freeze_depth; freeze_depth = atomic_inc_return(&q->mq_freeze_depth); if (freeze_depth == 1) { - percpu_ref_kill(&q->mq_usage_counter); + percpu_ref_kill(&q->q_usage_counter); blk_mq_run_hw_queues(q, false); } } @@ -125,18 +92,34 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); static void blk_mq_freeze_queue_wait(struct request_queue *q) { - wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); + wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } /* * Guarantee no request is in use, so we can change any data structure of * the queue afterward. */ -void blk_mq_freeze_queue(struct request_queue *q) +void blk_freeze_queue(struct request_queue *q) { + /* + * In the !blk_mq case we are only calling this to kill the + * q_usage_counter, otherwise this increases the freeze depth + * and waits for it to return to zero. For this reason there is + * no blk_unfreeze_queue(), and blk_freeze_queue() is not + * exported to drivers as the only user for unfreeze is blk_mq. + */ blk_mq_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } + +void blk_mq_freeze_queue(struct request_queue *q) +{ + /* + * ...just an alias to keep freeze and unfreeze actions balanced + * in the blk_mq_* namespace + */ + blk_freeze_queue(q); +} EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) @@ -146,7 +129,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q) freeze_depth = atomic_dec_return(&q->mq_freeze_depth); WARN_ON_ONCE(freeze_depth < 0); if (!freeze_depth) { - percpu_ref_reinit(&q->mq_usage_counter); + percpu_ref_reinit(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } } @@ -255,17 +238,17 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_alloc_data alloc_data; int ret; - ret = blk_mq_queue_enter(q, gfp); + ret = blk_queue_enter(q, gfp); if (ret) return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, + blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM, reserved, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_WAIT)) { + if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); @@ -278,7 +261,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, } blk_mq_put_ctx(ctx); if (!rq) { - blk_mq_queue_exit(q); + blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } return rq; @@ -297,7 +280,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); blk_mq_put_tag(hctx, tag, &ctx->last_tag); - blk_mq_queue_exit(q); + blk_queue_exit(q); } void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) @@ -375,7 +358,7 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu(); } -void __blk_mq_complete_request(struct request *rq) +static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; @@ -989,18 +972,25 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_queue); -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, + bool at_head) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - trace_block_rq_insert(hctx->queue, rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); +} + +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool at_head) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + __blk_mq_insert_req_list(hctx, ctx, rq, at_head); blk_mq_hctx_mark_pending(hctx, ctx); } @@ -1056,8 +1046,9 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq, false); + __blk_mq_insert_req_list(hctx, ctx, rq, false); } + blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); blk_mq_run_hw_queue(hctx, from_schedule); @@ -1139,7 +1130,7 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - if (!hctx_allow_merges(hctx)) { + if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: @@ -1176,11 +1167,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, int rw = bio_data_dir(bio); struct blk_mq_alloc_data alloc_data; - if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) { - bio_io_error(bio); - return NULL; - } - + blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); @@ -1199,7 +1186,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; @@ -1211,7 +1198,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } -static int blk_mq_direct_issue_request(struct request *rq) +static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) { int ret; struct request_queue *q = rq->q; @@ -1222,6 +1209,7 @@ static int blk_mq_direct_issue_request(struct request *rq) .list = NULL, .last = 1 }; + blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); /* * For OK queue, we are done. For error, kill it. Any other @@ -1229,18 +1217,21 @@ static int blk_mq_direct_issue_request(struct request *rq) * would have done */ ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) + if (ret == BLK_MQ_RQ_QUEUE_OK) { + *cookie = new_cookie; return 0; - else { - __blk_mq_requeue_request(rq); + } - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { - rq->errors = -EIO; - blk_mq_end_request(rq, rq->errors); - return 0; - } - return -1; + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + *cookie = BLK_QC_T_NONE; + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; } + + return -1; } /* @@ -1248,7 +1239,7 @@ static int blk_mq_direct_issue_request(struct request *rq) * but will attempt to bypass the hctx queueing if we can go straight to * hardware for SYNC IO. */ -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); @@ -1257,23 +1248,29 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) unsigned int request_count = 0; struct blk_plug *plug; struct request *same_queue_rq = NULL; + blk_qc_t cookie; blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio_io_error(bio); - return; + return BLK_QC_T_NONE; } blk_queue_split(q, &bio, q->bio_split); - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) - return; + if (!is_flush_fua && !blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, + &same_queue_rq)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1294,15 +1291,16 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); /* - * we do limited pluging. If bio can be merged, do merge. + * We do limited pluging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be * issued. So the plug list will have one request at most */ if (plug) { /* * The plug list might get flushed before this. If that - * happens, same_queue_rq is invalid and plug list is empty - **/ + * happens, same_queue_rq is invalid and plug list is + * empty + */ if (same_queue_rq && !list_empty(&plug->mq_list)) { old_rq = same_queue_rq; list_del_init(&old_rq->queuelist); @@ -1312,11 +1310,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) old_rq = rq; blk_mq_put_ctx(data.ctx); if (!old_rq) - return; - if (!blk_mq_direct_issue_request(old_rq)) - return; + goto done; + if (!blk_mq_direct_issue_request(old_rq, &cookie)) + goto done; blk_mq_insert_request(old_rq, false, true, true); - return; + goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1330,13 +1328,15 @@ run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } blk_mq_put_ctx(data.ctx); +done: + return cookie; } /* * Single hardware queue variant. This will attempt to use any per-process * plug for merging and IO deferral. */ -static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); @@ -1344,23 +1344,26 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) unsigned int request_count = 0; struct blk_map_ctx data; struct request *rq; + blk_qc_t cookie; blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio_io_error(bio); - return; + return BLK_QC_T_NONE; } blk_queue_split(q, &bio, q->bio_split); if (!is_flush_fua && !blk_queue_nomerges(q) && blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return; + return BLK_QC_T_NONE; rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1376,15 +1379,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) plug = current->plug; if (plug) { blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) + if (!request_count) trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { + + blk_mq_put_ctx(data.ctx); + + if (request_count >= BLK_MAX_REQUEST_COUNT) { blk_flush_plug_list(plug, false); trace_block_plug(q); } + list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - return; + return cookie; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1399,6 +1405,7 @@ run_queue: } blk_mq_put_ctx(data.ctx); + return cookie; } /* @@ -1430,6 +1437,11 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_init_rq_map(). + */ + kmemleak_free(page_address(page)); __free_pages(page, page->private); } @@ -1502,6 +1514,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, list_add_tail(&page->lru, &tags->page_list); p = page_address(page); + /* + * Allow kmemleak to scan these pages as they contain pointers + * to additional allocations like via ops->init_request(). + */ + kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, set->queue_depth - i); left -= to_do * rq_size; @@ -1673,7 +1690,7 @@ static int blk_mq_init_hctx(struct request_queue *q, INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->queue_num = hctx_idx; - hctx->flags = set->flags; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1860,27 +1877,26 @@ static void blk_mq_map_swqueue(struct request_queue *q, } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) +static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - struct request_queue *q; - bool shared; int i; - if (set->tag_list.next == set->tag_list.prev) - shared = false; - else - shared = true; + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } +} + +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) +{ + struct request_queue *q; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_freeze_queue(q); - - queue_for_each_hw_ctx(q, hctx, i) { - if (shared) - hctx->flags |= BLK_MQ_F_TAG_SHARED; - else - hctx->flags &= ~BLK_MQ_F_TAG_SHARED; - } + queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q); } } @@ -1891,7 +1907,12 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); - blk_mq_update_tag_set_depth(set); + if (list_is_singular(&set->tag_list)) { + /* just transitioned to unshared */ + set->flags &= ~BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, false); + } mutex_unlock(&set->tag_list_lock); } @@ -1901,8 +1922,17 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, q->tag_set = set; mutex_lock(&set->tag_list_lock); + + /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ + if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { + set->flags |= BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, true); + } + if (set->flags & BLK_MQ_F_TAG_SHARED) + queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); - blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); } @@ -1989,14 +2019,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, hctxs[i]->queue_num = i; } - /* - * Init percpu_ref in atomic mode so that it's faster to shutdown. - * See blk_register_queue() for details. - */ - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto err_hctxs; - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); @@ -2077,8 +2099,6 @@ void blk_mq_free_queue(struct request_queue *q) blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); blk_mq_free_hw_queues(q, set); - - percpu_ref_exit(&q->mq_usage_counter); } /* Basically redo blk_mq_init_queue with queue frozen */ diff --git a/block/blk-mq.h b/block/blk-mq.h index f4fea7964..713820b47 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -25,12 +25,9 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; -void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); -void blk_mq_clone_flush_request(struct request *flush_rq, - struct request *orig_rq); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); diff --git a/block/blk-settings.c b/block/blk-settings.c index 7d8f129a1..dd4973583 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -91,7 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors = + BLK_SAFE_MAX_SECTORS; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; @@ -127,6 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; + lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -214,8 +216,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request - * @limits: the queue limits + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: @@ -224,13 +226,19 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); * the device driver based upon the capabilities of the I/O * controller. * + * max_dev_sectors is a hard limit imposed by the storage device for + * READ/WRITE requests. It is set by the disk driver. + * * max_sectors is a soft limit imposed by the block layer for * filesystem type requests. This value can be overridden on a * per-device basis in /sys/block/<device>/queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { + struct queue_limits *limits = &q->limits; + unsigned int max_sectors; + if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); printk(KERN_INFO "%s: set to minimum %d\n", @@ -238,22 +246,9 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ } limits->max_hw_sectors = max_hw_sectors; - limits->max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); -} -EXPORT_SYMBOL(blk_limits_max_hw_sectors); - -/** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_hw_sectors: max hardware sectors in the usual 512b unit - * - * Description: - * See description for blk_limits_max_hw_sectors(). - **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) -{ - blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); + max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + limits->max_sectors = max_sectors; } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -527,6 +522,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 07b42f5ad..e140cc487 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -205,6 +205,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; + max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long) + q->limits.max_dev_sectors >> 1); + if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) return -EINVAL; @@ -317,6 +320,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_poll_show(struct request_queue *q, char *page) +{ + return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); +} + +static ssize_t queue_poll_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long poll_on; + ssize_t ret; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + ret = queue_var_store(&poll_on, page, count); + if (ret < 0) + return ret; + + spin_lock_irq(q->queue_lock); + if (poll_on) + queue_flag_set(QUEUE_FLAG_POLL, q); + else + queue_flag_clear(QUEUE_FLAG_POLL, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -442,6 +473,12 @@ static struct queue_sysfs_entry queue_random_entry = { .store = queue_store_random, }; +static struct queue_sysfs_entry queue_poll_entry = { + .attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_show, + .store = queue_poll_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -466,6 +503,7 @@ static struct attribute *default_attrs[] = { &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, &queue_random_entry.attr, + &queue_poll_entry.attr, NULL, }; @@ -600,9 +638,8 @@ int blk_register_queue(struct gendisk *disk) */ if (!blk_queue_init_done(q)) { queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); blk_queue_bypass_end(q); - if (q->mq_ops) - blk_mq_finish_init(q); } ret = blk_trace_init_sysfs(dev); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c75a2636d..2149a1ddb 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -369,7 +369,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd) * regardless of the position of the group in the hierarchy. */ sq->parent_sq = &td->service_queue; - if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 246dfb16c..aa40aa933 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -158,11 +158,13 @@ void blk_abort_request(struct request *req) { if (blk_mark_rq_complete(req)) return; - blk_delete_timer(req); - if (req->q->mq_ops) + + if (req->q->mq_ops) { blk_mq_rq_timed_out(req, false); - else + } else { + blk_delete_timer(req); blk_rq_timed_out(req); + } } EXPORT_SYMBOL_GPL(blk_abort_request); diff --git a/block/blk.h b/block/blk.h index 98614ad37..c43926d3d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -72,6 +72,26 @@ void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); +void blk_freeze_queue(struct request_queue *q); + +static inline void blk_queue_enter_live(struct request_queue *q) +{ + /* + * Given that running in generic_make_request() context + * guarantees that a live reference against q_usage_counter has + * been established, further references under that same context + * need not check that the queue has been frozen (marked dead). + */ + percpu_ref_get(&q->q_usage_counter); +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +void blk_flush_integrity(void); +#else +static inline void blk_flush_integrity(void) +{ +} +#endif void blk_rq_timed_out_timer(unsigned long data); unsigned long blk_rq_timeout(unsigned long timeout); @@ -86,6 +106,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count, struct request **same_queue_rq); +unsigned int blk_plug_queued_count(struct request_queue *q); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 04de88463..1f9093e90 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1581,7 +1581,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) static void cfq_cpd_init(struct blkcg_policy_data *cpd) { struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); - unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ? + unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; if (cpd_to_blkcg(cpd) == &blkcg_root) @@ -1599,7 +1599,7 @@ static void cfq_cpd_free(struct blkcg_policy_data *cpd) static void cfq_cpd_bind(struct blkcg_policy_data *cpd) { struct blkcg *blkcg = cpd_to_blkcg(cpd); - bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup); + bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; if (blkcg == &blkcg_root) diff --git a/block/elevator.c b/block/elevator.c index 84d63943f..c3555c9c6 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -420,7 +420,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * noxmerges: Only simple one-hit cache try * merges: All merge tries attempted */ - if (blk_queue_nomerges(q)) + if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return ELEVATOR_NO_MERGE; /* diff --git a/block/genhd.c b/block/genhd.c index 398dab06d..c06731166 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -632,6 +632,7 @@ void add_disk(struct gendisk *disk) WARN_ON(retval); disk_add_events(disk); + blk_integrity_add(disk); } EXPORT_SYMBOL(add_disk); @@ -640,6 +641,7 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + blk_integrity_del(disk); disk_del_events(disk); /* invalidate stuff */ diff --git a/block/ioctl.c b/block/ioctl.c index 8061eba42..0918aed2d 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -7,6 +7,7 @@ #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> +#include <linux/pr.h> #include <asm/uaccess.h> static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) @@ -193,10 +194,20 @@ int blkdev_reread_part(struct block_device *bdev) } EXPORT_SYMBOL(blkdev_reread_part); -static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, - uint64_t len, int secure) +static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, + unsigned long arg, unsigned long flags) { - unsigned long flags = 0; + uint64_t range[2]; + uint64_t start, len; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; if (start & 511) return -EINVAL; @@ -207,14 +218,24 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; - if (secure) - flags |= BLKDEV_DISCARD_SECURE; return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); } -static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start, - uint64_t len) +static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, + unsigned long arg) { + uint64_t range[2]; + uint64_t start, len; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; + if (start & 511) return -EINVAL; if (len & 511) @@ -275,6 +296,96 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, */ EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); +static int blkdev_pr_register(struct block_device *bdev, + struct pr_registration __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_registration reg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_register) + return -EOPNOTSUPP; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); +} + +static int blkdev_pr_reserve(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_reserve) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); +} + +static int blkdev_pr_release(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_release) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags) + return -EOPNOTSUPP; + return ops->pr_release(bdev, rsv.key, rsv.type); +} + +static int blkdev_pr_preempt(struct block_device *bdev, + struct pr_preempt __user *arg, bool abort) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_preempt p; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_preempt) + return -EOPNOTSUPP; + if (copy_from_user(&p, arg, sizeof(p))) + return -EFAULT; + + if (p.flags) + return -EOPNOTSUPP; + return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); +} + +static int blkdev_pr_clear(struct block_device *bdev, + struct pr_clear __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_clear c; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_clear) + return -EOPNOTSUPP; + if (copy_from_user(&c, arg, sizeof(c))) + return -EFAULT; + + if (c.flags) + return -EOPNOTSUPP; + return ops->pr_clear(bdev, c.key); +} + /* * Is it an unrecognized ioctl? The correct returns are either * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a @@ -295,89 +406,115 @@ static inline int is_unrecognized_ioctl(int ret) ret == -ENOIOCTLCMD; } -/* - * always keep this in sync with compat_blkdev_ioctl() - */ -int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, - unsigned long arg) +static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) { - struct gendisk *disk = bdev->bd_disk; - struct backing_dev_info *bdi; - loff_t size; - int ret, n; - unsigned int max_sectors; + int ret; - switch(cmd) { - case BLKFLSBUF: - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; - fsync_bdev(bdev); - invalidate_bdev(bdev); - return 0; + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!is_unrecognized_ioctl(ret)) + return ret; - case BLKROSET: - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (get_user(n, (int __user *)(arg))) - return -EFAULT; - set_device_ro(bdev, n); - return 0; + fsync_bdev(bdev); + invalidate_bdev(bdev); + return 0; +} - case BLKDISCARD: - case BLKSECDISCARD: { - uint64_t range[2]; +static int blkdev_roset(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + int ret, n; - if (!(mode & FMODE_WRITE)) - return -EBADF; + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!is_unrecognized_ioctl(ret)) + return ret; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (get_user(n, (int __user *)arg)) + return -EFAULT; + set_device_ro(bdev, n); + return 0; +} - if (copy_from_user(range, (void __user *)arg, sizeof(range))) - return -EFAULT; +static int blkdev_getgeo(struct block_device *bdev, + struct hd_geometry __user *argp) +{ + struct gendisk *disk = bdev->bd_disk; + struct hd_geometry geo; + int ret; - return blk_ioctl_discard(bdev, range[0], range[1], - cmd == BLKSECDISCARD); - } - case BLKZEROOUT: { - uint64_t range[2]; + if (!argp) + return -EINVAL; + if (!disk->fops->getgeo) + return -ENOTTY; + + /* + * We need to set the startsect first, the driver may + * want to override it. + */ + memset(&geo, 0, sizeof(geo)); + geo.start = get_start_sect(bdev); + ret = disk->fops->getgeo(bdev, &geo); + if (ret) + return ret; + if (copy_to_user(argp, &geo, sizeof(geo))) + return -EFAULT; + return 0; +} - if (!(mode & FMODE_WRITE)) - return -EBADF; +/* set the logical block size */ +static int blkdev_bszset(struct block_device *bdev, fmode_t mode, + int __user *argp) +{ + int ret, n; - if (copy_from_user(range, (void __user *)arg, sizeof(range))) - return -EFAULT; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!argp) + return -EINVAL; + if (get_user(n, argp)) + return -EFAULT; - return blk_ioctl_zeroout(bdev, range[0], range[1]); + if (!(mode & FMODE_EXCL)) { + bdgrab(bdev); + if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) + return -EBUSY; } - case HDIO_GETGEO: { - struct hd_geometry geo; + ret = set_blocksize(bdev, n); + if (!(mode & FMODE_EXCL)) + blkdev_put(bdev, mode | FMODE_EXCL); + return ret; +} - if (!arg) - return -EINVAL; - if (!disk->fops->getgeo) - return -ENOTTY; - - /* - * We need to set the startsect first, the driver may - * want to override it. - */ - memset(&geo, 0, sizeof(geo)); - geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); - if (ret) - return ret; - if (copy_to_user((struct hd_geometry __user *)arg, &geo, - sizeof(geo))) - return -EFAULT; - return 0; - } +/* + * always keep this in sync with compat_blkdev_ioctl() + */ +int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, + unsigned long arg) +{ + struct backing_dev_info *bdi; + void __user *argp = (void __user *)arg; + loff_t size; + unsigned int max_sectors; + + switch (cmd) { + case BLKFLSBUF: + return blkdev_flushbuf(bdev, mode, cmd, arg); + case BLKROSET: + return blkdev_roset(bdev, mode, cmd, arg); + case BLKDISCARD: + return blk_ioctl_discard(bdev, mode, arg, 0); + case BLKSECDISCARD: + return blk_ioctl_discard(bdev, mode, arg, + BLKDEV_DISCARD_SECURE); + case BLKZEROOUT: + return blk_ioctl_zeroout(bdev, mode, arg); + case HDIO_GETGEO: + return blkdev_getgeo(bdev, argp); case BLKRAGET: case BLKFRAGET: if (!arg) @@ -414,28 +551,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKBSZSET: - /* set the logical block size */ - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (!arg) - return -EINVAL; - if (get_user(n, (int __user *) arg)) - return -EFAULT; - if (!(mode & FMODE_EXCL)) { - bdgrab(bdev); - if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) - return -EBUSY; - } - ret = set_blocksize(bdev, n); - if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); - return ret; + return blkdev_bszset(bdev, mode, argp); case BLKPG: - ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); - break; + return blkpg_ioctl(bdev, argp); case BLKRRPART: - ret = blkdev_reread_part(bdev); - break; + return blkdev_reread_part(bdev); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) @@ -447,11 +567,21 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESTOP: case BLKTRACESETUP: case BLKTRACETEARDOWN: - ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg); - break; + return blk_trace_ioctl(bdev, cmd, argp); + case IOC_PR_REGISTER: + return blkdev_pr_register(bdev, argp); + case IOC_PR_RESERVE: + return blkdev_pr_reserve(bdev, argp); + case IOC_PR_RELEASE: + return blkdev_pr_release(bdev, argp); + case IOC_PR_PREEMPT: + return blkdev_pr_preempt(bdev, argp, false); + case IOC_PR_PREEMPT_ABORT: + return blkdev_pr_preempt(bdev, argp, true); + case IOC_PR_CLEAR: + return blkdev_pr_clear(bdev, argp); default: - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + return __blkdev_driver_ioctl(bdev, mode, cmd, arg); } - return ret; } EXPORT_SYMBOL_GPL(blkdev_ioctl); diff --git a/block/ioprio.c b/block/ioprio.c index 31666c92b..cc7800e9e 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -123,7 +123,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), uid)) + if (!uid_eq(task_uid(p), uid) || + !task_pid_vnr(p)) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -220,7 +221,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), user->uid)) + if (!uid_eq(task_uid(p), user->uid) || + !task_pid_vnr(p)) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 3de89d469..a163c487c 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -21,10 +21,10 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq, static int noop_dispatch(struct request_queue *q, int force) { struct noop_data *nd = q->elevator->elevator_data; + struct request *rq; - if (!list_empty(&nd->queue)) { - struct request *rq; - rq = list_entry(nd->queue.next, struct request, queuelist); + rq = list_first_entry_or_null(&nd->queue, struct request, queuelist); + if (rq) { list_del_init(&rq->queuelist); elv_dispatch_sort(q, rq); return 1; @@ -46,7 +46,7 @@ noop_former_request(struct request_queue *q, struct request *rq) if (rq->queuelist.prev == &nd->queue) return NULL; - return list_entry(rq->queuelist.prev, struct request, queuelist); + return list_prev_entry(rq, queuelist); } static struct request * @@ -56,7 +56,7 @@ noop_latter_request(struct request_queue *q, struct request *rq) if (rq->queuelist.next == &nd->queue) return NULL; - return list_entry(rq->queuelist.next, struct request, queuelist); + return list_next_entry(rq, queuelist); } static int noop_init_queue(struct request_queue *q, struct elevator_type *e) diff --git a/block/partition-generic.c b/block/partition-generic.c index e77111332..746935a59 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -397,7 +397,7 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev) struct hd_struct *part; int res; - if (bdev->bd_part_count) + if (bdev->bd_part_count || bdev->bd_super) return -EBUSY; res = invalidate_partition(disk, 0); if (res) @@ -428,6 +428,7 @@ rescan: if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); + blk_integrity_revalidate(disk); check_disk_size_change(disk, bdev); bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) diff --git a/block/partitions/mac.c b/block/partitions/mac.c index c2c48ec64..621317ac4 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -32,7 +32,7 @@ int mac_partition(struct parsed_partitions *state) Sector sect; unsigned char *data; int slot, blocks_in_map; - unsigned secsize; + unsigned secsize, datasize, partoffset; #ifdef CONFIG_PPC_PMAC int found_root = 0; int found_root_goodness = 0; @@ -50,10 +50,14 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); - data = read_part_sector(state, secsize/512, §); + datasize = round_down(secsize, 512); + data = read_part_sector(state, datasize / 512, §); if (!data) return -1; - part = (struct mac_partition *) (data + secsize%512); + partoffset = secsize % 512; + if (partoffset + sizeof(*part) > datasize) + return -1; + part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); return 0; /* not a MacOS disk */ diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index dda653ce7..077479994 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -444,7 +444,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } - rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; @@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; } - if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { + if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) { err = DRIVER_ERROR << 24; goto error; } @@ -536,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, struct request *rq; int err; - rq = blk_get_request(q, WRITE, __GFP_WAIT); + rq = blk_get_request(q, WRITE, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); diff --git a/block/t10-pi.c b/block/t10-pi.c index 24d6e9715..2c9791233 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -160,38 +160,30 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, 3); } -struct blk_integrity t10_pi_type1_crc = { +struct blk_integrity_profile t10_pi_type1_crc = { .name = "T10-DIF-TYPE1-CRC", .generate_fn = t10_pi_type1_generate_crc, .verify_fn = t10_pi_type1_verify_crc, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type1_crc); -struct blk_integrity t10_pi_type1_ip = { +struct blk_integrity_profile t10_pi_type1_ip = { .name = "T10-DIF-TYPE1-IP", .generate_fn = t10_pi_type1_generate_ip, .verify_fn = t10_pi_type1_verify_ip, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type1_ip); -struct blk_integrity t10_pi_type3_crc = { +struct blk_integrity_profile t10_pi_type3_crc = { .name = "T10-DIF-TYPE3-CRC", .generate_fn = t10_pi_type3_generate_crc, .verify_fn = t10_pi_type3_verify_crc, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type3_crc); -struct blk_integrity t10_pi_type3_ip = { +struct blk_integrity_profile t10_pi_type3_ip = { .name = "T10-DIF-TYPE3-IP", .generate_fn = t10_pi_type3_generate_ip, .verify_fn = t10_pi_type3_verify_ip, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type3_ip); |