summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig.iosched6
-rw-r--r--block/bfq-cgroup.c1282
-rw-r--r--block/bfq-ioc.c6
-rw-r--r--block/bfq-iosched.c1092
-rw-r--r--block/bfq-sched.c209
-rw-r--r--block/bfq.h206
-rw-r--r--block/bio-integrity.c16
-rw-r--r--block/bio.c214
-rw-r--r--block/blk-cgroup.c524
-rw-r--r--block/blk-core.c44
-rw-r--r--block/blk-integrity.c3
-rw-r--r--block/blk-lib.c52
-rw-r--r--block/blk-map.c30
-rw-r--r--block/blk-merge.c172
-rw-r--r--block/blk-mq-cpumap.c9
-rw-r--r--block/blk-mq-sysfs.c34
-rw-r--r--block/blk-mq-tag.c28
-rw-r--r--block/blk-mq-tag.h2
-rw-r--r--block/blk-mq.c125
-rw-r--r--block/blk-mq.h3
-rw-r--r--block/blk-settings.c44
-rw-r--r--block/blk-sysfs.c44
-rw-r--r--block/blk-throttle.c505
-rw-r--r--block/blk.h5
-rw-r--r--block/bounce.c62
-rw-r--r--block/cfq-iosched.c651
-rw-r--r--block/genhd.c9
-rw-r--r--block/partition-generic.c12
-rw-r--r--block/uuid.c6
29 files changed, 2590 insertions, 2805 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 1fc1a4dc5..01da733dc 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -51,12 +51,14 @@ config IOSCHED_BFQ
applications. If compiled built-in (saying Y here), BFQ can
be configured to support hierarchical scheduling.
-config BFQ_GROUP_IOSCHED
+config CGROUP_BFQIO
bool "BFQ hierarchical scheduling support"
depends on CGROUPS && IOSCHED_BFQ=y
default n
---help---
- Enable hierarchical scheduling in BFQ, using the blkio controller.
+ Enable hierarchical scheduling in BFQ, using the cgroups
+ filesystem interface. The name of the subsystem will be
+ bfqio.
choice
prompt "Default I/O scheduler"
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index bc34d7a2b..11e2f1d4e 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -13,480 +13,254 @@
* file.
*/
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
-/* bfqg stats flags */
-enum bfqg_stats_flags {
- BFQG_stats_waiting = 0,
- BFQG_stats_idling,
- BFQG_stats_empty,
-};
-
-#define BFQG_FLAG_FNS(name) \
-static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
-{ \
- stats->flags |= (1 << BFQG_stats_##name); \
-} \
-static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
-{ \
- stats->flags &= ~(1 << BFQG_stats_##name); \
-} \
-static int bfqg_stats_##name(struct bfqg_stats *stats) \
-{ \
- return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
-} \
-
-BFQG_FLAG_FNS(waiting)
-BFQG_FLAG_FNS(idling)
-BFQG_FLAG_FNS(empty)
-#undef BFQG_FLAG_FNS
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
-{
- unsigned long long now;
-
- if (!bfqg_stats_waiting(stats))
- return;
-
- now = sched_clock();
- if (time_after64(now, stats->start_group_wait_time))
- blkg_stat_add(&stats->group_wait_time,
- now - stats->start_group_wait_time);
- bfqg_stats_clear_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
- struct bfq_group *curr_bfqg)
-{
- struct bfqg_stats *stats = &bfqg->stats;
-
- if (bfqg_stats_waiting(stats))
- return;
- if (bfqg == curr_bfqg)
- return;
- stats->start_group_wait_time = sched_clock();
- bfqg_stats_mark_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
-{
- unsigned long long now;
-
- if (!bfqg_stats_empty(stats))
- return;
-
- now = sched_clock();
- if (time_after64(now, stats->start_empty_time))
- blkg_stat_add(&stats->empty_time,
- now - stats->start_empty_time);
- bfqg_stats_clear_empty(stats);
-}
-
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
-{
- blkg_stat_add(&bfqg->stats.dequeue, 1);
-}
+static DEFINE_MUTEX(bfqio_mutex);
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
{
- struct bfqg_stats *stats = &bfqg->stats;
-
- if (blkg_rwstat_total(&stats->queued))
- return;
-
- /*
- * group is already marked empty. This can happen if bfqq got new
- * request in parent group and moved to this group while being added
- * to service tree. Just ignore the event and move on.
- */
- if (bfqg_stats_empty(stats))
- return;
-
- stats->start_empty_time = sched_clock();
- bfqg_stats_mark_empty(stats);
+ return bgrp ? !bgrp->online : false;
}
-static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
-{
- struct bfqg_stats *stats = &bfqg->stats;
-
- if (bfqg_stats_idling(stats)) {
- unsigned long long now = sched_clock();
-
- if (time_after64(now, stats->start_idle_time))
- blkg_stat_add(&stats->idle_time,
- now - stats->start_idle_time);
- bfqg_stats_clear_idling(stats);
- }
-}
-
-static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
-{
- struct bfqg_stats *stats = &bfqg->stats;
-
- stats->start_idle_time = sched_clock();
- bfqg_stats_mark_idling(stats);
-}
-
-static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
-{
- struct bfqg_stats *stats = &bfqg->stats;
-
- blkg_stat_add(&stats->avg_queue_size_sum,
- blkg_rwstat_total(&stats->queued));
- blkg_stat_add(&stats->avg_queue_size_samples, 1);
- bfqg_stats_update_group_wait_time(stats);
-}
-
-static struct blkcg_policy blkcg_policy_bfq;
-
-/*
- * blk-cgroup policy-related handlers
- * The following functions help in converting between blk-cgroup
- * internal structures and BFQ-specific structures.
- */
-
-static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
-{
- return pd ? container_of(pd, struct bfq_group, pd) : NULL;
-}
+static struct bfqio_cgroup bfqio_root_cgroup = {
+ .weight = BFQ_DEFAULT_GRP_WEIGHT,
+ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
+ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
+};
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+static inline void bfq_init_entity(struct bfq_entity *entity,
+ struct bfq_group *bfqg)
{
- return pd_to_blkg(&bfqg->pd);
+ entity->weight = entity->new_weight;
+ entity->orig_weight = entity->new_weight;
+ entity->ioprio = entity->new_ioprio;
+ entity->ioprio_class = entity->new_ioprio_class;
+ entity->parent = bfqg->my_entity;
+ entity->sched_data = &bfqg->sched_data;
}
-static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
{
- return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
}
/*
- * bfq_group handlers
- * The following functions help in navigating the bfq_group hierarchy
- * by allowing to find the parent of a bfq_group or the bfq_group
- * associated to a bfq_queue.
+ * Search the bfq_group for bfqd into the hash table (by now only a list)
+ * of bgrp. Must be called under rcu_read_lock().
*/
-
-static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
+ struct bfq_data *bfqd)
{
- struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
-
- return pblkg ? blkg_to_bfqg(pblkg) : NULL;
-}
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
-{
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
- return group_entity ? container_of(group_entity, struct bfq_group,
- entity) :
- bfqq->bfqd->root_group;
-}
-
-/*
- * The following two functions handle get and put of a bfq_group by
- * wrapping the related blk-cgroup hooks.
- */
-
-static void bfqg_get(struct bfq_group *bfqg)
-{
- return blkg_get(bfqg_to_blkg(bfqg));
-}
-
-static void bfqg_put(struct bfq_group *bfqg)
-{
- return blkg_put(bfqg_to_blkg(bfqg));
-}
+ struct bfq_group *bfqg;
+ void *key;
-static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
- struct bfq_queue *bfqq,
- int rw)
-{
- blkg_rwstat_add(&bfqg->stats.queued, rw, 1);
- bfqg_stats_end_empty_time(&bfqg->stats);
- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
- bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
-}
+ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
+ key = rcu_dereference(bfqg->bfqd);
+ if (key == bfqd)
+ return bfqg;
+ }
-static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw)
-{
- blkg_rwstat_add(&bfqg->stats.queued, rw, -1);
+ return NULL;
}
-static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)
+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
+ struct bfq_group *bfqg)
{
- blkg_rwstat_add(&bfqg->stats.merged, rw, 1);
-}
+ struct bfq_entity *entity = &bfqg->entity;
-static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,
- uint64_t bytes, int rw)
-{
- blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);
- blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);
- blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);
+ /*
+ * If the weight of the entity has never been set via the sysfs
+ * interface, then bgrp->weight == 0. In this case we initialize
+ * the weight from the current ioprio value. Otherwise, the group
+ * weight, if set, has priority over the ioprio value.
+ */
+ if (bgrp->weight == 0) {
+ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
+ entity->new_ioprio = bgrp->ioprio;
+ } else {
+ if (bgrp->weight < BFQ_MIN_WEIGHT ||
+ bgrp->weight > BFQ_MAX_WEIGHT) {
+ printk(KERN_CRIT "bfq_group_init_entity: "
+ "bgrp->weight %d\n", bgrp->weight);
+ BUG();
+ }
+ entity->new_weight = bgrp->weight;
+ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
+ }
+ entity->orig_weight = entity->weight = entity->new_weight;
+ entity->ioprio = entity->new_ioprio;
+ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
+ entity->my_sched_data = &bfqg->sched_data;
+ bfqg->active_entities = 0;
}
-static void bfqg_stats_update_completion(struct bfq_group *bfqg,
- uint64_t start_time, uint64_t io_start_time, int rw)
+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
+ struct bfq_group *parent)
{
- struct bfqg_stats *stats = &bfqg->stats;
- unsigned long long now = sched_clock();
-
- if (time_after64(now, io_start_time))
- blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
- if (time_after64(io_start_time, start_time))
- blkg_rwstat_add(&stats->wait_time, rw,
- io_start_time - start_time);
-}
+ struct bfq_entity *entity;
-/* @stats = 0 */
-static void bfqg_stats_reset(struct bfqg_stats *stats)
-{
- if (!stats)
- return;
-
- /* queued stats shouldn't be cleared */
- blkg_rwstat_reset(&stats->service_bytes);
- blkg_rwstat_reset(&stats->serviced);
- blkg_rwstat_reset(&stats->merged);
- blkg_rwstat_reset(&stats->service_time);
- blkg_rwstat_reset(&stats->wait_time);
- blkg_stat_reset(&stats->time);
- blkg_stat_reset(&stats->unaccounted_time);
- blkg_stat_reset(&stats->avg_queue_size_sum);
- blkg_stat_reset(&stats->avg_queue_size_samples);
- blkg_stat_reset(&stats->dequeue);
- blkg_stat_reset(&stats->group_wait_time);
- blkg_stat_reset(&stats->idle_time);
- blkg_stat_reset(&stats->empty_time);
-}
+ BUG_ON(parent == NULL);
+ BUG_ON(bfqg == NULL);
-/* @to += @from */
-static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)
-{
- if (!to || !from)
- return;
-
- /* queued stats shouldn't be cleared */
- blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
- blkg_rwstat_merge(&to->serviced, &from->serviced);
- blkg_rwstat_merge(&to->merged, &from->merged);
- blkg_rwstat_merge(&to->service_time, &from->service_time);
- blkg_rwstat_merge(&to->wait_time, &from->wait_time);
- blkg_stat_merge(&from->time, &from->time);
- blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
- blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
- blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
- blkg_stat_merge(&to->dequeue, &from->dequeue);
- blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
- blkg_stat_merge(&to->idle_time, &from->idle_time);
- blkg_stat_merge(&to->empty_time, &from->empty_time);
+ entity = &bfqg->entity;
+ entity->parent = parent->my_entity;
+ entity->sched_data = &parent->sched_data;
}
-/*
- * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
- * recursive stats can still account for the amount used by this bfqg after
- * it's gone.
+/**
+ * bfq_group_chain_alloc - allocate a chain of groups.
+ * @bfqd: queue descriptor.
+ * @css: the leaf cgroup_subsys_state this chain starts from.
+ *
+ * Allocate a chain of groups starting from the one belonging to
+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
+ * to the root has already an allocated group on @bfqd.
*/
-static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
+ struct cgroup_subsys_state *css)
{
- struct bfq_group *parent;
-
- if (!bfqg) /* root_group */
- return;
+ struct bfqio_cgroup *bgrp;
+ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
- parent = bfqg_parent(bfqg);
+ for (; css != NULL; css = css->parent) {
+ bgrp = css_to_bfqio(css);
- lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
-
- if (unlikely(!parent))
- return;
+ bfqg = bfqio_lookup_group(bgrp, bfqd);
+ if (bfqg != NULL) {
+ /*
+ * All the cgroups in the path from there to the
+ * root must have a bfq_group for bfqd, so we don't
+ * need any more allocations.
+ */
+ break;
+ }
- bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);
- bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);
- bfqg_stats_reset(&bfqg->stats);
- bfqg_stats_reset(&bfqg->dead_stats);
-}
+ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
+ if (bfqg == NULL)
+ goto cleanup;
-static void bfq_init_entity(struct bfq_entity *entity,
- struct bfq_group *bfqg)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ bfq_group_init_entity(bgrp, bfqg);
+ bfqg->my_entity = &bfqg->entity;
- entity->weight = entity->new_weight;
- entity->orig_weight = entity->new_weight;
- if (bfqq) {
- bfqq->ioprio = bfqq->new_ioprio;
- bfqq->ioprio_class = bfqq->new_ioprio_class;
- bfqg_get(bfqg);
+ if (leaf == NULL) {
+ leaf = bfqg;
+ prev = leaf;
+ } else {
+ bfq_group_set_parent(prev, bfqg);
+ /*
+ * Build a list of allocated nodes using the bfqd
+ * filed, that is still unused and will be
+ * initialized only after the node will be
+ * connected.
+ */
+ prev->bfqd = bfqg;
+ prev = bfqg;
+ }
}
- entity->parent = bfqg->my_entity;
- entity->sched_data = &bfqg->sched_data;
-}
-
-static void bfqg_stats_init(struct bfqg_stats *stats)
-{
- blkg_rwstat_init(&stats->service_bytes);
- blkg_rwstat_init(&stats->serviced);
- blkg_rwstat_init(&stats->merged);
- blkg_rwstat_init(&stats->service_time);
- blkg_rwstat_init(&stats->wait_time);
- blkg_rwstat_init(&stats->queued);
-
- blkg_stat_init(&stats->sectors);
- blkg_stat_init(&stats->time);
-
- blkg_stat_init(&stats->unaccounted_time);
- blkg_stat_init(&stats->avg_queue_size_sum);
- blkg_stat_init(&stats->avg_queue_size_samples);
- blkg_stat_init(&stats->dequeue);
- blkg_stat_init(&stats->group_wait_time);
- blkg_stat_init(&stats->idle_time);
- blkg_stat_init(&stats->empty_time);
-}
-
-static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
- {
- return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
- }
-
-static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
-{
- return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
-}
-
-static void bfq_cpd_init(const struct blkcg *blkcg)
-{
- struct bfq_group_data *d =
- cpd_to_bfqgd(blkcg->pd[blkcg_policy_bfq.plid]);
-
- d->weight = BFQ_DEFAULT_GRP_WEIGHT;
-}
-
-static void bfq_pd_init(struct blkcg_gq *blkg)
-{
- struct bfq_group *bfqg = blkg_to_bfqg(blkg);
- struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
- struct bfq_entity *entity = &bfqg->entity;
- struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
- entity->orig_weight = entity->weight = entity->new_weight = d->weight;
- entity->my_sched_data = &bfqg->sched_data;
- bfqg->my_entity = entity; /*
- * the root_group's will be set to NULL
- * in bfq_init_queue()
- */
- bfqg->bfqd = bfqd;
- bfqg->active_entities = 0;
- bfqg->rq_pos_tree = RB_ROOT;
+ return leaf;
- /* if the root_group does not exist, we are handling it right now */
- if (bfqd->root_group && bfqg != bfqd->root_group)
- hlist_add_head(&bfqg->bfqd_node, &bfqd->group_list);
+cleanup:
+ while (leaf != NULL) {
+ prev = leaf;
+ leaf = leaf->bfqd;
+ kfree(prev);
+ }
- bfqg_stats_init(&bfqg->stats);
- bfqg_stats_init(&bfqg->dead_stats);
+ return NULL;
}
-/* offset delta from bfqg->stats to bfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -
- offsetof(struct bfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+/**
+ * bfq_group_chain_link - link an allocated group chain to a cgroup
+ * hierarchy.
+ * @bfqd: the queue descriptor.
+ * @css: the leaf cgroup_subsys_state to start from.
+ * @leaf: the leaf group (to be associated to @cgroup).
+ *
+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
+ * hierarchy that already as a group associated to @bfqd all the nodes
+ * in the path to the root cgroup have one too.
+ *
+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
+ * per device) while the bfqio_cgroup lock protects the list of groups
+ * belonging to the same cgroup.
+ */
+static void bfq_group_chain_link(struct bfq_data *bfqd,
+ struct cgroup_subsys_state *css,
+ struct bfq_group *leaf)
{
- u64 sum = 0;
+ struct bfqio_cgroup *bgrp;
+ struct bfq_group *bfqg, *next, *prev = NULL;
+ unsigned long flags;
- sum += blkg_stat_recursive_sum(pd, off);
- sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
- return sum;
-}
+ assert_spin_locked(bfqd->queue->queue_lock);
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
- int off)
-{
- struct blkg_rwstat a, b;
+ for (; css != NULL && leaf != NULL; css = css->parent) {
+ bgrp = css_to_bfqio(css);
+ next = leaf->bfqd;
- a = blkg_rwstat_recursive_sum(pd, off);
- b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
- blkg_rwstat_merge(&a, &b);
- return a;
-}
+ bfqg = bfqio_lookup_group(bgrp, bfqd);
+ BUG_ON(bfqg != NULL);
-static void bfq_pd_reset_stats(struct blkcg_gq *blkg)
-{
- struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+ spin_lock_irqsave(&bgrp->lock, flags);
- bfqg_stats_reset(&bfqg->stats);
- bfqg_stats_reset(&bfqg->dead_stats);
-}
+ rcu_assign_pointer(leaf->bfqd, bfqd);
+ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
+ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
-static void bfq_group_set_parent(struct bfq_group *bfqg,
- struct bfq_group *parent)
-{
- struct bfq_entity *entity;
+ spin_unlock_irqrestore(&bgrp->lock, flags);
- BUG_ON(!parent);
- BUG_ON(!bfqg);
- BUG_ON(bfqg == parent);
+ prev = leaf;
+ leaf = next;
+ }
- entity = &bfqg->entity;
- entity->parent = parent->my_entity;
- entity->sched_data = &parent->sched_data;
+ BUG_ON(css == NULL && leaf != NULL);
+ if (css != NULL && prev != NULL) {
+ bgrp = css_to_bfqio(css);
+ bfqg = bfqio_lookup_group(bgrp, bfqd);
+ bfq_group_set_parent(prev, bfqg);
+ }
}
+/**
+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
+ * @bfqd: queue descriptor.
+ * @cgroup: cgroup being searched for.
+ *
+ * Return a group associated to @bfqd in @cgroup, allocating one if
+ * necessary. When a group is returned all the cgroups in the path
+ * to the root have a group associated to @bfqd.
+ *
+ * If the allocation fails, return the root group: this breaks guarantees
+ * but is a safe fallback. If this loss becomes a problem it can be
+ * mitigated using the equivalent weight (given by the product of the
+ * weights of the groups in the path from @group to the root) in the
+ * root scheduler.
+ *
+ * We allocate all the missing nodes in the path from the leaf cgroup
+ * to the root and we connect the nodes only after all the allocations
+ * have been successful.
+ */
static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
- struct blkcg *blkcg)
+ struct cgroup_subsys_state *css)
{
- struct request_queue *q = bfqd->queue;
- struct bfq_group *bfqg = NULL, *parent;
- struct bfq_entity *entity = NULL;
+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
+ struct bfq_group *bfqg;
- assert_spin_locked(bfqd->queue->queue_lock);
+ bfqg = bfqio_lookup_group(bgrp, bfqd);
+ if (bfqg != NULL)
+ return bfqg;
- /* avoid lookup for the common case where there's no blkcg */
- if (blkcg == &blkcg_root) {
+ bfqg = bfq_group_chain_alloc(bfqd, css);
+ if (bfqg != NULL)
+ bfq_group_chain_link(bfqd, css, bfqg);
+ else
bfqg = bfqd->root_group;
- } else {
- struct blkcg_gq *blkg;
-
- blkg = blkg_lookup_create(blkcg, q);
- if (!IS_ERR(blkg))
- bfqg = blkg_to_bfqg(blkg);
- else /* fallback to root_group */
- bfqg = bfqd->root_group;
- }
-
- BUG_ON(!bfqg);
-
- /*
- * Update chain of bfq_groups as we might be handling a leaf group
- * which, along with some of its relatives, has not been hooked yet
- * to the private hierarchy of BFQ.
- */
- entity = &bfqg->entity;
- for_each_entity(entity) {
- bfqg = container_of(entity, struct bfq_group, entity);
- BUG_ON(!bfqg);
- if (bfqg != bfqd->root_group) {
- parent = bfqg_parent(bfqg);
- if (!parent)
- parent = bfqd->root_group;
- BUG_ON(!parent);
- bfq_group_set_parent(bfqg, parent);
- }
- }
return bfqg;
}
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-
/**
* bfq_bfqq_move - migrate @bfqq to @bfqg.
* @bfqd: queue descriptor.
@@ -522,7 +296,6 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_deactivate_bfqq(bfqd, bfqq, 0);
} else if (entity->on_st)
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
- bfqg_put(bfqq_group(bfqq));
/*
* Here we use a reference to bfqg. We don't need a refcounter
@@ -531,15 +304,11 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
entity->parent = bfqg->my_entity;
entity->sched_data = &bfqg->sched_data;
- bfqg_get(bfqg);
- if (busy) {
- bfq_pos_tree_add_move(bfqd, bfqq);
- if (resume)
- bfq_activate_bfqq(bfqd, bfqq);
- }
+ if (busy && resume)
+ bfq_activate_bfqq(bfqd, bfqq);
- if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
bfq_schedule_dispatch(bfqd);
}
@@ -547,9 +316,9 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* __bfq_bic_change_cgroup - move @bic to @cgroup.
* @bfqd: the queue descriptor.
* @bic: the bic to move.
- * @blkcg: the blk-cgroup to move to.
+ * @cgroup: the cgroup to move to.
*
- * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
* has to make sure that the reference to cgroup is valid across the call.
*
* NOTE: an alternative approach might have been to store the current
@@ -558,17 +327,18 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
struct bfq_io_cq *bic,
- struct blkcg *blkcg)
+ struct cgroup_subsys_state *css)
{
struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
- struct bfq_group *bfqg;
struct bfq_entity *entity;
+ struct bfq_group *bfqg;
+ struct bfqio_cgroup *bgrp;
- lockdep_assert_held(bfqd->queue->queue_lock);
+ bgrp = css_to_bfqio(css);
- bfqg = bfq_find_alloc_group(bfqd, blkcg);
- if (async_bfqq) {
+ bfqg = bfq_find_alloc_group(bfqd, css);
+ if (async_bfqq != NULL) {
entity = &async_bfqq->entity;
if (entity->sched_data != &bfqg->sched_data) {
@@ -580,7 +350,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
}
}
- if (sync_bfqq) {
+ if (sync_bfqq != NULL) {
entity = &sync_bfqq->entity;
if (entity->sched_data != &bfqg->sched_data)
bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
@@ -589,39 +359,74 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
return bfqg;
}
-static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+/**
+ * bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bic: the bic being migrated.
+ * @cgroup: the destination cgroup.
+ *
+ * When the task owning @bic is moved to @cgroup, @bic is immediately
+ * moved into its new parent group.
+ */
+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
+ struct cgroup_subsys_state *css)
+{
+ struct bfq_data *bfqd;
+ unsigned long uninitialized_var(flags);
+
+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
+ &flags);
+ if (bfqd != NULL) {
+ __bfq_bic_change_cgroup(bfqd, bic, css);
+ bfq_put_bfqd_unlock(bfqd, &flags);
+ }
+}
+
+/**
+ * bfq_bic_update_cgroup - update the cgroup of @bic.
+ * @bic: the @bic to update.
+ *
+ * Make sure that @bic is enqueued in the cgroup of the current task.
+ * We need this in addition to moving bics during the cgroup attach
+ * phase because the task owning @bic could be at its first disk
+ * access or we may end up in the root cgroup as the result of a
+ * memory allocation failure and here we try to move to the right
+ * group.
+ *
+ * Must be called under the queue lock. It is safe to use the returned
+ * value even after the rcu_read_unlock() as the migration/destruction
+ * paths act under the queue lock too. IOW it is impossible to race with
+ * group migration/destruction and end up with an invalid group as:
+ * a) here cgroup has not yet been destroyed, nor its destroy callback
+ * has started execution, as current holds a reference to it,
+ * b) if it is destroyed after rcu_read_unlock() [after current is
+ * migrated to a different cgroup] its attach() callback will have
+ * taken care of remove all the references to the old cgroup data.
+ */
+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
{
struct bfq_data *bfqd = bic_to_bfqd(bic);
- struct blkcg *blkcg;
- struct bfq_group *bfqg = NULL;
- uint64_t id;
+ struct bfq_group *bfqg;
+ struct cgroup_subsys_state *css;
+
+ BUG_ON(bfqd == NULL);
rcu_read_lock();
- blkcg = bio_blkcg(bio);
- id = blkcg->css.serial_nr;
+ css = task_css(current, bfqio_cgrp_id);
+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
rcu_read_unlock();
- /*
- * Check whether blkcg has changed. The condition may trigger
- * spuriously on a newly created cic but there's no harm.
- */
- if (unlikely(!bfqd) || likely(bic->blkcg_id == id))
- return;
-
- bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);
- BUG_ON(!bfqg);
- bic->blkcg_id = id;
+ return bfqg;
}
/**
* bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
* @st: the service tree being flushed.
*/
-static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
{
struct bfq_entity *entity = st->first_idle;
- for (; entity ; entity = st->first_idle)
+ for (; entity != NULL; entity = st->first_idle)
__bfq_deactivate_entity(entity, 0);
}
@@ -630,12 +435,12 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st)
* @bfqd: the device data structure with the root group.
* @entity: the entity to move.
*/
-static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
- struct bfq_entity *entity)
+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+ struct bfq_entity *entity)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- BUG_ON(!bfqq);
+ BUG_ON(bfqq == NULL);
bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
return;
}
@@ -649,9 +454,9 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
*
* Needs queue_lock to be taken and reference to be valid over the call.
*/
-static void bfq_reparent_active_entities(struct bfq_data *bfqd,
- struct bfq_group *bfqg,
- struct bfq_service_tree *st)
+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
+ struct bfq_group *bfqg,
+ struct bfq_service_tree *st)
{
struct rb_root *active = &st->active;
struct bfq_entity *entity = NULL;
@@ -659,10 +464,10 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd,
if (!RB_EMPTY_ROOT(&st->active))
entity = bfq_entity_of(rb_first(active));
- for (; entity ; entity = bfq_entity_of(rb_first(active)))
+ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
bfq_reparent_leaf_entity(bfqd, entity);
- if (bfqg->sched_data.in_service_entity)
+ if (bfqg->sched_data.in_service_entity != NULL)
bfq_reparent_leaf_entity(bfqd,
bfqg->sched_data.in_service_entity);
@@ -671,21 +476,20 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd,
/**
* bfq_destroy_group - destroy @bfqg.
+ * @bgrp: the bfqio_cgroup containing @bfqg.
* @bfqg: the group being destroyed.
*
* Destroy @bfqg, making sure that it is not referenced from its parent.
- * blkio already grabs the queue_lock for us, so no need to use RCU-based magic
*/
-static void bfq_pd_offline(struct blkcg_gq *blkg)
+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
{
+ struct bfq_data *bfqd;
struct bfq_service_tree *st;
- struct bfq_group *bfqg = blkg_to_bfqg(blkg);
- struct bfq_data *bfqd = bfqg->bfqd;
struct bfq_entity *entity = bfqg->my_entity;
+ unsigned long uninitialized_var(flags);
int i;
- if (!entity) /* root group */
- return;
+ hlist_del(&bfqg->group_node);
/*
* Empty all service_trees belonging to this group before
@@ -714,19 +518,37 @@ static void bfq_pd_offline(struct blkcg_gq *blkg)
* There is no need to put the sync queues, as the
* scheduler has taken no reference.
*/
- bfq_reparent_active_entities(bfqd, bfqg, st);
+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
+ if (bfqd != NULL) {
+ bfq_reparent_active_entities(bfqd, bfqg, st);
+ bfq_put_bfqd_unlock(bfqd, &flags);
+ }
BUG_ON(!RB_EMPTY_ROOT(&st->active));
BUG_ON(!RB_EMPTY_ROOT(&st->idle));
}
- BUG_ON(bfqg->sched_data.next_in_service);
- BUG_ON(bfqg->sched_data.in_service_entity);
+ BUG_ON(bfqg->sched_data.next_in_service != NULL);
+ BUG_ON(bfqg->sched_data.in_service_entity != NULL);
- hlist_del(&bfqg->bfqd_node);
- __bfq_deactivate_entity(entity, 0);
- bfq_put_async_queues(bfqd, bfqg);
- BUG_ON(entity->tree);
+ /*
+ * We may race with device destruction, take extra care when
+ * dereferencing bfqg->bfqd.
+ */
+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
+ if (bfqd != NULL) {
+ hlist_del(&bfqg->bfqd_node);
+ __bfq_deactivate_entity(entity, 0);
+ bfq_put_async_queues(bfqd, bfqg);
+ bfq_put_bfqd_unlock(bfqd, &flags);
+ }
+ BUG_ON(entity->tree != NULL);
- bfqg_stats_xfer_dead(bfqg);
+ /*
+ * No need to defer the kfree() to the end of the RCU grace
+ * period: we are called from the destroy() callback of our
+ * cgroup, so we can be sure that no one is a) still using
+ * this cgroup or b) doing lookups in it.
+ */
+ kfree(bfqg);
}
static void bfq_end_wr_async(struct bfq_data *bfqd)
@@ -773,309 +595,312 @@ static void bfq_disconnect_groups(struct bfq_data *bfqd)
}
}
-static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,
- struct cftype *cftype)
+static inline void bfq_free_root_group(struct bfq_data *bfqd)
{
- struct blkcg *blkcg = css_to_blkcg(css);
- struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
- int ret = -EINVAL;
+ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
+ struct bfq_group *bfqg = bfqd->root_group;
+
+ bfq_put_async_queues(bfqd, bfqg);
- spin_lock_irq(&blkcg->lock);
- ret = bfqgd->weight;
- spin_unlock_irq(&blkcg->lock);
+ spin_lock_irq(&bgrp->lock);
+ hlist_del_rcu(&bfqg->group_node);
+ spin_unlock_irq(&bgrp->lock);
- return ret;
+ /*
+ * No need to synchronize_rcu() here: since the device is gone
+ * there cannot be any read-side access to its root_group.
+ */
+ kfree(bfqg);
}
-static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,
- struct cftype *cftype,
- u64 val)
+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
{
- struct blkcg *blkcg = css_to_blkcg(css);
- struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
- struct blkcg_gq *blkg;
- int ret = -EINVAL;
-
- if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
- return ret;
-
- ret = 0;
- spin_lock_irq(&blkcg->lock);
- bfqgd->weight = (unsigned short)val;
- hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
- struct bfq_group *bfqg = blkg_to_bfqg(blkg);
- if (!bfqg)
- continue;
- /*
- * Setting the prio_changed flag of the entity
- * to 1 with new_weight == weight would re-set
- * the value of the weight to its ioprio mapping.
- * Set the flag only if necessary.
- */
- if ((unsigned short)val != bfqg->entity.new_weight) {
- bfqg->entity.new_weight = (unsigned short)val;
- /*
- * Make sure that the above new value has been
- * stored in bfqg->entity.new_weight before
- * setting the prio_changed flag. In fact,
- * this flag may be read asynchronously (in
- * critical sections protected by a different
- * lock than that held here), and finding this
- * flag set may cause the execution of the code
- * for updating parameters whose value may
- * depend also on bfqg->entity.new_weight (in
- * __bfq_entity_update_weight_prio).
- * This barrier makes sure that the new value
- * of bfqg->entity.new_weight is correctly
- * seen in that code.
- */
- smp_wmb();
- bfqg->entity.prio_changed = 1;
- }
- }
- spin_unlock_irq(&blkcg->lock);
+ struct bfq_group *bfqg;
+ struct bfqio_cgroup *bgrp;
+ int i;
- return ret;
-}
+ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
+ if (bfqg == NULL)
+ return NULL;
-static int bfqg_print_stat(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
- &blkcg_policy_bfq, seq_cft(sf)->private, false);
- return 0;
-}
+ bfqg->entity.parent = NULL;
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-static int bfqg_print_rwstat(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
- &blkcg_policy_bfq, seq_cft(sf)->private, true);
- return 0;
+ bgrp = &bfqio_root_cgroup;
+ spin_lock_irq(&bgrp->lock);
+ rcu_assign_pointer(bfqg->bfqd, bfqd);
+ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
+ spin_unlock_irq(&bgrp->lock);
+
+ return bfqg;
}
-static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- u64 sum = bfqg_stat_pd_recursive_sum(pd, off);
+#define SHOW_FUNCTION(__VAR) \
+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
+ struct cftype *cftype) \
+{ \
+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
+ u64 ret = -ENODEV; \
+ \
+ mutex_lock(&bfqio_mutex); \
+ if (bfqio_is_removed(bgrp)) \
+ goto out_unlock; \
+ \
+ spin_lock_irq(&bgrp->lock); \
+ ret = bgrp->__VAR; \
+ spin_unlock_irq(&bgrp->lock); \
+ \
+out_unlock: \
+ mutex_unlock(&bfqio_mutex); \
+ return ret; \
+}
+
+SHOW_FUNCTION(weight);
+SHOW_FUNCTION(ioprio);
+SHOW_FUNCTION(ioprio_class);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
+ struct cftype *cftype, \
+ u64 val) \
+{ \
+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
+ struct bfq_group *bfqg; \
+ int ret = -EINVAL; \
+ \
+ if (val < (__MIN) || val > (__MAX)) \
+ return ret; \
+ \
+ ret = -ENODEV; \
+ mutex_lock(&bfqio_mutex); \
+ if (bfqio_is_removed(bgrp)) \
+ goto out_unlock; \
+ ret = 0; \
+ \
+ spin_lock_irq(&bgrp->lock); \
+ bgrp->__VAR = (unsigned short)val; \
+ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
+ /* \
+ * Setting the ioprio_changed flag of the entity \
+ * to 1 with new_##__VAR == ##__VAR would re-set \
+ * the value of the weight to its ioprio mapping. \
+ * Set the flag only if necessary. \
+ */ \
+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
+ bfqg->entity.new_##__VAR = (unsigned short)val; \
+ /* \
+ * Make sure that the above new value has been \
+ * stored in bfqg->entity.new_##__VAR before \
+ * setting the ioprio_changed flag. In fact, \
+ * this flag may be read asynchronously (in \
+ * critical sections protected by a different \
+ * lock than that held here), and finding this \
+ * flag set may cause the execution of the code \
+ * for updating parameters whose value may \
+ * depend also on bfqg->entity.new_##__VAR (in \
+ * __bfq_entity_update_weight_prio). \
+ * This barrier makes sure that the new value \
+ * of bfqg->entity.new_##__VAR is correctly \
+ * seen in that code. \
+ */ \
+ smp_wmb(); \
+ bfqg->entity.ioprio_changed = 1; \
+ } \
+ } \
+ spin_unlock_irq(&bgrp->lock); \
+ \
+out_unlock: \
+ mutex_unlock(&bfqio_mutex); \
+ return ret; \
+}
+
+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
+#undef STORE_FUNCTION
- return __blkg_prfill_u64(sf, pd, sum);
-}
+static struct cftype bfqio_files[] = {
+ {
+ .name = "weight",
+ .read_u64 = bfqio_cgroup_weight_read,
+ .write_u64 = bfqio_cgroup_weight_write,
+ },
+ {
+ .name = "ioprio",
+ .read_u64 = bfqio_cgroup_ioprio_read,
+ .write_u64 = bfqio_cgroup_ioprio_write,
+ },
+ {
+ .name = "ioprio_class",
+ .read_u64 = bfqio_cgroup_ioprio_class_read,
+ .write_u64 = bfqio_cgroup_ioprio_class_write,
+ },
+ { }, /* terminate */
+};
-static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
+ *parent_css)
{
- struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);
+ struct bfqio_cgroup *bgrp;
- return __blkg_prfill_rwstat(sf, pd, &sum);
-}
+ if (parent_css != NULL) {
+ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
+ if (bgrp == NULL)
+ return ERR_PTR(-ENOMEM);
+ } else
+ bgrp = &bfqio_root_cgroup;
-static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
- seq_cft(sf)->private, false);
- return 0;
+ spin_lock_init(&bgrp->lock);
+ INIT_HLIST_HEAD(&bgrp->group_data);
+ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
+ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
+
+ return &bgrp->css;
}
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+/*
+ * We cannot support shared io contexts, as we have no means to support
+ * two tasks with the same ioc in two different groups without major rework
+ * of the main bic/bfqq data structures. By now we allow a task to change
+ * its cgroup only if it's the only owner of its ioc; the drawback of this
+ * behavior is that a group containing a task that forked using CLONE_IO
+ * will not be destroyed until the tasks sharing the ioc die.
+ */
+static int bfqio_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
- seq_cft(sf)->private, true);
- return 0;
+ struct task_struct *task;
+ struct io_context *ioc;
+ int ret = 0;
+
+ cgroup_taskset_for_each(task, tset) {
+ /*
+ * task_lock() is needed to avoid races with
+ * exit_io_context()
+ */
+ task_lock(task);
+ ioc = task->io_context;
+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
+ /*
+ * ioc == NULL means that the task is either too
+ * young or exiting: if it has still no ioc the
+ * ioc can't be shared, if the task is exiting the
+ * attach will fail anyway, no matter what we
+ * return here.
+ */
+ ret = -EINVAL;
+ task_unlock(task);
+ if (ret)
+ break;
+ }
+
+ return ret;
}
-static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
+static void bfqio_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
{
- struct bfq_group *bfqg = pd_to_bfqg(pd);
- u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
- u64 v = 0;
+ struct task_struct *task;
+ struct io_context *ioc;
+ struct io_cq *icq;
- if (samples) {
- v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
- v = div64_u64(v, samples);
+ /*
+ * IMPORTANT NOTE: The move of more than one process at a time to a
+ * new group has not yet been tested.
+ */
+ cgroup_taskset_for_each(task, tset) {
+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+ if (ioc) {
+ /*
+ * Handle cgroup change here.
+ */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
+ if (!strncmp(
+ icq->q->elevator->type->elevator_name,
+ "bfq", ELV_NAME_MAX))
+ bfq_bic_change_cgroup(icq_to_bic(icq),
+ css);
+ rcu_read_unlock();
+ put_io_context(ioc);
+ }
}
- __blkg_prfill_u64(sf, pd, v);
- return 0;
}
-/* print avg_queue_size */
-static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+static void bfqio_destroy(struct cgroup_subsys_state *css)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
- 0, false);
- return 0;
+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
+ struct hlist_node *tmp;
+ struct bfq_group *bfqg;
+
+ /*
+ * Since we are destroying the cgroup, there are no more tasks
+ * referencing it, and all the RCU grace periods that may have
+ * referenced it are ended (as the destruction of the parent
+ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
+ * anything else and we don't need any synchronization.
+ */
+ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
+ bfq_destroy_group(bgrp, bfqg);
+
+ BUG_ON(!hlist_empty(&bgrp->group_data));
+
+ kfree(bgrp);
}
-static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+static int bfqio_css_online(struct cgroup_subsys_state *css)
{
- int ret;
+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
- ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
- if (ret)
- return NULL;
+ mutex_lock(&bfqio_mutex);
+ bgrp->online = true;
+ mutex_unlock(&bfqio_mutex);
- return blkg_to_bfqg(bfqd->queue->root_blkg);
+ return 0;
}
-static struct cftype bfqio_files[] = {
- {
- .name = "bfq.weight",
- .read_u64 = bfqio_cgroup_weight_read,
- .write_u64 = bfqio_cgroup_weight_write,
- },
- /* statistics, cover only the tasks in the bfqg */
- {
- .name = "bfq.time",
- .private = offsetof(struct bfq_group, stats.time),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.sectors",
- .private = offsetof(struct bfq_group, stats.sectors),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.io_service_bytes",
- .private = offsetof(struct bfq_group, stats.service_bytes),
- .seq_show = bfqg_print_rwstat,
- },
- {
- .name = "bfq.io_serviced",
- .private = offsetof(struct bfq_group, stats.serviced),
- .seq_show = bfqg_print_rwstat,
- },
- {
- .name = "bfq.io_service_time",
- .private = offsetof(struct bfq_group, stats.service_time),
- .seq_show = bfqg_print_rwstat,
- },
- {
- .name = "bfq.io_wait_time",
- .private = offsetof(struct bfq_group, stats.wait_time),
- .seq_show = bfqg_print_rwstat,
- },
- {
- .name = "bfq.io_merged",
- .private = offsetof(struct bfq_group, stats.merged),
- .seq_show = bfqg_print_rwstat,
- },
- {
- .name = "bfq.io_queued",
- .private = offsetof(struct bfq_group, stats.queued),
- .seq_show = bfqg_print_rwstat,
- },
+static void bfqio_css_offline(struct cgroup_subsys_state *css)
+{
+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
- /* the same statictics which cover the bfqg and its descendants */
- {
- .name = "bfq.time_recursive",
- .private = offsetof(struct bfq_group, stats.time),
- .seq_show = bfqg_print_stat_recursive,
- },
- {
- .name = "bfq.sectors_recursive",
- .private = offsetof(struct bfq_group, stats.sectors),
- .seq_show = bfqg_print_stat_recursive,
- },
- {
- .name = "bfq.io_service_bytes_recursive",
- .private = offsetof(struct bfq_group, stats.service_bytes),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.io_serviced_recursive",
- .private = offsetof(struct bfq_group, stats.serviced),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.io_service_time_recursive",
- .private = offsetof(struct bfq_group, stats.service_time),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.io_wait_time_recursive",
- .private = offsetof(struct bfq_group, stats.wait_time),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.io_merged_recursive",
- .private = offsetof(struct bfq_group, stats.merged),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.io_queued_recursive",
- .private = offsetof(struct bfq_group, stats.queued),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.avg_queue_size",
- .seq_show = bfqg_print_avg_queue_size,
- },
- {
- .name = "bfq.group_wait_time",
- .private = offsetof(struct bfq_group, stats.group_wait_time),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.idle_time",
- .private = offsetof(struct bfq_group, stats.idle_time),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.empty_time",
- .private = offsetof(struct bfq_group, stats.empty_time),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.dequeue",
- .private = offsetof(struct bfq_group, stats.dequeue),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.unaccounted_time",
- .private = offsetof(struct bfq_group, stats.unaccounted_time),
- .seq_show = bfqg_print_stat,
- },
- { } /* terminate */
-};
+ mutex_lock(&bfqio_mutex);
+ bgrp->online = false;
+ mutex_unlock(&bfqio_mutex);
+}
-static struct blkcg_policy blkcg_policy_bfq = {
- .pd_size = sizeof(struct bfq_group),
- .cpd_size = sizeof(struct bfq_group_data),
- .cftypes = bfqio_files,
- .pd_init_fn = bfq_pd_init,
- .cpd_init_fn = bfq_cpd_init,
- .pd_offline_fn = bfq_pd_offline,
- .pd_reset_stats_fn = bfq_pd_reset_stats,
+struct cgroup_subsys bfqio_cgrp_subsys = {
+ .css_alloc = bfqio_create,
+ .css_online = bfqio_css_online,
+ .css_offline = bfqio_css_offline,
+ .can_attach = bfqio_can_attach,
+ .attach = bfqio_attach,
+ .css_free = bfqio_destroy,
+ .legacy_cftypes = bfqio_files,
};
-
#else
-
-static void bfq_init_entity(struct bfq_entity *entity,
- struct bfq_group *bfqg)
+static inline void bfq_init_entity(struct bfq_entity *entity,
+ struct bfq_group *bfqg)
{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
entity->weight = entity->new_weight;
entity->orig_weight = entity->new_weight;
- if (bfqq) {
- bfqq->ioprio = bfqq->new_ioprio;
- bfqq->ioprio_class = bfqq->new_ioprio_class;
- }
+ entity->ioprio = entity->new_ioprio;
+ entity->ioprio_class = entity->new_ioprio_class;
entity->sched_data = &bfqg->sched_data;
}
-static struct bfq_group *
-bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+static inline struct bfq_group *
+bfq_bic_update_cgroup(struct bfq_io_cq *bic)
{
struct bfq_data *bfqd = bic_to_bfqd(bic);
return bfqd->root_group;
}
-static void bfq_bfqq_move(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- struct bfq_entity *entity,
- struct bfq_group *bfqg)
+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct bfq_entity *entity,
+ struct bfq_group *bfqg)
{
}
@@ -1084,24 +909,23 @@ static void bfq_end_wr_async(struct bfq_data *bfqd)
bfq_end_wr_async_queues(bfqd, bfqd->root_group);
}
-static void bfq_disconnect_groups(struct bfq_data *bfqd)
+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
{
bfq_put_async_queues(bfqd, bfqd->root_group);
}
-static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
- struct blkcg *blkcg)
+static inline void bfq_free_root_group(struct bfq_data *bfqd)
{
- return bfqd->root_group;
+ kfree(bfqd->root_group);
}
-static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
{
struct bfq_group *bfqg;
int i;
bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
- if (!bfqg)
+ if (bfqg == NULL)
return NULL;
for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
index fb7bb8f08..7f6b0004c 100644
--- a/block/bfq-ioc.c
+++ b/block/bfq-ioc.c
@@ -14,7 +14,7 @@
* icq_to_bic - convert iocontext queue structure to bfq_io_cq.
* @icq: the iocontext queue.
*/
-static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
{
/* bic->icq is the first member, %NULL will convert to %NULL */
return container_of(icq, struct bfq_io_cq, icq);
@@ -27,8 +27,8 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
*
* Queue lock must be held.
*/
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
- struct io_context *ioc)
+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+ struct io_context *ioc)
{
if (ioc)
return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index cf487f926..71b51c1b4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -82,9 +82,6 @@ static const int bfq_back_penalty = 2;
/* Idling period duration, in jiffies. */
static int bfq_slice_idle = HZ / 125;
-/* Minimum number of assigned budgets for which stats are safe to compute. */
-static const int bfq_stats_min_budgets = 194;
-
/* Default maximum budget values, in sectors and number of requests. */
static const int bfq_default_max_budget = 16 * 1024;
static const int bfq_max_budget_async_rq = 4;
@@ -166,22 +163,38 @@ static int device_speed_thresh[2];
#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
-static void bfq_schedule_dispatch(struct bfq_data *bfqd);
+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
#include "bfq-ioc.c"
#include "bfq-sched.c"
#include "bfq-cgroup.c"
-#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
-#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
+ IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
+ IOPRIO_CLASS_RT)
#define bfq_sample_valid(samples) ((samples) > 80)
/*
+ * The following macro groups conditions that need to be evaluated when
+ * checking if existing queues and groups form a symmetric scenario
+ * and therefore idling can be reduced or disabled for some of the
+ * queues. See the comment to the function bfq_bfqq_must_not_expire()
+ * for further details.
+ */
+#ifdef CONFIG_CGROUP_BFQIO
+#define symmetric_scenario (!bfqd->active_numerous_groups && \
+ !bfq_differentiated_weights(bfqd))
+#else
+#define symmetric_scenario (!bfq_differentiated_weights(bfqd))
+#endif
+
+/*
* We regard a request as SYNC, if either it's a read or has the SYNC bit
* set (in which case it could also be a direct WRITE).
*/
-static int bfq_bio_sync(struct bio *bio)
+static inline int bfq_bio_sync(struct bio *bio)
{
if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
return 1;
@@ -193,7 +206,7 @@ static int bfq_bio_sync(struct bio *bio)
* Scheduler run of queue, if there are requests pending and no one in the
* driver that will restart queueing.
*/
-static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
{
if (bfqd->queued != 0) {
bfq_log(bfqd, "schedule dispatch");
@@ -217,9 +230,9 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
unsigned wrap = 0; /* bit mask: requests behind the disk head? */
- if (!rq1 || rq1 == rq2)
+ if (rq1 == NULL || rq1 == rq2)
return rq2;
- if (!rq2)
+ if (rq2 == NULL)
return rq1;
if (rq_is_sync(rq1) && !rq_is_sync(rq2))
@@ -332,17 +345,17 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
(long long unsigned)sector,
- bfqq ? bfqq->pid : 0);
+ bfqq != NULL ? bfqq->pid : 0);
return bfqq;
}
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
struct rb_node **p, *parent;
struct bfq_queue *__bfqq;
- if (bfqq->pos_root) {
+ if (bfqq->pos_root != NULL) {
rb_erase(&bfqq->pos_node, bfqq->pos_root);
bfqq->pos_root = NULL;
}
@@ -352,10 +365,10 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
if (!bfqq->next_rq)
return;
- bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ bfqq->pos_root = &bfqd->rq_pos_tree;
__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
blk_rq_pos(bfqq->next_rq), &parent, &p);
- if (!__bfqq) {
+ if (__bfqq == NULL) {
rb_link_node(&bfqq->pos_node, parent, p);
rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
} else
@@ -365,7 +378,7 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
/*
* Tell whether there are active queues or groups with differentiated weights.
*/
-static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
{
/*
* For weights to differ, at least one of the trees must contain
@@ -374,7 +387,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
(bfqd->queue_weights_tree.rb_node->rb_left ||
bfqd->queue_weights_tree.rb_node->rb_right)
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
) ||
(!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
(bfqd->group_weights_tree.rb_node->rb_left ||
@@ -384,40 +397,6 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
}
/*
- * The following function returns true if every queue must receive the
- * same share of the throughput (this condition is used when deciding
- * whether idling may be disabled, see the comments in the function
- * bfq_bfqq_may_idle()).
- *
- * Such a scenario occurs when:
- * 1) all active queues have the same weight,
- * 2) all active groups at the same level in the groups tree have the same
- * weight,
- * 3) all active groups at the same level in the groups tree have the same
- * number of children.
- *
- * Unfortunately, keeping the necessary state for evaluating exactly the
- * above symmetry conditions would be quite complex and time-consuming.
- * Therefore this function evaluates, instead, the following stronger
- * sub-conditions, for which it is much easier to maintain the needed
- * state:
- * 1) all active queues have the same weight,
- * 2) all active groups have the same weight,
- * 3) all active groups have at most one active child each.
- * In particular, the last two conditions are always true if hierarchical
- * support and the cgroups interface are not enabled, thus no state needs
- * to be maintained in this case.
- */
-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
-{
- return
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- !bfqd->active_numerous_groups &&
-#endif
- !bfq_differentiated_weights(bfqd);
-}
-
-/*
* If the weight-counter tree passed as input contains no counter for
* the weight of the input entity, then add that counter; otherwise just
* increment the existing counter.
@@ -516,10 +495,10 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
BUG_ON(RB_EMPTY_NODE(&last->rb_node));
- if (rbprev)
+ if (rbprev != NULL)
prev = rb_entry_rq(rbprev);
- if (rbnext)
+ if (rbnext != NULL)
next = rb_entry_rq(rbnext);
else {
rbnext = rb_first(&bfqq->sort_list);
@@ -531,8 +510,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
}
/* see the definition of bfq_async_charge_factor for details */
-static unsigned long bfq_serv_to_charge(struct request *rq,
- struct bfq_queue *bfqq)
+static inline unsigned long bfq_serv_to_charge(struct request *rq,
+ struct bfq_queue *bfqq)
{
return blk_rq_sectors(rq) *
(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
@@ -558,7 +537,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
struct request *next_rq = bfqq->next_rq;
unsigned long new_budget;
- if (!next_rq)
+ if (next_rq == NULL)
return;
if (bfqq == bfqd->in_service_queue)
@@ -581,7 +560,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
}
}
-static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
{
u64 dur;
@@ -594,12 +573,13 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
return dur;
}
-static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq)
+static inline unsigned
+bfq_bfqq_cooperations(struct bfq_queue *bfqq)
{
return bfqq->bic ? bfqq->bic->cooperations : 0;
}
-static void
+static inline void
bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
{
if (bic->saved_idle_window)
@@ -623,7 +603,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;
bfqq->wr_cur_max_time = bic->wr_time_left;
bfqq->last_wr_start_finish = jiffies;
- bfqq->entity.prio_changed = 1;
+ bfqq->entity.ioprio_changed = 1;
}
/*
* Clear wr_time_left to prevent bfq_bfqq_save_state() from
@@ -633,12 +613,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
bic->wr_time_left = 0;
}
+/* Must be called with the queue_lock held. */
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
int process_refs, io_refs;
- lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
-
io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
BUG_ON(process_refs < 0);
@@ -646,7 +625,8 @@ static int bfqq_process_refs(struct bfq_queue *bfqq)
}
/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
-static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static inline void bfq_reset_burst_list(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
{
struct bfq_queue *item;
struct hlist_node *n;
@@ -878,14 +858,14 @@ static void bfq_add_request(struct request *rq)
*/
prev = bfqq->next_rq;
next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
- BUG_ON(!next_rq);
+ BUG_ON(next_rq == NULL);
bfqq->next_rq = next_rq;
/*
* Adjust priority tree position, if next_rq changes.
*/
if (prev != bfqq->next_rq)
- bfq_pos_tree_add_move(bfqd, bfqq);
+ bfq_rq_pos_tree_add(bfqd, bfqq);
if (!bfq_bfqq_busy(bfqq)) {
bool soft_rt, coop_or_in_burst,
@@ -893,10 +873,6 @@ static void bfq_add_request(struct request *rq)
bfqq->budget_timeout +
bfqd->bfq_wr_min_idle_time);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
- rq->cmd_flags);
-#endif
if (bfq_bfqq_sync(bfqq)) {
bool already_in_burst =
!hlist_unhashed(&bfqq->burst_list_node) ||
@@ -941,7 +917,7 @@ static void bfq_add_request(struct request *rq)
goto add_bfqq_busy;
if (bfq_bfqq_just_split(bfqq))
- goto set_prio_changed;
+ goto set_ioprio_changed;
/*
* If the queue:
@@ -953,7 +929,7 @@ static void bfq_add_request(struct request *rq)
* start a weight-raising period.
*/
if (old_wr_coeff == 1 && (interactive || soft_rt) &&
- (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {
+ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
bfqq->wr_coeff = bfqd->bfq_wr_coeff;
if (interactive)
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
@@ -1032,9 +1008,9 @@ static void bfq_add_request(struct request *rq)
bfqd->bfq_wr_rt_max_time;
}
}
-set_prio_changed:
+set_ioprio_changed:
if (old_wr_coeff != bfqq->wr_coeff)
- entity->prio_changed = 1;
+ entity->ioprio_changed = 1;
add_bfqq_busy:
bfqq->last_idle_bklogged = jiffies;
bfqq->service_from_backlogged = 0;
@@ -1049,7 +1025,7 @@ add_bfqq_busy:
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
bfqd->wr_busy_queues++;
- entity->prio_changed = 1;
+ entity->ioprio_changed = 1;
bfq_log_bfqq(bfqd, bfqq,
"non-idle wrais starting at %lu, rais_max_time %u",
jiffies,
@@ -1072,11 +1048,11 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
struct bfq_queue *bfqq;
bic = bfq_bic_lookup(bfqd, tsk->io_context);
- if (!bic)
+ if (bic == NULL)
return NULL;
bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
- if (bfqq)
+ if (bfqq != NULL)
return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
return NULL;
@@ -1092,7 +1068,8 @@ static void bfq_activate_request(struct request_queue *q, struct request *rq)
(long long unsigned)bfqd->last_position);
}
-static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+static inline void bfq_deactivate_request(struct request_queue *q,
+ struct request *rq)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
@@ -1124,7 +1101,7 @@ static void bfq_remove_request(struct request *rq)
/*
* Remove queue from request-position tree as it is empty.
*/
- if (bfqq->pos_root) {
+ if (bfqq->pos_root != NULL) {
rb_erase(&bfqq->pos_node, bfqq->pos_root);
bfqq->pos_root = NULL;
}
@@ -1134,9 +1111,6 @@ static void bfq_remove_request(struct request *rq)
BUG_ON(bfqq->meta_pending == 0);
bfqq->meta_pending--;
}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
-#endif
}
static int bfq_merge(struct request_queue *q, struct request **req,
@@ -1146,7 +1120,7 @@ static int bfq_merge(struct request_queue *q, struct request **req,
struct request *__rq;
__rq = bfq_find_rq_fmerge(bfqd, bio);
- if (__rq && elv_rq_merge_ok(__rq, bio)) {
+ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
*req = __rq;
return ELEVATOR_FRONT_MERGE;
}
@@ -1173,7 +1147,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,
prev = bfqq->next_rq;
next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
bfqd->last_position);
- BUG_ON(!next_rq);
+ BUG_ON(next_rq == NULL);
bfqq->next_rq = next_rq;
/*
* If next_rq changes, update both the queue's budget to
@@ -1182,19 +1156,11 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,
*/
if (prev != bfqq->next_rq) {
bfq_updated_next_req(bfqd, bfqq);
- bfq_pos_tree_add_move(bfqd, bfqq);
+ bfq_rq_pos_tree_add(bfqd, bfqq);
}
}
}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfq_bio_merged(struct request_queue *q, struct request *req,
- struct bio *bio)
-{
- bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw);
-}
-#endif
-
static void bfq_merged_requests(struct request_queue *q, struct request *rq,
struct request *next)
{
@@ -1221,21 +1187,18 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq,
bfqq->next_rq = rq;
bfq_remove_request(next);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
-#endif
}
/* Must be called with bfqq != NULL */
-static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
{
- BUG_ON(!bfqq);
+ BUG_ON(bfqq == NULL);
if (bfq_bfqq_busy(bfqq))
bfqq->bfqd->wr_busy_queues--;
bfqq->wr_coeff = 1;
bfqq->wr_cur_max_time = 0;
/* Trigger a weight change on the next activation of the queue */
- bfqq->entity.prio_changed = 1;
+ bfqq->entity.ioprio_changed = 1;
}
static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
@@ -1245,9 +1208,9 @@ static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
for (i = 0; i < 2; i++)
for (j = 0; j < IOPRIO_BE_NR; j++)
- if (bfqg->async_bfqq[i][j])
+ if (bfqg->async_bfqq[i][j] != NULL)
bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
- if (bfqg->async_idle_bfqq)
+ if (bfqg->async_idle_bfqq != NULL)
bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
}
@@ -1266,7 +1229,7 @@ static void bfq_end_wr(struct bfq_data *bfqd)
spin_unlock_irq(bfqd->queue->queue_lock);
}
-static sector_t bfq_io_struct_pos(void *io_struct, bool request)
+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
{
if (request)
return blk_rq_pos(io_struct);
@@ -1274,18 +1237,25 @@ static sector_t bfq_io_struct_pos(void *io_struct, bool request)
return ((struct bio *)io_struct)->bi_iter.bi_sector;
}
-static int bfq_rq_close_to_sector(void *io_struct, bool request,
- sector_t sector)
+static inline sector_t bfq_dist_from(sector_t pos1,
+ sector_t pos2)
{
- return abs64(bfq_io_struct_pos(io_struct, request) - sector) <=
- BFQQ_SEEK_THR;
+ if (pos1 >= pos2)
+ return pos1 - pos2;
+ else
+ return pos2 - pos1;
}
-static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
sector_t sector)
{
- struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
+ BFQQ_SEEK_THR;
+}
+
+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
+{
+ struct rb_root *root = &bfqd->rq_pos_tree;
struct rb_node *parent, *node;
struct bfq_queue *__bfqq;
@@ -1297,7 +1267,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
* request, choose it.
*/
__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
- if (__bfqq)
+ if (__bfqq != NULL)
return __bfqq;
/*
@@ -1313,7 +1283,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
node = rb_next(&__bfqq->pos_node);
else
node = rb_prev(&__bfqq->pos_node);
- if (!node)
+ if (node == NULL)
return NULL;
__bfqq = rb_entry(node, struct bfq_queue, pos_node);
@@ -1323,21 +1293,56 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
return NULL;
}
-static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
- struct bfq_queue *cur_bfqq,
- sector_t sector)
+/*
+ * bfqd - obvious
+ * cur_bfqq - passed in so that we don't decide that the current queue
+ * is closely cooperating with itself
+ * sector - used as a reference point to search for a close queue
+ */
+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
+ struct bfq_queue *cur_bfqq,
+ sector_t sector)
{
struct bfq_queue *bfqq;
+ if (bfq_class_idle(cur_bfqq))
+ return NULL;
+ if (!bfq_bfqq_sync(cur_bfqq))
+ return NULL;
+ if (BFQQ_SEEKY(cur_bfqq))
+ return NULL;
+
+ /* If device has only one backlogged bfq_queue, don't search. */
+ if (bfqd->busy_queues == 1)
+ return NULL;
+
+ /*
+ * We should notice if some of the queues are cooperating, e.g.
+ * working closely on the same area of the disk. In that case,
+ * we can group them together and don't waste time idling.
+ */
+ bfqq = bfqq_close(bfqd, sector);
+ if (bfqq == NULL || bfqq == cur_bfqq)
+ return NULL;
+
+ /*
+ * Do not merge queues from different bfq_groups.
+ */
+ if (bfqq->entity.parent != cur_bfqq->entity.parent)
+ return NULL;
+
/*
- * We shall notice if some of the queues are cooperating,
- * e.g., working closely on the same area of the device. In
- * that case, we can group them together and: 1) don't waste
- * time idling, and 2) serve the union of their requests in
- * the best possible order for throughput.
+ * It only makes sense to merge sync queues.
*/
- bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
- if (!bfqq || bfqq == cur_bfqq)
+ if (!bfq_bfqq_sync(bfqq))
+ return NULL;
+ if (BFQQ_SEEKY(bfqq))
+ return NULL;
+
+ /*
+ * Do not merge queues of different priority classes.
+ */
+ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
return NULL;
return bfqq;
@@ -1404,32 +1409,6 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
return new_bfqq;
}
-static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
- struct bfq_queue *new_bfqq)
-{
- if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
- (bfqq->ioprio_class != new_bfqq->ioprio_class))
- return false;
-
- /*
- * If either of the queues has already been detected as seeky,
- * then merging it with the other queue is unlikely to lead to
- * sequential I/O.
- */
- if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
- return false;
-
- /*
- * Interleaved I/O is known to be done by (some) applications
- * only for reads, so it does not make sense to merge async
- * queues.
- */
- if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
- return false;
-
- return true;
-}
-
/*
* Attempt to schedule a merge of bfqq with the currently in-service queue
* or with a close queue among the scheduled queues.
@@ -1451,52 +1430,56 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (bfqq->new_bfqq)
return bfqq->new_bfqq;
+
if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
return NULL;
- /* If device has only one backlogged bfq_queue, don't search. */
- if (bfqd->busy_queues == 1)
- return NULL;
in_service_bfqq = bfqd->in_service_queue;
- if (!in_service_bfqq || in_service_bfqq == bfqq ||
+ if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
!bfqd->in_service_bic ||
unlikely(in_service_bfqq == &bfqd->oom_bfqq))
goto check_scheduled;
+ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
+ goto check_scheduled;
+
+ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
+ goto check_scheduled;
+
+ if (in_service_bfqq->entity.parent != bfqq->entity.parent)
+ goto check_scheduled;
+
if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
- bfqq->entity.parent == in_service_bfqq->entity.parent &&
- bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
+ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
- if (new_bfqq)
- return new_bfqq;
+ if (new_bfqq != NULL)
+ return new_bfqq; /* Merge with in-service queue */
}
+
/*
* Check whether there is a cooperator among currently scheduled
* queues. The only thing we need is that the bio/request is not
* NULL, as we need it to establish whether a cooperator exists.
*/
check_scheduled:
- new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
- bfq_io_struct_pos(io_struct, request));
-
- BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
-
- if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
- bfq_may_be_close_cooperator(bfqq, new_bfqq))
+ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
+ bfq_io_struct_pos(io_struct, request));
+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))
return bfq_setup_merge(bfqq, new_bfqq);
return NULL;
}
-static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+static inline void
+bfq_bfqq_save_state(struct bfq_queue *bfqq)
{
/*
- * If !bfqq->bic, the queue is already shared or its requests
+ * If bfqq->bic == NULL, the queue is already shared or its requests
* have already been redirected to a shared queue; both idle window
* and weight raising state have already been saved. Do nothing.
*/
- if (!bfqq->bic)
+ if (bfqq->bic == NULL)
return;
if (bfqq->bic->wr_time_left)
/*
@@ -1540,7 +1523,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
bfqq->bic->failed_cooperations = 0;
}
-static void bfq_get_bic_reference(struct bfq_queue *bfqq)
+static inline void
+bfq_get_bic_reference(struct bfq_queue *bfqq)
{
/*
* If bfqq->bic has a non-NULL value, the bic to which it belongs
@@ -1588,7 +1572,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
bfq_put_queue(bfqq);
}
-static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
+static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
{
struct bfq_io_cq *bic = bfqq->bic;
struct bfq_data *bfqd = bfqq->bfqd;
@@ -1619,7 +1603,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
* Queue lock is held here.
*/
bic = bfq_bic_lookup(bfqd, current->io_context);
- if (!bic)
+ if (bic == NULL)
return 0;
bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
@@ -1627,9 +1611,9 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
* We take advantage of this function to perform an early merge
* of the queues of possible cooperating processes.
*/
- if (bfqq) {
+ if (bfqq != NULL) {
new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
- if (new_bfqq) {
+ if (new_bfqq != NULL) {
bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
/*
* If we get here, the bio will be queued in the
@@ -1647,10 +1631,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
- if (bfqq) {
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
-#endif
+ if (bfqq != NULL) {
bfq_mark_bfqq_must_alloc(bfqq);
bfq_mark_bfqq_budget_new(bfqq);
bfq_clear_bfqq_fifo_expire(bfqq);
@@ -1658,7 +1639,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
bfq_log_bfqq(bfqd, bfqq,
- "set_in_service_queue, cur-budget = %d",
+ "set_in_service_queue, cur-budget = %lu",
bfqq->entity.budget);
}
@@ -1681,9 +1662,9 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
* stored in bfqd, which is dynamically updated according to the
* estimated disk peak rate; otherwise return the default max budget
*/
-static int bfq_max_budget(struct bfq_data *bfqd)
+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
{
- if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ if (bfqd->budgets_assigned < 194)
return bfq_default_max_budget;
else
return bfqd->bfq_max_budget;
@@ -1693,9 +1674,9 @@ static int bfq_max_budget(struct bfq_data *bfqd)
* Return min budget, which is a fraction of the current or default
* max budget (trying with 1/32)
*/
-static int bfq_min_budget(struct bfq_data *bfqd)
+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
{
- if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ if (bfqd->budgets_assigned < 194)
return bfq_default_max_budget / 32;
else
return bfqd->bfq_max_budget / 32;
@@ -1711,7 +1692,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
/* Processes have exited, don't wait. */
bic = bfqd->in_service_bic;
- if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
+ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
return;
bfq_mark_bfqq_wait_request(bfqq);
@@ -1737,15 +1718,12 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
bfq_max_budget(bfqq->bfqd) / 8) ||
bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
- bfq_symmetric_scenario(bfqd))
+ symmetric_scenario)
sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
else if (bfqq->wr_coeff > 1)
sl = sl * 3;
bfqd->last_idling_start = ktime_get();
mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
-#endif
bfq_log(bfqd, "arm idle: %u/%u ms",
jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
}
@@ -1799,10 +1777,6 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
if (bfq_bfqq_sync(bfqq))
bfqd->sync_flight++;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq),
- rq->cmd_flags);
-#endif
}
/*
@@ -1828,7 +1802,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
return rq;
}
-static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
return entity->budget - entity->service;
@@ -1862,7 +1836,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
/*
* Resort priority tree of potential close cooperators.
*/
- bfq_pos_tree_add_move(bfqd, bfqq);
+ bfq_rq_pos_tree_add(bfqd, bfqq);
}
}
@@ -1872,24 +1846,24 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
* @bfqq: queue to update.
* @reason: reason for expiration.
*
- * Handle the feedback on @bfqq budget at queue expiration.
- * See the body for detailed comments.
+ * Handle the feedback on @bfqq budget. See the body for detailed
+ * comments.
*/
static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
enum bfqq_expiration reason)
{
struct request *next_rq;
- int budget, min_budget;
+ unsigned long budget, min_budget;
budget = bfqq->max_budget;
min_budget = bfq_min_budget(bfqd);
BUG_ON(bfqq != bfqd->in_service_queue);
- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
budget, bfq_min_budget(bfqd));
bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
@@ -1966,19 +1940,18 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
default:
return;
}
- } else
- /*
- * Async queues get always the maximum possible budget
- * (their ability to dispatch is limited by
- * @bfqd->bfq_max_budget_async_rq).
- */
+ } else /* async queue */
+ /* async queues get always the maximum possible budget
+ * (their ability to dispatch is limited by
+ * @bfqd->bfq_max_budget_async_rq).
+ */
budget = bfqd->bfq_max_budget;
bfqq->max_budget = budget;
- if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
- !bfqd->bfq_user_max_budget)
- bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
+ bfqq->max_budget > bfqd->bfq_max_budget)
+ bfqq->max_budget = bfqd->bfq_max_budget;
/*
* Make sure that we have enough budget for the next request.
@@ -1987,14 +1960,14 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
* update.
*/
next_rq = bfqq->next_rq;
- if (next_rq)
+ if (next_rq != NULL)
bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(next_rq, bfqq));
else
bfqq->entity.budget = bfqq->max_budget;
- bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
- next_rq ? blk_rq_sectors(next_rq) : 0,
+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
+ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
bfqq->entity.budget);
}
@@ -2020,15 +1993,15 @@ static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
* seeky processes, and hence reduce their chances to lower the
* throughput. See the code for more details.
*/
-static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- bool compensate, enum bfqq_expiration reason)
+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ int compensate, enum bfqq_expiration reason)
{
u64 bw, usecs, expected, timeout;
ktime_t delta;
int update = 0;
if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
- return false;
+ return 0;
if (compensate)
delta = bfqd->last_idling_start;
@@ -2039,7 +2012,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/* Don't trust short/unrealistic values. */
if (usecs < 100 || usecs >= LONG_MAX)
- return false;
+ return 0;
/*
* Calculate the bandwidth for the last slice. We use a 64 bit
@@ -2088,7 +2061,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqd->bfq_max_budget =
bfq_calc_max_budget(bfqd->peak_rate,
timeout);
- bfq_log(bfqd, "new max_budget=%d",
+ bfq_log(bfqd, "new max_budget=%lu",
bfqd->bfq_max_budget);
}
if (bfqd->device_speed == BFQ_BFQD_FAST &&
@@ -2113,7 +2086,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* and for the moment return false.
*/
if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
- return false;
+ return 0;
/*
* A process is considered ``slow'' (i.e., seeky, so that we
@@ -2188,8 +2161,8 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* seems to be quite precise also in embedded systems and KVM/QEMU virtual
* machines.
*/
-static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
+static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
{
return max(bfqq->last_idle_bklogged +
HZ * bfqq->service_from_backlogged /
@@ -2202,7 +2175,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
* the current time will be lower than this time instant according to the macro
* time_is_before_jiffies().
*/
-static unsigned long bfq_infinity_from_now(unsigned long now)
+static inline unsigned long bfq_infinity_from_now(unsigned long now)
{
return now + ULONG_MAX / 2;
}
@@ -2239,14 +2212,13 @@ static unsigned long bfq_infinity_from_now(unsigned long now)
*/
static void bfq_bfqq_expire(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
- bool compensate,
+ int compensate,
enum bfqq_expiration reason)
{
- bool slow;
+ int slow;
BUG_ON(bfqq != bfqd->in_service_queue);
- /*
- * Update disk peak rate for autotuning and check whether the
+ /* Update disk peak rate for autotuning and check whether the
* process is slow (see bfq_update_peak_rate).
*/
slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
@@ -2340,12 +2312,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
* just checked on request arrivals and completions, as well as on
* idle timer expirations.
*/
-static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
{
if (bfq_bfqq_budget_new(bfqq) ||
time_before(jiffies, bfqq->budget_timeout))
- return false;
- return true;
+ return 0;
+ return 1;
}
/*
@@ -2356,7 +2328,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
* does not hold, or if the queue is slow enough to deserve only to be
* kicked off for preserving a high throughput.
*/
-static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
{
bfq_log_bfqq(bfqq->bfqd, bfqq,
"may_budget_timeout: wait_request %d left %d timeout %d",
@@ -2371,278 +2343,183 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
}
/*
- * For a queue that becomes empty, device idling is allowed only if
- * this function returns true for that queue. As a consequence, since
- * device idling plays a critical role for both throughput boosting
- * and service guarantees, the return value of this function plays a
- * critical role as well.
+ * Device idling is allowed only for the queues for which this function
+ * returns true. For this reason, the return value of this function plays a
+ * critical role for both throughput boosting and service guarantees. The
+ * return value is computed through a logical expression. In this rather
+ * long comment, we try to briefly describe all the details and motivations
+ * behind the components of this logical expression.
+ *
+ * First, the expression is false if bfqq is not sync, or if: bfqq happened
+ * to become active during a large burst of queue activations, and the
+ * pattern of requests bfqq contains boosts the throughput if bfqq is
+ * expired. In fact, queues that became active during a large burst benefit
+ * only from throughput, as discussed in the comments to bfq_handle_burst.
+ * In this respect, expiring bfqq certainly boosts the throughput on NCQ-
+ * capable flash-based devices, whereas, on rotational devices, it boosts
+ * the throughput only if bfqq contains random requests.
+ *
+ * On the opposite end, if (a) bfqq is sync, (b) the above burst-related
+ * condition does not hold, and (c) bfqq is being weight-raised, then the
+ * expression always evaluates to true, as device idling is instrumental
+ * for preserving low-latency guarantees (see [1]). If, instead, conditions
+ * (a) and (b) do hold, but (c) does not, then the expression evaluates to
+ * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and
+ * (2) at least one of the following two conditions holds.
+ * The first condition is that the device is not performing NCQ, because
+ * idling the device most certainly boosts the throughput if this condition
+ * holds and bfqq is I/O-bound and has been granted a non-null idle window.
+ * The second compound condition is made of the logical AND of two components.
+ *
+ * The first component is true only if there is no weight-raised busy
+ * queue. This guarantees that the device is not idled for a sync non-
+ * weight-raised queue when there are busy weight-raised queues. The former
+ * is then expired immediately if empty. Combined with the timestamping
+ * rules of BFQ (see [1] for details), this causes sync non-weight-raised
+ * queues to get a lower number of requests served, and hence to ask for a
+ * lower number of requests from the request pool, before the busy weight-
+ * raised queues get served again.
+ *
+ * This is beneficial for the processes associated with weight-raised
+ * queues, when the request pool is saturated (e.g., in the presence of
+ * write hogs). In fact, if the processes associated with the other queues
+ * ask for requests at a lower rate, then weight-raised processes have a
+ * higher probability to get a request from the pool immediately (or at
+ * least soon) when they need one. Hence they have a higher probability to
+ * actually get a fraction of the disk throughput proportional to their
+ * high weight. This is especially true with NCQ-capable drives, which
+ * enqueue several requests in advance and further reorder internally-
+ * queued requests.
+ *
+ * In the end, mistreating non-weight-raised queues when there are busy
+ * weight-raised queues seems to mitigate starvation problems in the
+ * presence of heavy write workloads and NCQ, and hence to guarantee a
+ * higher application and system responsiveness in these hostile scenarios.
+ *
+ * If the first component of the compound condition is instead true, i.e.,
+ * there is no weight-raised busy queue, then the second component of the
+ * compound condition takes into account service-guarantee and throughput
+ * issues related to NCQ (recall that the compound condition is evaluated
+ * only if the device is detected as supporting NCQ).
*
- * In a nutshell, this function returns true only if idling is
- * beneficial for throughput or, even if detrimental for throughput,
- * idling is however necessary to preserve service guarantees (low
- * latency, desired throughput distribution, ...). In particular, on
- * NCQ-capable devices, this function tries to return false, so as to
- * help keep the drives' internal queues full, whenever this helps the
- * device boost the throughput without causing any service-guarantee
- * issue.
+ * As for service guarantees, allowing the drive to enqueue more than one
+ * request at a time, and hence delegating de facto final scheduling
+ * decisions to the drive's internal scheduler, causes loss of control on
+ * the actual request service order. In this respect, when the drive is
+ * allowed to enqueue more than one request at a time, the service
+ * distribution enforced by the drive's internal scheduler is likely to
+ * coincide with the desired device-throughput distribution only in the
+ * following, perfectly symmetric, scenario:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ * weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ * number of children.
+ *
+ * Even in such a scenario, sequential I/O may still receive a preferential
+ * treatment, but this is not likely to be a big issue with flash-based
+ * devices, because of their non-dramatic loss of throughput with random
+ * I/O. Things do differ with HDDs, for which additional care is taken, as
+ * explained after completing the discussion for flash-based devices.
*
- * In more detail, the return value of this function is obtained by,
- * first, computing a number of boolean variables that take into
- * account throughput and service-guarantee issues, and, then,
- * combining these variables in a logical expression. Most of the
- * issues taken into account are not trivial. We discuss these issues
- * while introducing the variables.
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore BFQ evaluates instead the following stronger sub-conditions,
+ * for which it is much easier to maintain the needed state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, hence no state needs
+ * to be maintained in this case.
+ *
+ * According to the above considerations, the second component of the
+ * compound condition evaluates to true if any of the above symmetry
+ * sub-condition does not hold, or the device is not flash-based. Therefore,
+ * if also the first component is true, then idling is allowed for a sync
+ * queue. These are the only sub-conditions considered if the device is
+ * flash-based, as, for such a device, it is sensible to force idling only
+ * for service-guarantee issues. In fact, as for throughput, idling
+ * NCQ-capable flash-based devices would not boost the throughput even
+ * with sequential I/O; rather it would lower the throughput in proportion
+ * to how fast the device is. In the end, (only) if all the three
+ * sub-conditions hold and the device is flash-based, the compound
+ * condition evaluates to false and therefore no idling is performed.
+ *
+ * As already said, things change with a rotational device, where idling
+ * boosts the throughput with sequential I/O (even with NCQ). Hence, for
+ * such a device the second component of the compound condition evaluates
+ * to true also if the following additional sub-condition does not hold:
+ * the queue is constantly seeky. Unfortunately, this different behavior
+ * with respect to flash-based devices causes an additional asymmetry: if
+ * some sync queues enjoy idling and some other sync queues do not, then
+ * the latter get a low share of the device throughput, simply because the
+ * former get many requests served after being set as in service, whereas
+ * the latter do not. As a consequence, to guarantee the desired throughput
+ * distribution, on HDDs the compound expression evaluates to true (and
+ * hence device idling is performed) also if the following last symmetry
+ * condition does not hold: no other queue is benefiting from idling. Also
+ * this last condition is actually replaced with a simpler-to-maintain and
+ * stronger condition: there is no busy queue which is not constantly seeky
+ * (and hence may also benefit from idling).
+ *
+ * To sum up, when all the required symmetry and throughput-boosting
+ * sub-conditions hold, the second component of the compound condition
+ * evaluates to false, and hence no idling is performed. This helps to
+ * keep the drives' internal queues full on NCQ-capable devices, and hence
+ * to boost the throughput, without causing 'almost' any loss of service
+ * guarantees. The 'almost' follows from the fact that, if the internal
+ * queue of one such device is filled while all the sub-conditions hold,
+ * but at some point in time some sub-condition stops to hold, then it may
+ * become impossible to let requests be served in the new desired order
+ * until all the requests already queued in the device have been served.
*/
-static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
- bool idling_boosts_thr, idling_boosts_thr_without_issues,
- all_queues_seeky, on_hdd_and_not_all_queues_seeky,
- idling_needed_for_service_guarantees,
- asymmetric_scenario;
-
- /*
- * The next variable takes into account the cases where idling
- * boosts the throughput.
- *
- * The value of the variable is computed considering, first, that
- * idling is virtually always beneficial for the throughput if:
- * (a) the device is not NCQ-capable, or
- * (b) regardless of the presence of NCQ, the device is rotational
- * and the request pattern for bfqq is I/O-bound and sequential.
- *
- * Secondly, and in contrast to the above item (b), idling an
- * NCQ-capable flash-based device would not boost the
- * throughput even with sequential I/O; rather it would lower
- * the throughput in proportion to how fast the device
- * is. Accordingly, the next variable is true if any of the
- * above conditions (a) and (b) is true, and, in particular,
- * happens to be false if bfqd is an NCQ-capable flash-based
- * device.
- */
- idling_boosts_thr = !bfqd->hw_tag ||
- (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
- bfq_bfqq_idle_window(bfqq)) ;
+#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \
+ bfqd->busy_in_flight_queues == \
+ bfqd->const_seeky_busy_in_flight_queues)
- /*
- * The value of the next variable,
- * idling_boosts_thr_without_issues, is equal to that of
- * idling_boosts_thr, unless a special case holds. In this
- * special case, described below, idling may cause problems to
- * weight-raised queues.
- *
- * When the request pool is saturated (e.g., in the presence
- * of write hogs), if the processes associated with
- * non-weight-raised queues ask for requests at a lower rate,
- * then processes associated with weight-raised queues have a
- * higher probability to get a request from the pool
- * immediately (or at least soon) when they need one. Thus
- * they have a higher probability to actually get a fraction
- * of the device throughput proportional to their high
- * weight. This is especially true with NCQ-capable drives,
- * which enqueue several requests in advance, and further
- * reorder internally-queued requests.
- *
- * For this reason, we force to false the value of
- * idling_boosts_thr_without_issues if there are weight-raised
- * busy queues. In this case, and if bfqq is not weight-raised,
- * this guarantees that the device is not idled for bfqq (if,
- * instead, bfqq is weight-raised, then idling will be
- * guaranteed by another variable, see below). Combined with
- * the timestamping rules of BFQ (see [1] for details), this
- * behavior causes bfqq, and hence any sync non-weight-raised
- * queue, to get a lower number of requests served, and thus
- * to ask for a lower number of requests from the request
- * pool, before the busy weight-raised queues get served
- * again. This often mitigates starvation problems in the
- * presence of heavy write workloads and NCQ, thereby
- * guaranteeing a higher application and system responsiveness
- * in these hostile scenarios.
- */
- idling_boosts_thr_without_issues = idling_boosts_thr &&
- bfqd->wr_busy_queues == 0;
-
- /*
- * There are then two cases where idling must be performed not
- * for throughput concerns, but to preserve service
- * guarantees. In the description of these cases, we say, for
- * short, that a queue is sequential/random if the process
- * associated to the queue issues sequential/random requests
- * (in the second case the queue may be tagged as seeky or
- * even constantly_seeky).
- *
- * To introduce the first case, we note that, since
- * bfq_bfqq_idle_window(bfqq) is false if the device is
- * NCQ-capable and bfqq is random (see
- * bfq_update_idle_window()), then, from the above two
- * assignments it follows that
- * idling_boosts_thr_without_issues is false if the device is
- * NCQ-capable and bfqq is random. Therefore, for this case,
- * device idling would never be allowed if we used just
- * idling_boosts_thr_without_issues to decide whether to allow
- * it. And, beneficially, this would imply that throughput
- * would always be boosted also with random I/O on NCQ-capable
- * HDDs.
- *
- * But we must be careful on this point, to avoid an unfair
- * treatment for bfqq. In fact, because of the same above
- * assignments, idling_boosts_thr_without_issues is, on the
- * other hand, true if 1) the device is an HDD and bfqq is
- * sequential, and 2) there are no busy weight-raised
- * queues. As a consequence, if we used just
- * idling_boosts_thr_without_issues to decide whether to idle
- * the device, then with an HDD we might easily bump into a
- * scenario where queues that are sequential and I/O-bound
- * would enjoy idling, whereas random queues would not. The
- * latter might then get a low share of the device throughput,
- * simply because the former would get many requests served
- * after being set as in service, while the latter would not.
- *
- * To address this issue, we start by setting to true a
- * sentinel variable, on_hdd_and_not_all_queues_seeky, if the
- * device is rotational and not all queues with pending or
- * in-flight requests are constantly seeky (i.e., there are
- * active sequential queues, and bfqq might then be mistreated
- * if it does not enjoy idling because it is random).
- */
- all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) &&
- bfqd->busy_in_flight_queues ==
- bfqd->const_seeky_busy_in_flight_queues;
-
- on_hdd_and_not_all_queues_seeky =
- !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky;
-
- /*
- * To introduce the second case where idling needs to be
- * performed to preserve service guarantees, we can note that
- * allowing the drive to enqueue more than one request at a
- * time, and hence delegating de facto final scheduling
- * decisions to the drive's internal scheduler, causes loss of
- * control on the actual request service order. In particular,
- * the critical situation is when requests from different
- * processes happens to be present, at the same time, in the
- * internal queue(s) of the drive. In such a situation, the
- * drive, by deciding the service order of the
- * internally-queued requests, does determine also the actual
- * throughput distribution among these processes. But the
- * drive typically has no notion or concern about per-process
- * throughput distribution, and makes its decisions only on a
- * per-request basis. Therefore, the service distribution
- * enforced by the drive's internal scheduler is likely to
- * coincide with the desired device-throughput distribution
- * only in a completely symmetric scenario where:
- * (i) each of these processes must get the same throughput as
- * the others;
- * (ii) all these processes have the same I/O pattern
- (either sequential or random).
- * In fact, in such a scenario, the drive will tend to treat
- * the requests of each of these processes in about the same
- * way as the requests of the others, and thus to provide
- * each of these processes with about the same throughput
- * (which is exactly the desired throughput distribution). In
- * contrast, in any asymmetric scenario, device idling is
- * certainly needed to guarantee that bfqq receives its
- * assigned fraction of the device throughput (see [1] for
- * details).
- *
- * We address this issue by controlling, actually, only the
- * symmetry sub-condition (i), i.e., provided that
- * sub-condition (i) holds, idling is not performed,
- * regardless of whether sub-condition (ii) holds. In other
- * words, only if sub-condition (i) holds, then idling is
- * allowed, and the device tends to be prevented from queueing
- * many requests, possibly of several processes. The reason
- * for not controlling also sub-condition (ii) is that, first,
- * in the case of an HDD, the asymmetry in terms of types of
- * I/O patterns is already taken in to account in the above
- * sentinel variable
- * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a
- * flash-based device, we prefer however to privilege
- * throughput (and idling lowers throughput for this type of
- * devices), for the following reasons:
- * 1) differently from HDDs, the service time of random
- * requests is not orders of magnitudes lower than the service
- * time of sequential requests; thus, even if processes doing
- * sequential I/O get a preferential treatment with respect to
- * others doing random I/O, the consequences are not as
- * dramatic as with HDDs;
- * 2) if a process doing random I/O does need strong
- * throughput guarantees, it is hopefully already being
- * weight-raised, or the user is likely to have assigned it a
- * higher weight than the other processes (and thus
- * sub-condition (i) is likely to be false, which triggers
- * idling).
- *
- * According to the above considerations, the next variable is
- * true (only) if sub-condition (i) holds. To compute the
- * value of this variable, we not only use the return value of
- * the function bfq_symmetric_scenario(), but also check
- * whether bfqq is being weight-raised, because
- * bfq_symmetric_scenario() does not take into account also
- * weight-raised queues (see comments to
- * bfq_weights_tree_add()).
- *
- * As a side note, it is worth considering that the above
- * device-idling countermeasures may however fail in the
- * following unlucky scenario: if idling is (correctly)
- * disabled in a time period during which all symmetry
- * sub-conditions hold, and hence the device is allowed to
- * enqueue many requests, but at some later point in time some
- * sub-condition stops to hold, then it may become impossible
- * to let requests be served in the desired order until all
- * the requests already queued in the device have been served.
- */
- asymmetric_scenario = bfqq->wr_coeff > 1 ||
- !bfq_symmetric_scenario(bfqd);
+#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \
+ bfqd->hw_tag && \
+ (blk_queue_nonrot(bfqd->queue) || \
+ bfq_bfqq_constantly_seeky(bfqq)))
- /*
- * Finally, there is a case where maximizing throughput is the
- * best choice even if it may cause unfairness toward
- * bfqq. Such a case is when bfqq became active in a burst of
- * queue activations. Queues that became active during a large
- * burst benefit only from throughput, as discussed in the
- * comments to bfq_handle_burst. Thus, if bfqq became active
- * in a burst and not idling the device maximizes throughput,
- * then the device must no be idled, because not idling the
- * device provides bfqq and all other queues in the burst with
- * maximum benefit. Combining this and the two cases above, we
- * can now establish when idling is actually needed to
- * preserve service guarantees.
- */
- idling_needed_for_service_guarantees =
- (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) &&
- !bfq_bfqq_in_large_burst(bfqq);
+/*
+ * Condition for expiring a non-weight-raised queue (and hence not idling
+ * the device).
+ */
+#define cond_for_expiring_non_wr (bfqd->hw_tag && \
+ (bfqd->wr_busy_queues > 0 || \
+ (blk_queue_nonrot(bfqd->queue) || \
+ cond_for_seeky_on_ncq_hdd)))
- /*
- * We have now all the components we need to compute the return
- * value of the function, which is true only if both the following
- * conditions hold:
- * 1) bfqq is sync, because idling make sense only for sync queues;
- * 2) idling either boosts the throughput (without issues), or
- * is necessary to preserve service guarantees.
- */
return bfq_bfqq_sync(bfqq) &&
- (idling_boosts_thr_without_issues ||
- idling_needed_for_service_guarantees);
+ !cond_for_expiring_in_burst &&
+ (bfqq->wr_coeff > 1 || !symmetric_scenario ||
+ (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&
+ !cond_for_expiring_non_wr)
+ );
}
/*
- * If the in-service queue is empty but the function bfq_bfqq_may_idle
- * returns true, then:
+ * If the in-service queue is empty but sync, and the function
+ * bfq_bfqq_must_not_expire returns true, then:
* 1) the queue must remain in service and cannot be expired, and
- * 2) the device must be idled to wait for the possible arrival of a new
+ * 2) the disk must be idled to wait for the possible arrival of a new
* request for the queue.
- * See the comments to the function bfq_bfqq_may_idle for the reasons
+ * See the comments to the function bfq_bfqq_must_not_expire for the reasons
* why performing device idling is the best choice to boost the throughput
- * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * and preserve service guarantees when bfq_bfqq_must_not_expire itself
* returns true.
*/
-static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
- bfq_bfqq_may_idle(bfqq);
+ bfq_bfqq_must_not_expire(bfqq);
}
/*
@@ -2656,7 +2533,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
bfqq = bfqd->in_service_queue;
- if (!bfqq)
+ if (bfqq == NULL)
goto new_queue;
bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
@@ -2671,7 +2548,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
* If bfqq has requests queued and it has enough budget left to
* serve them, keep the queue, otherwise expire it.
*/
- if (next_rq) {
+ if (next_rq != NULL) {
if (bfq_serv_to_charge(next_rq, bfqq) >
bfq_bfqq_budget_left(bfqq)) {
reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
@@ -2698,9 +2575,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
*/
bfq_clear_bfqq_wait_request(bfqq);
del_timer(&bfqd->idle_slice_timer);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_update_idle_time(bfqq_group(bfqq));
-#endif
}
goto keep_queue;
}
@@ -2712,18 +2586,18 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
* may idle after their completion, then keep it anyway.
*/
if (timer_pending(&bfqd->idle_slice_timer) ||
- (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
bfqq = NULL;
goto keep_queue;
}
reason = BFQ_BFQQ_NO_MORE_REQUESTS;
expire:
- bfq_bfqq_expire(bfqd, bfqq, false, reason);
+ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
new_queue:
bfqq = bfq_set_in_service_queue(bfqd);
bfq_log(bfqd, "select_queue: new queue %d returned",
- bfqq ? bfqq->pid : 0);
+ bfqq != NULL ? bfqq->pid : 0);
keep_queue:
return bfqq;
}
@@ -2741,7 +2615,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
entity->orig_weight * bfqq->wr_coeff);
- if (entity->prio_changed)
+ if (entity->ioprio_changed)
bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
/*
@@ -2785,7 +2659,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
/* Follow expired path, else get first next available. */
rq = bfq_check_fifo(bfqq);
- if (!rq)
+ if (rq == NULL)
rq = bfqq->next_rq;
service_to_charge = bfq_serv_to_charge(rq, bfqq);
@@ -2821,14 +2695,14 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
bfq_update_wr_data(bfqd, bfqq);
bfq_log_bfqq(bfqd, bfqq,
- "dispatched %u sec req (%llu), budg left %d",
+ "dispatched %u sec req (%llu), budg left %lu",
blk_rq_sectors(rq),
(long long unsigned)blk_rq_pos(rq),
bfq_bfqq_budget_left(bfqq));
dispatched++;
- if (!bfqd->in_service_bic) {
+ if (bfqd->in_service_bic == NULL) {
atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
bfqd->in_service_bic = RQ_BIC(rq);
}
@@ -2841,7 +2715,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
return dispatched;
expire:
- bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
return dispatched;
}
@@ -2849,7 +2723,7 @@ static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
{
int dispatched = 0;
- while (bfqq->next_rq) {
+ while (bfqq->next_rq != NULL) {
bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
dispatched++;
}
@@ -2869,7 +2743,7 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd)
int dispatched = 0;
bfqq = bfqd->in_service_queue;
- if (bfqq)
+ if (bfqq != NULL)
__bfq_bfqq_expire(bfqd, bfqq);
/*
@@ -2905,7 +2779,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
return bfq_forced_dispatch(bfqd);
bfqq = bfq_select_queue(bfqd);
- if (!bfqq)
+ if (bfqq == NULL)
return 0;
if (bfq_class_idle(bfqq))
@@ -2945,9 +2819,6 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
static void bfq_put_queue(struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_group *bfqg = bfqq_group(bfqq);
-#endif
BUG_ON(atomic_read(&bfqq->ref) <= 0);
@@ -2956,9 +2827,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
if (!atomic_dec_and_test(&bfqq->ref))
return;
- BUG_ON(rb_first(&bfqq->sort_list));
+ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
- BUG_ON(bfqq->entity.tree);
+ BUG_ON(bfqq->entity.tree != NULL);
BUG_ON(bfq_bfqq_busy(bfqq));
BUG_ON(bfqd->in_service_queue == bfqq);
@@ -2976,9 +2847,6 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
kmem_cache_free(bfq_pool, bfqq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_put(bfqg);
-#endif
}
static void bfq_put_cooperator(struct bfq_queue *bfqq)
@@ -3015,7 +2883,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_put_queue(bfqq);
}
-static void bfq_init_icq(struct io_cq *icq)
+static inline void bfq_init_icq(struct io_cq *icq)
{
struct bfq_io_cq *bic = icq_to_bic(icq);
@@ -3082,38 +2950,40 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b
/*
* No prio set, inherit CPU scheduling settings.
*/
- bfqq->new_ioprio = task_nice_ioprio(tsk);
- bfqq->new_ioprio_class = task_nice_ioclass(tsk);
+ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
+ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
break;
case IOPRIO_CLASS_RT:
- bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
- bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
break;
case IOPRIO_CLASS_BE:
- bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
- bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
break;
case IOPRIO_CLASS_IDLE:
- bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
- bfqq->new_ioprio = 7;
+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
+ bfqq->entity.new_ioprio = 7;
bfq_clear_bfqq_idle_window(bfqq);
break;
}
- if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) {
+ if (bfqq->entity.new_ioprio < 0 ||
+ bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {
printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",
- bfqq->new_ioprio);
+ bfqq->entity.new_ioprio);
BUG();
}
- bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
- bfqq->entity.prio_changed = 1;
+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio);
+ bfqq->entity.ioprio_changed = 1;
}
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic)
{
struct bfq_data *bfqd;
struct bfq_queue *bfqq, *new_bfqq;
+ struct bfq_group *bfqg;
unsigned long uninitialized_var(flags);
int ioprio = bic->icq.ioc->ioprio;
@@ -3123,16 +2993,18 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
* This condition may trigger on a newly created bic, be sure to
* drop the lock before returning.
*/
- if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
+ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
goto out;
bic->ioprio = ioprio;
bfqq = bic->bfqq[BLK_RW_ASYNC];
- if (bfqq) {
- new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic,
+ if (bfqq != NULL) {
+ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
+ sched_data);
+ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
GFP_ATOMIC);
- if (new_bfqq) {
+ if (new_bfqq != NULL) {
bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
bfq_log_bfqq(bfqd, bfqq,
"check_ioprio_change: bfqq %p %d",
@@ -3142,7 +3014,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
}
bfqq = bic->bfqq[BLK_RW_SYNC];
- if (bfqq)
+ if (bfqq != NULL)
bfq_set_next_ioprio_data(bfqq, bic);
out:
@@ -3166,8 +3038,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (!bfq_class_idle(bfqq))
bfq_mark_bfqq_idle_window(bfqq);
bfq_mark_bfqq_sync(bfqq);
- } else
- bfq_clear_bfqq_sync(bfqq);
+ }
bfq_mark_bfqq_IO_bound(bfqq);
/* Tentative initial value to trade off between thr and lat */
@@ -3184,19 +3055,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
- struct bio *bio, int is_sync,
+ struct bfq_group *bfqg,
+ int is_sync,
struct bfq_io_cq *bic,
gfp_t gfp_mask)
{
- struct bfq_group *bfqg;
struct bfq_queue *bfqq, *new_bfqq = NULL;
- struct blkcg *blkcg;
retry:
- rcu_read_lock();
-
- blkcg = bio_blkcg(bio);
- bfqg = bfq_find_alloc_group(bfqd, blkcg);
/* bic always exists here */
bfqq = bic_to_bfqq(bic, is_sync);
@@ -3204,19 +3070,18 @@ retry:
* Always try a new alloc if we fall back to the OOM bfqq
* originally, since it should just be a temporary situation.
*/
- if (!bfqq || bfqq == &bfqd->oom_bfqq) {
+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
bfqq = NULL;
- if (new_bfqq) {
+ if (new_bfqq != NULL) {
bfqq = new_bfqq;
new_bfqq = NULL;
} else if (gfp_mask & __GFP_WAIT) {
- rcu_read_unlock();
spin_unlock_irq(bfqd->queue->queue_lock);
new_bfqq = kmem_cache_alloc_node(bfq_pool,
gfp_mask | __GFP_ZERO,
bfqd->queue->node);
spin_lock_irq(bfqd->queue->queue_lock);
- if (new_bfqq)
+ if (new_bfqq != NULL)
goto retry;
} else {
bfqq = kmem_cache_alloc_node(bfq_pool,
@@ -3224,7 +3089,7 @@ retry:
bfqd->queue->node);
}
- if (bfqq) {
+ if (bfqq != NULL) {
bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
is_sync);
bfq_init_entity(&bfqq->entity, bfqg);
@@ -3235,11 +3100,9 @@ retry:
}
}
- if (new_bfqq)
+ if (new_bfqq != NULL)
kmem_cache_free(bfq_pool, new_bfqq);
- rcu_read_unlock();
-
return bfqq;
}
@@ -3263,7 +3126,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
}
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
- struct bio *bio, int is_sync,
+ struct bfq_group *bfqg, int is_sync,
struct bfq_io_cq *bic, gfp_t gfp_mask)
{
const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
@@ -3272,26 +3135,19 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
struct bfq_queue *bfqq = NULL;
if (!is_sync) {
- struct blkcg *blkcg;
- struct bfq_group *bfqg;
-
- rcu_read_lock();
- blkcg = bio_blkcg(bio);
- rcu_read_unlock();
- bfqg = bfq_find_alloc_group(bfqd, blkcg);
async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
ioprio);
bfqq = *async_bfqq;
}
- if (!bfqq)
- bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask);
+ if (bfqq == NULL)
+ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
/*
* Pin the queue now that it's allocated, scheduler exit will
* prune it.
*/
- if (!is_sync && !(*async_bfqq)) {
+ if (!is_sync && *async_bfqq == NULL) {
atomic_inc(&bfqq->ref);
bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
bfqq, atomic_read(&bfqq->ref));
@@ -3424,9 +3280,9 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
- bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
- blk_rq_sectors(rq) < 32;
- bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+ blk_rq_sectors(rq) < 32;
+ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
/*
* There is just this request queued: if the request
@@ -3453,9 +3309,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
bfq_clear_bfqq_wait_request(bfqq);
del_timer(&bfqd->idle_slice_timer);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_update_idle_time(bfqq_group(bfqq));
-#endif
/*
* The queue is not empty, because a new request just
@@ -3465,8 +3318,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* See [1] for more details.
*/
if (budget_timeout)
- bfq_bfqq_expire(bfqd, bfqq, false,
- BFQ_BFQQ_BUDGET_TIMEOUT);
+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
/*
* Let the request rip immediately, or let a new queue be
@@ -3490,7 +3342,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
*/
if (!in_interrupt()) {
new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
- if (new_bfqq) {
+ if (new_bfqq != NULL) {
if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
/*
@@ -3518,7 +3370,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
* from assigning it a full weight-raising period. See the detailed
* comments about this field in bfq_init_icq().
*/
- if (bfqq->bic)
+ if (bfqq->bic != NULL)
bfqq->bic->wr_time_left = 0;
rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
list_add_tail(&rq->queuelist, &bfqq->fifo);
@@ -3566,11 +3418,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
BUG_ON(!bfqq->dispatched);
bfqd->rq_in_driver--;
bfqq->dispatched--;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_update_completion(bfqq_group(bfqq),
- rq_start_time_ns(rq),
- rq_io_start_time_ns(rq), rq->cmd_flags);
-#endif
if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
bfq_weights_tree_remove(bfqd, &bfqq->entity,
@@ -3615,12 +3462,11 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
bfq_arm_slice_timer(bfqd);
goto out;
} else if (bfq_may_expire_for_budg_timeout(bfqq))
- bfq_bfqq_expire(bfqd, bfqq, false,
- BFQ_BFQQ_BUDGET_TIMEOUT);
+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
(bfqq->dispatched == 0 ||
- !bfq_bfqq_may_idle(bfqq)))
- bfq_bfqq_expire(bfqd, bfqq, false,
+ !bfq_bfqq_must_not_expire(bfqq)))
+ bfq_bfqq_expire(bfqd, bfqq, 0,
BFQ_BFQQ_NO_MORE_REQUESTS);
}
@@ -3631,7 +3477,7 @@ out:
return;
}
-static int __bfq_may_queue(struct bfq_queue *bfqq)
+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
{
if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
bfq_clear_bfqq_must_alloc(bfqq);
@@ -3655,11 +3501,11 @@ static int bfq_may_queue(struct request_queue *q, int rw)
* 'may queue' if that fails.
*/
bic = bfq_bic_lookup(bfqd, tsk->io_context);
- if (!bic)
+ if (bic == NULL)
return ELV_MQUEUE_MAY;
bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
- if (bfqq)
+ if (bfqq != NULL)
return __bfq_may_queue(bfqq);
return ELV_MQUEUE_MAY;
@@ -3672,7 +3518,7 @@ static void bfq_put_request(struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
- if (bfqq) {
+ if (bfqq != NULL) {
const int rw = rq_data_dir(rq);
BUG_ON(!bfqq->allocated[rw]);
@@ -3724,24 +3570,25 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
const int rw = rq_data_dir(rq);
const int is_sync = rq_is_sync(rq);
struct bfq_queue *bfqq;
+ struct bfq_group *bfqg;
unsigned long flags;
bool split = false;
might_sleep_if(gfp_mask & __GFP_WAIT);
- bfq_check_ioprio_change(bic, bio);
+ bfq_check_ioprio_change(bic);
spin_lock_irqsave(q->queue_lock, flags);
- if (!bic)
+ if (bic == NULL)
goto queue_fail;
- bfq_bic_update_cgroup(bic, bio);
+ bfqg = bfq_bic_update_cgroup(bic);
new_queue:
bfqq = bic_to_bfqq(bic, is_sync);
- if (!bfqq || bfqq == &bfqd->oom_bfqq) {
- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);
+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
bic_set_bfqq(bic, bfqq, is_sync);
if (split && is_sync) {
if ((bic->was_in_burst_list && bfqd->large_burst) ||
@@ -3837,7 +3684,7 @@ static void bfq_idle_slice_timer(unsigned long data)
* the in-service queue. This can hardly happen, but in the worst
* case we just expire a queue too early.
*/
- if (bfqq) {
+ if (bfqq != NULL) {
bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
if (bfq_bfqq_budget_timeout(bfqq))
/*
@@ -3857,7 +3704,7 @@ static void bfq_idle_slice_timer(unsigned long data)
else
goto schedule_dispatch;
- bfq_bfqq_expire(bfqd, bfqq, true, reason);
+ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
}
schedule_dispatch:
@@ -3872,14 +3719,14 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
cancel_work_sync(&bfqd->unplug_work);
}
-static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
struct bfq_queue **bfqq_ptr)
{
struct bfq_group *root_group = bfqd->root_group;
struct bfq_queue *bfqq = *bfqq_ptr;
bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
- if (bfqq) {
+ if (bfqq != NULL) {
bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
bfqq, atomic_read(&bfqq->ref));
@@ -3915,7 +3762,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
spin_lock_irq(q->queue_lock);
- BUG_ON(bfqd->in_service_queue);
+ BUG_ON(bfqd->in_service_queue != NULL);
list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
bfq_deactivate_bfqq(bfqd, bfqq, 0);
@@ -3928,39 +3775,22 @@ static void bfq_exit_queue(struct elevator_queue *e)
BUG_ON(timer_pending(&bfqd->idle_slice_timer));
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- blkcg_deactivate_policy(q, &blkcg_policy_bfq);
-#endif
-
+ bfq_free_root_group(bfqd);
kfree(bfqd);
}
-static void bfq_init_root_group(struct bfq_group *root_group,
- struct bfq_data *bfqd)
-{
- int i;
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- root_group->entity.parent = NULL;
- root_group->my_entity = NULL;
- root_group->bfqd = bfqd;
-#endif
- root_group->rq_pos_tree = RB_ROOT;
- for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
- root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-}
-
static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
{
+ struct bfq_group *bfqg;
struct bfq_data *bfqd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
- if (!eq)
+ if (eq == NULL)
return -ENOMEM;
bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
- if (!bfqd) {
+ if (bfqd == NULL) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
@@ -3973,16 +3803,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
*/
bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
atomic_inc(&bfqd->oom_bfqq.ref);
- bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
- bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+ bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+ bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;
bfqd->oom_bfqq.entity.new_weight =
- bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+ bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio);
/*
* Trigger weight initialization, according to ioprio, at the
* oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
* class won't be changed any more.
*/
- bfqd->oom_bfqq.entity.prio_changed = 1;
+ bfqd->oom_bfqq.entity.ioprio_changed = 1;
bfqd->queue = q;
@@ -3990,12 +3820,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
- bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
- if (!bfqd->root_group)
- goto out_free;
- bfq_init_root_group(bfqd->root_group, bfqd);
+ bfqg = bfq_alloc_root_group(bfqd, q->node);
+ if (bfqg == NULL) {
+ kfree(bfqd);
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+
+ bfqd->root_group = bfqg;
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
bfqd->active_numerous_groups = 0;
#endif
@@ -4003,6 +3837,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
bfqd->idle_slice_timer.data = (unsigned long)bfqd;
+ bfqd->rq_pos_tree = RB_ROOT;
bfqd->queue_weights_tree = RB_ROOT;
bfqd->group_weights_tree = RB_ROOT;
@@ -4060,23 +3895,18 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->device_speed = BFQ_BFQD_FAST;
return 0;
-
-out_free:
- kfree(bfqd);
- kobject_put(&eq->kobj);
- return -ENOMEM;
}
static void bfq_slab_kill(void)
{
- if (bfq_pool)
+ if (bfq_pool != NULL)
kmem_cache_destroy(bfq_pool);
}
static int __init bfq_slab_setup(void)
{
bfq_pool = KMEM_CACHE(bfq_queue, 0);
- if (!bfq_pool)
+ if (bfq_pool == NULL)
return -ENOMEM;
return 0;
}
@@ -4221,7 +4051,7 @@ static ssize_t bfq_weights_store(struct elevator_queue *e,
return count;
}
-static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
{
u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
@@ -4315,9 +4145,6 @@ static struct elevator_type iosched_bfq = {
.elevator_merge_fn = bfq_merge,
.elevator_merged_fn = bfq_merged_request,
.elevator_merge_req_fn = bfq_merged_requests,
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- .elevator_bio_merged_fn = bfq_bio_merged,
-#endif
.elevator_allow_merge_fn = bfq_allow_merge,
.elevator_dispatch_fn = bfq_dispatch_requests,
.elevator_add_req_fn = bfq_insert_request,
@@ -4343,8 +4170,6 @@ static struct elevator_type iosched_bfq = {
static int __init bfq_init(void)
{
- int ret;
-
/*
* Can be 0 on HZ < 1000 setups.
*/
@@ -4354,15 +4179,8 @@ static int __init bfq_init(void)
if (bfq_timeout_async == 0)
bfq_timeout_async = 1;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- ret = blkcg_policy_register(&blkcg_policy_bfq);
- if (ret)
- return ret;
-#endif
-
- ret = -ENOMEM;
if (bfq_slab_setup())
- goto err_pol_unreg;
+ return -ENOMEM;
/*
* Times to load large popular applications for the typical systems
@@ -4381,27 +4199,15 @@ static int __init bfq_init(void)
device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
- ret = elv_register(&iosched_bfq);
- if (ret)
- goto err_pol_unreg;
-
- pr_info("BFQ I/O-scheduler: v7r9");
+ elv_register(&iosched_bfq);
+ pr_info("BFQ I/O-scheduler: v7r8");
return 0;
-
-err_pol_unreg:
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- blkcg_policy_unregister(&blkcg_policy_bfq);
-#endif
- return ret;
}
static void __exit bfq_exit(void)
{
elv_unregister(&iosched_bfq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- blkcg_policy_unregister(&blkcg_policy_bfq);
-#endif
bfq_slab_kill();
}
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
index 9328a1f09..d0890c6d4 100644
--- a/block/bfq-sched.c
+++ b/block/bfq-sched.c
@@ -10,27 +10,24 @@
* Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
*/
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
#define for_each_entity(entity) \
- for (; entity ; entity = entity->parent)
+ for (; entity != NULL; entity = entity->parent)
#define for_each_entity_safe(entity, parent) \
for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
-
static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
int extract,
struct bfq_data *bfqd);
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-
-static void bfq_update_budget(struct bfq_entity *next_in_service)
+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
{
struct bfq_entity *bfqg_entity;
struct bfq_group *bfqg;
struct bfq_sched_data *group_sd;
- BUG_ON(!next_in_service);
+ BUG_ON(next_in_service == NULL);
group_sd = next_in_service->sched_data;
@@ -41,7 +38,7 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
* as it must never become an in-service entity.
*/
bfqg_entity = bfqg->my_entity;
- if (bfqg_entity)
+ if (bfqg_entity != NULL)
bfqg_entity->budget = next_in_service->budget;
}
@@ -49,7 +46,7 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd)
{
struct bfq_entity *next_in_service;
- if (sd->in_service_entity)
+ if (sd->in_service_entity != NULL)
/* will update/requeue at the end of service */
return 0;
@@ -63,35 +60,35 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd)
next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
sd->next_in_service = next_in_service;
- if (next_in_service)
+ if (next_in_service != NULL)
bfq_update_budget(next_in_service);
return 1;
}
-static void bfq_check_next_in_service(struct bfq_sched_data *sd,
- struct bfq_entity *entity)
+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
+ struct bfq_entity *entity)
{
BUG_ON(sd->next_in_service != entity);
}
#else
#define for_each_entity(entity) \
- for (; entity ; entity = NULL)
+ for (; entity != NULL; entity = NULL)
#define for_each_entity_safe(entity, parent) \
- for (parent = NULL; entity ; entity = parent)
+ for (parent = NULL; entity != NULL; entity = parent)
-static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
{
return 0;
}
-static void bfq_check_next_in_service(struct bfq_sched_data *sd,
- struct bfq_entity *entity)
+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
+ struct bfq_entity *entity)
{
}
-static void bfq_update_budget(struct bfq_entity *next_in_service)
+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
{
}
#endif
@@ -112,18 +109,18 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
*
* Return @a > @b, dealing with wrapping correctly.
*/
-static int bfq_gt(u64 a, u64 b)
+static inline int bfq_gt(u64 a, u64 b)
{
return (s64)(a - b) > 0;
}
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
{
struct bfq_queue *bfqq = NULL;
- BUG_ON(!entity);
+ BUG_ON(entity == NULL);
- if (!entity->my_sched_data)
+ if (entity->my_sched_data == NULL)
bfqq = container_of(entity, struct bfq_queue, entity);
return bfqq;
@@ -135,7 +132,8 @@ static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
* @service: amount of service.
* @weight: scale factor (weight of an entity or weight sum).
*/
-static u64 bfq_delta(unsigned long service, unsigned long weight)
+static inline u64 bfq_delta(unsigned long service,
+ unsigned long weight)
{
u64 d = (u64)service << WFQ_SERVICE_SHIFT;
@@ -148,7 +146,8 @@ static u64 bfq_delta(unsigned long service, unsigned long weight)
* @entity: the entity to act upon.
* @service: the service to be charged to the entity.
*/
-static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+static inline void bfq_calc_finish(struct bfq_entity *entity,
+ unsigned long service)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
@@ -157,7 +156,7 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
entity->finish = entity->start +
bfq_delta(service, entity->weight);
- if (bfqq) {
+ if (bfqq != NULL) {
bfq_log_bfqq(bfqq->bfqd, bfqq,
"calc_finish: serv %lu, w %d",
service, entity->weight);
@@ -177,11 +176,11 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
* conversion mechanism because, e.g., in the tree walking functions,
* the check for a %NULL value would be redundant.
*/
-static struct bfq_entity *bfq_entity_of(struct rb_node *node)
+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
{
struct bfq_entity *entity = NULL;
- if (node)
+ if (node != NULL)
entity = rb_entry(node, struct bfq_entity, rb_node);
return entity;
@@ -192,7 +191,8 @@ static struct bfq_entity *bfq_entity_of(struct rb_node *node)
* @root: the tree root.
* @entity: the entity to remove.
*/
-static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+static inline void bfq_extract(struct rb_root *root,
+ struct bfq_entity *entity)
{
BUG_ON(entity->tree != root);
@@ -225,7 +225,7 @@ static void bfq_idle_extract(struct bfq_service_tree *st,
bfq_extract(&st->idle, entity);
- if (bfqq)
+ if (bfqq != NULL)
list_del(&bfqq->bfqq_list);
}
@@ -243,9 +243,9 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
- BUG_ON(entity->tree);
+ BUG_ON(entity->tree != NULL);
- while (*node) {
+ while (*node != NULL) {
parent = *node;
entry = rb_entry(parent, struct bfq_entity, rb_node);
@@ -271,11 +271,12 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
* that the subtree rooted at @node (which may be its left or its right
* child) has a valid min_start value.
*/
-static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+static inline void bfq_update_min(struct bfq_entity *entity,
+ struct rb_node *node)
{
struct bfq_entity *child;
- if (node) {
+ if (node != NULL) {
child = rb_entry(node, struct bfq_entity, rb_node);
if (bfq_gt(entity->min_start, child->min_start))
entity->min_start = child->min_start;
@@ -290,7 +291,7 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
* this function updates its min_start value. The left and right subtrees
* are assumed to hold a correct min_start value.
*/
-static void bfq_update_active_node(struct rb_node *node)
+static inline void bfq_update_active_node(struct rb_node *node)
{
struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
@@ -317,12 +318,12 @@ up:
bfq_update_active_node(node);
parent = rb_parent(node);
- if (!parent)
+ if (parent == NULL)
return;
- if (node == parent->rb_left && parent->rb_right)
+ if (node == parent->rb_left && parent->rb_right != NULL)
bfq_update_active_node(parent->rb_right);
- else if (parent->rb_left)
+ else if (parent->rb_left != NULL)
bfq_update_active_node(parent->rb_left);
node = parent;
@@ -354,7 +355,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node = &entity->rb_node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
struct bfq_sched_data *sd = NULL;
struct bfq_group *bfqg = NULL;
struct bfq_data *bfqd = NULL;
@@ -362,22 +363,22 @@ static void bfq_active_insert(struct bfq_service_tree *st,
bfq_insert(&st->active, entity);
- if (node->rb_left)
+ if (node->rb_left != NULL)
node = node->rb_left;
- else if (node->rb_right)
+ else if (node->rb_right != NULL)
node = node->rb_right;
bfq_update_active_tree(node);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
sd = entity->sched_data;
bfqg = container_of(sd, struct bfq_group, sched_data);
BUG_ON(!bfqg);
bfqd = (struct bfq_data *)bfqg->bfqd;
#endif
- if (bfqq)
+ if (bfqq != NULL)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
else { /* bfq_group */
BUG_ON(!bfqd);
bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
@@ -396,32 +397,31 @@ static void bfq_active_insert(struct bfq_service_tree *st,
* bfq_ioprio_to_weight - calc a weight from an ioprio.
* @ioprio: the ioprio value to convert.
*/
-static unsigned short bfq_ioprio_to_weight(int ioprio)
+static inline unsigned short bfq_ioprio_to_weight(int ioprio)
{
BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
- return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio;
+ return IOPRIO_BE_NR - ioprio;
}
/**
* bfq_weight_to_ioprio - calc an ioprio from a weight.
* @weight: the weight value to convert.
*
- * To preserve as much as possible the old only-ioprio user interface,
+ * To preserve as mush as possible the old only-ioprio user interface,
* 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ * larger than IOPRIO_BE_NR
*/
-static unsigned short bfq_weight_to_ioprio(int weight)
+static inline unsigned short bfq_weight_to_ioprio(int weight)
{
BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
- return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?
- 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;
+ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
}
-static void bfq_get_entity(struct bfq_entity *entity)
+static inline void bfq_get_entity(struct bfq_entity *entity)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- if (bfqq) {
+ if (bfqq != NULL) {
atomic_inc(&bfqq->ref);
bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
bfqq, atomic_read(&bfqq->ref));
@@ -441,15 +441,15 @@ static struct rb_node *bfq_find_deepest(struct rb_node *node)
{
struct rb_node *deepest;
- if (!node->rb_right && !node->rb_left)
+ if (node->rb_right == NULL && node->rb_left == NULL)
deepest = rb_parent(node);
- else if (!node->rb_right)
+ else if (node->rb_right == NULL)
deepest = node->rb_left;
- else if (!node->rb_left)
+ else if (node->rb_left == NULL)
deepest = node->rb_right;
else {
deepest = rb_next(node);
- if (deepest->rb_right)
+ if (deepest->rb_right != NULL)
deepest = deepest->rb_right;
else if (rb_parent(deepest) != node)
deepest = rb_parent(deepest);
@@ -468,7 +468,7 @@ static void bfq_active_extract(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
struct bfq_sched_data *sd = NULL;
struct bfq_group *bfqg = NULL;
struct bfq_data *bfqd = NULL;
@@ -477,18 +477,18 @@ static void bfq_active_extract(struct bfq_service_tree *st,
node = bfq_find_deepest(&entity->rb_node);
bfq_extract(&st->active, entity);
- if (node)
+ if (node != NULL)
bfq_update_active_tree(node);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
sd = entity->sched_data;
bfqg = container_of(sd, struct bfq_group, sched_data);
BUG_ON(!bfqg);
bfqd = (struct bfq_data *)bfqg->bfqd;
#endif
- if (bfqq)
+ if (bfqq != NULL)
list_del(&bfqq->bfqq_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
else { /* bfq_group */
BUG_ON(!bfqd);
bfq_weights_tree_remove(bfqd, entity,
@@ -519,14 +519,14 @@ static void bfq_idle_insert(struct bfq_service_tree *st,
struct bfq_entity *first_idle = st->first_idle;
struct bfq_entity *last_idle = st->last_idle;
- if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
st->first_idle = entity;
- if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
st->last_idle = entity;
bfq_insert(&st->idle, entity);
- if (bfqq)
+ if (bfqq != NULL)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
}
@@ -549,7 +549,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
entity->on_st = 0;
st->wsum -= entity->weight;
- if (bfqq) {
+ if (bfqq != NULL) {
sd = entity->sched_data;
bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
bfqq, atomic_read(&bfqq->ref));
@@ -581,7 +581,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
struct bfq_entity *first_idle = st->first_idle;
struct bfq_entity *last_idle = st->last_idle;
- if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
!bfq_gt(last_idle->finish, st->vtime)) {
/*
* Forget the whole idle tree, increasing the vtime past
@@ -590,7 +590,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
st->vtime = last_idle->finish;
}
- if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
bfq_put_idle_entity(st, first_idle);
}
@@ -600,19 +600,19 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
{
struct bfq_service_tree *new_st = old_st;
- if (entity->prio_changed) {
+ if (entity->ioprio_changed) {
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
unsigned short prev_weight, new_weight;
struct bfq_data *bfqd = NULL;
struct rb_root *root;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
struct bfq_sched_data *sd;
struct bfq_group *bfqg;
#endif
- if (bfqq)
+ if (bfqq != NULL)
bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
else {
sd = entity->my_sched_data;
bfqg = container_of(sd, struct bfq_group, sched_data);
@@ -634,14 +634,12 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
BUG();
}
entity->orig_weight = entity->new_weight;
- if (bfqq)
- bfqq->ioprio =
- bfq_weight_to_ioprio(entity->orig_weight);
+ entity->ioprio =
+ bfq_weight_to_ioprio(entity->orig_weight);
}
- if (bfqq)
- bfqq->ioprio_class = bfqq->new_ioprio_class;
- entity->prio_changed = 0;
+ entity->ioprio_class = entity->new_ioprio_class;
+ entity->ioprio_changed = 0;
/*
* NOTE: here we may be changing the weight too early,
@@ -654,7 +652,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
prev_weight = entity->weight;
new_weight = entity->orig_weight *
- (bfqq ? bfqq->wr_coeff : 1);
+ (bfqq != NULL ? bfqq->wr_coeff : 1);
/*
* If the weight of the entity changes, remove the entity
* from its old weight counter (if there is a counter
@@ -685,10 +683,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
return new_st;
}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
-#endif
-
/**
* bfq_bfqq_served - update the scheduler status after selection for
* service.
@@ -699,7 +693,7 @@ static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
* are synchronized every time a new bfqq is selected for service. By now,
* we keep it to better check consistency.
*/
-static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
{
struct bfq_entity *entity = &bfqq->entity;
struct bfq_service_tree *st;
@@ -714,10 +708,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
st->vtime += bfq_delta(served, st->wsum);
bfq_forget_idle(st);
}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
-#endif
- bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
}
/**
@@ -730,7 +721,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
* budget. In this way we should obtain a sort of time-domain
* fairness among all the seeky/slow queues.
*/
-static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
@@ -755,7 +746,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity)
struct bfq_service_tree *st = bfq_entity_service_tree(entity);
if (entity == sd->in_service_entity) {
- BUG_ON(entity->tree);
+ BUG_ON(entity->tree != NULL);
/*
* If we are requeueing the current entity we have
* to take care of not charging to it service it has
@@ -846,7 +837,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
if (!entity->on_st)
return 0;
- BUG_ON(was_in_service && entity->tree);
+ BUG_ON(was_in_service && entity->tree != NULL);
if (was_in_service) {
bfq_calc_finish(entity, entity->service);
@@ -855,7 +846,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
bfq_active_extract(st, entity);
else if (entity->tree == &st->idle)
bfq_idle_extract(st, entity);
- else if (entity->tree)
+ else if (entity->tree != NULL)
BUG();
if (was_in_service || sd->next_in_service == entity)
@@ -893,7 +884,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
*/
break;
- if (sd->next_in_service)
+ if (sd->next_in_service != NULL)
/*
* The parent entity is still backlogged and
* the budgets on the path towards the root
@@ -962,7 +953,7 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
struct bfq_entity *entry, *first = NULL;
struct rb_node *node = st->active.rb_node;
- while (node) {
+ while (node != NULL) {
entry = rb_entry(node, struct bfq_entity, rb_node);
left:
if (!bfq_gt(entry->start, st->vtime))
@@ -970,7 +961,7 @@ left:
BUG_ON(bfq_gt(entry->min_start, st->vtime));
- if (node->rb_left) {
+ if (node->rb_left != NULL) {
entry = rb_entry(node->rb_left,
struct bfq_entity, rb_node);
if (!bfq_gt(entry->min_start, st->vtime)) {
@@ -978,12 +969,12 @@ left:
goto left;
}
}
- if (first)
+ if (first != NULL)
break;
node = node->rb_right;
}
- BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));
+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
return first;
}
@@ -1039,13 +1030,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
struct bfq_entity *entity;
int i = 0;
- BUG_ON(sd->in_service_entity);
+ BUG_ON(sd->in_service_entity != NULL);
- if (bfqd &&
+ if (bfqd != NULL &&
jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
true);
- if (entity) {
+ if (entity != NULL) {
i = BFQ_IOPRIO_CLASSES - 1;
bfqd->bfq_class_idle_last_service = jiffies;
sd->next_in_service = entity;
@@ -1053,7 +1044,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
}
for (; i < BFQ_IOPRIO_CLASSES; i++) {
entity = __bfq_lookup_next_entity(st + i, false);
- if (entity) {
+ if (entity != NULL) {
if (extract) {
bfq_check_next_in_service(sd, entity);
bfq_active_extract(st + i, entity);
@@ -1076,27 +1067,27 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
struct bfq_sched_data *sd;
struct bfq_queue *bfqq;
- BUG_ON(bfqd->in_service_queue);
+ BUG_ON(bfqd->in_service_queue != NULL);
if (bfqd->busy_queues == 0)
return NULL;
sd = &bfqd->root_group->sched_data;
- for (; sd ; sd = entity->my_sched_data) {
+ for (; sd != NULL; sd = entity->my_sched_data) {
entity = bfq_lookup_next_entity(sd, 1, bfqd);
- BUG_ON(!entity);
+ BUG_ON(entity == NULL);
entity->service = 0;
}
bfqq = bfq_entity_to_bfqq(entity);
- BUG_ON(!bfqq);
+ BUG_ON(bfqq == NULL);
return bfqq;
}
static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
{
- if (bfqd->in_service_bic) {
+ if (bfqd->in_service_bic != NULL) {
put_io_context(bfqd->in_service_bic->icq.ioc);
bfqd->in_service_bic = NULL;
}
@@ -1123,10 +1114,6 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_activate_entity(entity);
}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
-#endif
-
/*
* Called when the bfqq no longer has requests pending, remove it from
* the service tree.
@@ -1160,10 +1147,6 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues--;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_update_dequeue(bfqq_group(bfqq));
-#endif
-
bfq_deactivate_bfqq(bfqd, bfqq, requeue);
}
diff --git a/block/bfq.h b/block/bfq.h
index 320c4389b..93d3f6e95 100644
--- a/block/bfq.h
+++ b/block/bfq.h
@@ -1,5 +1,5 @@
/*
- * BFQ-v7r9 for 4.2.0: data structures and common functions prototypes.
+ * BFQ-v7r8 for 4.3.0: data structures and common functions prototypes.
*
* Based on ideas and code from CFQ:
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
@@ -17,14 +17,12 @@
#include <linux/hrtimer.h>
#include <linux/ioprio.h>
#include <linux/rbtree.h>
-#include <linux/blk-cgroup.h>
#define BFQ_IOPRIO_CLASSES 3
#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
-#define BFQ_MIN_WEIGHT 1
-#define BFQ_MAX_WEIGHT 1000
-#define BFQ_WEIGHT_CONVERSION_COEFF 10
+#define BFQ_MIN_WEIGHT 1
+#define BFQ_MAX_WEIGHT 1000
#define BFQ_DEFAULT_QUEUE_IOPRIO 4
@@ -119,8 +117,12 @@ struct bfq_weight_counter {
* @ioprio: the ioprio in use.
* @new_weight: when a weight change is requested, the new weight value.
* @orig_weight: original weight, used to implement weight boosting
- * @prio_changed: flag, true when the user requested a weight, ioprio or
- * ioprio_class change.
+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
+ * @ioprio_class: the ioprio_class in use.
+ * @new_ioprio_class: when an ioprio_class change is requested, the new
+ * ioprio_class value.
+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
+ * ioprio_class change.
*
* A bfq_entity is used to represent either a bfq_queue (leaf node in the
* cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
@@ -132,7 +134,7 @@ struct bfq_weight_counter {
* allow different weights on different devices, but this
* functionality is not exported to userspace by now. Priorities and
* weights are updated lazily, first storing the new values into the
- * new_* fields, then setting the @prio_changed flag. As soon as
+ * new_* fields, then setting the @ioprio_changed flag. As soon as
* there is a transition in the entity state that allows the priority
* update to take place the effective and the requested priority
* values are synchronized.
@@ -159,7 +161,7 @@ struct bfq_entity {
u64 min_start;
- int service, budget;
+ unsigned long service, budget;
unsigned short weight, new_weight;
unsigned short orig_weight;
@@ -168,7 +170,10 @@ struct bfq_entity {
struct bfq_sched_data *my_sched_data;
struct bfq_sched_data *sched_data;
- int prio_changed;
+ unsigned short ioprio, new_ioprio;
+ unsigned short ioprio_class, new_ioprio_class;
+
+ int ioprio_changed;
};
struct bfq_group;
@@ -177,14 +182,10 @@ struct bfq_group;
* struct bfq_queue - leaf schedulable entity.
* @ref: reference counter.
* @bfqd: parent bfq_data.
- * @new_ioprio: when an ioprio change is requested, the new ioprio value.
- * @ioprio_class: the ioprio_class in use.
- * @new_ioprio_class: when an ioprio_class change is requested, the new
- * ioprio_class value.
* @new_bfqq: shared bfq_queue if queue is cooperating with
* one or more other queues.
- * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree).
- * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree).
+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
* @sort_list: sorted list of pending requests.
* @next_rq: if fifo isn't expired, next request to serve.
* @queued: nr of requests queued in @sort_list.
@@ -238,9 +239,6 @@ struct bfq_queue {
atomic_t ref;
struct bfq_data *bfqd;
- unsigned short ioprio, new_ioprio;
- unsigned short ioprio_class, new_ioprio_class;
-
/* fields for cooperating queues handling */
struct bfq_queue *new_bfqq;
struct rb_node pos_node;
@@ -255,7 +253,7 @@ struct bfq_queue {
struct bfq_entity entity;
- int max_budget;
+ unsigned long max_budget;
unsigned long budget_timeout;
int dispatched;
@@ -304,8 +302,6 @@ struct bfq_ttime {
* @icq: associated io_cq structure
* @bfqq: array of two process queues, the sync and the async
* @ttime: associated @bfq_ttime struct
- * @ioprio: per (request_queue, blkcg) ioprio.
- * @blkcg_id: id of the blkcg the related io_cq belongs to.
* @wr_time_left: snapshot of the time left before weight raising ends
* for the sync queue associated to this process; this
* snapshot is taken to remember this value while the weight
@@ -333,10 +329,6 @@ struct bfq_io_cq {
struct bfq_ttime ttime;
int ioprio;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- uint64_t blkcg_id; /* the current blkcg ID */
-#endif
-
unsigned int wr_time_left;
bool saved_idle_window;
bool saved_IO_bound;
@@ -357,6 +349,9 @@ enum bfq_device_speed {
* struct bfq_data - per device data structure.
* @queue: request queue for the managed device.
* @root_group: root bfq_group for the device.
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ * determining if two or more queues have interleaving
+ * requests (see bfq_close_cooperator()).
* @active_numerous_groups: number of bfq_groups containing more than one
* active @bfq_entity.
* @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by
@@ -490,8 +485,9 @@ struct bfq_data {
struct request_queue *queue;
struct bfq_group *root_group;
+ struct rb_root rq_pos_tree;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
int active_numerous_groups;
#endif
@@ -524,7 +520,7 @@ struct bfq_data {
ktime_t last_idling_start;
int peak_rate_samples;
u64 peak_rate;
- int bfq_max_budget;
+ unsigned long bfq_max_budget;
struct hlist_head group_list;
struct list_head active_list;
@@ -536,8 +532,8 @@ struct bfq_data {
unsigned int bfq_slice_idle;
u64 bfq_class_idle_last_service;
- int bfq_user_max_budget;
- int bfq_max_budget_async_rq;
+ unsigned int bfq_user_max_budget;
+ unsigned int bfq_max_budget_async_rq;
unsigned int bfq_timeout[2];
unsigned int bfq_coop_thresh;
@@ -597,15 +593,15 @@ enum bfqq_state_flags {
};
#define BFQ_BFQQ_FNS(name) \
-static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
{ \
(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
} \
-static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
{ \
(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
} \
-static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
{ \
return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
}
@@ -644,64 +640,14 @@ enum bfqq_expiration {
BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
};
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-struct bfqg_stats {
- /* total bytes transferred */
- struct blkg_rwstat service_bytes;
- /* total IOs serviced, post merge */
- struct blkg_rwstat serviced;
- /* number of ios merged */
- struct blkg_rwstat merged;
- /* total time spent on device in ns, may not be accurate w/ queueing */
- struct blkg_rwstat service_time;
- /* total time spent waiting in scheduler queue in ns */
- struct blkg_rwstat wait_time;
- /* number of IOs queued up */
- struct blkg_rwstat queued;
- /* total sectors transferred */
- struct blkg_stat sectors;
- /* total disk time and nr sectors dispatched by this group */
- struct blkg_stat time;
- /* time not charged to this cgroup */
- struct blkg_stat unaccounted_time;
- /* sum of number of ios queued across all samples */
- struct blkg_stat avg_queue_size_sum;
- /* count of samples taken for average */
- struct blkg_stat avg_queue_size_samples;
- /* how many times this group has been removed from service tree */
- struct blkg_stat dequeue;
- /* total time spent waiting for it to be assigned a timeslice. */
- struct blkg_stat group_wait_time;
- /* time spent idling for this blkcg_gq */
- struct blkg_stat idle_time;
- /* total time with empty current active q with other requests queued */
- struct blkg_stat empty_time;
- /* fields after this shouldn't be cleared on stat reset */
- uint64_t start_group_wait_time;
- uint64_t start_idle_time;
- uint64_t start_empty_time;
- uint16_t flags;
-};
-
-/*
- * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
- *
- * @ps: @blkcg_policy_storage that this structure inherits
- * @weight: weight of the bfq_group
- */
-struct bfq_group_data {
- /* must be the first member */
- struct blkcg_policy_data pd;
-
- unsigned short weight;
-};
-
+#ifdef CONFIG_CGROUP_BFQIO
/**
* struct bfq_group - per (device, cgroup) data structure.
* @entity: schedulable entity to insert into the parent group sched_data.
* @sched_data: own sched_data, to contain child entities (they may be
* both bfq_queues and bfq_groups).
+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
+ * list of the containing cgroup's bfqio_cgroup.
* @bfqd_node: node to be inserted into the @bfqd->group_list list
* of the groups active on the same device; used for cleanup.
* @bfqd: the bfq_data for the device this group acts upon.
@@ -717,26 +663,23 @@ struct bfq_group_data {
* are groups with more than one active @bfq_entity
* (see the comments to the function
* bfq_bfqq_must_not_expire()).
- * @rq_pos_tree: rbtree sorted by next_request position, used when
- * determining if two or more queues have interleaving
- * requests (see bfq_find_close_cooperator()).
*
* Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
* there is a set of bfq_groups, each one collecting the lower-level
* entities belonging to the group that are acting on the same device.
*
* Locking works as follows:
+ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
+ * via RCU from its readers.
* o @bfqd is protected by the queue lock, RCU is used to access it
* from the readers.
* o All the other fields are protected by the @bfqd queue lock.
*/
struct bfq_group {
- /* must be the first member */
- struct blkg_policy_data pd;
-
struct bfq_entity entity;
struct bfq_sched_data sched_data;
+ struct hlist_node group_node;
struct hlist_node bfqd_node;
void *bfqd;
@@ -747,33 +690,44 @@ struct bfq_group {
struct bfq_entity *my_entity;
int active_entities;
+};
- struct rb_root rq_pos_tree;
+/**
+ * struct bfqio_cgroup - bfq cgroup data structure.
+ * @css: subsystem state for bfq in the containing cgroup.
+ * @online: flag marked when the subsystem is inserted.
+ * @weight: cgroup weight.
+ * @ioprio: cgroup ioprio.
+ * @ioprio_class: cgroup ioprio_class.
+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
+ * @group_data: list containing the bfq_group belonging to this cgroup.
+ *
+ * @group_data is accessed using RCU, with @lock protecting the updates,
+ * @ioprio and @ioprio_class are protected by @lock.
+ */
+struct bfqio_cgroup {
+ struct cgroup_subsys_state css;
+ bool online;
- struct bfqg_stats stats;
- struct bfqg_stats dead_stats; /* stats pushed from dead children */
-};
+ unsigned short weight, ioprio, ioprio_class;
+ spinlock_t lock;
+ struct hlist_head group_data;
+};
#else
struct bfq_group {
struct bfq_sched_data sched_data;
struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
struct bfq_queue *async_idle_bfqq;
-
- struct rb_root rq_pos_tree;
};
#endif
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
-
-static struct bfq_service_tree *
+static inline struct bfq_service_tree *
bfq_entity_service_tree(struct bfq_entity *entity)
{
struct bfq_sched_data *sched_data = entity->sched_data;
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
- BFQ_DEFAULT_GRP_CLASS;
+ unsigned int idx = entity->ioprio_class - 1;
BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
BUG_ON(sched_data == NULL);
@@ -781,18 +735,19 @@ bfq_entity_service_tree(struct bfq_entity *entity)
return sched_data->service_tree + idx;
}
-static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
+ bool is_sync)
{
return bic->bfqq[is_sync];
}
-static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
- bool is_sync)
+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
+ struct bfq_queue *bfqq, bool is_sync)
{
bic->bfqq[is_sync] = bfqq;
}
-static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
{
return bic->icq.q->elevator->elevator_data;
}
@@ -811,7 +766,8 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
* the function returns NULL, with the queue unlocked, otherwise it
* returns the dereferenced pointer, with the queue locked.
*/
-static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)
+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
+ unsigned long *flags)
{
struct bfq_data *bfqd;
@@ -820,9 +776,7 @@ static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)
if (bfqd != NULL) {
spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
- if (ptr == NULL)
- printk(KERN_CRIT "get_bfqd_locked pointer NULL\n");
- else if (*ptr == bfqd)
+ if (*ptr == bfqd)
goto out;
spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
}
@@ -833,37 +787,17 @@ out:
return bfqd;
}
-static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)
+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
+ unsigned long *flags)
{
spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
- if (!group_entity)
- group_entity = &bfqq->bfqd->root_group->entity;
-
- return container_of(group_entity, struct bfq_group, entity);
-}
-
-#else
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- return bfqq->bfqd->root_group;
-}
-
-#endif
-
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic);
static void bfq_put_queue(struct bfq_queue *bfqq);
static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
- struct bio *bio, int is_sync,
+ struct bfq_group *bfqg, int is_sync,
struct bfq_io_cq *bic, gfp_t gfp_mask);
static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
struct bfq_group *bfqg);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 719b7152a..14b8faf8b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -140,6 +140,11 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
iv = bip->bip_vec + bip->bip_vcnt;
+ if (bip->bip_vcnt &&
+ bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+ &bip->bip_vec[bip->bip_vcnt - 1], offset))
+ return 0;
+
iv->bv_page = page;
iv->bv_len = len;
iv->bv_offset = offset;
@@ -355,13 +360,12 @@ static void bio_integrity_verify_fn(struct work_struct *work)
container_of(work, struct bio_integrity_payload, bip_work);
struct bio *bio = bip->bip_bio;
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- int error;
- error = bio_integrity_process(bio, bi->verify_fn);
+ bio->bi_error = bio_integrity_process(bio, bi->verify_fn);
/* Restore original bio completion handler */
bio->bi_end_io = bip->bip_end_io;
- bio_endio(bio, error);
+ bio_endio(bio);
}
/**
@@ -376,7 +380,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
* in process context. This function postpones completion
* accordingly.
*/
-void bio_integrity_endio(struct bio *bio, int error)
+void bio_integrity_endio(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
@@ -386,9 +390,9 @@ void bio_integrity_endio(struct bio *bio, int error)
* integrity metadata. Restore original bio end_io handler
* and run it.
*/
- if (error) {
+ if (bio->bi_error) {
bio->bi_end_io = bip->bip_end_io;
- bio_endio(bio, error);
+ bio_endio(bio);
return;
}
diff --git a/block/bio.c b/block/bio.c
index d6e5ba339..ad3f276d7 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -269,7 +269,6 @@ static void bio_free(struct bio *bio)
void bio_init(struct bio *bio)
{
memset(bio, 0, sizeof(*bio));
- bio->bi_flags = 1 << BIO_UPTODATE;
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
}
@@ -292,14 +291,17 @@ void bio_reset(struct bio *bio)
__bio_free(bio);
memset(bio, 0, BIO_RESET_BYTES);
- bio->bi_flags = flags | (1 << BIO_UPTODATE);
+ bio->bi_flags = flags;
atomic_set(&bio->__bi_remaining, 1);
}
EXPORT_SYMBOL(bio_reset);
-static void bio_chain_endio(struct bio *bio, int error)
+static void bio_chain_endio(struct bio *bio)
{
- bio_endio(bio->bi_private, error);
+ struct bio *parent = bio->bi_private;
+
+ parent->bi_error = bio->bi_error;
+ bio_endio(parent);
bio_put(bio);
}
@@ -309,7 +311,7 @@ static void bio_chain_endio(struct bio *bio, int error)
*/
static inline void bio_inc_remaining(struct bio *bio)
{
- bio->bi_flags |= (1 << BIO_CHAIN);
+ bio_set_flag(bio, BIO_CHAIN);
smp_mb__before_atomic();
atomic_inc(&bio->__bi_remaining);
}
@@ -493,7 +495,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
if (unlikely(!bvl))
goto err_free;
- bio->bi_flags |= 1 << BIO_OWNS_VEC;
+ bio_set_flag(bio, BIO_OWNS_VEC);
} else if (nr_iovecs) {
bvl = bio->bi_inline_vecs;
}
@@ -578,7 +580,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
* so we don't set nor calculate new physical/hw segment counts here
*/
bio->bi_bdev = bio_src->bi_bdev;
- bio->bi_flags |= 1 << BIO_CLONED;
+ bio_set_flag(bio, BIO_CLONED);
bio->bi_rw = bio_src->bi_rw;
bio->bi_iter = bio_src->bi_iter;
bio->bi_io_vec = bio_src->bi_io_vec;
@@ -692,31 +694,22 @@ integrity_clone:
EXPORT_SYMBOL(bio_clone_bioset);
/**
- * bio_get_nr_vecs - return approx number of vecs
- * @bdev: I/O target
+ * bio_add_pc_page - attempt to add page to bio
+ * @q: the target queue
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
*
- * Return the approximate number of pages we can send to this target.
- * There's no guarantee that you will be able to fit this number of pages
- * into a bio, it does not account for dynamic restrictions that vary
- * on offset.
+ * Attempt to add a page to the bio_vec maplist. This can fail for a
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
+ *
+ * This should only be used by REQ_PC bios.
*/
-int bio_get_nr_vecs(struct block_device *bdev)
-{
- struct request_queue *q = bdev_get_queue(bdev);
- int nr_pages;
-
- nr_pages = min_t(unsigned,
- queue_max_segments(q),
- queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
-
- return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
-
-}
-EXPORT_SYMBOL(bio_get_nr_vecs);
-
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
- *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors)
+int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
+ *page, unsigned int len, unsigned int offset)
{
int retried_segments = 0;
struct bio_vec *bvec;
@@ -727,7 +720,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
if (unlikely(bio_flagged(bio, BIO_CLONED)))
return 0;
- if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
+ if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
return 0;
/*
@@ -740,28 +733,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
if (page == prev->bv_page &&
offset == prev->bv_offset + prev->bv_len) {
- unsigned int prev_bv_len = prev->bv_len;
prev->bv_len += len;
-
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- /* prev_bvec is already charged in
- bi_size, discharge it in order to
- simulate merging updated prev_bvec
- as new bvec. */
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_iter.bi_sector,
- .bi_size = bio->bi_iter.bi_size -
- prev_bv_len,
- .bi_rw = bio->bi_rw,
- };
-
- if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
- prev->bv_len -= len;
- return 0;
- }
- }
-
bio->bi_iter.bi_size += len;
goto done;
}
@@ -770,8 +742,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
- if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) &&
- bvec_gap_to_prev(prev, offset))
+ if (bvec_gap_to_prev(q, prev, offset))
return 0;
}
@@ -804,30 +775,9 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
blk_recount_segments(q, bio);
}
- /*
- * if queue has other restrictions (eg varying max sector size
- * depending on offset), it can specify a merge_bvec_fn in the
- * queue to get further control
- */
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_iter.bi_sector,
- .bi_size = bio->bi_iter.bi_size - len,
- .bi_rw = bio->bi_rw,
- };
-
- /*
- * merge_bvec_fn() returns number of bytes it can accept
- * at this offset
- */
- if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
- goto failed;
- }
-
/* If we may be able to merge these biovecs, force a recount */
if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
- bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+ bio_clear_flag(bio, BIO_SEG_VALID);
done:
return len;
@@ -841,28 +791,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
blk_recount_segments(q, bio);
return 0;
}
-
-/**
- * bio_add_pc_page - attempt to add page to bio
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
- *
- * This should only be used by REQ_PC bios.
- */
-int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
-{
- return __bio_add_page(q, bio, page, len, offset,
- queue_max_hw_sectors(q));
-}
EXPORT_SYMBOL(bio_add_pc_page);
/**
@@ -872,22 +800,47 @@ EXPORT_SYMBOL(bio_add_pc_page);
* @len: vec entry length
* @offset: vec entry offset
*
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
+ * Attempt to add a page to the bio_vec maplist. This will only fail
+ * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
*/
-int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
- unsigned int offset)
+int bio_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- unsigned int max_sectors;
+ struct bio_vec *bv;
+
+ /*
+ * cloned bio must not modify vec list
+ */
+ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+ return 0;
- max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
- if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size)
- max_sectors = len >> 9;
+ /*
+ * For filesystems with a blocksize smaller than the pagesize
+ * we will often be called with the same page as last time and
+ * a consecutive offset. Optimize this special case.
+ */
+ if (bio->bi_vcnt > 0) {
+ bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
- return __bio_add_page(q, bio, page, len, offset, max_sectors);
+ if (page == bv->bv_page &&
+ offset == bv->bv_offset + bv->bv_len) {
+ bv->bv_len += len;
+ goto done;
+ }
+ }
+
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return 0;
+
+ bv = &bio->bi_io_vec[bio->bi_vcnt];
+ bv->bv_page = page;
+ bv->bv_len = len;
+ bv->bv_offset = offset;
+
+ bio->bi_vcnt++;
+done:
+ bio->bi_iter.bi_size += len;
+ return len;
}
EXPORT_SYMBOL(bio_add_page);
@@ -896,11 +849,11 @@ struct submit_bio_ret {
int error;
};
-static void submit_bio_wait_endio(struct bio *bio, int error)
+static void submit_bio_wait_endio(struct bio *bio)
{
struct submit_bio_ret *ret = bio->bi_private;
- ret->error = error;
+ ret->error = bio->bi_error;
complete(&ret->event);
}
@@ -1388,7 +1341,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
if (iter->type & WRITE)
bio->bi_rw |= REQ_WRITE;
- bio->bi_flags |= (1 << BIO_USER_MAPPED);
+ bio_set_flag(bio, BIO_USER_MAPPED);
/*
* subtle -- if __bio_map_user() ended up bouncing a bio,
@@ -1445,7 +1398,7 @@ void bio_unmap_user(struct bio *bio)
}
EXPORT_SYMBOL(bio_unmap_user);
-static void bio_map_kern_endio(struct bio *bio, int err)
+static void bio_map_kern_endio(struct bio *bio)
{
bio_put(bio);
}
@@ -1501,13 +1454,13 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
}
EXPORT_SYMBOL(bio_map_kern);
-static void bio_copy_kern_endio(struct bio *bio, int err)
+static void bio_copy_kern_endio(struct bio *bio)
{
bio_free_pages(bio);
bio_put(bio);
}
-static void bio_copy_kern_endio_read(struct bio *bio, int err)
+static void bio_copy_kern_endio_read(struct bio *bio)
{
char *p = bio->bi_private;
struct bio_vec *bvec;
@@ -1518,7 +1471,7 @@ static void bio_copy_kern_endio_read(struct bio *bio, int err)
p += bvec->bv_len;
}
- bio_copy_kern_endio(bio, err);
+ bio_copy_kern_endio(bio);
}
/**
@@ -1768,7 +1721,7 @@ static inline bool bio_remaining_done(struct bio *bio)
BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
if (atomic_dec_and_test(&bio->__bi_remaining)) {
- clear_bit(BIO_CHAIN, &bio->bi_flags);
+ bio_clear_flag(bio, BIO_CHAIN);
return true;
}
@@ -1778,25 +1731,15 @@ static inline bool bio_remaining_done(struct bio *bio)
/**
* bio_endio - end I/O on a bio
* @bio: bio
- * @error: error, if any
*
* Description:
- * bio_endio() will end I/O on the whole bio. bio_endio() is the
- * preferred way to end I/O on a bio, it takes care of clearing
- * BIO_UPTODATE on error. @error is 0 on success, and and one of the
- * established -Exxxx (-EIO, for instance) error values in case
- * something went wrong. No one should call bi_end_io() directly on a
- * bio unless they own it and thus know that it has an end_io
- * function.
+ * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
+ * way to end I/O on a bio. No one should call bi_end_io() directly on a
+ * bio unless they own it and thus know that it has an end_io function.
**/
-void bio_endio(struct bio *bio, int error)
+void bio_endio(struct bio *bio)
{
while (bio) {
- if (error)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
-
if (unlikely(!bio_remaining_done(bio)))
break;
@@ -1810,11 +1753,12 @@ void bio_endio(struct bio *bio, int error)
*/
if (bio->bi_end_io == bio_chain_endio) {
struct bio *parent = bio->bi_private;
+ parent->bi_error = bio->bi_error;
bio_put(bio);
bio = parent;
} else {
if (bio->bi_end_io)
- bio->bi_end_io(bio, error);
+ bio->bi_end_io(bio);
bio = NULL;
}
}
@@ -1882,7 +1826,7 @@ void bio_trim(struct bio *bio, int offset, int size)
if (offset == 0 && size == bio->bi_iter.bi_size)
return;
- clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+ bio_clear_flag(bio, BIO_SEG_VALID);
bio_advance(bio, offset << 9);
@@ -2046,7 +1990,7 @@ int bio_associate_current(struct bio *bio)
get_io_context_active(ioc);
bio->bi_ioc = ioc;
- bio->bi_css = task_get_css(current, blkio_cgrp_id);
+ bio->bi_css = task_get_css(current, io_cgrp_id);
return 0;
}
EXPORT_SYMBOL_GPL(bio_associate_current);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9cc48d1d7..55512dd62 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,6 +24,7 @@
#include <linux/genhd.h>
#include <linux/delay.h>
#include <linux/atomic.h>
+#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
#include "blk.h"
@@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg)
return;
for (i = 0; i < BLKCG_MAX_POLS; i++)
- kfree(blkg->pd[i]);
+ if (blkg->pd[i])
+ blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
- blk_exit_rl(&blkg->rl);
+ if (blkg->blkcg != &blkcg_root)
+ blk_exit_rl(&blkg->rl);
+
+ blkg_rwstat_exit(&blkg->stat_ios);
+ blkg_rwstat_exit(&blkg->stat_bytes);
kfree(blkg);
}
@@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (!blkg)
return NULL;
+ if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+ blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+ goto err_free;
+
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
blkg->blkcg = blkcg;
@@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
continue;
/* alloc per-policy data and attach it to blkg */
- pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+ pd = pol->pd_alloc_fn(gfp_mask, q->node);
if (!pd)
goto err_free;
@@ -129,26 +139,11 @@ err_free:
return NULL;
}
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state. If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
- bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+ struct request_queue *q, bool update_hint)
{
struct blkcg_gq *blkg;
- blkg = rcu_dereference(blkcg->blkg_hint);
- if (blkg && blkg->q == q)
- return blkg;
-
/*
* Hint didn't match. Look up from the radix tree. Note that the
* hint can only be updated under queue_lock as otherwise @blkg
@@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
return NULL;
}
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair. This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- if (unlikely(blk_queue_bypass(q)))
- return NULL;
- return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
/*
* If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
*/
static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct request_queue *q,
@@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* blkg holds a reference to blkcg */
if (!css_tryget_online(&blkcg->css)) {
- ret = -EINVAL;
+ ret = -ENODEV;
goto err_free_blkg;
}
wb_congested = wb_congested_get_create(&q->backing_dev_info,
- blkcg->css.id, GFP_ATOMIC);
+ blkcg->css.id, GFP_NOWAIT);
if (!wb_congested) {
ret = -ENOMEM;
goto err_put_css;
@@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* allocate */
if (!new_blkg) {
- new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+ new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
goto err_put_congested;
@@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
- ret = -EINVAL;
+ ret = -ENODEV;
goto err_put_congested;
}
blkg_get(blkg->parent);
@@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_init_fn)
- pol->pd_init_fn(blkg);
+ pol->pd_init_fn(blkg->pd[i]);
}
/* insert */
@@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_online_fn)
- pol->pd_online_fn(blkg);
+ pol->pd_online_fn(blkg->pd[i]);
}
}
blkg->online = true;
@@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
* we shouldn't allow anything to go through for a bypassing queue.
*/
if (unlikely(blk_queue_bypass(q)))
- return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+ return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
blkg = __blkg_lookup(blkcg, q, true);
if (blkg)
@@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
return blkg;
}
}
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
static void blkg_destroy(struct blkcg_gq *blkg)
{
struct blkcg *blkcg = blkg->blkcg;
+ struct blkcg_gq *parent = blkg->parent;
int i;
lockdep_assert_held(blkg->q->queue_lock);
@@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_offline_fn)
- pol->pd_offline_fn(blkg);
+ pol->pd_offline_fn(blkg->pd[i]);
+ }
+
+ if (parent) {
+ blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+ blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
}
+
blkg->online = false;
radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -403,15 +386,6 @@ static void blkg_destroy_all(struct request_queue *q)
void __blkg_release_rcu(struct rcu_head *rcu_head)
{
struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
- int i;
-
- /* tell policies that this one is being freed */
- for (i = 0; i < BLKCG_MAX_POLS; i++) {
- struct blkcg_policy *pol = blkcg_policy[i];
-
- if (blkg->pd[i] && pol->pd_exit_fn)
- pol->pd_exit_fn(blkg);
- }
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
@@ -475,12 +449,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
* anyway. If you get hit by a race, retry.
*/
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+ blkg_rwstat_reset(&blkg->stat_bytes);
+ blkg_rwstat_reset(&blkg->stat_ios);
+
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
- if (blkcg_policy_enabled(blkg->q, pol) &&
- pol->pd_reset_stats_fn)
- pol->pd_reset_stats_fn(blkg);
+ if (blkg->pd[i] && pol->pd_reset_stats_fn)
+ pol->pd_reset_stats_fn(blkg->pd[i]);
}
}
@@ -489,13 +465,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
return 0;
}
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
{
/* some drivers (floppy) instantiate a queue w/o disk registered */
if (blkg->q->backing_dev_info.dev)
return dev_name(blkg->q->backing_dev_info.dev);
return NULL;
}
+EXPORT_SYMBOL_GPL(blkg_dev_name);
/**
* blkcg_print_blkgs - helper for printing per-blkg data
@@ -584,9 +561,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
for (i = 0; i < BLKG_RWSTAT_NR; i++)
seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
- (unsigned long long)rwstat->cnt[i]);
+ (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
- v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+ v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+ atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
return v;
}
@@ -623,31 +601,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
}
EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+ return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+ offsetof(struct blkcg_gq, stat_bytes), true);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+ offsetof(struct blkcg_gq, stat_ios), true);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd,
+ int off)
+{
+ struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+ NULL, off);
+ return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat_field_recursive,
+ (void *)seq_cft(sf)->private,
+ offsetof(struct blkcg_gq, stat_bytes), true);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat_field_recursive,
+ (void *)seq_cft(sf)->private,
+ offsetof(struct blkcg_gq, stat_ios), true);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
/**
* blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts. The caller must be holding the
+ * queue lock for online tests.
*
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum. The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
*/
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol, int off)
{
- struct blkcg_policy *pol = blkcg_policy[pd->plid];
struct blkcg_gq *pos_blkg;
struct cgroup_subsys_state *pos_css;
u64 sum = 0;
- lockdep_assert_held(pd->blkg->q->queue_lock);
+ lockdep_assert_held(blkg->q->queue_lock);
rcu_read_lock();
- blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
- struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
- struct blkg_stat *stat = (void *)pos_pd + off;
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct blkg_stat *stat;
+
+ if (!pos_blkg->online)
+ continue;
+
+ if (pol)
+ stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+ else
+ stat = (void *)blkg + off;
- if (pos_blkg->online)
- sum += blkg_stat_read(stat);
+ sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
}
rcu_read_unlock();
@@ -657,37 +726,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
/**
* blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts. The caller must be holding the
+ * queue lock for online tests.
*
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum. The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
*/
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
- int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol, int off)
{
- struct blkcg_policy *pol = blkcg_policy[pd->plid];
struct blkcg_gq *pos_blkg;
struct cgroup_subsys_state *pos_css;
struct blkg_rwstat sum = { };
int i;
- lockdep_assert_held(pd->blkg->q->queue_lock);
+ lockdep_assert_held(blkg->q->queue_lock);
rcu_read_lock();
- blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
- struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
- struct blkg_rwstat *rwstat = (void *)pos_pd + off;
- struct blkg_rwstat tmp;
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct blkg_rwstat *rwstat;
if (!pos_blkg->online)
continue;
- tmp = blkg_rwstat_read(rwstat);
+ if (pol)
+ rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+ else
+ rwstat = (void *)pos_blkg + off;
for (i = 0; i < BLKG_RWSTAT_NR; i++)
- sum.cnt[i] += tmp.cnt[i];
+ atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+ &sum.aux_cnt[i]);
}
rcu_read_unlock();
@@ -703,29 +778,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
* @ctx: blkg_conf_ctx to be filled
*
* Parse per-blkg config update from @input and initialize @ctx with the
- * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value. This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result. @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN. This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
*/
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- const char *input, struct blkg_conf_ctx *ctx)
+ char *input, struct blkg_conf_ctx *ctx)
__acquires(rcu) __acquires(disk->queue->queue_lock)
{
struct gendisk *disk;
struct blkcg_gq *blkg;
unsigned int major, minor;
- unsigned long long v;
- int part, ret;
+ int key_len, part, ret;
+ char *body;
- if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+ if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
return -EINVAL;
+ body = input + key_len;
+ if (!isspace(*body))
+ return -EINVAL;
+ body = skip_spaces(body);
+
disk = get_gendisk(MKDEV(major, minor), &part);
if (!disk)
- return -EINVAL;
+ return -ENODEV;
if (part) {
put_disk(disk);
- return -EINVAL;
+ return -ENODEV;
}
rcu_read_lock();
@@ -734,7 +814,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (blkcg_policy_enabled(disk->queue, pol))
blkg = blkg_lookup_create(blkcg, disk->queue);
else
- blkg = ERR_PTR(-EINVAL);
+ blkg = ERR_PTR(-EOPNOTSUPP);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
@@ -756,7 +836,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
ctx->disk = disk;
ctx->blkg = blkg;
- ctx->v = v;
+ ctx->body = body;
return 0;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);
@@ -777,8 +857,55 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct blkcg_gq *blkg;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ const char *dname;
+ struct blkg_rwstat rwstat;
+ u64 rbytes, wbytes, rios, wios;
+
+ dname = blkg_dev_name(blkg);
+ if (!dname)
+ continue;
+
+ spin_lock_irq(blkg->q->queue_lock);
+
+ rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes));
+ rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+ wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+ offsetof(struct blkcg_gq, stat_ios));
+ rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+ wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ spin_unlock_irq(blkg->q->queue_lock);
+
+ if (rbytes || wbytes || rios || wios)
+ seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+ dname, rbytes, wbytes, rios, wios);
+ }
+
+ rcu_read_unlock();
+ return 0;
+}
+
struct cftype blkcg_files[] = {
{
+ .name = "stat",
+ .seq_show = blkcg_print_stat,
+ },
+ { } /* terminate */
+};
+
+struct cftype blkcg_legacy_files[] = {
+ {
.name = "reset_stats",
.write_u64 = blkcg_reset_stats,
},
@@ -825,18 +952,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
static void blkcg_css_free(struct cgroup_subsys_state *css)
{
struct blkcg *blkcg = css_to_blkcg(css);
+ int i;
mutex_lock(&blkcg_pol_mutex);
+
list_del(&blkcg->all_blkcgs_node);
- mutex_unlock(&blkcg_pol_mutex);
- if (blkcg != &blkcg_root) {
- int i;
+ for (i = 0; i < BLKCG_MAX_POLS; i++)
+ if (blkcg->cpd[i])
+ blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
- for (i = 0; i < BLKCG_MAX_POLS; i++)
- kfree(blkcg->pd[i]);
- kfree(blkcg);
- }
+ mutex_unlock(&blkcg_pol_mutex);
+
+ kfree(blkcg);
}
static struct cgroup_subsys_state *
@@ -850,13 +978,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
if (!parent_css) {
blkcg = &blkcg_root;
- goto done;
- }
-
- blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
- if (!blkcg) {
- ret = ERR_PTR(-ENOMEM);
- goto free_blkcg;
+ } else {
+ blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+ if (!blkcg) {
+ ret = ERR_PTR(-ENOMEM);
+ goto free_blkcg;
+ }
}
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -869,23 +996,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
* check if the policy requires any specific per-cgroup
* data: if it does, allocate and initialize it.
*/
- if (!pol || !pol->cpd_size)
+ if (!pol || !pol->cpd_alloc_fn)
continue;
- BUG_ON(blkcg->pd[i]);
- cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+ cpd = pol->cpd_alloc_fn(GFP_KERNEL);
if (!cpd) {
ret = ERR_PTR(-ENOMEM);
goto free_pd_blkcg;
}
- blkcg->pd[i] = cpd;
+ blkcg->cpd[i] = cpd;
+ cpd->blkcg = blkcg;
cpd->plid = i;
- pol->cpd_init_fn(blkcg);
+ if (pol->cpd_init_fn)
+ pol->cpd_init_fn(cpd);
}
-done:
spin_lock_init(&blkcg->lock);
- INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+ INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -897,7 +1024,8 @@ done:
free_pd_blkcg:
for (i--; i >= 0; i--)
- kfree(blkcg->pd[i]);
+ if (blkcg->cpd[i])
+ blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
free_blkcg:
kfree(blkcg);
mutex_unlock(&blkcg_pol_mutex);
@@ -941,7 +1069,7 @@ int blkcg_init_queue(struct request_queue *q)
radix_tree_preload_end();
if (IS_ERR(blkg)) {
- kfree(new_blkg);
+ blkg_free(new_blkg);
return PTR_ERR(blkg);
}
@@ -1018,12 +1146,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
return ret;
}
-struct cgroup_subsys blkio_cgrp_subsys = {
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+ int i;
+
+ mutex_lock(&blkcg_pol_mutex);
+
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+ struct blkcg *blkcg;
+
+ if (!pol || !pol->cpd_bind_fn)
+ continue;
+
+ list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+ if (blkcg->cpd[pol->plid])
+ pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+ }
+ mutex_unlock(&blkcg_pol_mutex);
+}
+
+struct cgroup_subsys io_cgrp_subsys = {
.css_alloc = blkcg_css_alloc,
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
.can_attach = blkcg_can_attach,
- .legacy_cftypes = blkcg_files,
+ .bind = blkcg_bind,
+ .dfl_cftypes = blkcg_files,
+ .legacy_cftypes = blkcg_legacy_files,
+ .legacy_name = "blkio",
#ifdef CONFIG_MEMCG
/*
* This ensures that, if available, memcg is automatically enabled
@@ -1033,7 +1184,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
.depends_on = 1 << memory_cgrp_id,
#endif
};
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
/**
* blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1054,65 +1205,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol)
{
- LIST_HEAD(pds);
+ struct blkg_policy_data *pd_prealloc = NULL;
struct blkcg_gq *blkg;
- struct blkg_policy_data *pd, *nd;
- int cnt = 0, ret;
+ int ret;
if (blkcg_policy_enabled(q, pol))
return 0;
- /* count and allocate policy_data for all existing blkgs */
blk_queue_bypass_start(q);
- spin_lock_irq(q->queue_lock);
- list_for_each_entry(blkg, &q->blkg_list, q_node)
- cnt++;
- spin_unlock_irq(q->queue_lock);
-
- /* allocate per-blkg policy data for all existing blkgs */
- while (cnt--) {
- pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
- if (!pd) {
+pd_prealloc:
+ if (!pd_prealloc) {
+ pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
+ if (!pd_prealloc) {
ret = -ENOMEM;
- goto out_free;
+ goto out_bypass_end;
}
- list_add_tail(&pd->alloc_node, &pds);
}
- /*
- * Install the allocated pds and cpds. With @q bypassing, no new blkg
- * should have been created while the queue lock was dropped.
- */
spin_lock_irq(q->queue_lock);
list_for_each_entry(blkg, &q->blkg_list, q_node) {
- if (WARN_ON(list_empty(&pds))) {
- /* umm... this shouldn't happen, just abort */
- ret = -ENOMEM;
- goto out_unlock;
- }
- pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
- list_del_init(&pd->alloc_node);
+ struct blkg_policy_data *pd;
- /* grab blkcg lock too while installing @pd on @blkg */
- spin_lock(&blkg->blkcg->lock);
+ if (blkg->pd[pol->plid])
+ continue;
+
+ pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+ if (!pd)
+ swap(pd, pd_prealloc);
+ if (!pd) {
+ spin_unlock_irq(q->queue_lock);
+ goto pd_prealloc;
+ }
blkg->pd[pol->plid] = pd;
pd->blkg = blkg;
pd->plid = pol->plid;
- pol->pd_init_fn(blkg);
-
- spin_unlock(&blkg->blkcg->lock);
+ if (pol->pd_init_fn)
+ pol->pd_init_fn(pd);
}
__set_bit(pol->plid, q->blkcg_pols);
ret = 0;
-out_unlock:
+
spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
blk_queue_bypass_end(q);
- list_for_each_entry_safe(pd, nd, &pds, alloc_node)
- kfree(pd);
+ if (pd_prealloc)
+ pol->pd_free_fn(pd_prealloc);
return ret;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1142,13 +1282,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
/* grab blkcg lock too while removing @pd from @blkg */
spin_lock(&blkg->blkcg->lock);
- if (pol->pd_offline_fn)
- pol->pd_offline_fn(blkg);
- if (pol->pd_exit_fn)
- pol->pd_exit_fn(blkg);
-
- kfree(blkg->pd[pol->plid]);
- blkg->pd[pol->plid] = NULL;
+ if (blkg->pd[pol->plid]) {
+ if (pol->pd_offline_fn)
+ pol->pd_offline_fn(blkg->pd[pol->plid]);
+ pol->pd_free_fn(blkg->pd[pol->plid]);
+ blkg->pd[pol->plid] = NULL;
+ }
spin_unlock(&blkg->blkcg->lock);
}
@@ -1170,9 +1309,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg *blkcg;
int i, ret;
- if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
- return -EINVAL;
-
mutex_lock(&blkcg_pol_register_mutex);
mutex_lock(&blkcg_pol_mutex);
@@ -1189,36 +1325,42 @@ int blkcg_policy_register(struct blkcg_policy *pol)
blkcg_policy[pol->plid] = pol;
/* allocate and install cpd's */
- if (pol->cpd_size) {
+ if (pol->cpd_alloc_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
struct blkcg_policy_data *cpd;
- cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+ cpd = pol->cpd_alloc_fn(GFP_KERNEL);
if (!cpd) {
mutex_unlock(&blkcg_pol_mutex);
goto err_free_cpds;
}
- blkcg->pd[pol->plid] = cpd;
+ blkcg->cpd[pol->plid] = cpd;
+ cpd->blkcg = blkcg;
cpd->plid = pol->plid;
- pol->cpd_init_fn(blkcg);
+ pol->cpd_init_fn(cpd);
}
}
mutex_unlock(&blkcg_pol_mutex);
/* everything is in place, add intf files for the new policy */
- if (pol->cftypes)
- WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
- pol->cftypes));
+ if (pol->dfl_cftypes)
+ WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+ pol->dfl_cftypes));
+ if (pol->legacy_cftypes)
+ WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
+ pol->legacy_cftypes));
mutex_unlock(&blkcg_pol_register_mutex);
return 0;
err_free_cpds:
- if (pol->cpd_size) {
+ if (pol->cpd_alloc_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- kfree(blkcg->pd[pol->plid]);
- blkcg->pd[pol->plid] = NULL;
+ if (blkcg->cpd[pol->plid]) {
+ pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+ blkcg->cpd[pol->plid] = NULL;
+ }
}
}
blkcg_policy[pol->plid] = NULL;
@@ -1245,16 +1387,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
goto out_unlock;
/* kill the intf files first */
- if (pol->cftypes)
- cgroup_rm_cftypes(pol->cftypes);
+ if (pol->dfl_cftypes)
+ cgroup_rm_cftypes(pol->dfl_cftypes);
+ if (pol->legacy_cftypes)
+ cgroup_rm_cftypes(pol->legacy_cftypes);
/* remove cpds and unregister */
mutex_lock(&blkcg_pol_mutex);
- if (pol->cpd_size) {
+ if (pol->cpd_alloc_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- kfree(blkcg->pd[pol->plid]);
- blkcg->pd[pol->plid] = NULL;
+ if (blkcg->cpd[pol->plid]) {
+ pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+ blkcg->cpd[pol->plid] = NULL;
+ }
}
}
blkcg_policy[pol->plid] = NULL;
diff --git a/block/blk-core.c b/block/blk-core.c
index d1a63defd..48a6cc8df 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -145,18 +145,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
unsigned int nbytes, int error)
{
if (error)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
+ bio->bi_error = error;
if (unlikely(rq->cmd_flags & REQ_QUIET))
- set_bit(BIO_QUIET, &bio->bi_flags);
+ bio_set_flag(bio, BIO_QUIET);
bio_advance(bio, nbytes);
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
- bio_endio(bio, error);
+ bio_endio(bio);
}
void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -580,7 +578,7 @@ void blk_cleanup_queue(struct request_queue *q)
q->queue_lock = &q->__queue_lock;
spin_unlock_irq(lock);
- bdi_destroy(&q->backing_dev_info);
+ bdi_unregister(&q->backing_dev_info);
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
@@ -647,6 +645,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (q->id < 0)
goto fail_q;
+ q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+ if (!q->bio_split)
+ goto fail_id;
+
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
@@ -655,7 +657,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
err = bdi_init(&q->backing_dev_info);
if (err)
- goto fail_id;
+ goto fail_split;
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
@@ -697,6 +699,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
fail_bdi:
bdi_destroy(&q->backing_dev_info);
+fail_split:
+ bioset_free(q->bio_split);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
fail_q:
@@ -1614,6 +1618,8 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
struct request *req;
unsigned int request_count = 0;
+ blk_queue_split(q, &bio, q->bio_split);
+
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
@@ -1622,7 +1628,8 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio_endio(bio, -EIO);
+ bio->bi_error = -EIO;
+ bio_endio(bio);
return;
}
@@ -1675,7 +1682,8 @@ get_rq:
*/
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (IS_ERR(req)) {
- bio_endio(bio, PTR_ERR(req)); /* @q is dead */
+ bio->bi_error = PTR_ERR(req);
+ bio_endio(bio);
goto out_unlock;
}
@@ -1834,15 +1842,6 @@ generic_make_request_checks(struct bio *bio)
goto end_io;
}
- if (likely(bio_is_rw(bio) &&
- nr_sectors > queue_max_hw_sectors(q))) {
- printk(KERN_ERR "bio too big device %s (%u > %u)\n",
- bdevname(bio->bi_bdev, b),
- bio_sectors(bio),
- queue_max_hw_sectors(q));
- goto end_io;
- }
-
part = bio->bi_bdev->bd_part;
if (should_fail_request(part, bio->bi_iter.bi_size) ||
should_fail_request(&part_to_disk(part)->part0,
@@ -1891,14 +1890,15 @@ generic_make_request_checks(struct bio *bio)
*/
create_io_context(GFP_ATOMIC, q->node);
- if (blk_throtl_bio(q, bio))
- return false; /* throttled, will be resubmitted later */
+ if (!blkcg_bio_issue_check(q, bio))
+ return false;
trace_block_bio_queue(q, bio);
return true;
end_io:
- bio_endio(bio, err);
+ bio->bi_error = err;
+ bio_endio(bio);
return false;
}
@@ -1991,7 +1991,7 @@ void submit_bio(int rw, struct bio *bio)
bio->bi_rw |= rw;
if (unlikely(trap_non_toi_io))
- BUG_ON(!(bio->bi_flags & (1 << BIO_TOI)));
+ BUG_ON(!bio_flagged(bio, BIO_TOI));
/*
* If it's a regular read/write or a barrier with data attached,
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index f548b64be..75f29cf70 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -204,6 +204,9 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
q->limits.max_integrity_segments)
return false;
+ if (integrity_req_gap_back_merge(req, next->bio))
+ return false;
+
return true;
}
EXPORT_SYMBOL(blk_integrity_merge_rq);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 7688ee3f5..9ebf65379 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -11,16 +11,16 @@
struct bio_batch {
atomic_t done;
- unsigned long flags;
+ int error;
struct completion *wait;
};
-static void bio_batch_end_io(struct bio *bio, int err)
+static void bio_batch_end_io(struct bio *bio)
{
struct bio_batch *bb = bio->bi_private;
- if (err && (err != -EOPNOTSUPP))
- clear_bit(BIO_UPTODATE, &bb->flags);
+ if (bio->bi_error && bio->bi_error != -EOPNOTSUPP)
+ bb->error = bio->bi_error;
if (atomic_dec_and_test(&bb->done))
complete(bb->wait);
bio_put(bio);
@@ -43,7 +43,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q = bdev_get_queue(bdev);
int type = REQ_WRITE | REQ_DISCARD;
- unsigned int max_discard_sectors, granularity;
+ unsigned int granularity;
int alignment;
struct bio_batch bb;
struct bio *bio;
@@ -60,17 +60,6 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
granularity = max(q->limits.discard_granularity >> 9, 1U);
alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
- /*
- * Ensure that max_discard_sectors is of the proper
- * granularity, so that requests stay aligned after a split.
- */
- max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
- max_discard_sectors -= max_discard_sectors % granularity;
- if (unlikely(!max_discard_sectors)) {
- /* Avoid infinite loop below. Being cautious never hurts. */
- return -EOPNOTSUPP;
- }
-
if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secdiscard(q))
return -EOPNOTSUPP;
@@ -78,7 +67,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
atomic_set(&bb.done, 1);
- bb.flags = 1 << BIO_UPTODATE;
+ bb.error = 0;
bb.wait = &wait;
blk_start_plug(&plug);
@@ -92,7 +81,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
break;
}
- req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+ /* Make sure bi_size doesn't overflow */
+ req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
/*
* If splitting a request, and the next starting sector would be
@@ -134,9 +124,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if (!atomic_dec_and_test(&bb.done))
wait_for_completion_io(&wait);
- if (!test_bit(BIO_UPTODATE, &bb.flags))
- ret = -EIO;
-
+ if (bb.error)
+ return bb.error;
return ret;
}
EXPORT_SYMBOL(blkdev_issue_discard);
@@ -166,13 +155,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
if (!q)
return -ENXIO;
- max_write_same_sectors = q->limits.max_write_same_sectors;
-
- if (max_write_same_sectors == 0)
- return -EOPNOTSUPP;
+ /* Ensure that max_write_same_sectors doesn't overflow bi_size */
+ max_write_same_sectors = UINT_MAX >> 9;
atomic_set(&bb.done, 1);
- bb.flags = 1 << BIO_UPTODATE;
+ bb.error = 0;
bb.wait = &wait;
while (nr_sects) {
@@ -208,9 +195,8 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
if (!atomic_dec_and_test(&bb.done))
wait_for_completion_io(&wait);
- if (!test_bit(BIO_UPTODATE, &bb.flags))
- ret = -ENOTSUPP;
-
+ if (bb.error)
+ return bb.error;
return ret;
}
EXPORT_SYMBOL(blkdev_issue_write_same);
@@ -236,7 +222,7 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
DECLARE_COMPLETION_ONSTACK(wait);
atomic_set(&bb.done, 1);
- bb.flags = 1 << BIO_UPTODATE;
+ bb.error = 0;
bb.wait = &wait;
ret = 0;
@@ -270,10 +256,8 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
if (!atomic_dec_and_test(&bb.done))
wait_for_completion_io(&wait);
- if (!test_bit(BIO_UPTODATE, &bb.flags))
- /* One of bios in the batch was completed with error.*/
- ret = -EIO;
-
+ if (bb.error)
+ return bb.error;
return ret;
}
diff --git a/block/blk-map.c b/block/blk-map.c
index da310a105..f565e11f4 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -9,6 +9,24 @@
#include "blk.h"
+static bool iovec_gap_to_prv(struct request_queue *q,
+ struct iovec *prv, struct iovec *cur)
+{
+ unsigned long prev_end;
+
+ if (!queue_virt_boundary(q))
+ return false;
+
+ if (prv->iov_base == NULL && prv->iov_len == 0)
+ /* prv is not set - don't check */
+ return false;
+
+ prev_end = (unsigned long)(prv->iov_base + prv->iov_len);
+
+ return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) ||
+ prev_end & queue_virt_boundary(q));
+}
+
int blk_rq_append_bio(struct request_queue *q, struct request *rq,
struct bio *bio)
{
@@ -67,7 +85,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
struct bio *bio;
int unaligned = 0;
struct iov_iter i;
- struct iovec iov;
+ struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
if (!iter || !iter->count)
return -EINVAL;
@@ -81,8 +99,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
/*
* Keep going so we check length of all segments
*/
- if (uaddr & queue_dma_alignment(q))
+ if ((uaddr & queue_dma_alignment(q)) ||
+ iovec_gap_to_prv(q, &prv, &iov))
unaligned = 1;
+
+ prv.iov_base = iov.iov_base;
+ prv.iov_len = iov.iov_len;
}
if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
@@ -94,7 +116,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
return PTR_ERR(bio);
if (map_data && map_data->null_mapped)
- bio->bi_flags |= (1 << BIO_NULL_MAPPED);
+ bio_set_flag(bio, BIO_NULL_MAPPED);
if (bio->bi_iter.bi_size != iter->count) {
/*
@@ -103,7 +125,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
* normal IO completion path
*/
bio_get(bio);
- bio_endio(bio, 0);
+ bio_endio(bio);
__blk_rq_unmap_user(bio);
return -EINVAL;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 30a0d9f89..c4e9c37f3 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,12 +9,134 @@
#include "blk.h"
+static struct bio *blk_bio_discard_split(struct request_queue *q,
+ struct bio *bio,
+ struct bio_set *bs)
+{
+ unsigned int max_discard_sectors, granularity;
+ int alignment;
+ sector_t tmp;
+ unsigned split_sectors;
+
+ /* Zero-sector (unknown) and one-sector granularities are the same. */
+ granularity = max(q->limits.discard_granularity >> 9, 1U);
+
+ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+ max_discard_sectors -= max_discard_sectors % granularity;
+
+ if (unlikely(!max_discard_sectors)) {
+ /* XXX: warn */
+ return NULL;
+ }
+
+ if (bio_sectors(bio) <= max_discard_sectors)
+ return NULL;
+
+ split_sectors = max_discard_sectors;
+
+ /*
+ * If the next starting sector would be misaligned, stop the discard at
+ * the previous aligned sector.
+ */
+ alignment = (q->limits.discard_alignment >> 9) % granularity;
+
+ tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
+ tmp = sector_div(tmp, granularity);
+
+ if (split_sectors > tmp)
+ split_sectors -= tmp;
+
+ return bio_split(bio, split_sectors, GFP_NOIO, bs);
+}
+
+static struct bio *blk_bio_write_same_split(struct request_queue *q,
+ struct bio *bio,
+ struct bio_set *bs)
+{
+ if (!q->limits.max_write_same_sectors)
+ return NULL;
+
+ if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
+ return NULL;
+
+ return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
+}
+
+static struct bio *blk_bio_segment_split(struct request_queue *q,
+ struct bio *bio,
+ struct bio_set *bs)
+{
+ struct bio_vec bv, bvprv, *bvprvp = NULL;
+ struct bvec_iter iter;
+ unsigned seg_size = 0, nsegs = 0, sectors = 0;
+
+ bio_for_each_segment(bv, bio, iter) {
+ if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q))
+ goto split;
+
+ /*
+ * If the queue doesn't support SG gaps and adding this
+ * offset would create a gap, disallow it.
+ */
+ if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
+ goto split;
+
+ if (bvprvp && blk_queue_cluster(q)) {
+ if (seg_size + bv.bv_len > queue_max_segment_size(q))
+ goto new_segment;
+ if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv))
+ goto new_segment;
+ if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv))
+ goto new_segment;
+
+ seg_size += bv.bv_len;
+ bvprv = bv;
+ bvprvp = &bv;
+ sectors += bv.bv_len >> 9;
+ continue;
+ }
+new_segment:
+ if (nsegs == queue_max_segments(q))
+ goto split;
+
+ nsegs++;
+ bvprv = bv;
+ bvprvp = &bv;
+ seg_size = bv.bv_len;
+ sectors += bv.bv_len >> 9;
+ }
+
+ return NULL;
+split:
+ return bio_split(bio, sectors, GFP_NOIO, bs);
+}
+
+void blk_queue_split(struct request_queue *q, struct bio **bio,
+ struct bio_set *bs)
+{
+ struct bio *split;
+
+ if ((*bio)->bi_rw & REQ_DISCARD)
+ split = blk_bio_discard_split(q, *bio, bs);
+ else if ((*bio)->bi_rw & REQ_WRITE_SAME)
+ split = blk_bio_write_same_split(q, *bio, bs);
+ else
+ split = blk_bio_segment_split(q, *bio, q->bio_split);
+
+ if (split) {
+ bio_chain(split, *bio);
+ generic_make_request(*bio);
+ *bio = split;
+ }
+}
+EXPORT_SYMBOL(blk_queue_split);
+
static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
struct bio *bio,
bool no_sg_merge)
{
struct bio_vec bv, bvprv = { NULL };
- int cluster, high, highprv = 1;
+ int cluster, prev = 0;
unsigned int seg_size, nr_phys_segs;
struct bio *fbio, *bbio;
struct bvec_iter iter;
@@ -36,7 +158,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
cluster = blk_queue_cluster(q);
seg_size = 0;
nr_phys_segs = 0;
- high = 0;
for_each_bio(bio) {
bio_for_each_segment(bv, bio, iter) {
/*
@@ -46,13 +167,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
if (no_sg_merge)
goto new_segment;
- /*
- * the trick here is making sure that a high page is
- * never considered part of another segment, since
- * that might change with the bounce page.
- */
- high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
- if (!high && !highprv && cluster) {
+ if (prev && cluster) {
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
goto new_segment;
@@ -72,8 +187,8 @@ new_segment:
nr_phys_segs++;
bvprv = bv;
+ prev = 1;
seg_size = bv.bv_len;
- highprv = high;
}
bbio = bio;
}
@@ -116,7 +231,7 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
bio->bi_next = nxt;
}
- bio->bi_flags |= (1 << BIO_SEG_VALID);
+ bio_set_flag(bio, BIO_SEG_VALID);
}
EXPORT_SYMBOL(blk_recount_segments);
@@ -266,7 +381,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
if (rq->cmd_flags & REQ_WRITE)
memset(q->dma_drain_buffer, 0, q->dma_drain_size);
- sg->page_link &= ~0x02;
+ sg_unmark_end(sg);
sg = sg_next(sg);
sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
q->dma_drain_size,
@@ -312,6 +427,11 @@ no_merge:
int ll_back_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio)
{
+ if (req_gap_back_merge(req, bio))
+ return 0;
+ if (blk_integrity_rq(req) &&
+ integrity_req_gap_back_merge(req, bio))
+ return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req)) {
req->cmd_flags |= REQ_NOMERGE;
@@ -330,6 +450,12 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
int ll_front_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio)
{
+
+ if (req_gap_front_merge(req, bio))
+ return 0;
+ if (blk_integrity_rq(req) &&
+ integrity_req_gap_front_merge(req, bio))
+ return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req)) {
req->cmd_flags |= REQ_NOMERGE;
@@ -356,14 +482,6 @@ static bool req_no_special_merge(struct request *req)
return !q->mq_ops && req->special;
}
-static int req_gap_to_prev(struct request *req, struct request *next)
-{
- struct bio *prev = req->biotail;
-
- return bvec_gap_to_prev(&prev->bi_io_vec[prev->bi_vcnt - 1],
- next->bio->bi_io_vec[0].bv_offset);
-}
-
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
{
@@ -378,8 +496,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
if (req_no_special_merge(req) || req_no_special_merge(next))
return 0;
- if (test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags) &&
- req_gap_to_prev(req, next))
+ if (req_gap_back_merge(req, next->bio))
return 0;
/*
@@ -564,8 +681,6 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
{
- struct request_queue *q = rq->q;
-
if (!rq_mergeable(rq) || !bio_mergeable(bio))
return false;
@@ -589,15 +704,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
!blk_write_same_mergeable(rq->bio, bio))
return false;
- /* Only check gaps if the bio carries data */
- if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
- struct bio_vec *bprev;
-
- bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
- if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
- return false;
- }
-
return true;
}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 1e28ddb65..8764c241e 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
return cpu;
}
-int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+ const struct cpumask *online_mask)
{
unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
cpumask_var_t cpus;
@@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
cpumask_clear(cpus);
nr_cpus = nr_uniq_cpus = 0;
- for_each_online_cpu(i) {
+ for_each_cpu(i, online_mask) {
nr_cpus++;
first_sibling = get_first_sibling(i);
if (!cpumask_test_cpu(first_sibling, cpus))
@@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
queue = 0;
for_each_possible_cpu(i) {
- if (!cpu_online(i)) {
+ if (!cpumask_test_cpu(i, online_mask)) {
map[i] = 0;
continue;
}
@@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
if (!map)
return NULL;
- if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
+ if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
return map;
kfree(map);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 279c5d674..788fffd9b 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -229,8 +229,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
unsigned int i, first = 1;
ssize_t ret = 0;
- blk_mq_disable_hotplug();
-
for_each_cpu(i, hctx->cpumask) {
if (first)
ret += sprintf(ret + page, "%u", i);
@@ -240,8 +238,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
first = 0;
}
- blk_mq_enable_hotplug();
-
ret += sprintf(ret + page, "\n");
return ret;
}
@@ -343,7 +339,7 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
struct blk_mq_ctx *ctx;
int i;
- if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+ if (!hctx->nr_ctx)
return;
hctx_for_each_ctx(hctx, ctx, i)
@@ -358,7 +354,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
struct blk_mq_ctx *ctx;
int i, ret;
- if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+ if (!hctx->nr_ctx)
return 0;
ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
@@ -381,6 +377,8 @@ void blk_mq_unregister_disk(struct gendisk *disk)
struct blk_mq_ctx *ctx;
int i, j;
+ blk_mq_disable_hotplug();
+
queue_for_each_hw_ctx(q, hctx, i) {
blk_mq_unregister_hctx(hctx);
@@ -395,6 +393,9 @@ void blk_mq_unregister_disk(struct gendisk *disk)
kobject_put(&q->mq_kobj);
kobject_put(&disk_to_dev(disk)->kobj);
+
+ q->mq_sysfs_init_done = false;
+ blk_mq_enable_hotplug();
}
static void blk_mq_sysfs_init(struct request_queue *q)
@@ -425,27 +426,30 @@ int blk_mq_register_disk(struct gendisk *disk)
struct blk_mq_hw_ctx *hctx;
int ret, i;
+ blk_mq_disable_hotplug();
+
blk_mq_sysfs_init(q);
ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
- return ret;
+ goto out;
kobject_uevent(&q->mq_kobj, KOBJ_ADD);
queue_for_each_hw_ctx(q, hctx, i) {
- hctx->flags |= BLK_MQ_F_SYSFS_UP;
ret = blk_mq_register_hctx(hctx);
if (ret)
break;
}
- if (ret) {
+ if (ret)
blk_mq_unregister_disk(disk);
- return ret;
- }
+ else
+ q->mq_sysfs_init_done = true;
+out:
+ blk_mq_enable_hotplug();
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(blk_mq_register_disk);
@@ -454,6 +458,9 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
+ if (!q->mq_sysfs_init_done)
+ return;
+
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
}
@@ -463,6 +470,9 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
+ if (!q->mq_sysfs_init_done)
+ return ret;
+
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9115c6d59..ec2d11915 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -471,17 +471,30 @@ void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
}
EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
-void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv)
{
- struct blk_mq_tags *tags = hctx->tags;
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ struct blk_mq_tags *tags = hctx->tags;
+
+ /*
+ * If not software queues are currently mapped to this
+ * hardware queue, there's nothing to check
+ */
+ if (!blk_mq_hw_queue_mapped(hctx))
+ continue;
+
+ if (tags->nr_reserved_tags)
+ bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
+ bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+ false);
+ }
- if (tags->nr_reserved_tags)
- bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
- bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
- false);
}
-EXPORT_SYMBOL(blk_mq_tag_busy_iter);
static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
{
@@ -628,6 +641,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
{
bt_free(&tags->bitmap_tags);
bt_free(&tags->breserved_tags);
+ free_cpumask_var(tags->cpumask);
kfree(tags);
}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 9eb2cf4f0..d468a79f2 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -58,6 +58,8 @@ extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+ void *priv);
enum {
BLK_MQ_TAG_CACHE_MIN = 1,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c69902695..85f014327 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -393,14 +393,16 @@ void __blk_mq_complete_request(struct request *rq)
* Ends all I/O on a request. It does not handle partial completions.
* The actual completion happens out-of-order, through a IPI handler.
**/
-void blk_mq_complete_request(struct request *rq)
+void blk_mq_complete_request(struct request *rq, int error)
{
struct request_queue *q = rq->q;
if (unlikely(blk_should_fake_timeout(q)))
return;
- if (!blk_mark_rq_complete(rq))
+ if (!blk_mark_rq_complete(rq)) {
+ rq->errors = error;
__blk_mq_complete_request(rq);
+ }
}
EXPORT_SYMBOL(blk_mq_complete_request);
@@ -616,10 +618,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
* If a request wasn't started before the queue was
* marked dying, kill it here or it'll go unnoticed.
*/
- if (unlikely(blk_queue_dying(rq->q))) {
- rq->errors = -EIO;
- blk_mq_complete_request(rq);
- }
+ if (unlikely(blk_queue_dying(rq->q)))
+ blk_mq_complete_request(rq, -EIO);
return;
}
if (rq->cmd_flags & REQ_NO_TIMEOUT)
@@ -641,24 +641,16 @@ static void blk_mq_rq_timer(unsigned long priv)
.next = 0,
.next_set = 0,
};
- struct blk_mq_hw_ctx *hctx;
int i;
- queue_for_each_hw_ctx(q, hctx, i) {
- /*
- * If not software queues are currently mapped to this
- * hardware queue, there's nothing to check
- */
- if (!blk_mq_hw_queue_mapped(hctx))
- continue;
-
- blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
- }
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.next_set) {
data.next = blk_rq_timeout(round_jiffies_up(data.next));
mod_timer(&q->timeout, data.next);
} else {
+ struct blk_mq_hw_ctx *hctx;
+
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
@@ -1185,7 +1177,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
struct blk_mq_alloc_data alloc_data;
if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
return NULL;
}
@@ -1269,10 +1261,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
return;
}
+ blk_queue_split(q, &bio, q->bio_split);
+
if (!is_flush_fua && !blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
return;
@@ -1354,10 +1348,12 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
return;
}
+ blk_queue_split(q, &bio, q->bio_split);
+
if (!is_flush_fua && !blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, &request_count, NULL))
return;
@@ -1785,13 +1781,19 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}
-static void blk_mq_map_swqueue(struct request_queue *q)
+static void blk_mq_map_swqueue(struct request_queue *q,
+ const struct cpumask *online_mask)
{
unsigned int i;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
+ /*
+ * Avoid others reading imcomplete hctx->cpumask through sysfs
+ */
+ mutex_lock(&q->sysfs_lock);
+
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -1802,7 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
*/
queue_for_each_ctx(q, ctx, i) {
/* If the cpu isn't online, the cpu is mapped to first hctx */
- if (!cpu_online(i))
+ if (!cpumask_test_cpu(i, online_mask))
continue;
hctx = q->mq_ops->map_queue(q, i);
@@ -1811,6 +1813,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx->ctxs[hctx->nr_ctx++] = ctx;
}
+ mutex_unlock(&q->sysfs_lock);
+
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_ctxmap *map = &hctx->ctx_map;
@@ -1848,7 +1852,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
}
queue_for_each_ctx(q, ctx, i) {
- if (!cpu_online(i))
+ if (!cpumask_test_cpu(i, online_mask))
continue;
hctx = q->mq_ops->map_queue(q, i);
@@ -1921,6 +1925,9 @@ void blk_mq_release(struct request_queue *q)
kfree(hctx);
}
+ kfree(q->mq_map);
+ q->mq_map = NULL;
+
kfree(q->queue_hw_ctx);
/* ctx kobj stays in queue_ctx */
@@ -2030,13 +2037,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
if (blk_mq_init_hw_queues(q, set))
goto err_hctxs;
+ get_online_cpus();
mutex_lock(&all_q_mutex);
- list_add_tail(&q->all_q_node, &all_q_list);
- mutex_unlock(&all_q_mutex);
+ list_add_tail(&q->all_q_node, &all_q_list);
blk_mq_add_queue_tag_set(set, q);
+ blk_mq_map_swqueue(q, cpu_online_mask);
- blk_mq_map_swqueue(q);
+ mutex_unlock(&all_q_mutex);
+ put_online_cpus();
return q;
@@ -2060,30 +2069,27 @@ void blk_mq_free_queue(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
+ mutex_lock(&all_q_mutex);
+ list_del_init(&q->all_q_node);
+ mutex_unlock(&all_q_mutex);
+
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
blk_mq_free_hw_queues(q, set);
percpu_ref_exit(&q->mq_usage_counter);
-
- kfree(q->mq_map);
-
- q->mq_map = NULL;
-
- mutex_lock(&all_q_mutex);
- list_del_init(&q->all_q_node);
- mutex_unlock(&all_q_mutex);
}
/* Basically redo blk_mq_init_queue with queue frozen */
-static void blk_mq_queue_reinit(struct request_queue *q)
+static void blk_mq_queue_reinit(struct request_queue *q,
+ const struct cpumask *online_mask)
{
WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
blk_mq_sysfs_unregister(q);
- blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
+ blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
/*
* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
@@ -2091,7 +2097,7 @@ static void blk_mq_queue_reinit(struct request_queue *q)
* involves free and re-allocate memory, worthy doing?)
*/
- blk_mq_map_swqueue(q);
+ blk_mq_map_swqueue(q, online_mask);
blk_mq_sysfs_register(q);
}
@@ -2100,16 +2106,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
unsigned long action, void *hcpu)
{
struct request_queue *q;
+ int cpu = (unsigned long)hcpu;
+ /*
+ * New online cpumask which is going to be set in this hotplug event.
+ * Declare this cpumasks as global as cpu-hotplug operation is invoked
+ * one-by-one and dynamically allocating this could result in a failure.
+ */
+ static struct cpumask online_new;
/*
- * Before new mappings are established, hotadded cpu might already
- * start handling requests. This doesn't break anything as we map
- * offline CPUs to first hardware queue. We will re-init the queue
- * below to get optimal settings.
+ * Before hotadded cpu starts handling requests, new mappings must
+ * be established. Otherwise, these requests in hw queue might
+ * never be dispatched.
+ *
+ * For example, there is a single hw queue (hctx) and two CPU queues
+ * (ctx0 for CPU0, and ctx1 for CPU1).
+ *
+ * Now CPU1 is just onlined and a request is inserted into
+ * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
+ * still zero.
+ *
+ * And then while running hw queue, flush_busy_ctxs() finds bit0 is
+ * set in pending bitmap and tries to retrieve requests in
+ * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0,
+ * so the request in ctx1->rq_list is ignored.
*/
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
- action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ cpumask_copy(&online_new, cpu_online_mask);
+ break;
+ case CPU_UP_PREPARE:
+ cpumask_copy(&online_new, cpu_online_mask);
+ cpumask_set_cpu(cpu, &online_new);
+ break;
+ default:
return NOTIFY_OK;
+ }
mutex_lock(&all_q_mutex);
@@ -2133,7 +2166,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
}
list_for_each_entry(q, &all_q_list, all_q_node)
- blk_mq_queue_reinit(q);
+ blk_mq_queue_reinit(q, &online_new);
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_unfreeze_queue(q);
@@ -2263,10 +2296,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
int i;
for (i = 0; i < set->nr_hw_queues; i++) {
- if (set->tags[i]) {
+ if (set->tags[i])
blk_mq_free_rq_map(set, set->tags[i], i);
- free_cpumask_var(set->tags[i]->cpumask);
- }
}
kfree(set->tags);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6a48c4c0d..f4fea7964 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
* CPU -> queue mappings
*/
extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+ const struct cpumask *online_mask);
extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
/*
diff --git a/block/blk-settings.c b/block/blk-settings.c
index e0057d035..7d8f129a1 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -53,28 +53,6 @@ void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
}
EXPORT_SYMBOL(blk_queue_unprep_rq);
-/**
- * blk_queue_merge_bvec - set a merge_bvec function for queue
- * @q: queue
- * @mbfn: merge_bvec_fn
- *
- * Usually queues have static limitations on the max sectors or segments that
- * we can put in a request. Stacking drivers may have some settings that
- * are dynamic, and thus we have to query the queue whether it is ok to
- * add a new bio_vec to a bio at a given offset or not. If the block device
- * has such limitations, it needs to register a merge_bvec_fn to control
- * the size of bio's sent to it. Note that a block device *must* allow a
- * single page to be added to an empty bio. The block device driver may want
- * to use the bio_split() function to deal with these bio's. By default
- * no merge_bvec_fn is defined for a queue, and only the fixed limits are
- * honored.
- */
-void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
-{
- q->merge_bvec_fn = mbfn;
-}
-EXPORT_SYMBOL(blk_queue_merge_bvec);
-
void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
{
q->softirq_done_fn = fn;
@@ -111,11 +89,13 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_integrity_segments = 0;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+ lim->virt_boundary_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_discard_sectors = 0;
+ lim->max_hw_discard_sectors = 0;
lim->discard_granularity = 0;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
@@ -257,7 +237,9 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
__func__, max_hw_sectors);
}
- limits->max_sectors = limits->max_hw_sectors = max_hw_sectors;
+ limits->max_hw_sectors = max_hw_sectors;
+ limits->max_sectors = min_t(unsigned int, max_hw_sectors,
+ BLK_DEF_MAX_SECTORS);
}
EXPORT_SYMBOL(blk_limits_max_hw_sectors);
@@ -303,6 +285,7 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors);
void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors)
{
+ q->limits.max_hw_discard_sectors = max_discard_sectors;
q->limits.max_discard_sectors = max_discard_sectors;
}
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
@@ -550,6 +533,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
+ t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask,
+ b->virt_boundary_mask);
t->max_segments = min_not_zero(t->max_segments, b->max_segments);
t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
@@ -641,6 +626,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
b->max_discard_sectors);
+ t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors,
+ b->max_hw_discard_sectors);
t->discard_granularity = max(t->discard_granularity,
b->discard_granularity);
t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) %
@@ -788,6 +775,17 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
EXPORT_SYMBOL(blk_queue_segment_boundary);
/**
+ * blk_queue_virt_boundary - set boundary rules for bio merging
+ * @q: the request queue for the device
+ * @mask: the memory boundary mask
+ **/
+void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
+{
+ q->limits.virt_boundary_mask = mask;
+}
+EXPORT_SYMBOL(blk_queue_virt_boundary);
+
+/**
* blk_queue_dma_alignment - set dma length and memory alignment
* @q: the request queue for the device
* @mask: alignment mask
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6264b382d..07b42f5ad 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -145,12 +145,43 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
return queue_var_show(q->limits.discard_granularity, page);
}
+static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
+{
+ unsigned long long val;
+
+ val = q->limits.max_hw_discard_sectors << 9;
+ return sprintf(page, "%llu\n", val);
+}
+
static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
{
return sprintf(page, "%llu\n",
(unsigned long long)q->limits.max_discard_sectors << 9);
}
+static ssize_t queue_discard_max_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ unsigned long max_discard;
+ ssize_t ret = queue_var_store(&max_discard, page, count);
+
+ if (ret < 0)
+ return ret;
+
+ if (max_discard & (q->limits.discard_granularity - 1))
+ return -EINVAL;
+
+ max_discard >>= 9;
+ if (max_discard > UINT_MAX)
+ return -EINVAL;
+
+ if (max_discard > q->limits.max_hw_discard_sectors)
+ max_discard = q->limits.max_hw_discard_sectors;
+
+ q->limits.max_discard_sectors = max_discard;
+ return ret;
+}
+
static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_discard_zeroes_data(q), page);
@@ -360,9 +391,15 @@ static struct queue_sysfs_entry queue_discard_granularity_entry = {
.show = queue_discard_granularity_show,
};
+static struct queue_sysfs_entry queue_discard_max_hw_entry = {
+ .attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO },
+ .show = queue_discard_max_hw_show,
+};
+
static struct queue_sysfs_entry queue_discard_max_entry = {
- .attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+ .attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR },
.show = queue_discard_max_show,
+ .store = queue_discard_max_store,
};
static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
@@ -421,6 +458,7 @@ static struct attribute *default_attrs[] = {
&queue_io_opt_entry.attr,
&queue_discard_granularity_entry.attr,
&queue_discard_max_entry.attr,
+ &queue_discard_max_hw_entry.attr,
&queue_discard_zeroes_data_entry.attr,
&queue_write_same_max_entry.attr,
&queue_nonrot_entry.attr,
@@ -502,6 +540,7 @@ static void blk_release_queue(struct kobject *kobj)
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
+ bdi_exit(&q->backing_dev_info);
blkcg_exit_queue(q);
if (q->elevator) {
@@ -523,6 +562,9 @@ static void blk_release_queue(struct kobject *kobj)
blk_trace_shutdown(q);
+ if (q->bio_split)
+ bioset_free(q->bio_split);
+
ida_simple_remove(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b23193518..c75a2636d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -83,14 +83,6 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
-/* Per-cpu group stats */
-struct tg_stats_cpu {
- /* total bytes transferred */
- struct blkg_rwstat service_bytes;
- /* total IOs serviced, post merge */
- struct blkg_rwstat serviced;
-};
-
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
@@ -141,12 +133,6 @@ struct throtl_grp {
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
-
- /* Per cpu stats pointer */
- struct tg_stats_cpu __percpu *stats_cpu;
-
- /* List of tgs waiting for per cpu stats memory to be allocated */
- struct list_head stats_alloc_node;
};
struct throtl_data
@@ -168,13 +154,6 @@ struct throtl_data
struct work_struct dispatch_work;
};
-/* list and work item to allocate percpu group stats */
-static DEFINE_SPINLOCK(tg_stats_alloc_lock);
-static LIST_HEAD(tg_stats_alloc_list);
-
-static void tg_stats_alloc_fn(struct work_struct *);
-static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
-
static void throtl_pending_timer_fn(unsigned long arg);
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
return pd_to_blkg(&tg->pd);
}
-static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
-{
- return blkg_to_tg(td->queue->root_blkg);
-}
-
/**
* sq_to_tg - return the throl_grp the specified service queue belongs to
* @sq: the throtl_service_queue of interest
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
} \
} while (0)
-static void tg_stats_init(struct tg_stats_cpu *tg_stats)
-{
- blkg_rwstat_init(&tg_stats->service_bytes);
- blkg_rwstat_init(&tg_stats->serviced);
-}
-
-/*
- * Worker for allocating per cpu stat for tgs. This is scheduled on the
- * system_wq once there are some groups on the alloc_list waiting for
- * allocation.
- */
-static void tg_stats_alloc_fn(struct work_struct *work)
-{
- static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */
- struct delayed_work *dwork = to_delayed_work(work);
- bool empty = false;
-
-alloc_stats:
- if (!stats_cpu) {
- int cpu;
-
- stats_cpu = alloc_percpu(struct tg_stats_cpu);
- if (!stats_cpu) {
- /* allocation failed, try again after some time */
- schedule_delayed_work(dwork, msecs_to_jiffies(10));
- return;
- }
- for_each_possible_cpu(cpu)
- tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
- }
-
- spin_lock_irq(&tg_stats_alloc_lock);
-
- if (!list_empty(&tg_stats_alloc_list)) {
- struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
- struct throtl_grp,
- stats_alloc_node);
- swap(tg->stats_cpu, stats_cpu);
- list_del_init(&tg->stats_alloc_node);
- }
-
- empty = list_empty(&tg_stats_alloc_list);
- spin_unlock_irq(&tg_stats_alloc_lock);
- if (!empty)
- goto alloc_stats;
-}
-
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
INIT_LIST_HEAD(&qn->node);
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
}
/* init a service_queue, assumes the caller zeroed it */
-static void throtl_service_queue_init(struct throtl_service_queue *sq,
- struct throtl_service_queue *parent_sq)
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
{
INIT_LIST_HEAD(&sq->queued[0]);
INIT_LIST_HEAD(&sq->queued[1]);
sq->pending_tree = RB_ROOT;
- sq->parent_sq = parent_sq;
setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
(unsigned long)sq);
}
-static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
{
- del_timer_sync(&sq->pending_timer);
+ struct throtl_grp *tg;
+ int rw;
+
+ tg = kzalloc_node(sizeof(*tg), gfp, node);
+ if (!tg)
+ return NULL;
+
+ throtl_service_queue_init(&tg->service_queue);
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+ throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+ }
+
+ RB_CLEAR_NODE(&tg->rb_node);
+ tg->bps[READ] = -1;
+ tg->bps[WRITE] = -1;
+ tg->iops[READ] = -1;
+ tg->iops[WRITE] = -1;
+
+ return &tg->pd;
}
-static void throtl_pd_init(struct blkcg_gq *blkg)
+static void throtl_pd_init(struct blkg_policy_data *pd)
{
- struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_grp *tg = pd_to_tg(pd);
+ struct blkcg_gq *blkg = tg_to_blkg(tg);
struct throtl_data *td = blkg->q->td;
- struct throtl_service_queue *parent_sq;
- unsigned long flags;
- int rw;
+ struct throtl_service_queue *sq = &tg->service_queue;
/*
* If on the default hierarchy, we switch to properly hierarchical
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
* Limits of a group don't interact with limits of other groups
* regardless of the position of the group in the hierarchy.
*/
- parent_sq = &td->service_queue;
-
+ sq->parent_sq = &td->service_queue;
if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
- parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
-
- throtl_service_queue_init(&tg->service_queue, parent_sq);
-
- for (rw = READ; rw <= WRITE; rw++) {
- throtl_qnode_init(&tg->qnode_on_self[rw], tg);
- throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
- }
-
- RB_CLEAR_NODE(&tg->rb_node);
+ sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
tg->td = td;
-
- tg->bps[READ] = -1;
- tg->bps[WRITE] = -1;
- tg->iops[READ] = -1;
- tg->iops[WRITE] = -1;
-
- /*
- * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
- * but percpu allocator can't be called from IO path. Queue tg on
- * tg_stats_alloc_list and allocate from work item.
- */
- spin_lock_irqsave(&tg_stats_alloc_lock, flags);
- list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
- schedule_delayed_work(&tg_stats_alloc_work, 0);
- spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
}
/*
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg)
(tg->bps[rw] != -1 || tg->iops[rw] != -1);
}
-static void throtl_pd_online(struct blkcg_gq *blkg)
+static void throtl_pd_online(struct blkg_policy_data *pd)
{
/*
* We don't want new groups to escape the limits of its ancestors.
* Update has_rules[] after a new group is brought online.
*/
- tg_update_has_rules(blkg_to_tg(blkg));
-}
-
-static void throtl_pd_exit(struct blkcg_gq *blkg)
-{
- struct throtl_grp *tg = blkg_to_tg(blkg);
- unsigned long flags;
-
- spin_lock_irqsave(&tg_stats_alloc_lock, flags);
- list_del_init(&tg->stats_alloc_node);
- spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
-
- free_percpu(tg->stats_cpu);
-
- throtl_service_queue_exit(&tg->service_queue);
-}
-
-static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
-{
- struct throtl_grp *tg = blkg_to_tg(blkg);
- int cpu;
-
- if (tg->stats_cpu == NULL)
- return;
-
- for_each_possible_cpu(cpu) {
- struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
- blkg_rwstat_reset(&sc->service_bytes);
- blkg_rwstat_reset(&sc->serviced);
- }
-}
-
-static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
- struct blkcg *blkcg)
-{
- /*
- * This is the common case when there are no blkcgs. Avoid lookup
- * in this case
- */
- if (blkcg == &blkcg_root)
- return td_root_tg(td);
-
- return blkg_to_tg(blkg_lookup(blkcg, td->queue));
+ tg_update_has_rules(pd_to_tg(pd));
}
-static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
- struct blkcg *blkcg)
+static void throtl_pd_free(struct blkg_policy_data *pd)
{
- struct request_queue *q = td->queue;
- struct throtl_grp *tg = NULL;
-
- /*
- * This is the common case when there are no blkcgs. Avoid lookup
- * in this case
- */
- if (blkcg == &blkcg_root) {
- tg = td_root_tg(td);
- } else {
- struct blkcg_gq *blkg;
-
- blkg = blkg_lookup_create(blkcg, q);
-
- /* if %NULL and @q is alive, fall back to root_tg */
- if (!IS_ERR(blkg))
- tg = blkg_to_tg(blkg);
- else if (!blk_queue_dying(q))
- tg = td_root_tg(td);
- }
+ struct throtl_grp *tg = pd_to_tg(pd);
- return tg;
+ del_timer_sync(&tg->service_queue.pending_timer);
+ kfree(tg);
}
static struct throtl_grp *
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
return 0;
}
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
- int rw)
-{
- struct throtl_grp *tg = blkg_to_tg(blkg);
- struct tg_stats_cpu *stats_cpu;
- unsigned long flags;
-
- /* If per cpu stats are not allocated yet, don't do any accounting. */
- if (tg->stats_cpu == NULL)
- return;
-
- /*
- * Disabling interrupts to provide mutual exclusion between two
- * writes on same cpu. It probably is not needed for 64bit. Not
- * optimizing that case yet.
- */
- local_irq_save(flags);
-
- stats_cpu = this_cpu_ptr(tg->stats_cpu);
-
- blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
- blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
-
- local_irq_restore(flags);
-}
-
static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
* more than once as a throttled bio will go through blk-throtl the
* second time when it eventually gets issued. Set it when a bio
* is being charged to a tg.
- *
- * Dispatch stats aren't recursive and each @bio should only be
- * accounted by the @tg it was originally associated with. Let's
- * update the stats when setting REQ_THROTTLED for the first time
- * which is guaranteed to be for the @bio's original tg.
*/
- if (!(bio->bi_rw & REQ_THROTTLED)) {
+ if (!(bio->bi_rw & REQ_THROTTLED))
bio->bi_rw |= REQ_THROTTLED;
- throtl_update_dispatch_stats(tg_to_blkg(tg),
- bio->bi_iter.bi_size, bio->bi_rw);
- }
}
/**
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
}
}
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- struct throtl_grp *tg = pd_to_tg(pd);
- struct blkg_rwstat rwstat = { }, tmp;
- int i, cpu;
-
- if (tg->stats_cpu == NULL)
- return 0;
-
- for_each_possible_cpu(cpu) {
- struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
- tmp = blkg_rwstat_read((void *)sc + off);
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- rwstat.cnt[i] += tmp.cnt[i];
- }
-
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
- &blkcg_policy_throtl, seq_cft(sf)->private, true);
- return 0;
-}
-
static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
return 0;
}
-static ssize_t tg_set_conf(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off, bool is_u64)
+static void tg_conf_updated(struct throtl_grp *tg)
{
- struct blkcg *blkcg = css_to_blkcg(of_css(of));
- struct blkg_conf_ctx ctx;
- struct throtl_grp *tg;
- struct throtl_service_queue *sq;
- struct blkcg_gq *blkg;
+ struct throtl_service_queue *sq = &tg->service_queue;
struct cgroup_subsys_state *pos_css;
- int ret;
-
- ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
- if (ret)
- return ret;
-
- tg = blkg_to_tg(ctx.blkg);
- sq = &tg->service_queue;
-
- if (!ctx.v)
- ctx.v = -1;
-
- if (is_u64)
- *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
- else
- *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
+ struct blkcg_gq *blkg;
throtl_log(&tg->service_queue,
"limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
* restrictions in the whole hierarchy and allows them to bypass
* blk-throttle.
*/
- blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
+ blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
tg_update_has_rules(blkg_to_tg(blkg));
/*
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
tg_update_disptime(tg);
throtl_schedule_next_dispatch(sq->parent_sq, true);
}
+}
+
+static ssize_t tg_set_conf(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off, bool is_u64)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct blkg_conf_ctx ctx;
+ struct throtl_grp *tg;
+ int ret;
+ u64 v;
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+ if (sscanf(ctx.body, "%llu", &v) != 1)
+ goto out_finish;
+ if (!v)
+ v = -1;
+
+ tg = blkg_to_tg(ctx.blkg);
+
+ if (is_u64)
+ *(u64 *)((void *)tg + of_cft(of)->private) = v;
+ else
+ *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
+
+ tg_conf_updated(tg);
+ ret = 0;
+out_finish:
blkg_conf_finish(&ctx);
- return nbytes;
+ return ret ?: nbytes;
}
static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
return tg_set_conf(of, buf, nbytes, off, false);
}
-static struct cftype throtl_files[] = {
+static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
.private = offsetof(struct throtl_grp, bps[READ]),
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = {
},
{
.name = "throttle.io_service_bytes",
- .private = offsetof(struct tg_stats_cpu, service_bytes),
- .seq_show = tg_print_cpu_rwstat,
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_bytes,
},
{
.name = "throttle.io_serviced",
- .private = offsetof(struct tg_stats_cpu, serviced),
- .seq_show = tg_print_cpu_rwstat,
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_ios,
+ },
+ { } /* terminate */
+};
+
+static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ struct throtl_grp *tg = pd_to_tg(pd);
+ const char *dname = blkg_dev_name(pd->blkg);
+ char bufs[4][21] = { "max", "max", "max", "max" };
+
+ if (!dname)
+ return 0;
+ if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
+ tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+ return 0;
+
+ if (tg->bps[READ] != -1)
+ snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+ if (tg->bps[WRITE] != -1)
+ snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+ if (tg->iops[READ] != -1)
+ snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+ if (tg->iops[WRITE] != -1)
+ snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+ seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+ dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+ return 0;
+}
+
+static int tg_print_max(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+ &blkcg_policy_throtl, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static ssize_t tg_set_max(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct blkg_conf_ctx ctx;
+ struct throtl_grp *tg;
+ u64 v[4];
+ int ret;
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+ if (ret)
+ return ret;
+
+ tg = blkg_to_tg(ctx.blkg);
+
+ v[0] = tg->bps[READ];
+ v[1] = tg->bps[WRITE];
+ v[2] = tg->iops[READ];
+ v[3] = tg->iops[WRITE];
+
+ while (true) {
+ char tok[27]; /* wiops=18446744073709551616 */
+ char *p;
+ u64 val = -1;
+ int len;
+
+ if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
+ break;
+ if (tok[0] == '\0')
+ break;
+ ctx.body += len;
+
+ ret = -EINVAL;
+ p = tok;
+ strsep(&p, "=");
+ if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+ goto out_finish;
+
+ ret = -ERANGE;
+ if (!val)
+ goto out_finish;
+
+ ret = -EINVAL;
+ if (!strcmp(tok, "rbps"))
+ v[0] = val;
+ else if (!strcmp(tok, "wbps"))
+ v[1] = val;
+ else if (!strcmp(tok, "riops"))
+ v[2] = min_t(u64, val, UINT_MAX);
+ else if (!strcmp(tok, "wiops"))
+ v[3] = min_t(u64, val, UINT_MAX);
+ else
+ goto out_finish;
+ }
+
+ tg->bps[READ] = v[0];
+ tg->bps[WRITE] = v[1];
+ tg->iops[READ] = v[2];
+ tg->iops[WRITE] = v[3];
+
+ tg_conf_updated(tg);
+ ret = 0;
+out_finish:
+ blkg_conf_finish(&ctx);
+ return ret ?: nbytes;
+}
+
+static struct cftype throtl_files[] = {
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = tg_print_max,
+ .write = tg_set_max,
},
{ } /* terminate */
};
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q)
}
static struct blkcg_policy blkcg_policy_throtl = {
- .pd_size = sizeof(struct throtl_grp),
- .cftypes = throtl_files,
+ .dfl_cftypes = throtl_files,
+ .legacy_cftypes = throtl_legacy_files,
+ .pd_alloc_fn = throtl_pd_alloc,
.pd_init_fn = throtl_pd_init,
.pd_online_fn = throtl_pd_online,
- .pd_exit_fn = throtl_pd_exit,
- .pd_reset_stats_fn = throtl_pd_reset_stats,
+ .pd_free_fn = throtl_pd_free,
};
-bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+ struct bio *bio)
{
- struct throtl_data *td = q->td;
struct throtl_qnode *qn = NULL;
- struct throtl_grp *tg;
+ struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
- struct blkcg *blkcg;
bool throttled = false;
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
/* see throtl_charge_bio() */
- if (bio->bi_rw & REQ_THROTTLED)
+ if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
goto out;
- /*
- * A throtl_grp pointer retrieved under rcu can be used to access
- * basic fields like stats and io rates. If a group has no rules,
- * just update the dispatch stats in lockless manner and return.
- */
- rcu_read_lock();
- blkcg = bio_blkcg(bio);
- tg = throtl_lookup_tg(td, blkcg);
- if (tg) {
- if (!tg->has_rules[rw]) {
- throtl_update_dispatch_stats(tg_to_blkg(tg),
- bio->bi_iter.bi_size, bio->bi_rw);
- goto out_unlock_rcu;
- }
- }
-
- /*
- * Either group has not been allocated yet or it is not an unlimited
- * IO group
- */
spin_lock_irq(q->queue_lock);
- tg = throtl_lookup_create_tg(td, blkcg);
- if (unlikely(!tg))
+
+ if (unlikely(blk_queue_bypass(q)))
goto out_unlock;
sq = &tg->service_queue;
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
out_unlock:
spin_unlock_irq(q->queue_lock);
-out_unlock_rcu:
- rcu_read_unlock();
out:
/*
* As multiple blk-throtls may stack in the same issue path, we
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q)
return -ENOMEM;
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
- throtl_service_queue_init(&td->service_queue, NULL);
+ throtl_service_queue_init(&td->service_queue);
q->td = td;
td->queue = q;
diff --git a/block/blk.h b/block/blk.h
index 838188b35..98614ad37 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
* Internal throttling interface
*/
#ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
extern void blk_throtl_drain(struct request_queue *q);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
-{
- return false;
-}
static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
diff --git a/block/bounce.c b/block/bounce.c
index b17311227..1cb5dd3a5 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -123,17 +123,19 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
}
}
-static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+static void bounce_end_io(struct bio *bio, mempool_t *pool)
{
struct bio *bio_orig = bio->bi_private;
struct bio_vec *bvec, *org_vec;
int i;
+ int start = bio_orig->bi_iter.bi_idx;
/*
* free up bounce indirect pages used
*/
bio_for_each_segment_all(bvec, bio, i) {
- org_vec = bio_orig->bi_io_vec + i;
+ org_vec = bio_orig->bi_io_vec + i + start;
+
if (bvec->bv_page == org_vec->bv_page)
continue;
@@ -141,61 +143,44 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
mempool_free(bvec->bv_page, pool);
}
- bio_endio(bio_orig, err);
+ bio_orig->bi_error = bio->bi_error;
+ bio_endio(bio_orig);
bio_put(bio);
}
-static void bounce_end_io_write(struct bio *bio, int err)
+static void bounce_end_io_write(struct bio *bio)
{
- bounce_end_io(bio, page_pool, err);
+ bounce_end_io(bio, page_pool);
}
-static void bounce_end_io_write_isa(struct bio *bio, int err)
+static void bounce_end_io_write_isa(struct bio *bio)
{
- bounce_end_io(bio, isa_page_pool, err);
+ bounce_end_io(bio, isa_page_pool);
}
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
{
struct bio *bio_orig = bio->bi_private;
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (!bio->bi_error)
copy_to_high_bio_irq(bio_orig, bio);
- bounce_end_io(bio, pool, err);
-}
-
-static void bounce_end_io_read(struct bio *bio, int err)
-{
- __bounce_end_io_read(bio, page_pool, err);
+ bounce_end_io(bio, pool);
}
-static void bounce_end_io_read_isa(struct bio *bio, int err)
+static void bounce_end_io_read(struct bio *bio)
{
- __bounce_end_io_read(bio, isa_page_pool, err);
+ __bounce_end_io_read(bio, page_pool);
}
-#ifdef CONFIG_NEED_BOUNCE_POOL
-static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
-{
- if (bio_data_dir(bio) != WRITE)
- return 0;
-
- if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
- return 0;
-
- return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
-}
-#else
-static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
+static void bounce_end_io_read_isa(struct bio *bio)
{
- return 0;
+ __bounce_end_io_read(bio, isa_page_pool);
}
-#endif /* CONFIG_NEED_BOUNCE_POOL */
static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
- mempool_t *pool, int force)
+ mempool_t *pool)
{
struct bio *bio;
int rw = bio_data_dir(*bio_orig);
@@ -203,8 +188,6 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
struct bvec_iter iter;
unsigned i;
- if (force)
- goto bounce;
bio_for_each_segment(from, *bio_orig, iter)
if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
goto bounce;
@@ -216,7 +199,7 @@ bounce:
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;
- if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+ if (page_to_pfn(page) <= queue_bounce_pfn(q))
continue;
to->bv_page = mempool_alloc(pool, q->bounce_gfp);
@@ -254,7 +237,6 @@ bounce:
void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
{
- int must_bounce;
mempool_t *pool;
/*
@@ -263,15 +245,13 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
if (!bio_has_data(*bio_orig))
return;
- must_bounce = must_snapshot_stable_pages(q, *bio_orig);
-
/*
* for non-isa bounce case, just check if the bounce pfn is equal
* to or bigger than the highest pfn in the system -- in that case,
* don't waste time iterating over bio segments
*/
if (!(q->bounce_gfp & GFP_DMA)) {
- if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
+ if (queue_bounce_pfn(q) >= blk_max_pfn)
return;
pool = page_pool;
} else {
@@ -282,7 +262,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
/*
* slow path
*/
- __blk_queue_bounce(q, bio_orig, pool, must_bounce);
+ __blk_queue_bounce(q, bio_orig, pool);
}
EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c62bb2e65..04de88463 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool;
#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
/* blkio-related constants */
-#define CFQ_WEIGHT_MIN 10
-#define CFQ_WEIGHT_MAX 1000
-#define CFQ_WEIGHT_DEFAULT 500
+#define CFQ_WEIGHT_LEGACY_MIN 10
+#define CFQ_WEIGHT_LEGACY_DFL 500
+#define CFQ_WEIGHT_LEGACY_MAX 1000
struct cfq_ttime {
unsigned long last_end_request;
@@ -177,10 +177,6 @@ enum wl_type_t {
struct cfqg_stats {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
- /* total bytes transferred */
- struct blkg_rwstat service_bytes;
- /* total IOs serviced, post merge */
- struct blkg_rwstat serviced;
/* number of ios merged */
struct blkg_rwstat merged;
/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -189,8 +185,6 @@ struct cfqg_stats {
struct blkg_rwstat wait_time;
/* number of IOs queued up */
struct blkg_rwstat queued;
- /* total sectors transferred */
- struct blkg_stat sectors;
/* total disk time and nr sectors dispatched by this group */
struct blkg_stat time;
#ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -220,7 +214,7 @@ struct cfqg_stats {
/* Per-cgroup data */
struct cfq_group_data {
/* must be the first member */
- struct blkcg_policy_data pd;
+ struct blkcg_policy_data cpd;
unsigned int weight;
unsigned int leaf_weight;
@@ -304,7 +298,11 @@ struct cfq_group {
int dispatched;
struct cfq_ttime ttime;
struct cfqg_stats stats; /* stats for this cfqg */
- struct cfqg_stats dead_stats; /* stats pushed from dead children */
+
+ /* async queue for each priority case */
+ struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
+ struct cfq_queue *async_idle_cfqq;
+
};
struct cfq_io_cq {
@@ -370,12 +368,6 @@ struct cfq_data {
struct cfq_queue *active_queue;
struct cfq_io_cq *active_cic;
- /*
- * async queue for each priority case
- */
- struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
- struct cfq_queue *async_idle_cfqq;
-
sector_t last_position;
/*
@@ -401,6 +393,7 @@ struct cfq_data {
};
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
+static void cfq_put_queue(struct cfq_queue *cfqq);
static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
enum wl_class_t class,
@@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
static struct cfq_group_data
*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
{
- return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+ return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
}
static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
@@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
}
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
- uint64_t bytes, int rw)
-{
- blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
- blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
- blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
-}
-
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
uint64_t start_time, uint64_t io_start_time, int rw)
{
@@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
static void cfqg_stats_reset(struct cfqg_stats *stats)
{
/* queued stats shouldn't be cleared */
- blkg_rwstat_reset(&stats->service_bytes);
- blkg_rwstat_reset(&stats->serviced);
blkg_rwstat_reset(&stats->merged);
blkg_rwstat_reset(&stats->service_time);
blkg_rwstat_reset(&stats->wait_time);
@@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
}
/* @to += @from */
-static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
{
/* queued stats shouldn't be cleared */
- blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
- blkg_rwstat_merge(&to->serviced, &from->serviced);
- blkg_rwstat_merge(&to->merged, &from->merged);
- blkg_rwstat_merge(&to->service_time, &from->service_time);
- blkg_rwstat_merge(&to->wait_time, &from->wait_time);
- blkg_stat_merge(&from->time, &from->time);
+ blkg_rwstat_add_aux(&to->merged, &from->merged);
+ blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+ blkg_stat_add_aux(&from->time, &from->time);
#ifdef CONFIG_DEBUG_BLK_CGROUP
- blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
- blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
- blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
- blkg_stat_merge(&to->dequeue, &from->dequeue);
- blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
- blkg_stat_merge(&to->idle_time, &from->idle_time);
- blkg_stat_merge(&to->empty_time, &from->empty_time);
+ blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+ blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+ blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+ blkg_stat_add_aux(&to->empty_time, &from->empty_time);
#endif
}
/*
- * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
* recursive stats can still account for the amount used by this cfqg after
* it's gone.
*/
@@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
if (unlikely(!parent))
return;
- cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
- cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+ cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
cfqg_stats_reset(&cfqg->stats);
- cfqg_stats_reset(&cfqg->dead_stats);
}
#else /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
unsigned long time, unsigned long unaccounted_time) { }
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
- uint64_t bytes, int rw) { }
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
uint64_t start_time, uint64_t io_start_time, int rw) { }
@@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
static void cfq_dispatch_insert(struct request_queue *, struct request *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
- struct cfq_io_cq *cic, struct bio *bio,
- gfp_t gfp_mask);
+ struct cfq_io_cq *cic, struct bio *bio);
static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
@@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfqg_stats_init(struct cfqg_stats *stats)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+ bool on_dfl, bool reset_dev, bool is_leaf_weight);
+
+static void cfqg_stats_exit(struct cfqg_stats *stats)
{
- blkg_rwstat_init(&stats->service_bytes);
- blkg_rwstat_init(&stats->serviced);
- blkg_rwstat_init(&stats->merged);
- blkg_rwstat_init(&stats->service_time);
- blkg_rwstat_init(&stats->wait_time);
- blkg_rwstat_init(&stats->queued);
+ blkg_rwstat_exit(&stats->merged);
+ blkg_rwstat_exit(&stats->service_time);
+ blkg_rwstat_exit(&stats->wait_time);
+ blkg_rwstat_exit(&stats->queued);
+ blkg_stat_exit(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_stat_exit(&stats->unaccounted_time);
+ blkg_stat_exit(&stats->avg_queue_size_sum);
+ blkg_stat_exit(&stats->avg_queue_size_samples);
+ blkg_stat_exit(&stats->dequeue);
+ blkg_stat_exit(&stats->group_wait_time);
+ blkg_stat_exit(&stats->idle_time);
+ blkg_stat_exit(&stats->empty_time);
+#endif
+}
- blkg_stat_init(&stats->sectors);
- blkg_stat_init(&stats->time);
+static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
+{
+ if (blkg_rwstat_init(&stats->merged, gfp) ||
+ blkg_rwstat_init(&stats->service_time, gfp) ||
+ blkg_rwstat_init(&stats->wait_time, gfp) ||
+ blkg_rwstat_init(&stats->queued, gfp) ||
+ blkg_stat_init(&stats->time, gfp))
+ goto err;
#ifdef CONFIG_DEBUG_BLK_CGROUP
- blkg_stat_init(&stats->unaccounted_time);
- blkg_stat_init(&stats->avg_queue_size_sum);
- blkg_stat_init(&stats->avg_queue_size_samples);
- blkg_stat_init(&stats->dequeue);
- blkg_stat_init(&stats->group_wait_time);
- blkg_stat_init(&stats->idle_time);
- blkg_stat_init(&stats->empty_time);
+ if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+ blkg_stat_init(&stats->dequeue, gfp) ||
+ blkg_stat_init(&stats->group_wait_time, gfp) ||
+ blkg_stat_init(&stats->idle_time, gfp) ||
+ blkg_stat_init(&stats->empty_time, gfp))
+ goto err;
#endif
+ return 0;
+err:
+ cfqg_stats_exit(stats);
+ return -ENOMEM;
}
-static void cfq_cpd_init(const struct blkcg *blkcg)
+static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
{
- struct cfq_group_data *cgd =
- cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+ struct cfq_group_data *cgd;
- if (blkcg == &blkcg_root) {
- cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
- cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
- } else {
- cgd->weight = CFQ_WEIGHT_DEFAULT;
- cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
- }
+ cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+ if (!cgd)
+ return NULL;
+ return &cgd->cpd;
+}
+
+static void cfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+ struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
+ unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+ CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+ if (cpd_to_blkcg(cpd) == &blkcg_root)
+ weight *= 2;
+
+ cgd->weight = weight;
+ cgd->leaf_weight = weight;
}
-static void cfq_pd_init(struct blkcg_gq *blkg)
+static void cfq_cpd_free(struct blkcg_policy_data *cpd)
{
- struct cfq_group *cfqg = blkg_to_cfqg(blkg);
- struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
+ kfree(cpd_to_cfqgd(cpd));
+}
+
+static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
+{
+ struct blkcg *blkcg = cpd_to_blkcg(cpd);
+ bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+ unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+ if (blkcg == &blkcg_root)
+ weight *= 2;
+
+ WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
+ WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
+}
+
+static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
+{
+ struct cfq_group *cfqg;
+
+ cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
+ if (!cfqg)
+ return NULL;
cfq_init_cfqg_base(cfqg);
+ if (cfqg_stats_init(&cfqg->stats, gfp)) {
+ kfree(cfqg);
+ return NULL;
+ }
+
+ return &cfqg->pd;
+}
+
+static void cfq_pd_init(struct blkg_policy_data *pd)
+{
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
+ struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
+
cfqg->weight = cgd->weight;
cfqg->leaf_weight = cgd->leaf_weight;
- cfqg_stats_init(&cfqg->stats);
- cfqg_stats_init(&cfqg->dead_stats);
}
-static void cfq_pd_offline(struct blkcg_gq *blkg)
+static void cfq_pd_offline(struct blkg_policy_data *pd)
{
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
+ int i;
+
+ for (i = 0; i < IOPRIO_BE_NR; i++) {
+ if (cfqg->async_cfqq[0][i])
+ cfq_put_queue(cfqg->async_cfqq[0][i]);
+ if (cfqg->async_cfqq[1][i])
+ cfq_put_queue(cfqg->async_cfqq[1][i]);
+ }
+
+ if (cfqg->async_idle_cfqq)
+ cfq_put_queue(cfqg->async_idle_cfqq);
+
/*
* @blkg is going offline and will be ignored by
* blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
* that they don't get lost. If IOs complete after this point, the
* stats for them will be lost. Oh well...
*/
- cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+ cfqg_stats_xfer_dead(cfqg);
}
-/* offset delta from cfqg->stats to cfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
- offsetof(struct cfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+static void cfq_pd_free(struct blkg_policy_data *pd)
{
- u64 sum = 0;
-
- sum += blkg_stat_recursive_sum(pd, off);
- sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
- return sum;
-}
-
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
- int off)
-{
- struct blkg_rwstat a, b;
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
- a = blkg_rwstat_recursive_sum(pd, off);
- b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
- blkg_rwstat_merge(&a, &b);
- return a;
+ cfqg_stats_exit(&cfqg->stats);
+ return kfree(cfqg);
}
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
{
- struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
cfqg_stats_reset(&cfqg->stats);
- cfqg_stats_reset(&cfqg->dead_stats);
}
-/*
- * Search for the cfq group current task belongs to. request_queue lock must
- * be held.
- */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
- struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+ struct blkcg *blkcg)
{
- struct request_queue *q = cfqd->queue;
- struct cfq_group *cfqg = NULL;
-
- /* avoid lookup for the common case where there's no blkcg */
- if (blkcg == &blkcg_root) {
- cfqg = cfqd->root_group;
- } else {
- struct blkcg_gq *blkg;
-
- blkg = blkg_lookup_create(blkcg, q);
- if (!IS_ERR(blkg))
- cfqg = blkg_to_cfqg(blkg);
- }
+ struct blkcg_gq *blkg;
- return cfqg;
+ blkg = blkg_lookup(blkcg, cfqd->queue);
+ if (likely(blkg))
+ return blkg_to_cfqg(blkg);
+ return NULL;
}
static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
{
- /* Currently, all async queues are mapped to root group */
- if (!cfq_cfqq_sync(cfqq))
- cfqg = cfqq->cfqd->root_group;
-
cfqq->cfqg = cfqg;
/* cfqq reference on cfqg */
cfqg_get(cfqg);
@@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off,
- bool is_leaf_weight)
+ bool on_dfl, bool is_leaf_weight)
{
+ unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+ unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkg_conf_ctx ctx;
struct cfq_group *cfqg;
struct cfq_group_data *cfqgd;
int ret;
+ u64 v;
ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
if (ret)
return ret;
- ret = -EINVAL;
+ if (sscanf(ctx.body, "%llu", &v) == 1) {
+ /* require "default" on dfl */
+ ret = -ERANGE;
+ if (!v && on_dfl)
+ goto out_finish;
+ } else if (!strcmp(strim(ctx.body), "default")) {
+ v = 0;
+ } else {
+ ret = -EINVAL;
+ goto out_finish;
+ }
+
cfqg = blkg_to_cfqg(ctx.blkg);
cfqgd = blkcg_to_cfqgd(blkcg);
- if (!cfqg || !cfqgd)
- goto err;
- if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+ ret = -ERANGE;
+ if (!v || (v >= min && v <= max)) {
if (!is_leaf_weight) {
- cfqg->dev_weight = ctx.v;
- cfqg->new_weight = ctx.v ?: cfqgd->weight;
+ cfqg->dev_weight = v;
+ cfqg->new_weight = v ?: cfqgd->weight;
} else {
- cfqg->dev_leaf_weight = ctx.v;
- cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
+ cfqg->dev_leaf_weight = v;
+ cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
}
ret = 0;
}
-
-err:
+out_finish:
blkg_conf_finish(&ctx);
return ret ?: nbytes;
}
@@ -1776,25 +1805,27 @@ err:
static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- return __cfqg_set_weight_device(of, buf, nbytes, off, false);
+ return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
}
static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- return __cfqg_set_weight_device(of, buf, nbytes, off, true);
+ return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
}
-static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 val, bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+ bool on_dfl, bool reset_dev, bool is_leaf_weight)
{
+ unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+ unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
struct cfq_group_data *cfqgd;
int ret = 0;
- if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
- return -EINVAL;
+ if (val < min || val > max)
+ return -ERANGE;
spin_lock_irq(&blkcg->lock);
cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
continue;
if (!is_leaf_weight) {
+ if (reset_dev)
+ cfqg->dev_weight = 0;
if (!cfqg->dev_weight)
cfqg->new_weight = cfqgd->weight;
} else {
+ if (reset_dev)
+ cfqg->dev_leaf_weight = 0;
if (!cfqg->dev_leaf_weight)
cfqg->new_leaf_weight = cfqgd->leaf_weight;
}
@@ -1831,13 +1866,13 @@ out:
static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
u64 val)
{
- return __cfq_set_weight(css, cft, val, false);
+ return __cfq_set_weight(css, val, false, false, false);
}
static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- return __cfq_set_weight(css, cft, val, true);
+ return __cfq_set_weight(css, val, false, false, true);
}
static int cfqg_print_stat(struct seq_file *sf, void *v)
@@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
-
+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_cfq, off);
return __blkg_prfill_u64(sf, pd, sum);
}
static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
-
+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_cfq, off);
return __blkg_prfill_rwstat(sf, pd, &sum);
}
@@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
return 0;
}
+static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
+ return 0;
+}
+
+static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes));
+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
+ false);
+ return 0;
+}
+
#ifdef CONFIG_DEBUG_BLK_CGROUP
static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
@@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
}
#endif /* CONFIG_DEBUG_BLK_CGROUP */
-static struct cftype cfq_blkcg_files[] = {
+static struct cftype cfq_blkcg_legacy_files[] = {
/* on root, weight is mapped to leaf_weight */
{
.name = "weight_device",
@@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = {
},
{
.name = "sectors",
- .private = offsetof(struct cfq_group, stats.sectors),
- .seq_show = cfqg_print_stat,
+ .seq_show = cfqg_print_stat_sectors,
},
{
.name = "io_service_bytes",
- .private = offsetof(struct cfq_group, stats.service_bytes),
- .seq_show = cfqg_print_rwstat,
+ .private = (unsigned long)&blkcg_policy_cfq,
+ .seq_show = blkg_print_stat_bytes,
},
{
.name = "io_serviced",
- .private = offsetof(struct cfq_group, stats.serviced),
- .seq_show = cfqg_print_rwstat,
+ .private = (unsigned long)&blkcg_policy_cfq,
+ .seq_show = blkg_print_stat_ios,
},
{
.name = "io_service_time",
@@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = {
},
{
.name = "sectors_recursive",
- .private = offsetof(struct cfq_group, stats.sectors),
- .seq_show = cfqg_print_stat_recursive,
+ .seq_show = cfqg_print_stat_sectors_recursive,
},
{
.name = "io_service_bytes_recursive",
- .private = offsetof(struct cfq_group, stats.service_bytes),
- .seq_show = cfqg_print_rwstat_recursive,
+ .private = (unsigned long)&blkcg_policy_cfq,
+ .seq_show = blkg_print_stat_bytes_recursive,
},
{
.name = "io_serviced_recursive",
- .private = offsetof(struct cfq_group, stats.serviced),
- .seq_show = cfqg_print_rwstat_recursive,
+ .private = (unsigned long)&blkcg_policy_cfq,
+ .seq_show = blkg_print_stat_ios_recursive,
},
{
.name = "io_service_time_recursive",
@@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = {
#endif /* CONFIG_DEBUG_BLK_CGROUP */
{ } /* terminate */
};
+
+static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+
+ seq_printf(sf, "default %u\n", cgd->weight);
+ blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
+ &blkcg_policy_cfq, 0, false);
+ return 0;
+}
+
+static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ char *endp;
+ int ret;
+ u64 v;
+
+ buf = strim(buf);
+
+ /* "WEIGHT" or "default WEIGHT" sets the default weight */
+ v = simple_strtoull(buf, &endp, 0);
+ if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+ ret = __cfq_set_weight(of_css(of), v, true, false, false);
+ return ret ?: nbytes;
+ }
+
+ /* "MAJ:MIN WEIGHT" */
+ return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
+}
+
+static struct cftype cfq_blkcg_files[] = {
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cfq_print_weight_on_dfl,
+ .write = cfq_set_weight_on_dfl,
+ },
+ { } /* terminate */
+};
+
#else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
- struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+ struct blkcg *blkcg)
{
return cfqd->root_group;
}
@@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
cfqq->nr_sectors += blk_rq_sectors(rq);
- cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
}
/*
@@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq)
struct cfq_io_cq *cic = icq_to_cic(icq);
struct cfq_data *cfqd = cic_to_cfqd(cic);
- if (cic->cfqq[BLK_RW_ASYNC]) {
- cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
- cic->cfqq[BLK_RW_ASYNC] = NULL;
+ if (cic_to_cfqq(cic, false)) {
+ cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
+ cic_set_cfqq(cic, NULL, false);
}
- if (cic->cfqq[BLK_RW_SYNC]) {
- cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
- cic->cfqq[BLK_RW_SYNC] = NULL;
+ if (cic_to_cfqq(cic, true)) {
+ cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
+ cic_set_cfqq(cic, NULL, true);
}
}
@@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
return;
- cfqq = cic->cfqq[BLK_RW_ASYNC];
+ cfqq = cic_to_cfqq(cic, false);
if (cfqq) {
- struct cfq_queue *new_cfqq;
- new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
- GFP_ATOMIC);
- if (new_cfqq) {
- cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
- cfq_put_queue(cfqq);
- }
+ cfq_put_queue(cfqq);
+ cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
+ cic_set_cfqq(cic, cfqq, false);
}
- cfqq = cic->cfqq[BLK_RW_SYNC];
+ cfqq = cic_to_cfqq(cic, true);
if (cfqq)
cfq_mark_cfqq_prio_changed(cfqq);
@@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
{
struct cfq_data *cfqd = cic_to_cfqd(cic);
- struct cfq_queue *sync_cfqq;
+ struct cfq_queue *cfqq;
uint64_t serial_nr;
rcu_read_lock();
@@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
return;
- sync_cfqq = cic_to_cfqq(cic, 1);
- if (sync_cfqq) {
- /*
- * Drop reference to sync queue. A new sync queue will be
- * assigned in new group upon arrival of a fresh request.
- */
- cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
- cic_set_cfqq(cic, NULL, 1);
- cfq_put_queue(sync_cfqq);
+ /*
+ * Drop reference to queues. New queues will be assigned in new
+ * group upon arrival of fresh requests.
+ */
+ cfqq = cic_to_cfqq(cic, false);
+ if (cfqq) {
+ cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+ cic_set_cfqq(cic, NULL, false);
+ cfq_put_queue(cfqq);
+ }
+
+ cfqq = cic_to_cfqq(cic, true);
+ if (cfqq) {
+ cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+ cic_set_cfqq(cic, NULL, true);
+ cfq_put_queue(cfqq);
}
cic->blkcg_serial_nr = serial_nr;
@@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
-static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
- struct bio *bio, gfp_t gfp_mask)
-{
- struct blkcg *blkcg;
- struct cfq_queue *cfqq, *new_cfqq = NULL;
- struct cfq_group *cfqg;
-
-retry:
- rcu_read_lock();
-
- blkcg = bio_blkcg(bio);
- cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
- if (!cfqg) {
- cfqq = &cfqd->oom_cfqq;
- goto out;
- }
-
- cfqq = cic_to_cfqq(cic, is_sync);
-
- /*
- * Always try a new alloc if we fell back to the OOM cfqq
- * originally, since it should just be a temporary situation.
- */
- if (!cfqq || cfqq == &cfqd->oom_cfqq) {
- cfqq = NULL;
- if (new_cfqq) {
- cfqq = new_cfqq;
- new_cfqq = NULL;
- } else if (gfp_mask & __GFP_WAIT) {
- rcu_read_unlock();
- spin_unlock_irq(cfqd->queue->queue_lock);
- new_cfqq = kmem_cache_alloc_node(cfq_pool,
- gfp_mask | __GFP_ZERO,
- cfqd->queue->node);
- spin_lock_irq(cfqd->queue->queue_lock);
- if (new_cfqq)
- goto retry;
- else
- return &cfqd->oom_cfqq;
- } else {
- cfqq = kmem_cache_alloc_node(cfq_pool,
- gfp_mask | __GFP_ZERO,
- cfqd->queue->node);
- }
-
- if (cfqq) {
- cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
- cfq_init_prio_data(cfqq, cic);
- cfq_link_cfqq_cfqg(cfqq, cfqg);
- cfq_log_cfqq(cfqd, cfqq, "alloced");
- } else
- cfqq = &cfqd->oom_cfqq;
- }
-out:
- if (new_cfqq)
- kmem_cache_free(cfq_pool, new_cfqq);
-
- rcu_read_unlock();
- return cfqq;
-}
-
static struct cfq_queue **
-cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
+cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
{
switch (ioprio_class) {
case IOPRIO_CLASS_RT:
- return &cfqd->async_cfqq[0][ioprio];
+ return &cfqg->async_cfqq[0][ioprio];
case IOPRIO_CLASS_NONE:
ioprio = IOPRIO_NORM;
/* fall through */
case IOPRIO_CLASS_BE:
- return &cfqd->async_cfqq[1][ioprio];
+ return &cfqg->async_cfqq[1][ioprio];
case IOPRIO_CLASS_IDLE:
- return &cfqd->async_idle_cfqq;
+ return &cfqg->async_idle_cfqq;
default:
BUG();
}
@@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
static struct cfq_queue *
cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
- struct bio *bio, gfp_t gfp_mask)
+ struct bio *bio)
{
int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
struct cfq_queue **async_cfqq = NULL;
- struct cfq_queue *cfqq = NULL;
+ struct cfq_queue *cfqq;
+ struct cfq_group *cfqg;
+
+ rcu_read_lock();
+ cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
+ if (!cfqg) {
+ cfqq = &cfqd->oom_cfqq;
+ goto out;
+ }
if (!is_sync) {
if (!ioprio_valid(cic->ioprio)) {
@@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
ioprio = task_nice_ioprio(tsk);
ioprio_class = task_nice_ioclass(tsk);
}
- async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
+ async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
cfqq = *async_cfqq;
+ if (cfqq)
+ goto out;
}
- if (!cfqq)
- cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+ cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+ cfqd->queue->node);
+ if (!cfqq) {
+ cfqq = &cfqd->oom_cfqq;
+ goto out;
+ }
- /*
- * pin the queue now that it's allocated, scheduler exit will prune it
- */
- if (!is_sync && !(*async_cfqq)) {
+ cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
+ cfq_init_prio_data(cfqq, cic);
+ cfq_link_cfqq_cfqg(cfqq, cfqg);
+ cfq_log_cfqq(cfqd, cfqq, "alloced");
+
+ if (async_cfqq) {
+ /* a new async queue is created, pin and remember */
cfqq->ref++;
*async_cfqq = cfqq;
}
-
+out:
cfqq->ref++;
+ rcu_read_unlock();
return cfqq;
}
@@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
const bool is_sync = rq_is_sync(rq);
struct cfq_queue *cfqq;
- might_sleep_if(gfp_mask & __GFP_WAIT);
-
spin_lock_irq(q->queue_lock);
check_ioprio_changed(cic, bio);
@@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
if (!cfqq || cfqq == &cfqd->oom_cfqq) {
- cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+ if (cfqq)
+ cfq_put_queue(cfqq);
+ cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
cic_set_cfqq(cic, cfqq, is_sync);
} else {
/*
@@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
cancel_work_sync(&cfqd->unplug_work);
}
-static void cfq_put_async_queues(struct cfq_data *cfqd)
-{
- int i;
-
- for (i = 0; i < IOPRIO_BE_NR; i++) {
- if (cfqd->async_cfqq[0][i])
- cfq_put_queue(cfqd->async_cfqq[0][i]);
- if (cfqd->async_cfqq[1][i])
- cfq_put_queue(cfqd->async_cfqq[1][i]);
- }
-
- if (cfqd->async_idle_cfqq)
- cfq_put_queue(cfqd->async_idle_cfqq);
-}
-
static void cfq_exit_queue(struct elevator_queue *e)
{
struct cfq_data *cfqd = e->elevator_data;
@@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
if (cfqd->active_queue)
__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
- cfq_put_async_queues(cfqd);
-
spin_unlock_irq(q->queue_lock);
cfq_shutdown_timer_wq(cfqd);
@@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
goto out_free;
cfq_init_cfqg_base(cfqd->root_group);
+ cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+ cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
#endif
- cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
- cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
/*
* Not strictly needed (since RB_ROOT just clears the node and we
@@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
cfqd->prio_trees[i] = RB_ROOT;
/*
- * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
+ * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
* Grab a permanent reference to it, so that the normal code flow
* will not attempt to free it. oom_cfqq is linked to root_group
* but shouldn't hold a reference as it'll never be unlinked. Lose
@@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
static struct blkcg_policy blkcg_policy_cfq = {
- .pd_size = sizeof(struct cfq_group),
- .cpd_size = sizeof(struct cfq_group_data),
- .cftypes = cfq_blkcg_files,
+ .dfl_cftypes = cfq_blkcg_files,
+ .legacy_cftypes = cfq_blkcg_legacy_files,
+ .cpd_alloc_fn = cfq_cpd_alloc,
.cpd_init_fn = cfq_cpd_init,
+ .cpd_free_fn = cfq_cpd_free,
+ .cpd_bind_fn = cfq_cpd_bind,
+
+ .pd_alloc_fn = cfq_pd_alloc,
.pd_init_fn = cfq_pd_init,
.pd_offline_fn = cfq_pd_offline,
+ .pd_free_fn = cfq_pd_free,
.pd_reset_stats_fn = cfq_pd_reset_stats,
};
#endif
diff --git a/block/genhd.c b/block/genhd.c
index 6d1003f0d..398dab06d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1112,8 +1112,7 @@ static void disk_release(struct device *dev)
disk_release_events(disk);
kfree(disk->random);
disk_replace_part_tbl(disk, NULL);
- free_part_stats(&disk->part0);
- free_part_info(&disk->part0);
+ hd_free_part(&disk->part0);
if (disk->queue)
blk_put_queue(disk->queue);
kfree(disk);
@@ -1287,7 +1286,11 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
* converted to make use of bd_mutex and sequence counters.
*/
seqcount_init(&disk->part0.nr_sects_seq);
- hd_ref_init(&disk->part0);
+ if (hd_ref_init(&disk->part0)) {
+ hd_free_part(&disk->part0);
+ kfree(disk);
+ return NULL;
+ }
disk->minors = minors;
rand_initialize_disk(disk);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 0d9e5f97f..e77111332 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -212,8 +212,7 @@ static void part_release(struct device *dev)
{
struct hd_struct *p = dev_to_part(dev);
blk_free_devt(dev->devt);
- free_part_stats(p);
- free_part_info(p);
+ hd_free_part(p);
kfree(p);
}
@@ -233,8 +232,9 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
put_device(part_to_dev(part));
}
-void __delete_partition(struct hd_struct *part)
+void __delete_partition(struct percpu_ref *ref)
{
+ struct hd_struct *part = container_of(ref, struct hd_struct, ref);
call_rcu(&part->rcu_head, delete_partition_rcu_cb);
}
@@ -255,7 +255,7 @@ void delete_partition(struct gendisk *disk, int partno)
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
- hd_struct_put(part);
+ hd_struct_kill(part);
}
static ssize_t whole_disk_show(struct device *dev,
@@ -356,8 +356,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
- hd_ref_init(p);
- return p;
+ if (!hd_ref_init(p))
+ return p;
out_free_info:
free_part_info(p);
diff --git a/block/uuid.c b/block/uuid.c
index 722d53b63..4610d7b8f 100644
--- a/block/uuid.c
+++ b/block/uuid.c
@@ -142,12 +142,12 @@ static int null_uuid(const char *uuid)
}
-static void uuid_end_bio(struct bio *bio, int err)
+static void uuid_end_bio(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
- if(!test_bit(BIO_UPTODATE, &bio->bi_flags))
- SetPageError(page);
+ if (bio->bi_error)
+ SetPageError(page);
unlock_page(page);
bio_put(bio);