29 files changed, 2590 insertions, 2805 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 1fc1a4dc5..01da733dc 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -51,12 +51,14 @@ config IOSCHED_BFQ
 	  applications. If compiled built-in (saying Y here), BFQ can
 	  be configured to support hierarchical scheduling.
 
-config BFQ_GROUP_IOSCHED
+config CGROUP_BFQIO
 	bool "BFQ hierarchical scheduling support"
 	depends on CGROUPS && IOSCHED_BFQ=y
 	default n
 	---help---
-	  Enable hierarchical scheduling in BFQ, using the blkio controller.
+	  Enable hierarchical scheduling in BFQ, using the cgroups
+	  filesystem interface.  The name of the subsystem will be
+	  bfqio.
 
 choice
 	prompt "Default I/O scheduler"
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index bc34d7a2b..11e2f1d4e 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -13,480 +13,254 @@
  * file.
  */
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 
-/* bfqg stats flags */
-enum bfqg_stats_flags {
-	BFQG_stats_waiting = 0,
-	BFQG_stats_idling,
-	BFQG_stats_empty,
-};
-
-#define BFQG_FLAG_FNS(name)						\
-static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
-{									\
-	stats->flags |= (1 << BFQG_stats_##name);			\
-}									\
-static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
-{									\
-	stats->flags &= ~(1 << BFQG_stats_##name);			\
-}									\
-static int bfqg_stats_##name(struct bfqg_stats *stats)		\
-{									\
-	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
-}									\
-
-BFQG_FLAG_FNS(waiting)
-BFQG_FLAG_FNS(idling)
-BFQG_FLAG_FNS(empty)
-#undef BFQG_FLAG_FNS
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
-{
-	unsigned long long now;
-
-	if (!bfqg_stats_waiting(stats))
-		return;
-
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
-		blkg_stat_add(&stats->group_wait_time,
-			      now - stats->start_group_wait_time);
-	bfqg_stats_clear_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
-						 struct bfq_group *curr_bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	if (bfqg_stats_waiting(stats))
-		return;
-	if (bfqg == curr_bfqg)
-		return;
-	stats->start_group_wait_time = sched_clock();
-	bfqg_stats_mark_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
-{
-	unsigned long long now;
-
-	if (!bfqg_stats_empty(stats))
-		return;
-
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
-		blkg_stat_add(&stats->empty_time,
-			      now - stats->start_empty_time);
-	bfqg_stats_clear_empty(stats);
-}
-
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
-{
-	blkg_stat_add(&bfqg->stats.dequeue, 1);
-}
+static DEFINE_MUTEX(bfqio_mutex);
 
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
 {
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	if (blkg_rwstat_total(&stats->queued))
-		return;
-
-	/*
-	 * group is already marked empty. This can happen if bfqq got new
-	 * request in parent group and moved to this group while being added
-	 * to service tree. Just ignore the event and move on.
-	 */
-	if (bfqg_stats_empty(stats))
-		return;
-
-	stats->start_empty_time = sched_clock();
-	bfqg_stats_mark_empty(stats);
+	return bgrp ? !bgrp->online : false;
 }
 
-static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	if (bfqg_stats_idling(stats)) {
-		unsigned long long now = sched_clock();
-
-		if (time_after64(now, stats->start_idle_time))
-			blkg_stat_add(&stats->idle_time,
-				      now - stats->start_idle_time);
-		bfqg_stats_clear_idling(stats);
-	}
-}
-
-static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	stats->start_idle_time = sched_clock();
-	bfqg_stats_mark_idling(stats);
-}
-
-static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	blkg_stat_add(&stats->avg_queue_size_sum,
-		      blkg_rwstat_total(&stats->queued));
-	blkg_stat_add(&stats->avg_queue_size_samples, 1);
-	bfqg_stats_update_group_wait_time(stats);
-}
-
-static struct blkcg_policy blkcg_policy_bfq;
-
-/*
- * blk-cgroup policy-related handlers
- * The following functions help in converting between blk-cgroup
- * internal structures and BFQ-specific structures.
- */
-
-static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
-{
-	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
-}
+static struct bfqio_cgroup bfqio_root_cgroup = {
+	.weight = BFQ_DEFAULT_GRP_WEIGHT,
+	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,
+	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,
+};
 
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+static inline void bfq_init_entity(struct bfq_entity *entity,
+				   struct bfq_group *bfqg)
 {
-	return pd_to_blkg(&bfqg->pd);
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	entity->ioprio = entity->new_ioprio;
+	entity->ioprio_class = entity->new_ioprio_class;
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
 }
 
-static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
 {
-	return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
 }
 
 /*
- * bfq_group handlers
- * The following functions help in navigating the bfq_group hierarchy
- * by allowing to find the parent of a bfq_group or the bfq_group
- * associated to a bfq_queue.
+ * Search the bfq_group for bfqd into the hash table (by now only a list)
+ * of bgrp.  Must be called under rcu_read_lock().
  */
-
-static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
+					    struct bfq_data *bfqd)
 {
-	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
-
-	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
-}
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *group_entity = bfqq->entity.parent;
-
-	return group_entity ? container_of(group_entity, struct bfq_group,
-					   entity) :
-			      bfqq->bfqd->root_group;
-}
-
-/*
- * The following two functions handle get and put of a bfq_group by
- * wrapping the related blk-cgroup hooks.
- */
-
-static void bfqg_get(struct bfq_group *bfqg)
-{
-	return blkg_get(bfqg_to_blkg(bfqg));
-}
-
-static void bfqg_put(struct bfq_group *bfqg)
-{
-	return blkg_put(bfqg_to_blkg(bfqg));
-}
+	struct bfq_group *bfqg;
+	void *key;
 
-static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
-				     struct bfq_queue *bfqq,
-				     int rw)
-{
-	blkg_rwstat_add(&bfqg->stats.queued, rw, 1);
-	bfqg_stats_end_empty_time(&bfqg->stats);
-	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
-		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
-}
+	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
+		key = rcu_dereference(bfqg->bfqd);
+		if (key == bfqd)
+			return bfqg;
+	}
 
-static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw)
-{
-	blkg_rwstat_add(&bfqg->stats.queued, rw, -1);
+	return NULL;
 }
 
-static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)
+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
+					 struct bfq_group *bfqg)
 {
-	blkg_rwstat_add(&bfqg->stats.merged, rw, 1);
-}
+	struct bfq_entity *entity = &bfqg->entity;
 
-static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,
-					      uint64_t bytes, int rw)
-{
-	blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);
-	blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);
-	blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);
+	/*
+	 * If the weight of the entity has never been set via the sysfs
+	 * interface, then bgrp->weight == 0. In this case we initialize
+	 * the weight from the current ioprio value. Otherwise, the group
+	 * weight, if set, has priority over the ioprio value.
+	 */
+	if (bgrp->weight == 0) {
+		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
+		entity->new_ioprio = bgrp->ioprio;
+	} else {
+		if (bgrp->weight < BFQ_MIN_WEIGHT ||
+		    bgrp->weight > BFQ_MAX_WEIGHT) {
+			printk(KERN_CRIT "bfq_group_init_entity: "
+					 "bgrp->weight %d\n", bgrp->weight);
+			BUG();
+		}
+		entity->new_weight = bgrp->weight;
+		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
+	}
+	entity->orig_weight = entity->weight = entity->new_weight;
+	entity->ioprio = entity->new_ioprio;
+	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
+	entity->my_sched_data = &bfqg->sched_data;
+	bfqg->active_entities = 0;
 }
 
-static void bfqg_stats_update_completion(struct bfq_group *bfqg,
-			uint64_t start_time, uint64_t io_start_time, int rw)
+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
 {
-	struct bfqg_stats *stats = &bfqg->stats;
-	unsigned long long now = sched_clock();
-
-	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
-	if (time_after64(io_start_time, start_time))
-		blkg_rwstat_add(&stats->wait_time, rw,
-				io_start_time - start_time);
-}
+	struct bfq_entity *entity;
 
-/* @stats = 0 */
-static void bfqg_stats_reset(struct bfqg_stats *stats)
-{
-	if (!stats)
-		return;
-
-	/* queued stats shouldn't be cleared */
-	blkg_rwstat_reset(&stats->service_bytes);
-	blkg_rwstat_reset(&stats->serviced);
-	blkg_rwstat_reset(&stats->merged);
-	blkg_rwstat_reset(&stats->service_time);
-	blkg_rwstat_reset(&stats->wait_time);
-	blkg_stat_reset(&stats->time);
-	blkg_stat_reset(&stats->unaccounted_time);
-	blkg_stat_reset(&stats->avg_queue_size_sum);
-	blkg_stat_reset(&stats->avg_queue_size_samples);
-	blkg_stat_reset(&stats->dequeue);
-	blkg_stat_reset(&stats->group_wait_time);
-	blkg_stat_reset(&stats->idle_time);
-	blkg_stat_reset(&stats->empty_time);
-}
+	BUG_ON(parent == NULL);
+	BUG_ON(bfqg == NULL);
 
-/* @to += @from */
-static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)
-{
-	if (!to || !from)
-		return;
-
-	/* queued stats shouldn't be cleared */
-	blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
-	blkg_rwstat_merge(&to->serviced, &from->serviced);
-	blkg_rwstat_merge(&to->merged, &from->merged);
-	blkg_rwstat_merge(&to->service_time, &from->service_time);
-	blkg_rwstat_merge(&to->wait_time, &from->wait_time);
-	blkg_stat_merge(&from->time, &from->time);
-	blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
-	blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-	blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
-	blkg_stat_merge(&to->dequeue, &from->dequeue);
-	blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
-	blkg_stat_merge(&to->idle_time, &from->idle_time);
-	blkg_stat_merge(&to->empty_time, &from->empty_time);
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
 }
 
-/*
- * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
- * recursive stats can still account for the amount used by this bfqg after
- * it's gone.
+/**
+ * bfq_group_chain_alloc - allocate a chain of groups.
+ * @bfqd: queue descriptor.
+ * @css: the leaf cgroup_subsys_state this chain starts from.
+ *
+ * Allocate a chain of groups starting from the one belonging to
+ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain
+ * to the root has already an allocated group on @bfqd.
  */
-static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
+					       struct cgroup_subsys_state *css)
 {
-	struct bfq_group *parent;
-
-	if (!bfqg) /* root_group */
-		return;
+	struct bfqio_cgroup *bgrp;
+	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
 
-	parent = bfqg_parent(bfqg);
+	for (; css != NULL; css = css->parent) {
+		bgrp = css_to_bfqio(css);
 
-	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
-
-	if (unlikely(!parent))
-		return;
+		bfqg = bfqio_lookup_group(bgrp, bfqd);
+		if (bfqg != NULL) {
+			/*
+			 * All the cgroups in the path from there to the
+			 * root must have a bfq_group for bfqd, so we don't
+			 * need any more allocations.
+			 */
+			break;
+		}
 
-	bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);
-	bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);
-	bfqg_stats_reset(&bfqg->stats);
-	bfqg_stats_reset(&bfqg->dead_stats);
-}
+		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
+		if (bfqg == NULL)
+			goto cleanup;
 
-static void bfq_init_entity(struct bfq_entity *entity,
-			    struct bfq_group *bfqg)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+		bfq_group_init_entity(bgrp, bfqg);
+		bfqg->my_entity = &bfqg->entity;
 
-	entity->weight = entity->new_weight;
-	entity->orig_weight = entity->new_weight;
-	if (bfqq) {
-		bfqq->ioprio = bfqq->new_ioprio;
-		bfqq->ioprio_class = bfqq->new_ioprio_class;
-		bfqg_get(bfqg);
+		if (leaf == NULL) {
+			leaf = bfqg;
+			prev = leaf;
+		} else {
+			bfq_group_set_parent(prev, bfqg);
+			/*
+			 * Build a list of allocated nodes using the bfqd
+			 * filed, that is still unused and will be
+			 * initialized only after the node will be
+			 * connected.
+			 */
+			prev->bfqd = bfqg;
+			prev = bfqg;
+		}
 	}
-	entity->parent = bfqg->my_entity;
-	entity->sched_data = &bfqg->sched_data;
-}
-
-static void bfqg_stats_init(struct bfqg_stats *stats)
-{
-	blkg_rwstat_init(&stats->service_bytes);
-	blkg_rwstat_init(&stats->serviced);
-	blkg_rwstat_init(&stats->merged);
-	blkg_rwstat_init(&stats->service_time);
-	blkg_rwstat_init(&stats->wait_time);
-	blkg_rwstat_init(&stats->queued);
-
-	blkg_stat_init(&stats->sectors);
-	blkg_stat_init(&stats->time);
-
-	blkg_stat_init(&stats->unaccounted_time);
-	blkg_stat_init(&stats->avg_queue_size_sum);
-	blkg_stat_init(&stats->avg_queue_size_samples);
-	blkg_stat_init(&stats->dequeue);
-	blkg_stat_init(&stats->group_wait_time);
-	blkg_stat_init(&stats->idle_time);
-	blkg_stat_init(&stats->empty_time);
-}
-
-static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
- {
-	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
- }
-
-static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
-{
-	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
-}
-
-static void bfq_cpd_init(const struct blkcg *blkcg)
-{
-	struct bfq_group_data *d =
-		cpd_to_bfqgd(blkcg->pd[blkcg_policy_bfq.plid]);
-
-	d->weight = BFQ_DEFAULT_GRP_WEIGHT;
-}
-
-static void bfq_pd_init(struct blkcg_gq *blkg)
-{
-	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
-	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
-	struct bfq_entity *entity = &bfqg->entity;
-	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
 
-	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
-	entity->my_sched_data = &bfqg->sched_data;
-	bfqg->my_entity = entity; /*
-				   * the root_group's will be set to NULL
-				   * in bfq_init_queue()
-				   */
-	bfqg->bfqd = bfqd;
-	bfqg->active_entities = 0;
-	bfqg->rq_pos_tree = RB_ROOT;
+	return leaf;
 
-	/* if the root_group does not exist, we are handling it right now */
-	if (bfqd->root_group && bfqg != bfqd->root_group)
-		hlist_add_head(&bfqg->bfqd_node, &bfqd->group_list);
+cleanup:
+	while (leaf != NULL) {
+		prev = leaf;
+		leaf = leaf->bfqd;
+		kfree(prev);
+	}
 
-	bfqg_stats_init(&bfqg->stats);
-	bfqg_stats_init(&bfqg->dead_stats);
+	return NULL;
 }
 
-/* offset delta from bfqg->stats to bfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -
-					offsetof(struct bfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+/**
+ * bfq_group_chain_link - link an allocated group chain to a cgroup
+ *                        hierarchy.
+ * @bfqd: the queue descriptor.
+ * @css: the leaf cgroup_subsys_state to start from.
+ * @leaf: the leaf group (to be associated to @cgroup).
+ *
+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
+ * hierarchy that already as a group associated to @bfqd all the nodes
+ * in the path to the root cgroup have one too.
+ *
+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
+ * per device) while the bfqio_cgroup lock protects the list of groups
+ * belonging to the same cgroup.
+ */
+static void bfq_group_chain_link(struct bfq_data *bfqd,
+				 struct cgroup_subsys_state *css,
+				 struct bfq_group *leaf)
 {
-	u64 sum = 0;
+	struct bfqio_cgroup *bgrp;
+	struct bfq_group *bfqg, *next, *prev = NULL;
+	unsigned long flags;
 
-	sum += blkg_stat_recursive_sum(pd, off);
-	sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
-	return sum;
-}
+	assert_spin_locked(bfqd->queue->queue_lock);
 
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
-						       int off)
-{
-	struct blkg_rwstat a, b;
+	for (; css != NULL && leaf != NULL; css = css->parent) {
+		bgrp = css_to_bfqio(css);
+		next = leaf->bfqd;
 
-	a = blkg_rwstat_recursive_sum(pd, off);
-	b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
-	blkg_rwstat_merge(&a, &b);
-	return a;
-}
+		bfqg = bfqio_lookup_group(bgrp, bfqd);
+		BUG_ON(bfqg != NULL);
 
-static void bfq_pd_reset_stats(struct blkcg_gq *blkg)
-{
-	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+		spin_lock_irqsave(&bgrp->lock, flags);
 
-	bfqg_stats_reset(&bfqg->stats);
-	bfqg_stats_reset(&bfqg->dead_stats);
-}
+		rcu_assign_pointer(leaf->bfqd, bfqd);
+		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
+		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
 
-static void bfq_group_set_parent(struct bfq_group *bfqg,
-					struct bfq_group *parent)
-{
-	struct bfq_entity *entity;
+		spin_unlock_irqrestore(&bgrp->lock, flags);
 
-	BUG_ON(!parent);
-	BUG_ON(!bfqg);
-	BUG_ON(bfqg == parent);
+		prev = leaf;
+		leaf = next;
+	}
 
-	entity = &bfqg->entity;
-	entity->parent = parent->my_entity;
-	entity->sched_data = &parent->sched_data;
+	BUG_ON(css == NULL && leaf != NULL);
+	if (css != NULL && prev != NULL) {
+		bgrp = css_to_bfqio(css);
+		bfqg = bfqio_lookup_group(bgrp, bfqd);
+		bfq_group_set_parent(prev, bfqg);
+	}
 }
 
+/**
+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
+ * @bfqd: queue descriptor.
+ * @cgroup: cgroup being searched for.
+ *
+ * Return a group associated to @bfqd in @cgroup, allocating one if
+ * necessary.  When a group is returned all the cgroups in the path
+ * to the root have a group associated to @bfqd.
+ *
+ * If the allocation fails, return the root group: this breaks guarantees
+ * but is a safe fallback.  If this loss becomes a problem it can be
+ * mitigated using the equivalent weight (given by the product of the
+ * weights of the groups in the path from @group to the root) in the
+ * root scheduler.
+ *
+ * We allocate all the missing nodes in the path from the leaf cgroup
+ * to the root and we connect the nodes only after all the allocations
+ * have been successful.
+ */
 static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
-					      struct blkcg *blkcg)
+					      struct cgroup_subsys_state *css)
 {
-	struct request_queue *q = bfqd->queue;
-	struct bfq_group *bfqg = NULL, *parent;
-	struct bfq_entity *entity = NULL;
+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
+	struct bfq_group *bfqg;
 
-	assert_spin_locked(bfqd->queue->queue_lock);
+	bfqg = bfqio_lookup_group(bgrp, bfqd);
+	if (bfqg != NULL)
+		return bfqg;
 
-	/* avoid lookup for the common case where there's no blkcg */
-	if (blkcg == &blkcg_root) {
+	bfqg = bfq_group_chain_alloc(bfqd, css);
+	if (bfqg != NULL)
+		bfq_group_chain_link(bfqd, css, bfqg);
+	else
 		bfqg = bfqd->root_group;
-	} else {
-		struct blkcg_gq *blkg;
-
-		blkg = blkg_lookup_create(blkcg, q);
-		if (!IS_ERR(blkg))
-			bfqg = blkg_to_bfqg(blkg);
-		else /* fallback to root_group */
-			bfqg = bfqd->root_group;
-	}
-
-	BUG_ON(!bfqg);
-
-	/*
-	 * Update chain of bfq_groups as we might be handling a leaf group
-	 * which, along with some of its relatives, has not been hooked yet
-	 * to the private hierarchy of BFQ.
-	 */
-	entity = &bfqg->entity;
-	for_each_entity(entity) {
-		bfqg = container_of(entity, struct bfq_group, entity);
-		BUG_ON(!bfqg);
-		if (bfqg != bfqd->root_group) {
-			parent = bfqg_parent(bfqg);
-			if (!parent)
-				parent = bfqd->root_group;
-			BUG_ON(!parent);
-			bfq_group_set_parent(bfqg, parent);
-		}
-	}
 
 	return bfqg;
 }
 
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-
 /**
  * bfq_bfqq_move - migrate @bfqq to @bfqg.
  * @bfqd: queue descriptor.
@@ -522,7 +296,6 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			bfq_deactivate_bfqq(bfqd, bfqq, 0);
 	} else if (entity->on_st)
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
-	bfqg_put(bfqq_group(bfqq));
 
 	/*
 	 * Here we use a reference to bfqg.  We don't need a refcounter
@@ -531,15 +304,11 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	 */
 	entity->parent = bfqg->my_entity;
 	entity->sched_data = &bfqg->sched_data;
-	bfqg_get(bfqg);
 
-	if (busy) {
-		bfq_pos_tree_add_move(bfqd, bfqq);
-		if (resume)
-			bfq_activate_bfqq(bfqd, bfqq);
-	}
+	if (busy && resume)
+		bfq_activate_bfqq(bfqd, bfqq);
 
-	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
 		bfq_schedule_dispatch(bfqd);
 }
 
@@ -547,9 +316,9 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  * __bfq_bic_change_cgroup - move @bic to @cgroup.
  * @bfqd: the queue descriptor.
  * @bic: the bic to move.
- * @blkcg: the blk-cgroup to move to.
+ * @cgroup: the cgroup to move to.
  *
- * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
  * has to make sure that the reference to cgroup is valid across the call.
  *
  * NOTE: an alternative approach might have been to store the current
@@ -558,17 +327,18 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  */
 static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 						struct bfq_io_cq *bic,
-						struct blkcg *blkcg)
+						struct cgroup_subsys_state *css)
 {
 	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
 	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
-	struct bfq_group *bfqg;
 	struct bfq_entity *entity;
+	struct bfq_group *bfqg;
+	struct bfqio_cgroup *bgrp;
 
-	lockdep_assert_held(bfqd->queue->queue_lock);
+	bgrp = css_to_bfqio(css);
 
-	bfqg = bfq_find_alloc_group(bfqd, blkcg);
-	if (async_bfqq) {
+	bfqg = bfq_find_alloc_group(bfqd, css);
+	if (async_bfqq != NULL) {
 		entity = &async_bfqq->entity;
 
 		if (entity->sched_data != &bfqg->sched_data) {
@@ -580,7 +350,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 		}
 	}
 
-	if (sync_bfqq) {
+	if (sync_bfqq != NULL) {
 		entity = &sync_bfqq->entity;
 		if (entity->sched_data != &bfqg->sched_data)
 			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
@@ -589,39 +359,74 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 	return bfqg;
 }
 
-static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+/**
+ * bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bic: the bic being migrated.
+ * @cgroup: the destination cgroup.
+ *
+ * When the task owning @bic is moved to @cgroup, @bic is immediately
+ * moved into its new parent group.
+ */
+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
+				  struct cgroup_subsys_state *css)
+{
+	struct bfq_data *bfqd;
+	unsigned long uninitialized_var(flags);
+
+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
+				   &flags);
+	if (bfqd != NULL) {
+		__bfq_bic_change_cgroup(bfqd, bic, css);
+		bfq_put_bfqd_unlock(bfqd, &flags);
+	}
+}
+
+/**
+ * bfq_bic_update_cgroup - update the cgroup of @bic.
+ * @bic: the @bic to update.
+ *
+ * Make sure that @bic is enqueued in the cgroup of the current task.
+ * We need this in addition to moving bics during the cgroup attach
+ * phase because the task owning @bic could be at its first disk
+ * access or we may end up in the root cgroup as the result of a
+ * memory allocation failure and here we try to move to the right
+ * group.
+ *
+ * Must be called under the queue lock.  It is safe to use the returned
+ * value even after the rcu_read_unlock() as the migration/destruction
+ * paths act under the queue lock too.  IOW it is impossible to race with
+ * group migration/destruction and end up with an invalid group as:
+ *   a) here cgroup has not yet been destroyed, nor its destroy callback
+ *      has started execution, as current holds a reference to it,
+ *   b) if it is destroyed after rcu_read_unlock() [after current is
+ *      migrated to a different cgroup] its attach() callback will have
+ *      taken care of remove all the references to the old cgroup data.
+ */
+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
 {
 	struct bfq_data *bfqd = bic_to_bfqd(bic);
-	struct blkcg *blkcg;
-	struct bfq_group *bfqg = NULL;
-	uint64_t id;
+	struct bfq_group *bfqg;
+	struct cgroup_subsys_state *css;
+
+	BUG_ON(bfqd == NULL);
 
 	rcu_read_lock();
-	blkcg = bio_blkcg(bio);
-	id = blkcg->css.serial_nr;
+	css = task_css(current, bfqio_cgrp_id);
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
 	rcu_read_unlock();
 
-	/*
-	 * Check whether blkcg has changed.  The condition may trigger
-	 * spuriously on a newly created cic but there's no harm.
-	 */
-	if (unlikely(!bfqd) || likely(bic->blkcg_id == id))
-		return;
-
-	bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);
-	BUG_ON(!bfqg);
-	bic->blkcg_id = id;
+	return bfqg;
 }
 
 /**
  * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
  * @st: the service tree being flushed.
  */
-static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
 {
 	struct bfq_entity *entity = st->first_idle;
 
-	for (; entity ; entity = st->first_idle)
+	for (; entity != NULL; entity = st->first_idle)
 		__bfq_deactivate_entity(entity, 0);
 }
 
@@ -630,12 +435,12 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st)
  * @bfqd: the device data structure with the root group.
  * @entity: the entity to move.
  */
-static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
-				     struct bfq_entity *entity)
+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+					    struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
-	BUG_ON(!bfqq);
+	BUG_ON(bfqq == NULL);
 	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
 	return;
 }
@@ -649,9 +454,9 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
  *
  * Needs queue_lock to be taken and reference to be valid over the call.
  */
-static void bfq_reparent_active_entities(struct bfq_data *bfqd,
-					 struct bfq_group *bfqg,
-					 struct bfq_service_tree *st)
+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
+						struct bfq_group *bfqg,
+						struct bfq_service_tree *st)
 {
 	struct rb_root *active = &st->active;
 	struct bfq_entity *entity = NULL;
@@ -659,10 +464,10 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd,
 	if (!RB_EMPTY_ROOT(&st->active))
 		entity = bfq_entity_of(rb_first(active));
 
-	for (; entity ; entity = bfq_entity_of(rb_first(active)))
+	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
 		bfq_reparent_leaf_entity(bfqd, entity);
 
-	if (bfqg->sched_data.in_service_entity)
+	if (bfqg->sched_data.in_service_entity != NULL)
 		bfq_reparent_leaf_entity(bfqd,
 			bfqg->sched_data.in_service_entity);
 
@@ -671,21 +476,20 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd,
 
 /**
  * bfq_destroy_group - destroy @bfqg.
+ * @bgrp: the bfqio_cgroup containing @bfqg.
  * @bfqg: the group being destroyed.
  *
  * Destroy @bfqg, making sure that it is not referenced from its parent.
- * blkio already grabs the queue_lock for us, so no need to use RCU-based magic
  */
-static void bfq_pd_offline(struct blkcg_gq *blkg)
+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
 {
+	struct bfq_data *bfqd;
 	struct bfq_service_tree *st;
-	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
-	struct bfq_data *bfqd = bfqg->bfqd;
 	struct bfq_entity *entity = bfqg->my_entity;
+	unsigned long uninitialized_var(flags);
 	int i;
 
-	if (!entity) /* root group */
-		return;
+	hlist_del(&bfqg->group_node);
 
 	/*
 	 * Empty all service_trees belonging to this group before
@@ -714,19 +518,37 @@ static void bfq_pd_offline(struct blkcg_gq *blkg)
 		 * There is no need to put the sync queues, as the
 		 * scheduler has taken no reference.
 		 */
-		bfq_reparent_active_entities(bfqd, bfqg, st);
+		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
+		if (bfqd != NULL) {
+			bfq_reparent_active_entities(bfqd, bfqg, st);
+			bfq_put_bfqd_unlock(bfqd, &flags);
+		}
 		BUG_ON(!RB_EMPTY_ROOT(&st->active));
 		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
 	}
-	BUG_ON(bfqg->sched_data.next_in_service);
-	BUG_ON(bfqg->sched_data.in_service_entity);
+	BUG_ON(bfqg->sched_data.next_in_service != NULL);
+	BUG_ON(bfqg->sched_data.in_service_entity != NULL);
 
-	hlist_del(&bfqg->bfqd_node);
-	__bfq_deactivate_entity(entity, 0);
-	bfq_put_async_queues(bfqd, bfqg);
-	BUG_ON(entity->tree);
+	/*
+	 * We may race with device destruction, take extra care when
+	 * dereferencing bfqg->bfqd.
+	 */
+	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
+	if (bfqd != NULL) {
+		hlist_del(&bfqg->bfqd_node);
+		__bfq_deactivate_entity(entity, 0);
+		bfq_put_async_queues(bfqd, bfqg);
+		bfq_put_bfqd_unlock(bfqd, &flags);
+	}
+	BUG_ON(entity->tree != NULL);
 
-	bfqg_stats_xfer_dead(bfqg);
+	/*
+	 * No need to defer the kfree() to the end of the RCU grace
+	 * period: we are called from the destroy() callback of our
+	 * cgroup, so we can be sure that no one is a) still using
+	 * this cgroup or b) doing lookups in it.
+	 */
+	kfree(bfqg);
 }
 
 static void bfq_end_wr_async(struct bfq_data *bfqd)
@@ -773,309 +595,312 @@ static void bfq_disconnect_groups(struct bfq_data *bfqd)
 	}
 }
 
-static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,
-				       struct cftype *cftype)
+static inline void bfq_free_root_group(struct bfq_data *bfqd)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
-	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
-	int ret = -EINVAL;
+	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
+	struct bfq_group *bfqg = bfqd->root_group;
+
+	bfq_put_async_queues(bfqd, bfqg);
 
-	spin_lock_irq(&blkcg->lock);
-	ret = bfqgd->weight;
-	spin_unlock_irq(&blkcg->lock);
+	spin_lock_irq(&bgrp->lock);
+	hlist_del_rcu(&bfqg->group_node);
+	spin_unlock_irq(&bgrp->lock);
 
-	return ret;
+	/*
+	 * No need to synchronize_rcu() here: since the device is gone
+	 * there cannot be any read-side access to its root_group.
+	 */
+	kfree(bfqg);
 }
 
-static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,
-					struct cftype *cftype,
-					u64 val)
+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
-	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
-	struct blkcg_gq *blkg;
-	int ret = -EINVAL;
-
-	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
-		return ret;
-
-	ret = 0;
-	spin_lock_irq(&blkcg->lock);
-	bfqgd->weight = (unsigned short)val;
-	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
-		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
-		if (!bfqg)
-			continue;
-		/*
-		 * Setting the prio_changed flag of the entity
-		 * to 1 with new_weight == weight would re-set
-		 * the value of the weight to its ioprio mapping.
-		 * Set the flag only if necessary.
-		 */
-		if ((unsigned short)val != bfqg->entity.new_weight) {
-			bfqg->entity.new_weight = (unsigned short)val;
-			/*
-			 * Make sure that the above new value has been
-			 * stored in bfqg->entity.new_weight before
-			 * setting the prio_changed flag. In fact,
-			 * this flag may be read asynchronously (in
-			 * critical sections protected by a different
-			 * lock than that held here), and finding this
-			 * flag set may cause the execution of the code
-			 * for updating parameters whose value may
-			 * depend also on bfqg->entity.new_weight (in
-			 * __bfq_entity_update_weight_prio).
-			 * This barrier makes sure that the new value
-			 * of bfqg->entity.new_weight is correctly
-			 * seen in that code.
-			 */
-			smp_wmb();
-			bfqg->entity.prio_changed = 1;
-		}
-	}
-	spin_unlock_irq(&blkcg->lock);
+	struct bfq_group *bfqg;
+	struct bfqio_cgroup *bgrp;
+	int i;
 
-	return ret;
-}
+	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
+	if (bfqg == NULL)
+		return NULL;
 
-static int bfqg_print_stat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
-			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
-	return 0;
-}
+	bfqg->entity.parent = NULL;
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
 
-static int bfqg_print_rwstat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
-			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
-	return 0;
+	bgrp = &bfqio_root_cgroup;
+	spin_lock_irq(&bgrp->lock);
+	rcu_assign_pointer(bfqg->bfqd, bfqd);
+	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
+	spin_unlock_irq(&bgrp->lock);
+
+	return bfqg;
 }
 
-static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
-{
-	u64 sum = bfqg_stat_pd_recursive_sum(pd, off);
+#define SHOW_FUNCTION(__VAR)						\
+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
+				       struct cftype *cftype)		\
+{									\
+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\
+	u64 ret = -ENODEV;						\
+									\
+	mutex_lock(&bfqio_mutex);					\
+	if (bfqio_is_removed(bgrp))					\
+		goto out_unlock;					\
+									\
+	spin_lock_irq(&bgrp->lock);					\
+	ret = bgrp->__VAR;						\
+	spin_unlock_irq(&bgrp->lock);					\
+									\
+out_unlock:								\
+	mutex_unlock(&bfqio_mutex);					\
+	return ret;							\
+}
+
+SHOW_FUNCTION(weight);
+SHOW_FUNCTION(ioprio);
+SHOW_FUNCTION(ioprio_class);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\
+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
+					struct cftype *cftype,		\
+					u64 val)			\
+{									\
+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\
+	struct bfq_group *bfqg;						\
+	int ret = -EINVAL;						\
+									\
+	if (val < (__MIN) || val > (__MAX))				\
+		return ret;						\
+									\
+	ret = -ENODEV;							\
+	mutex_lock(&bfqio_mutex);					\
+	if (bfqio_is_removed(bgrp))					\
+		goto out_unlock;					\
+	ret = 0;							\
+									\
+	spin_lock_irq(&bgrp->lock);					\
+	bgrp->__VAR = (unsigned short)val;				\
+	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\
+		/*							\
+		 * Setting the ioprio_changed flag of the entity        \
+		 * to 1 with new_##__VAR == ##__VAR would re-set        \
+		 * the value of the weight to its ioprio mapping.       \
+		 * Set the flag only if necessary.			\
+		 */							\
+		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \
+			bfqg->entity.new_##__VAR = (unsigned short)val; \
+			/*						\
+			 * Make sure that the above new value has been	\
+			 * stored in bfqg->entity.new_##__VAR before	\
+			 * setting the ioprio_changed flag. In fact,	\
+			 * this flag may be read asynchronously (in	\
+			 * critical sections protected by a different	\
+			 * lock than that held here), and finding this	\
+			 * flag set may cause the execution of the code	\
+			 * for updating parameters whose value may	\
+			 * depend also on bfqg->entity.new_##__VAR (in	\
+			 * __bfq_entity_update_weight_prio).		\
+			 * This barrier makes sure that the new value	\
+			 * of bfqg->entity.new_##__VAR is correctly	\
+			 * seen in that code.				\
+			 */						\
+			smp_wmb();                                      \
+			bfqg->entity.ioprio_changed = 1;                \
+		}							\
+	}								\
+	spin_unlock_irq(&bgrp->lock);					\
+									\
+out_unlock:								\
+	mutex_unlock(&bfqio_mutex);					\
+	return ret;							\
+}
+
+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
+#undef STORE_FUNCTION
 
-	return __blkg_prfill_u64(sf, pd, sum);
-}
+static struct cftype bfqio_files[] = {
+	{
+		.name = "weight",
+		.read_u64 = bfqio_cgroup_weight_read,
+		.write_u64 = bfqio_cgroup_weight_write,
+	},
+	{
+		.name = "ioprio",
+		.read_u64 = bfqio_cgroup_ioprio_read,
+		.write_u64 = bfqio_cgroup_ioprio_write,
+	},
+	{
+		.name = "ioprio_class",
+		.read_u64 = bfqio_cgroup_ioprio_class_read,
+		.write_u64 = bfqio_cgroup_ioprio_class_write,
+	},
+	{ },	/* terminate */
+};
 
-static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
-					struct blkg_policy_data *pd, int off)
+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
+						*parent_css)
 {
-	struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);
+	struct bfqio_cgroup *bgrp;
 
-	return __blkg_prfill_rwstat(sf, pd, &sum);
-}
+	if (parent_css != NULL) {
+		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
+		if (bgrp == NULL)
+			return ERR_PTR(-ENOMEM);
+	} else
+		bgrp = &bfqio_root_cgroup;
 
-static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
-			  seq_cft(sf)->private, false);
-	return 0;
+	spin_lock_init(&bgrp->lock);
+	INIT_HLIST_HEAD(&bgrp->group_data);
+	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
+	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
+
+	return &bgrp->css;
 }
 
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+/*
+ * We cannot support shared io contexts, as we have no means to support
+ * two tasks with the same ioc in two different groups without major rework
+ * of the main bic/bfqq data structures.  By now we allow a task to change
+ * its cgroup only if it's the only owner of its ioc; the drawback of this
+ * behavior is that a group containing a task that forked using CLONE_IO
+ * will not be destroyed until the tasks sharing the ioc die.
+ */
+static int bfqio_can_attach(struct cgroup_subsys_state *css,
+			    struct cgroup_taskset *tset)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
-			  seq_cft(sf)->private, true);
-	return 0;
+	struct task_struct *task;
+	struct io_context *ioc;
+	int ret = 0;
+
+	cgroup_taskset_for_each(task, tset) {
+		/*
+		 * task_lock() is needed to avoid races with
+		 * exit_io_context()
+		 */
+		task_lock(task);
+		ioc = task->io_context;
+		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
+			/*
+			 * ioc == NULL means that the task is either too
+			 * young or exiting: if it has still no ioc the
+			 * ioc can't be shared, if the task is exiting the
+			 * attach will fail anyway, no matter what we
+			 * return here.
+			 */
+			ret = -EINVAL;
+		task_unlock(task);
+		if (ret)
+			break;
+	}
+
+	return ret;
 }
 
-static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
+static void bfqio_attach(struct cgroup_subsys_state *css,
+			 struct cgroup_taskset *tset)
 {
-	struct bfq_group *bfqg = pd_to_bfqg(pd);
-	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
-	u64 v = 0;
+	struct task_struct *task;
+	struct io_context *ioc;
+	struct io_cq *icq;
 
-	if (samples) {
-		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
-		v = div64_u64(v, samples);
+	/*
+	 * IMPORTANT NOTE: The move of more than one process at a time to a
+	 * new group has not yet been tested.
+	 */
+	cgroup_taskset_for_each(task, tset) {
+		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+		if (ioc) {
+			/*
+			 * Handle cgroup change here.
+			 */
+			rcu_read_lock();
+			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
+				if (!strncmp(
+					icq->q->elevator->type->elevator_name,
+					"bfq", ELV_NAME_MAX))
+					bfq_bic_change_cgroup(icq_to_bic(icq),
+							      css);
+			rcu_read_unlock();
+			put_io_context(ioc);
+		}
 	}
-	__blkg_prfill_u64(sf, pd, v);
-	return 0;
 }
 
-/* print avg_queue_size */
-static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+static void bfqio_destroy(struct cgroup_subsys_state *css)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
-			  0, false);
-	return 0;
+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
+	struct hlist_node *tmp;
+	struct bfq_group *bfqg;
+
+	/*
+	 * Since we are destroying the cgroup, there are no more tasks
+	 * referencing it, and all the RCU grace periods that may have
+	 * referenced it are ended (as the destruction of the parent
+	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by
+	 * anything else and we don't need any synchronization.
+	 */
+	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
+		bfq_destroy_group(bgrp, bfqg);
+
+	BUG_ON(!hlist_empty(&bgrp->group_data));
+
+	kfree(bgrp);
 }
 
-static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+static int bfqio_css_online(struct cgroup_subsys_state *css)
 {
-	int ret;
+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
 
-	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
-	if (ret)
-		return NULL;
+	mutex_lock(&bfqio_mutex);
+	bgrp->online = true;
+	mutex_unlock(&bfqio_mutex);
 
-        return blkg_to_bfqg(bfqd->queue->root_blkg);
+	return 0;
 }
 
-static struct cftype bfqio_files[] = {
-	{
-		.name = "bfq.weight",
-		.read_u64 = bfqio_cgroup_weight_read,
-		.write_u64 = bfqio_cgroup_weight_write,
-	},
-	/* statistics, cover only the tasks in the bfqg */
-	{
-		.name = "bfq.time",
-		.private = offsetof(struct bfq_group, stats.time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.sectors",
-		.private = offsetof(struct bfq_group, stats.sectors),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.io_service_bytes",
-		.private = offsetof(struct bfq_group, stats.service_bytes),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_serviced",
-		.private = offsetof(struct bfq_group, stats.serviced),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_service_time",
-		.private = offsetof(struct bfq_group, stats.service_time),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_wait_time",
-		.private = offsetof(struct bfq_group, stats.wait_time),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_merged",
-		.private = offsetof(struct bfq_group, stats.merged),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_queued",
-		.private = offsetof(struct bfq_group, stats.queued),
-		.seq_show = bfqg_print_rwstat,
-	},
+static void bfqio_css_offline(struct cgroup_subsys_state *css)
+{
+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
 
-	/* the same statictics which cover the bfqg and its descendants */
-	{
-		.name = "bfq.time_recursive",
-		.private = offsetof(struct bfq_group, stats.time),
-		.seq_show = bfqg_print_stat_recursive,
-	},
-	{
-		.name = "bfq.sectors_recursive",
-		.private = offsetof(struct bfq_group, stats.sectors),
-		.seq_show = bfqg_print_stat_recursive,
-	},
-	{
-		.name = "bfq.io_service_bytes_recursive",
-		.private = offsetof(struct bfq_group, stats.service_bytes),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_serviced_recursive",
-		.private = offsetof(struct bfq_group, stats.serviced),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_service_time_recursive",
-		.private = offsetof(struct bfq_group, stats.service_time),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_wait_time_recursive",
-		.private = offsetof(struct bfq_group, stats.wait_time),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_merged_recursive",
-		.private = offsetof(struct bfq_group, stats.merged),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_queued_recursive",
-		.private = offsetof(struct bfq_group, stats.queued),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.avg_queue_size",
-		.seq_show = bfqg_print_avg_queue_size,
-	},
-	{
-		.name = "bfq.group_wait_time",
-		.private = offsetof(struct bfq_group, stats.group_wait_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.idle_time",
-		.private = offsetof(struct bfq_group, stats.idle_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.empty_time",
-		.private = offsetof(struct bfq_group, stats.empty_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.dequeue",
-		.private = offsetof(struct bfq_group, stats.dequeue),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.unaccounted_time",
-		.private = offsetof(struct bfq_group, stats.unaccounted_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{ }	/* terminate */
-};
+	mutex_lock(&bfqio_mutex);
+	bgrp->online = false;
+	mutex_unlock(&bfqio_mutex);
+}
 
-static struct blkcg_policy blkcg_policy_bfq = {
-       .pd_size                = sizeof(struct bfq_group),
-       .cpd_size               = sizeof(struct bfq_group_data),
-       .cftypes                = bfqio_files,
-       .pd_init_fn             = bfq_pd_init,
-       .cpd_init_fn            = bfq_cpd_init,
-       .pd_offline_fn          = bfq_pd_offline,
-       .pd_reset_stats_fn      = bfq_pd_reset_stats,
+struct cgroup_subsys bfqio_cgrp_subsys = {
+	.css_alloc = bfqio_create,
+	.css_online = bfqio_css_online,
+	.css_offline = bfqio_css_offline,
+	.can_attach = bfqio_can_attach,
+	.attach = bfqio_attach,
+	.css_free = bfqio_destroy,
+	.legacy_cftypes = bfqio_files,
 };
-
 #else
-
-static void bfq_init_entity(struct bfq_entity *entity,
-			    struct bfq_group *bfqg)
+static inline void bfq_init_entity(struct bfq_entity *entity,
+				   struct bfq_group *bfqg)
 {
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	entity->weight = entity->new_weight;
 	entity->orig_weight = entity->new_weight;
-	if (bfqq) {
-		bfqq->ioprio = bfqq->new_ioprio;
-		bfqq->ioprio_class = bfqq->new_ioprio_class;
-	}
+	entity->ioprio = entity->new_ioprio;
+	entity->ioprio_class = entity->new_ioprio_class;
 	entity->sched_data = &bfqg->sched_data;
 }
 
-static struct bfq_group *
-bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+static inline struct bfq_group *
+bfq_bic_update_cgroup(struct bfq_io_cq *bic)
 {
 	struct bfq_data *bfqd = bic_to_bfqd(bic);
 	return bfqd->root_group;
 }
 
-static void bfq_bfqq_move(struct bfq_data *bfqd,
-			  struct bfq_queue *bfqq,
-			  struct bfq_entity *entity,
-			  struct bfq_group *bfqg)
+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
+				 struct bfq_queue *bfqq,
+				 struct bfq_entity *entity,
+				 struct bfq_group *bfqg)
 {
 }
 
@@ -1084,24 +909,23 @@ static void bfq_end_wr_async(struct bfq_data *bfqd)
 	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
 }
 
-static void bfq_disconnect_groups(struct bfq_data *bfqd)
+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
 {
 	bfq_put_async_queues(bfqd, bfqd->root_group);
 }
 
-static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
-                                              struct blkcg *blkcg)
+static inline void bfq_free_root_group(struct bfq_data *bfqd)
 {
-	return bfqd->root_group;
+	kfree(bfqd->root_group);
 }
 
-static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
 {
 	struct bfq_group *bfqg;
 	int i;
 
 	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
-	if (!bfqg)
+	if (bfqg == NULL)
 		return NULL;
 
 	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
index fb7bb8f08..7f6b0004c 100644
--- a/block/bfq-ioc.c
+++ b/block/bfq-ioc.c
@@ -14,7 +14,7 @@
  * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
  * @icq: the iocontext queue.
  */
-static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
 {
 	/* bic->icq is the first member, %NULL will convert to %NULL */
 	return container_of(icq, struct bfq_io_cq, icq);
@@ -27,8 +27,8 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
  *
  * Queue lock must be held.
  */
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
-					struct io_context *ioc)
+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+					       struct io_context *ioc)
 {
 	if (ioc)
 		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index cf487f926..71b51c1b4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -82,9 +82,6 @@ static const int bfq_back_penalty = 2;
 /* Idling period duration, in jiffies. */
 static int bfq_slice_idle = HZ / 125;
 
-/* Minimum number of assigned budgets for which stats are safe to compute. */
-static const int bfq_stats_min_budgets = 194;
-
 /* Default maximum budget values, in sectors and number of requests. */
 static const int bfq_default_max_budget = 16 * 1024;
 static const int bfq_max_budget_async_rq = 4;
@@ -166,22 +163,38 @@ static int device_speed_thresh[2];
 #define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
 #define RQ_BFQQ(rq)		((rq)->elv.priv[1])
 
-static void bfq_schedule_dispatch(struct bfq_data *bfqd);
+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
 
 #include "bfq-ioc.c"
 #include "bfq-sched.c"
 #include "bfq-cgroup.c"
 
-#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
-#define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\
+				 IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\
+				 IOPRIO_CLASS_RT)
 
 #define bfq_sample_valid(samples)	((samples) > 80)
 
 /*
+ * The following macro groups conditions that need to be evaluated when
+ * checking if existing queues and groups form a symmetric scenario
+ * and therefore idling can be reduced or disabled for some of the
+ * queues. See the comment to the function bfq_bfqq_must_not_expire()
+ * for further details.
+ */
+#ifdef CONFIG_CGROUP_BFQIO
+#define symmetric_scenario	  (!bfqd->active_numerous_groups && \
+				   !bfq_differentiated_weights(bfqd))
+#else
+#define symmetric_scenario	  (!bfq_differentiated_weights(bfqd))
+#endif
+
+/*
  * We regard a request as SYNC, if either it's a read or has the SYNC bit
  * set (in which case it could also be a direct WRITE).
  */
-static int bfq_bio_sync(struct bio *bio)
+static inline int bfq_bio_sync(struct bio *bio)
 {
 	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
 		return 1;
@@ -193,7 +206,7 @@ static int bfq_bio_sync(struct bio *bio)
  * Scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing.
  */
-static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
 {
 	if (bfqd->queued != 0) {
 		bfq_log(bfqd, "schedule dispatch");
@@ -217,9 +230,9 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
 #define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
 	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 
-	if (!rq1 || rq1 == rq2)
+	if (rq1 == NULL || rq1 == rq2)
 		return rq2;
-	if (!rq2)
+	if (rq2 == NULL)
 		return rq1;
 
 	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
@@ -332,17 +345,17 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
 
 	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
 		(long long unsigned)sector,
-		bfqq ? bfqq->pid : 0);
+		bfqq != NULL ? bfqq->pid : 0);
 
 	return bfqq;
 }
 
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	struct rb_node **p, *parent;
 	struct bfq_queue *__bfqq;
 
-	if (bfqq->pos_root) {
+	if (bfqq->pos_root != NULL) {
 		rb_erase(&bfqq->pos_node, bfqq->pos_root);
 		bfqq->pos_root = NULL;
 	}
@@ -352,10 +365,10 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	if (!bfqq->next_rq)
 		return;
 
-	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	bfqq->pos_root = &bfqd->rq_pos_tree;
 	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
 			blk_rq_pos(bfqq->next_rq), &parent, &p);
-	if (!__bfqq) {
+	if (__bfqq == NULL) {
 		rb_link_node(&bfqq->pos_node, parent, p);
 		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
 	} else
@@ -365,7 +378,7 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 /*
  * Tell whether there are active queues or groups with differentiated weights.
  */
-static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
 {
 	/*
 	 * For weights to differ, at least one of the trees must contain
@@ -374,7 +387,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
 	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
 		(bfqd->queue_weights_tree.rb_node->rb_left ||
 		 bfqd->queue_weights_tree.rb_node->rb_right)
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 	       ) ||
 	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
 		(bfqd->group_weights_tree.rb_node->rb_left ||
@@ -384,40 +397,6 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
 }
 
 /*
- * The following function returns true if every queue must receive the
- * same share of the throughput (this condition is used when deciding
- * whether idling may be disabled, see the comments in the function
- * bfq_bfqq_may_idle()).
- *
- * Such a scenario occurs when:
- * 1) all active queues have the same weight,
- * 2) all active groups at the same level in the groups tree have the same
- *    weight,
- * 3) all active groups at the same level in the groups tree have the same
- *    number of children.
- *
- * Unfortunately, keeping the necessary state for evaluating exactly the
- * above symmetry conditions would be quite complex and time-consuming.
- * Therefore this function evaluates, instead, the following stronger
- * sub-conditions, for which it is much easier to maintain the needed
- * state:
- * 1) all active queues have the same weight,
- * 2) all active groups have the same weight,
- * 3) all active groups have at most one active child each.
- * In particular, the last two conditions are always true if hierarchical
- * support and the cgroups interface are not enabled, thus no state needs
- * to be maintained in this case.
- */
-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
-{
-	return
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		!bfqd->active_numerous_groups &&
-#endif
-		!bfq_differentiated_weights(bfqd);
-}
-
-/*
  * If the weight-counter tree passed as input contains no counter for
  * the weight of the input entity, then add that counter; otherwise just
  * increment the existing counter.
@@ -516,10 +495,10 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
 
 	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 
-	if (rbprev)
+	if (rbprev != NULL)
 		prev = rb_entry_rq(rbprev);
 
-	if (rbnext)
+	if (rbnext != NULL)
 		next = rb_entry_rq(rbnext);
 	else {
 		rbnext = rb_first(&bfqq->sort_list);
@@ -531,8 +510,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
 }
 
 /* see the definition of bfq_async_charge_factor for details */
-static unsigned long bfq_serv_to_charge(struct request *rq,
-					struct bfq_queue *bfqq)
+static inline unsigned long bfq_serv_to_charge(struct request *rq,
+					       struct bfq_queue *bfqq)
 {
 	return blk_rq_sectors(rq) *
 		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
@@ -558,7 +537,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 	struct request *next_rq = bfqq->next_rq;
 	unsigned long new_budget;
 
-	if (!next_rq)
+	if (next_rq == NULL)
 		return;
 
 	if (bfqq == bfqd->in_service_queue)
@@ -581,7 +560,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 	}
 }
 
-static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
 {
 	u64 dur;
 
@@ -594,12 +573,13 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
 	return dur;
 }
 
-static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq)
+static inline unsigned
+bfq_bfqq_cooperations(struct bfq_queue *bfqq)
 {
 	return bfqq->bic ? bfqq->bic->cooperations : 0;
 }
 
-static void
+static inline void
 bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 {
 	if (bic->saved_idle_window)
@@ -623,7 +603,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;
 		bfqq->wr_cur_max_time = bic->wr_time_left;
 		bfqq->last_wr_start_finish = jiffies;
-		bfqq->entity.prio_changed = 1;
+		bfqq->entity.ioprio_changed = 1;
 	}
 	/*
 	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from
@@ -633,12 +613,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 	bic->wr_time_left = 0;
 }
 
+/* Must be called with the queue_lock held. */
 static int bfqq_process_refs(struct bfq_queue *bfqq)
 {
 	int process_refs, io_refs;
 
-	lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
-
 	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
 	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
 	BUG_ON(process_refs < 0);
@@ -646,7 +625,8 @@ static int bfqq_process_refs(struct bfq_queue *bfqq)
 }
 
 /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
-static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static inline void bfq_reset_burst_list(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq)
 {
 	struct bfq_queue *item;
 	struct hlist_node *n;
@@ -878,14 +858,14 @@ static void bfq_add_request(struct request *rq)
 	 */
 	prev = bfqq->next_rq;
 	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
-	BUG_ON(!next_rq);
+	BUG_ON(next_rq == NULL);
 	bfqq->next_rq = next_rq;
 
 	/*
 	 * Adjust priority tree position, if next_rq changes.
 	 */
 	if (prev != bfqq->next_rq)
-		bfq_pos_tree_add_move(bfqd, bfqq);
+		bfq_rq_pos_tree_add(bfqd, bfqq);
 
 	if (!bfq_bfqq_busy(bfqq)) {
 		bool soft_rt, coop_or_in_burst,
@@ -893,10 +873,6 @@ static void bfq_add_request(struct request *rq)
 						bfqq->budget_timeout +
 						bfqd->bfq_wr_min_idle_time);
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
-					 rq->cmd_flags);
-#endif
 		if (bfq_bfqq_sync(bfqq)) {
 			bool already_in_burst =
 			   !hlist_unhashed(&bfqq->burst_list_node) ||
@@ -941,7 +917,7 @@ static void bfq_add_request(struct request *rq)
 			goto add_bfqq_busy;
 
 		if (bfq_bfqq_just_split(bfqq))
-			goto set_prio_changed;
+			goto set_ioprio_changed;
 
 		/*
 		 * If the queue:
@@ -953,7 +929,7 @@ static void bfq_add_request(struct request *rq)
 		 * start a weight-raising period.
 		 */
 		if (old_wr_coeff == 1 && (interactive || soft_rt) &&
-		    (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {
+		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
 			if (interactive)
 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
@@ -1032,9 +1008,9 @@ static void bfq_add_request(struct request *rq)
 					bfqd->bfq_wr_rt_max_time;
 			}
 		}
-set_prio_changed:
+set_ioprio_changed:
 		if (old_wr_coeff != bfqq->wr_coeff)
-			entity->prio_changed = 1;
+			entity->ioprio_changed = 1;
 add_bfqq_busy:
 		bfqq->last_idle_bklogged = jiffies;
 		bfqq->service_from_backlogged = 0;
@@ -1049,7 +1025,7 @@ add_bfqq_busy:
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
 
 			bfqd->wr_busy_queues++;
-			entity->prio_changed = 1;
+			entity->ioprio_changed = 1;
 			bfq_log_bfqq(bfqd, bfqq,
 			    "non-idle wrais starting at %lu, rais_max_time %u",
 			    jiffies,
@@ -1072,11 +1048,11 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
 	struct bfq_queue *bfqq;
 
 	bic = bfq_bic_lookup(bfqd, tsk->io_context);
-	if (!bic)
+	if (bic == NULL)
 		return NULL;
 
 	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
-	if (bfqq)
+	if (bfqq != NULL)
 		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
 
 	return NULL;
@@ -1092,7 +1068,8 @@ static void bfq_activate_request(struct request_queue *q, struct request *rq)
 		(long long unsigned)bfqd->last_position);
 }
 
-static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+static inline void bfq_deactivate_request(struct request_queue *q,
+					  struct request *rq)
 {
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 
@@ -1124,7 +1101,7 @@ static void bfq_remove_request(struct request *rq)
 		/*
 		 * Remove queue from request-position tree as it is empty.
 		 */
-		if (bfqq->pos_root) {
+		if (bfqq->pos_root != NULL) {
 			rb_erase(&bfqq->pos_node, bfqq->pos_root);
 			bfqq->pos_root = NULL;
 		}
@@ -1134,9 +1111,6 @@ static void bfq_remove_request(struct request *rq)
 		BUG_ON(bfqq->meta_pending == 0);
 		bfqq->meta_pending--;
 	}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
-#endif
 }
 
 static int bfq_merge(struct request_queue *q, struct request **req,
@@ -1146,7 +1120,7 @@ static int bfq_merge(struct request_queue *q, struct request **req,
 	struct request *__rq;
 
 	__rq = bfq_find_rq_fmerge(bfqd, bio);
-	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_FRONT_MERGE;
 	}
@@ -1173,7 +1147,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,
 		prev = bfqq->next_rq;
 		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
 					 bfqd->last_position);
-		BUG_ON(!next_rq);
+		BUG_ON(next_rq == NULL);
 		bfqq->next_rq = next_rq;
 		/*
 		 * If next_rq changes, update both the queue's budget to
@@ -1182,19 +1156,11 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,
 		 */
 		if (prev != bfqq->next_rq) {
 			bfq_updated_next_req(bfqd, bfqq);
-			bfq_pos_tree_add_move(bfqd, bfqq);
+			bfq_rq_pos_tree_add(bfqd, bfqq);
 		}
 	}
 }
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfq_bio_merged(struct request_queue *q, struct request *req,
-			   struct bio *bio)
-{
-	bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw);
-}
-#endif
-
 static void bfq_merged_requests(struct request_queue *q, struct request *rq,
 				struct request *next)
 {
@@ -1221,21 +1187,18 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq,
 		bfqq->next_rq = rq;
 
 	bfq_remove_request(next);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
-#endif
 }
 
 /* Must be called with bfqq != NULL */
-static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
 {
-	BUG_ON(!bfqq);
+	BUG_ON(bfqq == NULL);
 	if (bfq_bfqq_busy(bfqq))
 		bfqq->bfqd->wr_busy_queues--;
 	bfqq->wr_coeff = 1;
 	bfqq->wr_cur_max_time = 0;
 	/* Trigger a weight change on the next activation of the queue */
-	bfqq->entity.prio_changed = 1;
+	bfqq->entity.ioprio_changed = 1;
 }
 
 static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
@@ -1245,9 +1208,9 @@ static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
 
 	for (i = 0; i < 2; i++)
 		for (j = 0; j < IOPRIO_BE_NR; j++)
-			if (bfqg->async_bfqq[i][j])
+			if (bfqg->async_bfqq[i][j] != NULL)
 				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
-	if (bfqg->async_idle_bfqq)
+	if (bfqg->async_idle_bfqq != NULL)
 		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
 }
 
@@ -1266,7 +1229,7 @@ static void bfq_end_wr(struct bfq_data *bfqd)
 	spin_unlock_irq(bfqd->queue->queue_lock);
 }
 
-static sector_t bfq_io_struct_pos(void *io_struct, bool request)
+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
 {
 	if (request)
 		return blk_rq_pos(io_struct);
@@ -1274,18 +1237,25 @@ static sector_t bfq_io_struct_pos(void *io_struct, bool request)
 		return ((struct bio *)io_struct)->bi_iter.bi_sector;
 }
 
-static int bfq_rq_close_to_sector(void *io_struct, bool request,
-				  sector_t sector)
+static inline sector_t bfq_dist_from(sector_t pos1,
+				     sector_t pos2)
 {
-	return abs64(bfq_io_struct_pos(io_struct, request) - sector) <=
-	       BFQQ_SEEK_THR;
+	if (pos1 >= pos2)
+		return pos1 - pos2;
+	else
+		return pos2 - pos1;
 }
 
-static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
-					 struct bfq_queue *bfqq,
+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
 					 sector_t sector)
 {
-	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
+	       BFQQ_SEEK_THR;
+}
+
+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
+{
+	struct rb_root *root = &bfqd->rq_pos_tree;
 	struct rb_node *parent, *node;
 	struct bfq_queue *__bfqq;
 
@@ -1297,7 +1267,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
 	 * request, choose it.
 	 */
 	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
-	if (__bfqq)
+	if (__bfqq != NULL)
 		return __bfqq;
 
 	/*
@@ -1313,7 +1283,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
 		node = rb_next(&__bfqq->pos_node);
 	else
 		node = rb_prev(&__bfqq->pos_node);
-	if (!node)
+	if (node == NULL)
 		return NULL;
 
 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
@@ -1323,21 +1293,56 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
 	return NULL;
 }
 
-static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
-						   struct bfq_queue *cur_bfqq,
-						   sector_t sector)
+/*
+ * bfqd - obvious
+ * cur_bfqq - passed in so that we don't decide that the current queue
+ *            is closely cooperating with itself
+ * sector - used as a reference point to search for a close queue
+ */
+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
+					      struct bfq_queue *cur_bfqq,
+					      sector_t sector)
 {
 	struct bfq_queue *bfqq;
 
+	if (bfq_class_idle(cur_bfqq))
+		return NULL;
+	if (!bfq_bfqq_sync(cur_bfqq))
+		return NULL;
+	if (BFQQ_SEEKY(cur_bfqq))
+		return NULL;
+
+	/* If device has only one backlogged bfq_queue, don't search. */
+	if (bfqd->busy_queues == 1)
+		return NULL;
+
+	/*
+	 * We should notice if some of the queues are cooperating, e.g.
+	 * working closely on the same area of the disk. In that case,
+	 * we can group them together and don't waste time idling.
+	 */
+	bfqq = bfqq_close(bfqd, sector);
+	if (bfqq == NULL || bfqq == cur_bfqq)
+		return NULL;
+
+	/*
+	 * Do not merge queues from different bfq_groups.
+	*/
+	if (bfqq->entity.parent != cur_bfqq->entity.parent)
+		return NULL;
+
 	/*
-	 * We shall notice if some of the queues are cooperating,
-	 * e.g., working closely on the same area of the device. In
-	 * that case, we can group them together and: 1) don't waste
-	 * time idling, and 2) serve the union of their requests in
-	 * the best possible order for throughput.
+	 * It only makes sense to merge sync queues.
 	 */
-	bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
-	if (!bfqq || bfqq == cur_bfqq)
+	if (!bfq_bfqq_sync(bfqq))
+		return NULL;
+	if (BFQQ_SEEKY(bfqq))
+		return NULL;
+
+	/*
+	 * Do not merge queues of different priority classes.
+	 */
+	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
 		return NULL;
 
 	return bfqq;
@@ -1404,32 +1409,6 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
 	return new_bfqq;
 }
 
-static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
-					struct bfq_queue *new_bfqq)
-{
-	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
-	    (bfqq->ioprio_class != new_bfqq->ioprio_class))
-		return false;
-
-	/*
-	 * If either of the queues has already been detected as seeky,
-	 * then merging it with the other queue is unlikely to lead to
-	 * sequential I/O.
-	 */
-	if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
-		return false;
-
-	/*
-	 * Interleaved I/O is known to be done by (some) applications
-	 * only for reads, so it does not make sense to merge async
-	 * queues.
-	 */
-	if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
-		return false;
-
-	return true;
-}
-
 /*
  * Attempt to schedule a merge of bfqq with the currently in-service queue
  * or with a close queue among the scheduled queues.
@@ -1451,52 +1430,56 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	if (bfqq->new_bfqq)
 		return bfqq->new_bfqq;
+
 	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
 		return NULL;
-	/* If device has only one backlogged bfq_queue, don't search. */
-	if (bfqd->busy_queues == 1)
-		return NULL;
 
 	in_service_bfqq = bfqd->in_service_queue;
 
-	if (!in_service_bfqq || in_service_bfqq == bfqq ||
+	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
 	    !bfqd->in_service_bic ||
 	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))
 		goto check_scheduled;
 
+	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
+		goto check_scheduled;
+
+	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
+		goto check_scheduled;
+
+	if (in_service_bfqq->entity.parent != bfqq->entity.parent)
+		goto check_scheduled;
+
 	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
-	    bfqq->entity.parent == in_service_bfqq->entity.parent &&
-	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
+	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
 		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
-		if (new_bfqq)
-			return new_bfqq;
+		if (new_bfqq != NULL)
+			return new_bfqq; /* Merge with in-service queue */
 	}
+
 	/*
 	 * Check whether there is a cooperator among currently scheduled
 	 * queues. The only thing we need is that the bio/request is not
 	 * NULL, as we need it to establish whether a cooperator exists.
 	 */
 check_scheduled:
-	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
-			bfq_io_struct_pos(io_struct, request));
-
-	BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
-
-	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
-	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
+	new_bfqq = bfq_close_cooperator(bfqd, bfqq,
+					bfq_io_struct_pos(io_struct, request));
+	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))
 		return bfq_setup_merge(bfqq, new_bfqq);
 
 	return NULL;
 }
 
-static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+static inline void
+bfq_bfqq_save_state(struct bfq_queue *bfqq)
 {
 	/*
-	 * If !bfqq->bic, the queue is already shared or its requests
+	 * If bfqq->bic == NULL, the queue is already shared or its requests
 	 * have already been redirected to a shared queue; both idle window
 	 * and weight raising state have already been saved. Do nothing.
 	 */
-	if (!bfqq->bic)
+	if (bfqq->bic == NULL)
 		return;
 	if (bfqq->bic->wr_time_left)
 		/*
@@ -1540,7 +1523,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 	bfqq->bic->failed_cooperations = 0;
 }
 
-static void bfq_get_bic_reference(struct bfq_queue *bfqq)
+static inline void
+bfq_get_bic_reference(struct bfq_queue *bfqq)
 {
 	/*
 	 * If bfqq->bic has a non-NULL value, the bic to which it belongs
@@ -1588,7 +1572,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
 	bfq_put_queue(bfqq);
 }
 
-static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
+static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
 {
 	struct bfq_io_cq *bic = bfqq->bic;
 	struct bfq_data *bfqd = bfqq->bfqd;
@@ -1619,7 +1603,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 	 * Queue lock is held here.
 	 */
 	bic = bfq_bic_lookup(bfqd, current->io_context);
-	if (!bic)
+	if (bic == NULL)
 		return 0;
 
 	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
@@ -1627,9 +1611,9 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 	 * We take advantage of this function to perform an early merge
 	 * of the queues of possible cooperating processes.
 	 */
-	if (bfqq) {
+	if (bfqq != NULL) {
 		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
-		if (new_bfqq) {
+		if (new_bfqq != NULL) {
 			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
 			/*
 			 * If we get here, the bio will be queued in the
@@ -1647,10 +1631,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 				       struct bfq_queue *bfqq)
 {
-	if (bfqq) {
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
-#endif
+	if (bfqq != NULL) {
 		bfq_mark_bfqq_must_alloc(bfqq);
 		bfq_mark_bfqq_budget_new(bfqq);
 		bfq_clear_bfqq_fifo_expire(bfqq);
@@ -1658,7 +1639,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
 
 		bfq_log_bfqq(bfqd, bfqq,
-			     "set_in_service_queue, cur-budget = %d",
+			     "set_in_service_queue, cur-budget = %lu",
 			     bfqq->entity.budget);
 	}
 
@@ -1681,9 +1662,9 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
  * stored in bfqd, which is dynamically updated according to the
  * estimated disk peak rate; otherwise return the default max budget
  */
-static int bfq_max_budget(struct bfq_data *bfqd)
+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
 {
-	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+	if (bfqd->budgets_assigned < 194)
 		return bfq_default_max_budget;
 	else
 		return bfqd->bfq_max_budget;
@@ -1693,9 +1674,9 @@ static int bfq_max_budget(struct bfq_data *bfqd)
  * Return min budget, which is a fraction of the current or default
  * max budget (trying with 1/32)
  */
-static int bfq_min_budget(struct bfq_data *bfqd)
+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
 {
-	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+	if (bfqd->budgets_assigned < 194)
 		return bfq_default_max_budget / 32;
 	else
 		return bfqd->bfq_max_budget / 32;
@@ -1711,7 +1692,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 
 	/* Processes have exited, don't wait. */
 	bic = bfqd->in_service_bic;
-	if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
+	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
 		return;
 
 	bfq_mark_bfqq_wait_request(bfqq);
@@ -1737,15 +1718,12 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
 				  bfq_max_budget(bfqq->bfqd) / 8) ||
 	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
-	    bfq_symmetric_scenario(bfqd))
+	    symmetric_scenario)
 		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
 	else if (bfqq->wr_coeff > 1)
 		sl = sl * 3;
 	bfqd->last_idling_start = ktime_get();
 	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
-#endif
 	bfq_log(bfqd, "arm idle: %u/%u ms",
 		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
 }
@@ -1799,10 +1777,6 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
 	if (bfq_bfqq_sync(bfqq))
 		bfqd->sync_flight++;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq),
-				   rq->cmd_flags);
-#endif
 }
 
 /*
@@ -1828,7 +1802,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
 	return rq;
 }
 
-static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 	return entity->budget - entity->service;
@@ -1862,7 +1836,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 		/*
 		 * Resort priority tree of potential close cooperators.
 		 */
-		bfq_pos_tree_add_move(bfqd, bfqq);
+		bfq_rq_pos_tree_add(bfqd, bfqq);
 	}
 }
 
@@ -1872,24 +1846,24 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  * @bfqq: queue to update.
  * @reason: reason for expiration.
  *
- * Handle the feedback on @bfqq budget at queue expiration.
- * See the body for detailed comments.
+ * Handle the feedback on @bfqq budget.  See the body for detailed
+ * comments.
  */
 static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 				     struct bfq_queue *bfqq,
 				     enum bfqq_expiration reason)
 {
 	struct request *next_rq;
-	int budget, min_budget;
+	unsigned long budget, min_budget;
 
 	budget = bfqq->max_budget;
 	min_budget = bfq_min_budget(bfqd);
 
 	BUG_ON(bfqq != bfqd->in_service_queue);
 
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
 		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
 		budget, bfq_min_budget(bfqd));
 	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
 		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
@@ -1966,19 +1940,18 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 		default:
 			return;
 		}
-	} else
-		/*
-		 * Async queues get always the maximum possible budget
-		 * (their ability to dispatch is limited by
-		 * @bfqd->bfq_max_budget_async_rq).
-		 */
+	} else /* async queue */
+	    /* async queues get always the maximum possible budget
+	     * (their ability to dispatch is limited by
+	     * @bfqd->bfq_max_budget_async_rq).
+	     */
 		budget = bfqd->bfq_max_budget;
 
 	bfqq->max_budget = budget;
 
-	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
-	    !bfqd->bfq_user_max_budget)
-		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
+	    bfqq->max_budget > bfqd->bfq_max_budget)
+		bfqq->max_budget = bfqd->bfq_max_budget;
 
 	/*
 	 * Make sure that we have enough budget for the next request.
@@ -1987,14 +1960,14 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 	 * update.
 	 */
 	next_rq = bfqq->next_rq;
-	if (next_rq)
+	if (next_rq != NULL)
 		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
 					    bfq_serv_to_charge(next_rq, bfqq));
 	else
 		bfqq->entity.budget = bfqq->max_budget;
 
-	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
-			next_rq ? blk_rq_sectors(next_rq) : 0,
+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
+			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
 			bfqq->entity.budget);
 }
 
@@ -2020,15 +1993,15 @@ static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
  * seeky processes, and hence reduce their chances to lower the
  * throughput. See the code for more details.
  */
-static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				 bool compensate, enum bfqq_expiration reason)
+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				int compensate, enum bfqq_expiration reason)
 {
 	u64 bw, usecs, expected, timeout;
 	ktime_t delta;
 	int update = 0;
 
 	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
-		return false;
+		return 0;
 
 	if (compensate)
 		delta = bfqd->last_idling_start;
@@ -2039,7 +2012,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	/* Don't trust short/unrealistic values. */
 	if (usecs < 100 || usecs >= LONG_MAX)
-		return false;
+		return 0;
 
 	/*
 	 * Calculate the bandwidth for the last slice.  We use a 64 bit
@@ -2088,7 +2061,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 				bfqd->bfq_max_budget =
 					bfq_calc_max_budget(bfqd->peak_rate,
 							    timeout);
-				bfq_log(bfqd, "new max_budget=%d",
+				bfq_log(bfqd, "new max_budget=%lu",
 					bfqd->bfq_max_budget);
 			}
 			if (bfqd->device_speed == BFQ_BFQD_FAST &&
@@ -2113,7 +2086,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	 * and for the moment return false.
 	 */
 	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
-		return false;
+		return 0;
 
 	/*
 	 * A process is considered ``slow'' (i.e., seeky, so that we
@@ -2188,8 +2161,8 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  * seems to be quite precise also in embedded systems and KVM/QEMU virtual
  * machines.
  */
-static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
-						struct bfq_queue *bfqq)
+static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+						       struct bfq_queue *bfqq)
 {
 	return max(bfqq->last_idle_bklogged +
 		   HZ * bfqq->service_from_backlogged /
@@ -2202,7 +2175,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
  * the current time will be lower than this time instant according to the macro
  * time_is_before_jiffies().
  */
-static unsigned long bfq_infinity_from_now(unsigned long now)
+static inline unsigned long bfq_infinity_from_now(unsigned long now)
 {
 	return now + ULONG_MAX / 2;
 }
@@ -2239,14 +2212,13 @@ static unsigned long bfq_infinity_from_now(unsigned long now)
  */
 static void bfq_bfqq_expire(struct bfq_data *bfqd,
 			    struct bfq_queue *bfqq,
-			    bool compensate,
+			    int compensate,
 			    enum bfqq_expiration reason)
 {
-	bool slow;
+	int slow;
 	BUG_ON(bfqq != bfqd->in_service_queue);
 
-	/*
-	 * Update disk peak rate for autotuning and check whether the
+	/* Update disk peak rate for autotuning and check whether the
 	 * process is slow (see bfq_update_peak_rate).
 	 */
 	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
@@ -2340,12 +2312,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
  * just checked on request arrivals and completions, as well as on
  * idle timer expirations.
  */
-static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
 {
 	if (bfq_bfqq_budget_new(bfqq) ||
 	    time_before(jiffies, bfqq->budget_timeout))
-		return false;
-	return true;
+		return 0;
+	return 1;
 }
 
 /*
@@ -2356,7 +2328,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
  * does not hold, or if the queue is slow enough to deserve only to be
  * kicked off for preserving a high throughput.
 */
-static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
 {
 	bfq_log_bfqq(bfqq->bfqd, bfqq,
 		"may_budget_timeout: wait_request %d left %d timeout %d",
@@ -2371,278 +2343,183 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
 }
 
 /*
- * For a queue that becomes empty, device idling is allowed only if
- * this function returns true for that queue. As a consequence, since
- * device idling plays a critical role for both throughput boosting
- * and service guarantees, the return value of this function plays a
- * critical role as well.
+ * Device idling is allowed only for the queues for which this function
+ * returns true. For this reason, the return value of this function plays a
+ * critical role for both throughput boosting and service guarantees. The
+ * return value is computed through a logical expression. In this rather
+ * long comment, we try to briefly describe all the details and motivations
+ * behind the components of this logical expression.
+ *
+ * First, the expression is false if bfqq is not sync, or if: bfqq happened
+ * to become active during a large burst of queue activations, and the
+ * pattern of requests bfqq contains boosts the throughput if bfqq is
+ * expired. In fact, queues that became active during a large burst benefit
+ * only from throughput, as discussed in the comments to bfq_handle_burst.
+ * In this respect, expiring bfqq certainly boosts the throughput on NCQ-
+ * capable flash-based devices, whereas, on rotational devices, it boosts
+ * the throughput only if bfqq contains random requests.
+ *
+ * On the opposite end, if (a) bfqq is sync, (b) the above burst-related
+ * condition does not hold, and (c) bfqq is being weight-raised, then the
+ * expression always evaluates to true, as device idling is instrumental
+ * for preserving low-latency guarantees (see [1]). If, instead, conditions
+ * (a) and (b) do hold, but (c) does not, then the expression evaluates to
+ * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and
+ * (2) at least one of the following two conditions holds.
+ * The first condition is that the device is not performing NCQ, because
+ * idling the device most certainly boosts the throughput if this condition
+ * holds and bfqq is I/O-bound and has been granted a non-null idle window.
+ * The second compound condition is made of the logical AND of two components.
+ *
+ * The first component is true only if there is no weight-raised busy
+ * queue. This guarantees that the device is not idled for a sync non-
+ * weight-raised queue when there are busy weight-raised queues. The former
+ * is then expired immediately if empty. Combined with the timestamping
+ * rules of BFQ (see [1] for details), this causes sync non-weight-raised
+ * queues to get a lower number of requests served, and hence to ask for a
+ * lower number of requests from the request pool, before the busy weight-
+ * raised queues get served again.
+ *
+ * This is beneficial for the processes associated with weight-raised
+ * queues, when the request pool is saturated (e.g., in the presence of
+ * write hogs). In fact, if the processes associated with the other queues
+ * ask for requests at a lower rate, then weight-raised processes have a
+ * higher probability to get a request from the pool immediately (or at
+ * least soon) when they need one. Hence they have a higher probability to
+ * actually get a fraction of the disk throughput proportional to their
+ * high weight. This is especially true with NCQ-capable drives, which
+ * enqueue several requests in advance and further reorder internally-
+ * queued requests.
+ *
+ * In the end, mistreating non-weight-raised queues when there are busy
+ * weight-raised queues seems to mitigate starvation problems in the
+ * presence of heavy write workloads and NCQ, and hence to guarantee a
+ * higher application and system responsiveness in these hostile scenarios.
+ *
+ * If the first component of the compound condition is instead true, i.e.,
+ * there is no weight-raised busy queue, then the second component of the
+ * compound condition takes into account service-guarantee and throughput
+ * issues related to NCQ (recall that the compound condition is evaluated
+ * only if the device is detected as supporting NCQ).
  *
- * In a nutshell, this function returns true only if idling is
- * beneficial for throughput or, even if detrimental for throughput,
- * idling is however necessary to preserve service guarantees (low
- * latency, desired throughput distribution, ...). In particular, on
- * NCQ-capable devices, this function tries to return false, so as to
- * help keep the drives' internal queues full, whenever this helps the
- * device boost the throughput without causing any service-guarantee
- * issue.
+ * As for service guarantees, allowing the drive to enqueue more than one
+ * request at a time, and hence delegating de facto final scheduling
+ * decisions to the drive's internal scheduler, causes loss of control on
+ * the actual request service order. In this respect, when the drive is
+ * allowed to enqueue more than one request at a time, the service
+ * distribution enforced by the drive's internal scheduler is likely to
+ * coincide with the desired device-throughput distribution only in the
+ * following, perfectly symmetric, scenario:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ *    number of children.
+ *
+ * Even in such a scenario, sequential I/O may still receive a preferential
+ * treatment, but this is not likely to be a big issue with flash-based
+ * devices, because of their non-dramatic loss of throughput with random
+ * I/O. Things do differ with HDDs, for which additional care is taken, as
+ * explained after completing the discussion for flash-based devices.
  *
- * In more detail, the return value of this function is obtained by,
- * first, computing a number of boolean variables that take into
- * account throughput and service-guarantee issues, and, then,
- * combining these variables in a logical expression. Most of the
- * issues taken into account are not trivial. We discuss these issues
- * while introducing the variables.
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore BFQ evaluates instead the following stronger sub-conditions,
+ * for which it is much easier to maintain the needed state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, hence no state needs
+ * to be maintained in this case.
+ *
+ * According to the above considerations, the second component of the
+ * compound condition evaluates to true if any of the above symmetry
+ * sub-condition does not hold, or the device is not flash-based. Therefore,
+ * if also the first component is true, then idling is allowed for a sync
+ * queue. These are the only sub-conditions considered if the device is
+ * flash-based, as, for such a device, it is sensible to force idling only
+ * for service-guarantee issues. In fact, as for throughput, idling
+ * NCQ-capable flash-based devices would not boost the throughput even
+ * with sequential I/O; rather it would lower the throughput in proportion
+ * to how fast the device is. In the end, (only) if all the three
+ * sub-conditions hold and the device is flash-based, the compound
+ * condition evaluates to false and therefore no idling is performed.
+ *
+ * As already said, things change with a rotational device, where idling
+ * boosts the throughput with sequential I/O (even with NCQ). Hence, for
+ * such a device the second component of the compound condition evaluates
+ * to true also if the following additional sub-condition does not hold:
+ * the queue is constantly seeky. Unfortunately, this different behavior
+ * with respect to flash-based devices causes an additional asymmetry: if
+ * some sync queues enjoy idling and some other sync queues do not, then
+ * the latter get a low share of the device throughput, simply because the
+ * former get many requests served after being set as in service, whereas
+ * the latter do not. As a consequence, to guarantee the desired throughput
+ * distribution, on HDDs the compound expression evaluates to true (and
+ * hence device idling is performed) also if the following last symmetry
+ * condition does not hold: no other queue is benefiting from idling. Also
+ * this last condition is actually replaced with a simpler-to-maintain and
+ * stronger condition: there is no busy queue which is not constantly seeky
+ * (and hence may also benefit from idling).
+ *
+ * To sum up, when all the required symmetry and throughput-boosting
+ * sub-conditions hold, the second component of the compound condition
+ * evaluates to false, and hence no idling is performed. This helps to
+ * keep the drives' internal queues full on NCQ-capable devices, and hence
+ * to boost the throughput, without causing 'almost' any loss of service
+ * guarantees. The 'almost' follows from the fact that, if the internal
+ * queue of one such device is filled while all the sub-conditions hold,
+ * but at some point in time some sub-condition stops to hold, then it may
+ * become impossible to let requests be served in the new desired order
+ * until all the requests already queued in the device have been served.
  */
-static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
-	bool idling_boosts_thr, idling_boosts_thr_without_issues,
-		all_queues_seeky, on_hdd_and_not_all_queues_seeky,
-		idling_needed_for_service_guarantees,
-		asymmetric_scenario;
-
-	/*
-	 * The next variable takes into account the cases where idling
-	 * boosts the throughput.
-	 *
-	 * The value of the variable is computed considering, first, that
-	 * idling is virtually always beneficial for the throughput if:
-	 * (a) the device is not NCQ-capable, or
-	 * (b) regardless of the presence of NCQ, the device is rotational
-	 *     and the request pattern for bfqq is I/O-bound and sequential.
-	 *
-	 * Secondly, and in contrast to the above item (b), idling an
-	 * NCQ-capable flash-based device would not boost the
-	 * throughput even with sequential I/O; rather it would lower
-	 * the throughput in proportion to how fast the device
-	 * is. Accordingly, the next variable is true if any of the
-	 * above conditions (a) and (b) is true, and, in particular,
-	 * happens to be false if bfqd is an NCQ-capable flash-based
-	 * device.
-	 */
-	idling_boosts_thr = !bfqd->hw_tag ||
-		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
-		 bfq_bfqq_idle_window(bfqq)) ;
+#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \
+				   bfqd->busy_in_flight_queues == \
+				   bfqd->const_seeky_busy_in_flight_queues)
 
-	/*
-	 * The value of the next variable,
-	 * idling_boosts_thr_without_issues, is equal to that of
-	 * idling_boosts_thr, unless a special case holds. In this
-	 * special case, described below, idling may cause problems to
-	 * weight-raised queues.
-	 *
-	 * When the request pool is saturated (e.g., in the presence
-	 * of write hogs), if the processes associated with
-	 * non-weight-raised queues ask for requests at a lower rate,
-	 * then processes associated with weight-raised queues have a
-	 * higher probability to get a request from the pool
-	 * immediately (or at least soon) when they need one. Thus
-	 * they have a higher probability to actually get a fraction
-	 * of the device throughput proportional to their high
-	 * weight. This is especially true with NCQ-capable drives,
-	 * which enqueue several requests in advance, and further
-	 * reorder internally-queued requests.
-	 *
-	 * For this reason, we force to false the value of
-	 * idling_boosts_thr_without_issues if there are weight-raised
-	 * busy queues. In this case, and if bfqq is not weight-raised,
-	 * this guarantees that the device is not idled for bfqq (if,
-	 * instead, bfqq is weight-raised, then idling will be
-	 * guaranteed by another variable, see below). Combined with
-	 * the timestamping rules of BFQ (see [1] for details), this
-	 * behavior causes bfqq, and hence any sync non-weight-raised
-	 * queue, to get a lower number of requests served, and thus
-	 * to ask for a lower number of requests from the request
-	 * pool, before the busy weight-raised queues get served
-	 * again. This often mitigates starvation problems in the
-	 * presence of heavy write workloads and NCQ, thereby
-	 * guaranteeing a higher application and system responsiveness
-	 * in these hostile scenarios.
-	 */
-	idling_boosts_thr_without_issues = idling_boosts_thr &&
-		bfqd->wr_busy_queues == 0;
-
-	/*
-	 * There are then two cases where idling must be performed not
-	 * for throughput concerns, but to preserve service
-	 * guarantees. In the description of these cases, we say, for
-	 * short, that a queue is sequential/random if the process
-	 * associated to the queue issues sequential/random requests
-	 * (in the second case the queue may be tagged as seeky or
-	 * even constantly_seeky).
-	 *
-	 * To introduce the first case, we note that, since
-	 * bfq_bfqq_idle_window(bfqq) is false if the device is
-	 * NCQ-capable and bfqq is random (see
-	 * bfq_update_idle_window()), then, from the above two
-	 * assignments it follows that
-	 * idling_boosts_thr_without_issues is false if the device is
-	 * NCQ-capable and bfqq is random. Therefore, for this case,
-	 * device idling would never be allowed if we used just
-	 * idling_boosts_thr_without_issues to decide whether to allow
-	 * it. And, beneficially, this would imply that throughput
-	 * would always be boosted also with random I/O on NCQ-capable
-	 * HDDs.
-	 *
-	 * But we must be careful on this point, to avoid an unfair
-	 * treatment for bfqq. In fact, because of the same above
-	 * assignments, idling_boosts_thr_without_issues is, on the
-	 * other hand, true if 1) the device is an HDD and bfqq is
-	 * sequential, and 2) there are no busy weight-raised
-	 * queues. As a consequence, if we used just
-	 * idling_boosts_thr_without_issues to decide whether to idle
-	 * the device, then with an HDD we might easily bump into a
-	 * scenario where queues that are sequential and I/O-bound
-	 * would enjoy idling, whereas random queues would not. The
-	 * latter might then get a low share of the device throughput,
-	 * simply because the former would get many requests served
-	 * after being set as in service, while the latter would not.
-	 *
-	 * To address this issue, we start by setting to true a
-	 * sentinel variable, on_hdd_and_not_all_queues_seeky, if the
-	 * device is rotational and not all queues with pending or
-	 * in-flight requests are constantly seeky (i.e., there are
-	 * active sequential queues, and bfqq might then be mistreated
-	 * if it does not enjoy idling because it is random).
-	 */
-	all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) &&
-			   bfqd->busy_in_flight_queues ==
-			   bfqd->const_seeky_busy_in_flight_queues;
-
-	on_hdd_and_not_all_queues_seeky =
-		!blk_queue_nonrot(bfqd->queue) && !all_queues_seeky;
-
-	/*
-	 * To introduce the second case where idling needs to be
-	 * performed to preserve service guarantees, we can note that
-	 * allowing the drive to enqueue more than one request at a
-	 * time, and hence delegating de facto final scheduling
-	 * decisions to the drive's internal scheduler, causes loss of
-	 * control on the actual request service order. In particular,
-	 * the critical situation is when requests from different
-	 * processes happens to be present, at the same time, in the
-	 * internal queue(s) of the drive. In such a situation, the
-	 * drive, by deciding the service order of the
-	 * internally-queued requests, does determine also the actual
-	 * throughput distribution among these processes. But the
-	 * drive typically has no notion or concern about per-process
-	 * throughput distribution, and makes its decisions only on a
-	 * per-request basis. Therefore, the service distribution
-	 * enforced by the drive's internal scheduler is likely to
-	 * coincide with the desired device-throughput distribution
-	 * only in a completely symmetric scenario where:
-	 * (i)  each of these processes must get the same throughput as
-	 *      the others;
-	 * (ii) all these processes have the same I/O pattern
-	        (either sequential or random).
-	 * In fact, in such a scenario, the drive will tend to treat
-	 * the requests of each of these processes in about the same
-	 * way as the requests of the others, and thus to provide
-	 * each of these processes with about the same throughput
-	 * (which is exactly the desired throughput distribution). In
-	 * contrast, in any asymmetric scenario, device idling is
-	 * certainly needed to guarantee that bfqq receives its
-	 * assigned fraction of the device throughput (see [1] for
-	 * details).
-	 *
-	 * We address this issue by controlling, actually, only the
-	 * symmetry sub-condition (i), i.e., provided that
-	 * sub-condition (i) holds, idling is not performed,
-	 * regardless of whether sub-condition (ii) holds. In other
-	 * words, only if sub-condition (i) holds, then idling is
-	 * allowed, and the device tends to be prevented from queueing
-	 * many requests, possibly of several processes. The reason
-	 * for not controlling also sub-condition (ii) is that, first,
-	 * in the case of an HDD, the asymmetry in terms of types of
-	 * I/O patterns is already taken in to account in the above
-	 * sentinel variable
-	 * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a
-	 * flash-based device, we prefer however to privilege
-	 * throughput (and idling lowers throughput for this type of
-	 * devices), for the following reasons:
-	 * 1) differently from HDDs, the service time of random
-	 *    requests is not orders of magnitudes lower than the service
-	 *    time of sequential requests; thus, even if processes doing
-	 *    sequential I/O get a preferential treatment with respect to
-	 *    others doing random I/O, the consequences are not as
-	 *    dramatic as with HDDs;
-	 * 2) if a process doing random I/O does need strong
-	 *    throughput guarantees, it is hopefully already being
-	 *    weight-raised, or the user is likely to have assigned it a
-	 *    higher weight than the other processes (and thus
-	 *    sub-condition (i) is likely to be false, which triggers
-	 *    idling).
-	 *
-	 * According to the above considerations, the next variable is
-	 * true (only) if sub-condition (i) holds. To compute the
-	 * value of this variable, we not only use the return value of
-	 * the function bfq_symmetric_scenario(), but also check
-	 * whether bfqq is being weight-raised, because
-	 * bfq_symmetric_scenario() does not take into account also
-	 * weight-raised queues (see comments to
-	 * bfq_weights_tree_add()).
-	 *
-	 * As a side note, it is worth considering that the above
-	 * device-idling countermeasures may however fail in the
-	 * following unlucky scenario: if idling is (correctly)
-	 * disabled in a time period during which all symmetry
-	 * sub-conditions hold, and hence the device is allowed to
-	 * enqueue many requests, but at some later point in time some
-	 * sub-condition stops to hold, then it may become impossible
-	 * to let requests be served in the desired order until all
-	 * the requests already queued in the device have been served.
-	 */
-	asymmetric_scenario = bfqq->wr_coeff > 1 ||
-		!bfq_symmetric_scenario(bfqd);
+#define cond_for_expiring_in_burst	(bfq_bfqq_in_large_burst(bfqq) && \
+					 bfqd->hw_tag && \
+					 (blk_queue_nonrot(bfqd->queue) || \
+					  bfq_bfqq_constantly_seeky(bfqq)))
 
-	/*
-	 * Finally, there is a case where maximizing throughput is the
-	 * best choice even if it may cause unfairness toward
-	 * bfqq. Such a case is when bfqq became active in a burst of
-	 * queue activations. Queues that became active during a large
-	 * burst benefit only from throughput, as discussed in the
-	 * comments to bfq_handle_burst. Thus, if bfqq became active
-	 * in a burst and not idling the device maximizes throughput,
-	 * then the device must no be idled, because not idling the
-	 * device provides bfqq and all other queues in the burst with
-	 * maximum benefit. Combining this and the two cases above, we
-	 * can now establish when idling is actually needed to
-	 * preserve service guarantees.
-	 */
-	idling_needed_for_service_guarantees =
-		(on_hdd_and_not_all_queues_seeky || asymmetric_scenario) &&
-		!bfq_bfqq_in_large_burst(bfqq);
+/*
+ * Condition for expiring a non-weight-raised queue (and hence not idling
+ * the device).
+ */
+#define cond_for_expiring_non_wr  (bfqd->hw_tag && \
+				   (bfqd->wr_busy_queues > 0 || \
+				    (blk_queue_nonrot(bfqd->queue) || \
+				      cond_for_seeky_on_ncq_hdd)))
 
-	/*
-	 * We have now all the components we need to compute the return
-	 * value of the function, which is true only if both the following
-	 * conditions hold:
-	 * 1) bfqq is sync, because idling make sense only for sync queues;
-	 * 2) idling either boosts the throughput (without issues), or
-	 *    is necessary to preserve service guarantees.
-	 */
 	return bfq_bfqq_sync(bfqq) &&
-		(idling_boosts_thr_without_issues ||
-		 idling_needed_for_service_guarantees);
+		!cond_for_expiring_in_burst &&
+		(bfqq->wr_coeff > 1 || !symmetric_scenario ||
+		 (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&
+		  !cond_for_expiring_non_wr)
+	);
 }
 
 /*
- * If the in-service queue is empty but the function bfq_bfqq_may_idle
- * returns true, then:
+ * If the in-service queue is empty but sync, and the function
+ * bfq_bfqq_must_not_expire returns true, then:
  * 1) the queue must remain in service and cannot be expired, and
- * 2) the device must be idled to wait for the possible arrival of a new
+ * 2) the disk must be idled to wait for the possible arrival of a new
  *    request for the queue.
- * See the comments to the function bfq_bfqq_may_idle for the reasons
+ * See the comments to the function bfq_bfqq_must_not_expire for the reasons
  * why performing device idling is the best choice to boost the throughput
- * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * and preserve service guarantees when bfq_bfqq_must_not_expire itself
  * returns true.
  */
-static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
 
 	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
-	       bfq_bfqq_may_idle(bfqq);
+	       bfq_bfqq_must_not_expire(bfqq);
 }
 
 /*
@@ -2656,7 +2533,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
 
 	bfqq = bfqd->in_service_queue;
-	if (!bfqq)
+	if (bfqq == NULL)
 		goto new_queue;
 
 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
@@ -2671,7 +2548,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	 * If bfqq has requests queued and it has enough budget left to
 	 * serve them, keep the queue, otherwise expire it.
 	 */
-	if (next_rq) {
+	if (next_rq != NULL) {
 		if (bfq_serv_to_charge(next_rq, bfqq) >
 			bfq_bfqq_budget_left(bfqq)) {
 			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
@@ -2698,9 +2575,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 				 */
 				bfq_clear_bfqq_wait_request(bfqq);
 				del_timer(&bfqd->idle_slice_timer);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-				bfqg_stats_update_idle_time(bfqq_group(bfqq));
-#endif
 			}
 			goto keep_queue;
 		}
@@ -2712,18 +2586,18 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	 * may idle after their completion, then keep it anyway.
 	 */
 	if (timer_pending(&bfqd->idle_slice_timer) ||
-	    (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
 		bfqq = NULL;
 		goto keep_queue;
 	}
 
 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
 expire:
-	bfq_bfqq_expire(bfqd, bfqq, false, reason);
+	bfq_bfqq_expire(bfqd, bfqq, 0, reason);
 new_queue:
 	bfqq = bfq_set_in_service_queue(bfqd);
 	bfq_log(bfqd, "select_queue: new queue %d returned",
-		bfqq ? bfqq->pid : 0);
+		bfqq != NULL ? bfqq->pid : 0);
 keep_queue:
 	return bfqq;
 }
@@ -2741,7 +2615,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 
 		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
 		       entity->orig_weight * bfqq->wr_coeff);
-		if (entity->prio_changed)
+		if (entity->ioprio_changed)
 			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
 
 		/*
@@ -2785,7 +2659,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
 
 	/* Follow expired path, else get first next available. */
 	rq = bfq_check_fifo(bfqq);
-	if (!rq)
+	if (rq == NULL)
 		rq = bfqq->next_rq;
 	service_to_charge = bfq_serv_to_charge(rq, bfqq);
 
@@ -2821,14 +2695,14 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
 	bfq_update_wr_data(bfqd, bfqq);
 
 	bfq_log_bfqq(bfqd, bfqq,
-			"dispatched %u sec req (%llu), budg left %d",
+			"dispatched %u sec req (%llu), budg left %lu",
 			blk_rq_sectors(rq),
 			(long long unsigned)blk_rq_pos(rq),
 			bfq_bfqq_budget_left(bfqq));
 
 	dispatched++;
 
-	if (!bfqd->in_service_bic) {
+	if (bfqd->in_service_bic == NULL) {
 		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
 		bfqd->in_service_bic = RQ_BIC(rq);
 	}
@@ -2841,7 +2715,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
 	return dispatched;
 
 expire:
-	bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
+	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
 	return dispatched;
 }
 
@@ -2849,7 +2723,7 @@ static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
 {
 	int dispatched = 0;
 
-	while (bfqq->next_rq) {
+	while (bfqq->next_rq != NULL) {
 		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
 		dispatched++;
 	}
@@ -2869,7 +2743,7 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd)
 	int dispatched = 0;
 
 	bfqq = bfqd->in_service_queue;
-	if (bfqq)
+	if (bfqq != NULL)
 		__bfq_bfqq_expire(bfqd, bfqq);
 
 	/*
@@ -2905,7 +2779,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
 		return bfq_forced_dispatch(bfqd);
 
 	bfqq = bfq_select_queue(bfqd);
-	if (!bfqq)
+	if (bfqq == NULL)
 		return 0;
 
 	if (bfq_class_idle(bfqq))
@@ -2945,9 +2819,6 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
 static void bfq_put_queue(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	struct bfq_group *bfqg = bfqq_group(bfqq);
-#endif
 
 	BUG_ON(atomic_read(&bfqq->ref) <= 0);
 
@@ -2956,9 +2827,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 	if (!atomic_dec_and_test(&bfqq->ref))
 		return;
 
-	BUG_ON(rb_first(&bfqq->sort_list));
+	BUG_ON(rb_first(&bfqq->sort_list) != NULL);
 	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
-	BUG_ON(bfqq->entity.tree);
+	BUG_ON(bfqq->entity.tree != NULL);
 	BUG_ON(bfq_bfqq_busy(bfqq));
 	BUG_ON(bfqd->in_service_queue == bfqq);
 
@@ -2976,9 +2847,6 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
 
 	kmem_cache_free(bfq_pool, bfqq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_put(bfqg);
-#endif
 }
 
 static void bfq_put_cooperator(struct bfq_queue *bfqq)
@@ -3015,7 +2883,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	bfq_put_queue(bfqq);
 }
 
-static void bfq_init_icq(struct io_cq *icq)
+static inline void bfq_init_icq(struct io_cq *icq)
 {
 	struct bfq_io_cq *bic = icq_to_bic(icq);
 
@@ -3082,38 +2950,40 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b
 		/*
 		 * No prio set, inherit CPU scheduling settings.
 		 */
-		bfqq->new_ioprio = task_nice_ioprio(tsk);
-		bfqq->new_ioprio_class = task_nice_ioclass(tsk);
+		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
+		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
-		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
-		bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
 		break;
 	case IOPRIO_CLASS_BE:
-		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
-		bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
 		break;
 	case IOPRIO_CLASS_IDLE:
-		bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
-		bfqq->new_ioprio = 7;
+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
+		bfqq->entity.new_ioprio = 7;
 		bfq_clear_bfqq_idle_window(bfqq);
 		break;
 	}
 
-	if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) {
+	if (bfqq->entity.new_ioprio < 0 ||
+	    bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {
 		printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",
-				 bfqq->new_ioprio);
+				 bfqq->entity.new_ioprio);
 		BUG();
 	}
 
-	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
-	bfqq->entity.prio_changed = 1;
+	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio);
+	bfqq->entity.ioprio_changed = 1;
 }
 
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic)
 {
 	struct bfq_data *bfqd;
 	struct bfq_queue *bfqq, *new_bfqq;
+	struct bfq_group *bfqg;
 	unsigned long uninitialized_var(flags);
 	int ioprio = bic->icq.ioc->ioprio;
 
@@ -3123,16 +2993,18 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 	 * This condition may trigger on a newly created bic, be sure to
 	 * drop the lock before returning.
 	 */
-	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
+	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
 		goto out;
 
 	bic->ioprio = ioprio;
 
 	bfqq = bic->bfqq[BLK_RW_ASYNC];
-	if (bfqq) {
-		new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic,
+	if (bfqq != NULL) {
+		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
+				    sched_data);
+		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
 					 GFP_ATOMIC);
-		if (new_bfqq) {
+		if (new_bfqq != NULL) {
 			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
 			bfq_log_bfqq(bfqd, bfqq,
 				     "check_ioprio_change: bfqq %p %d",
@@ -3142,7 +3014,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 	}
 
 	bfqq = bic->bfqq[BLK_RW_SYNC];
-	if (bfqq)
+	if (bfqq != NULL)
 		bfq_set_next_ioprio_data(bfqq, bic);
 
 out:
@@ -3166,8 +3038,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		if (!bfq_class_idle(bfqq))
 			bfq_mark_bfqq_idle_window(bfqq);
 		bfq_mark_bfqq_sync(bfqq);
-	} else
-		bfq_clear_bfqq_sync(bfqq);
+	}
 	bfq_mark_bfqq_IO_bound(bfqq);
 
 	/* Tentative initial value to trade off between thr and lat */
@@ -3184,19 +3055,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 }
 
 static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
-					      struct bio *bio, int is_sync,
+					      struct bfq_group *bfqg,
+					      int is_sync,
 					      struct bfq_io_cq *bic,
 					      gfp_t gfp_mask)
 {
-	struct bfq_group *bfqg;
 	struct bfq_queue *bfqq, *new_bfqq = NULL;
-	struct blkcg *blkcg;
 
 retry:
-	rcu_read_lock();
-
-	blkcg = bio_blkcg(bio);
-	bfqg = bfq_find_alloc_group(bfqd, blkcg);
 	/* bic always exists here */
 	bfqq = bic_to_bfqq(bic, is_sync);
 
@@ -3204,19 +3070,18 @@ retry:
 	 * Always try a new alloc if we fall back to the OOM bfqq
 	 * originally, since it should just be a temporary situation.
 	 */
-	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
 		bfqq = NULL;
-		if (new_bfqq) {
+		if (new_bfqq != NULL) {
 			bfqq = new_bfqq;
 			new_bfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
-			rcu_read_unlock();
 			spin_unlock_irq(bfqd->queue->queue_lock);
 			new_bfqq = kmem_cache_alloc_node(bfq_pool,
 					gfp_mask | __GFP_ZERO,
 					bfqd->queue->node);
 			spin_lock_irq(bfqd->queue->queue_lock);
-			if (new_bfqq)
+			if (new_bfqq != NULL)
 				goto retry;
 		} else {
 			bfqq = kmem_cache_alloc_node(bfq_pool,
@@ -3224,7 +3089,7 @@ retry:
 					bfqd->queue->node);
 		}
 
-		if (bfqq) {
+		if (bfqq != NULL) {
 			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
                                       is_sync);
 			bfq_init_entity(&bfqq->entity, bfqg);
@@ -3235,11 +3100,9 @@ retry:
 		}
 	}
 
-	if (new_bfqq)
+	if (new_bfqq != NULL)
 		kmem_cache_free(bfq_pool, new_bfqq);
 
-	rcu_read_unlock();
-
 	return bfqq;
 }
 
@@ -3263,7 +3126,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
 }
 
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bio *bio, int is_sync,
+				       struct bfq_group *bfqg, int is_sync,
 				       struct bfq_io_cq *bic, gfp_t gfp_mask)
 {
 	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
@@ -3272,26 +3135,19 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	struct bfq_queue *bfqq = NULL;
 
 	if (!is_sync) {
-		struct blkcg *blkcg;
-		struct bfq_group *bfqg;
-
-		rcu_read_lock();
-		blkcg = bio_blkcg(bio);
-		rcu_read_unlock();
-		bfqg = bfq_find_alloc_group(bfqd, blkcg);
 		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
 						  ioprio);
 		bfqq = *async_bfqq;
 	}
 
-	if (!bfqq)
-		bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask);
+	if (bfqq == NULL)
+		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
 
 	/*
 	 * Pin the queue now that it's allocated, scheduler exit will
 	 * prune it.
 	 */
-	if (!is_sync && !(*async_bfqq)) {
+	if (!is_sync && *async_bfqq == NULL) {
 		atomic_inc(&bfqq->ref);
 		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
 			     bfqq, atomic_read(&bfqq->ref));
@@ -3424,9 +3280,9 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
-		bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
-				 blk_rq_sectors(rq) < 32;
-		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+				blk_rq_sectors(rq) < 32;
+		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
 
 		/*
 		 * There is just this request queued: if the request
@@ -3453,9 +3309,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		 */
 		bfq_clear_bfqq_wait_request(bfqq);
 		del_timer(&bfqd->idle_slice_timer);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		bfqg_stats_update_idle_time(bfqq_group(bfqq));
-#endif
 
 		/*
 		 * The queue is not empty, because a new request just
@@ -3465,8 +3318,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		 * See [1] for more details.
 		 */
 		if (budget_timeout)
-			bfq_bfqq_expire(bfqd, bfqq, false,
-					BFQ_BFQQ_BUDGET_TIMEOUT);
+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
 
 		/*
 		 * Let the request rip immediately, or let a new queue be
@@ -3490,7 +3342,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
 	 */
 	if (!in_interrupt()) {
 		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
-		if (new_bfqq) {
+		if (new_bfqq != NULL) {
 			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
 				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
 			/*
@@ -3518,7 +3370,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
 	 * from assigning it a full weight-raising period. See the detailed
 	 * comments about this field in bfq_init_icq().
 	 */
-	if (bfqq->bic)
+	if (bfqq->bic != NULL)
 		bfqq->bic->wr_time_left = 0;
 	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
 	list_add_tail(&rq->queuelist, &bfqq->fifo);
@@ -3566,11 +3418,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
 	BUG_ON(!bfqq->dispatched);
 	bfqd->rq_in_driver--;
 	bfqq->dispatched--;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_update_completion(bfqq_group(bfqq),
-				     rq_start_time_ns(rq),
-				     rq_io_start_time_ns(rq), rq->cmd_flags);
-#endif
 
 	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
 		bfq_weights_tree_remove(bfqd, &bfqq->entity,
@@ -3615,12 +3462,11 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
 			bfq_arm_slice_timer(bfqd);
 			goto out;
 		} else if (bfq_may_expire_for_budg_timeout(bfqq))
-			bfq_bfqq_expire(bfqd, bfqq, false,
-					BFQ_BFQQ_BUDGET_TIMEOUT);
+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
 		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
 			 (bfqq->dispatched == 0 ||
-			  !bfq_bfqq_may_idle(bfqq)))
-			bfq_bfqq_expire(bfqd, bfqq, false,
+			  !bfq_bfqq_must_not_expire(bfqq)))
+			bfq_bfqq_expire(bfqd, bfqq, 0,
 					BFQ_BFQQ_NO_MORE_REQUESTS);
 	}
 
@@ -3631,7 +3477,7 @@ out:
 	return;
 }
 
-static int __bfq_may_queue(struct bfq_queue *bfqq)
+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
 {
 	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
 		bfq_clear_bfqq_must_alloc(bfqq);
@@ -3655,11 +3501,11 @@ static int bfq_may_queue(struct request_queue *q, int rw)
 	 * 'may queue' if that fails.
 	 */
 	bic = bfq_bic_lookup(bfqd, tsk->io_context);
-	if (!bic)
+	if (bic == NULL)
 		return ELV_MQUEUE_MAY;
 
 	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
-	if (bfqq)
+	if (bfqq != NULL)
 		return __bfq_may_queue(bfqq);
 
 	return ELV_MQUEUE_MAY;
@@ -3672,7 +3518,7 @@ static void bfq_put_request(struct request *rq)
 {
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 
-	if (bfqq) {
+	if (bfqq != NULL) {
 		const int rw = rq_data_dir(rq);
 
 		BUG_ON(!bfqq->allocated[rw]);
@@ -3724,24 +3570,25 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
 	const int rw = rq_data_dir(rq);
 	const int is_sync = rq_is_sync(rq);
 	struct bfq_queue *bfqq;
+	struct bfq_group *bfqg;
 	unsigned long flags;
 	bool split = false;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	bfq_check_ioprio_change(bic, bio);
+	bfq_check_ioprio_change(bic);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
-	if (!bic)
+	if (bic == NULL)
 		goto queue_fail;
 
-	bfq_bic_update_cgroup(bic, bio);
+	bfqg = bfq_bic_update_cgroup(bic);
 
 new_queue:
 	bfqq = bic_to_bfqq(bic, is_sync);
-	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
-		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);
+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
+		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
 		bic_set_bfqq(bic, bfqq, is_sync);
 		if (split && is_sync) {
 			if ((bic->was_in_burst_list && bfqd->large_burst) ||
@@ -3837,7 +3684,7 @@ static void bfq_idle_slice_timer(unsigned long data)
 	 * the in-service queue.  This can hardly happen, but in the worst
 	 * case we just expire a queue too early.
 	 */
-	if (bfqq) {
+	if (bfqq != NULL) {
 		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
 		if (bfq_bfqq_budget_timeout(bfqq))
 			/*
@@ -3857,7 +3704,7 @@ static void bfq_idle_slice_timer(unsigned long data)
 		else
 			goto schedule_dispatch;
 
-		bfq_bfqq_expire(bfqd, bfqq, true, reason);
+		bfq_bfqq_expire(bfqd, bfqq, 1, reason);
 	}
 
 schedule_dispatch:
@@ -3872,14 +3719,14 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
 	cancel_work_sync(&bfqd->unplug_work);
 }
 
-static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 					struct bfq_queue **bfqq_ptr)
 {
 	struct bfq_group *root_group = bfqd->root_group;
 	struct bfq_queue *bfqq = *bfqq_ptr;
 
 	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
-	if (bfqq) {
+	if (bfqq != NULL) {
 		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
 		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
 			     bfqq, atomic_read(&bfqq->ref));
@@ -3915,7 +3762,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	spin_lock_irq(q->queue_lock);
 
-	BUG_ON(bfqd->in_service_queue);
+	BUG_ON(bfqd->in_service_queue != NULL);
 	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
 		bfq_deactivate_bfqq(bfqd, bfqq, 0);
 
@@ -3928,39 +3775,22 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	blkcg_deactivate_policy(q, &blkcg_policy_bfq);
-#endif
-
+	bfq_free_root_group(bfqd);
 	kfree(bfqd);
 }
 
-static void bfq_init_root_group(struct bfq_group *root_group,
-				struct bfq_data *bfqd)
-{
-	int i;
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	root_group->entity.parent = NULL;
-	root_group->my_entity = NULL;
-	root_group->bfqd = bfqd;
-#endif
-	root_group->rq_pos_tree = RB_ROOT;
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-}
-
 static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 {
+	struct bfq_group *bfqg;
 	struct bfq_data *bfqd;
 	struct elevator_queue *eq;
 
 	eq = elevator_alloc(q, e);
-	if (!eq)
+	if (eq == NULL)
 		return -ENOMEM;
 
 	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
-	if (!bfqd) {
+	if (bfqd == NULL) {
 		kobject_put(&eq->kobj);
 		return -ENOMEM;
 	}
@@ -3973,16 +3803,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	 */
 	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
 	atomic_inc(&bfqd->oom_bfqq.ref);
-	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
-	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+	bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+	bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;
 	bfqd->oom_bfqq.entity.new_weight =
-		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+		bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio);
 	/*
 	 * Trigger weight initialization, according to ioprio, at the
 	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
 	 * class won't be changed any more.
 	 */
-	bfqd->oom_bfqq.entity.prio_changed = 1;
+	bfqd->oom_bfqq.entity.ioprio_changed = 1;
 
 	bfqd->queue = q;
 
@@ -3990,12 +3820,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	q->elevator = eq;
 	spin_unlock_irq(q->queue_lock);
 
-	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
-	if (!bfqd->root_group)
-		goto out_free;
-	bfq_init_root_group(bfqd->root_group, bfqd);
+	bfqg = bfq_alloc_root_group(bfqd, q->node);
+	if (bfqg == NULL) {
+		kfree(bfqd);
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+
+	bfqd->root_group = bfqg;
 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 	bfqd->active_numerous_groups = 0;
 #endif
 
@@ -4003,6 +3837,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
 	bfqd->idle_slice_timer.data = (unsigned long)bfqd;
 
+	bfqd->rq_pos_tree = RB_ROOT;
 	bfqd->queue_weights_tree = RB_ROOT;
 	bfqd->group_weights_tree = RB_ROOT;
 
@@ -4060,23 +3895,18 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->device_speed = BFQ_BFQD_FAST;
 
 	return 0;
-
-out_free:
-	kfree(bfqd);
-	kobject_put(&eq->kobj);
-	return -ENOMEM;
 }
 
 static void bfq_slab_kill(void)
 {
-	if (bfq_pool)
+	if (bfq_pool != NULL)
 		kmem_cache_destroy(bfq_pool);
 }
 
 static int __init bfq_slab_setup(void)
 {
 	bfq_pool = KMEM_CACHE(bfq_queue, 0);
-	if (!bfq_pool)
+	if (bfq_pool == NULL)
 		return -ENOMEM;
 	return 0;
 }
@@ -4221,7 +4051,7 @@ static ssize_t bfq_weights_store(struct elevator_queue *e,
 	return count;
 }
 
-static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
 {
 	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
 
@@ -4315,9 +4145,6 @@ static struct elevator_type iosched_bfq = {
 		.elevator_merge_fn =		bfq_merge,
 		.elevator_merged_fn =		bfq_merged_request,
 		.elevator_merge_req_fn =	bfq_merged_requests,
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		.elevator_bio_merged_fn =	bfq_bio_merged,
-#endif
 		.elevator_allow_merge_fn =	bfq_allow_merge,
 		.elevator_dispatch_fn =		bfq_dispatch_requests,
 		.elevator_add_req_fn =		bfq_insert_request,
@@ -4343,8 +4170,6 @@ static struct elevator_type iosched_bfq = {
 
 static int __init bfq_init(void)
 {
-	int ret;
-
 	/*
 	 * Can be 0 on HZ < 1000 setups.
 	 */
@@ -4354,15 +4179,8 @@ static int __init bfq_init(void)
 	if (bfq_timeout_async == 0)
 		bfq_timeout_async = 1;
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	ret = blkcg_policy_register(&blkcg_policy_bfq);
-	if (ret)
-		return ret;
-#endif
-
-	ret = -ENOMEM;
 	if (bfq_slab_setup())
-		goto err_pol_unreg;
+		return -ENOMEM;
 
 	/*
 	 * Times to load large popular applications for the typical systems
@@ -4381,27 +4199,15 @@ static int __init bfq_init(void)
 	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
 	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
 
-	ret = elv_register(&iosched_bfq);
-	if (ret)
-		goto err_pol_unreg;
-
-	pr_info("BFQ I/O-scheduler: v7r9");
+	elv_register(&iosched_bfq);
+	pr_info("BFQ I/O-scheduler: v7r8");
 
 	return 0;
-
-err_pol_unreg:
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	blkcg_policy_unregister(&blkcg_policy_bfq);
-#endif
-	return ret;
 }
 
 static void __exit bfq_exit(void)
 {
 	elv_unregister(&iosched_bfq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	blkcg_policy_unregister(&blkcg_policy_bfq);
-#endif
 	bfq_slab_kill();
 }
 
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
index 9328a1f09..d0890c6d4 100644
--- a/block/bfq-sched.c
+++ b/block/bfq-sched.c
@@ -10,27 +10,24 @@
  * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
  */
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 #define for_each_entity(entity)	\
-	for (; entity ; entity = entity->parent)
+	for (; entity != NULL; entity = entity->parent)
 
 #define for_each_entity_safe(entity, parent) \
 	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
 
-
 static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
 						 int extract,
 						 struct bfq_data *bfqd);
 
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-
-static void bfq_update_budget(struct bfq_entity *next_in_service)
+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
 {
 	struct bfq_entity *bfqg_entity;
 	struct bfq_group *bfqg;
 	struct bfq_sched_data *group_sd;
 
-	BUG_ON(!next_in_service);
+	BUG_ON(next_in_service == NULL);
 
 	group_sd = next_in_service->sched_data;
 
@@ -41,7 +38,7 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
 	 * as it must never become an in-service entity.
 	 */
 	bfqg_entity = bfqg->my_entity;
-	if (bfqg_entity)
+	if (bfqg_entity != NULL)
 		bfqg_entity->budget = next_in_service->budget;
 }
 
@@ -49,7 +46,7 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd)
 {
 	struct bfq_entity *next_in_service;
 
-	if (sd->in_service_entity)
+	if (sd->in_service_entity != NULL)
 		/* will update/requeue at the end of service */
 		return 0;
 
@@ -63,35 +60,35 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd)
 	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
 	sd->next_in_service = next_in_service;
 
-	if (next_in_service)
+	if (next_in_service != NULL)
 		bfq_update_budget(next_in_service);
 
 	return 1;
 }
 
-static void bfq_check_next_in_service(struct bfq_sched_data *sd,
-				      struct bfq_entity *entity)
+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
+					     struct bfq_entity *entity)
 {
 	BUG_ON(sd->next_in_service != entity);
 }
 #else
 #define for_each_entity(entity)	\
-	for (; entity ; entity = NULL)
+	for (; entity != NULL; entity = NULL)
 
 #define for_each_entity_safe(entity, parent) \
-	for (parent = NULL; entity ; entity = parent)
+	for (parent = NULL; entity != NULL; entity = parent)
 
-static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
 {
 	return 0;
 }
 
-static void bfq_check_next_in_service(struct bfq_sched_data *sd,
-				      struct bfq_entity *entity)
+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
+					     struct bfq_entity *entity)
 {
 }
 
-static void bfq_update_budget(struct bfq_entity *next_in_service)
+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
 {
 }
 #endif
@@ -112,18 +109,18 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
  *
  * Return @a > @b, dealing with wrapping correctly.
  */
-static int bfq_gt(u64 a, u64 b)
+static inline int bfq_gt(u64 a, u64 b)
 {
 	return (s64)(a - b) > 0;
 }
 
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = NULL;
 
-	BUG_ON(!entity);
+	BUG_ON(entity == NULL);
 
-	if (!entity->my_sched_data)
+	if (entity->my_sched_data == NULL)
 		bfqq = container_of(entity, struct bfq_queue, entity);
 
 	return bfqq;
@@ -135,7 +132,8 @@ static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
  * @service: amount of service.
  * @weight: scale factor (weight of an entity or weight sum).
  */
-static u64 bfq_delta(unsigned long service, unsigned long weight)
+static inline u64 bfq_delta(unsigned long service,
+					unsigned long weight)
 {
 	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
 
@@ -148,7 +146,8 @@ static u64 bfq_delta(unsigned long service, unsigned long weight)
  * @entity: the entity to act upon.
  * @service: the service to be charged to the entity.
  */
-static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+static inline void bfq_calc_finish(struct bfq_entity *entity,
+				   unsigned long service)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
@@ -157,7 +156,7 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
 	entity->finish = entity->start +
 		bfq_delta(service, entity->weight);
 
-	if (bfqq) {
+	if (bfqq != NULL) {
 		bfq_log_bfqq(bfqq->bfqd, bfqq,
 			"calc_finish: serv %lu, w %d",
 			service, entity->weight);
@@ -177,11 +176,11 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
  * conversion mechanism because, e.g., in the tree walking functions,
  * the check for a %NULL value would be redundant.
  */
-static struct bfq_entity *bfq_entity_of(struct rb_node *node)
+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
 {
 	struct bfq_entity *entity = NULL;
 
-	if (node)
+	if (node != NULL)
 		entity = rb_entry(node, struct bfq_entity, rb_node);
 
 	return entity;
@@ -192,7 +191,8 @@ static struct bfq_entity *bfq_entity_of(struct rb_node *node)
  * @root: the tree root.
  * @entity: the entity to remove.
  */
-static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+static inline void bfq_extract(struct rb_root *root,
+			       struct bfq_entity *entity)
 {
 	BUG_ON(entity->tree != root);
 
@@ -225,7 +225,7 @@ static void bfq_idle_extract(struct bfq_service_tree *st,
 
 	bfq_extract(&st->idle, entity);
 
-	if (bfqq)
+	if (bfqq != NULL)
 		list_del(&bfqq->bfqq_list);
 }
 
@@ -243,9 +243,9 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
 	struct rb_node **node = &root->rb_node;
 	struct rb_node *parent = NULL;
 
-	BUG_ON(entity->tree);
+	BUG_ON(entity->tree != NULL);
 
-	while (*node) {
+	while (*node != NULL) {
 		parent = *node;
 		entry = rb_entry(parent, struct bfq_entity, rb_node);
 
@@ -271,11 +271,12 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
  * that the subtree rooted at @node (which may be its left or its right
  * child) has a valid min_start value.
  */
-static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+static inline void bfq_update_min(struct bfq_entity *entity,
+				  struct rb_node *node)
 {
 	struct bfq_entity *child;
 
-	if (node) {
+	if (node != NULL) {
 		child = rb_entry(node, struct bfq_entity, rb_node);
 		if (bfq_gt(entity->min_start, child->min_start))
 			entity->min_start = child->min_start;
@@ -290,7 +291,7 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
  * this function updates its min_start value.  The left and right subtrees
  * are assumed to hold a correct min_start value.
  */
-static void bfq_update_active_node(struct rb_node *node)
+static inline void bfq_update_active_node(struct rb_node *node)
 {
 	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
 
@@ -317,12 +318,12 @@ up:
 	bfq_update_active_node(node);
 
 	parent = rb_parent(node);
-	if (!parent)
+	if (parent == NULL)
 		return;
 
-	if (node == parent->rb_left && parent->rb_right)
+	if (node == parent->rb_left && parent->rb_right != NULL)
 		bfq_update_active_node(parent->rb_right);
-	else if (parent->rb_left)
+	else if (parent->rb_left != NULL)
 		bfq_update_active_node(parent->rb_left);
 
 	node = parent;
@@ -354,7 +355,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node = &entity->rb_node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 	struct bfq_sched_data *sd = NULL;
 	struct bfq_group *bfqg = NULL;
 	struct bfq_data *bfqd = NULL;
@@ -362,22 +363,22 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 
 	bfq_insert(&st->active, entity);
 
-	if (node->rb_left)
+	if (node->rb_left != NULL)
 		node = node->rb_left;
-	else if (node->rb_right)
+	else if (node->rb_right != NULL)
 		node = node->rb_right;
 
 	bfq_update_active_tree(node);
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 	sd = entity->sched_data;
 	bfqg = container_of(sd, struct bfq_group, sched_data);
 	BUG_ON(!bfqg);
 	bfqd = (struct bfq_data *)bfqg->bfqd;
 #endif
-	if (bfqq)
+	if (bfqq != NULL)
 		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 	else { /* bfq_group */
 		BUG_ON(!bfqd);
 		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
@@ -396,32 +397,31 @@ static void bfq_active_insert(struct bfq_service_tree *st,
  * bfq_ioprio_to_weight - calc a weight from an ioprio.
  * @ioprio: the ioprio value to convert.
  */
-static unsigned short bfq_ioprio_to_weight(int ioprio)
+static inline unsigned short bfq_ioprio_to_weight(int ioprio)
 {
 	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
-	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio;
+	return IOPRIO_BE_NR - ioprio;
 }
 
 /**
  * bfq_weight_to_ioprio - calc an ioprio from a weight.
  * @weight: the weight value to convert.
  *
- * To preserve as much as possible the old only-ioprio user interface,
+ * To preserve as mush as possible the old only-ioprio user interface,
  * 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ * larger than IOPRIO_BE_NR
  */
-static unsigned short bfq_weight_to_ioprio(int weight)
+static inline unsigned short bfq_weight_to_ioprio(int weight)
 {
 	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
-	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?
-		0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;
+	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
 }
 
-static void bfq_get_entity(struct bfq_entity *entity)
+static inline void bfq_get_entity(struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
-	if (bfqq) {
+	if (bfqq != NULL) {
 		atomic_inc(&bfqq->ref);
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
 			     bfqq, atomic_read(&bfqq->ref));
@@ -441,15 +441,15 @@ static struct rb_node *bfq_find_deepest(struct rb_node *node)
 {
 	struct rb_node *deepest;
 
-	if (!node->rb_right && !node->rb_left)
+	if (node->rb_right == NULL && node->rb_left == NULL)
 		deepest = rb_parent(node);
-	else if (!node->rb_right)
+	else if (node->rb_right == NULL)
 		deepest = node->rb_left;
-	else if (!node->rb_left)
+	else if (node->rb_left == NULL)
 		deepest = node->rb_right;
 	else {
 		deepest = rb_next(node);
-		if (deepest->rb_right)
+		if (deepest->rb_right != NULL)
 			deepest = deepest->rb_right;
 		else if (rb_parent(deepest) != node)
 			deepest = rb_parent(deepest);
@@ -468,7 +468,7 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 	struct bfq_sched_data *sd = NULL;
 	struct bfq_group *bfqg = NULL;
 	struct bfq_data *bfqd = NULL;
@@ -477,18 +477,18 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 	node = bfq_find_deepest(&entity->rb_node);
 	bfq_extract(&st->active, entity);
 
-	if (node)
+	if (node != NULL)
 		bfq_update_active_tree(node);
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 	sd = entity->sched_data;
 	bfqg = container_of(sd, struct bfq_group, sched_data);
 	BUG_ON(!bfqg);
 	bfqd = (struct bfq_data *)bfqg->bfqd;
 #endif
-	if (bfqq)
+	if (bfqq != NULL)
 		list_del(&bfqq->bfqq_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 	else { /* bfq_group */
 		BUG_ON(!bfqd);
 		bfq_weights_tree_remove(bfqd, entity,
@@ -519,14 +519,14 @@ static void bfq_idle_insert(struct bfq_service_tree *st,
 	struct bfq_entity *first_idle = st->first_idle;
 	struct bfq_entity *last_idle = st->last_idle;
 
-	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
 		st->first_idle = entity;
-	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
 		st->last_idle = entity;
 
 	bfq_insert(&st->idle, entity);
 
-	if (bfqq)
+	if (bfqq != NULL)
 		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
 }
 
@@ -549,7 +549,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
 
 	entity->on_st = 0;
 	st->wsum -= entity->weight;
-	if (bfqq) {
+	if (bfqq != NULL) {
 		sd = entity->sched_data;
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
 			     bfqq, atomic_read(&bfqq->ref));
@@ -581,7 +581,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
 	struct bfq_entity *first_idle = st->first_idle;
 	struct bfq_entity *last_idle = st->last_idle;
 
-	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
 	    !bfq_gt(last_idle->finish, st->vtime)) {
 		/*
 		 * Forget the whole idle tree, increasing the vtime past
@@ -590,7 +590,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
 		st->vtime = last_idle->finish;
 	}
 
-	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
 		bfq_put_idle_entity(st, first_idle);
 }
 
@@ -600,19 +600,19 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 {
 	struct bfq_service_tree *new_st = old_st;
 
-	if (entity->prio_changed) {
+	if (entity->ioprio_changed) {
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 		unsigned short prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
 		struct rb_root *root;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 		struct bfq_sched_data *sd;
 		struct bfq_group *bfqg;
 #endif
 
-		if (bfqq)
+		if (bfqq != NULL)
 			bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 		else {
 			sd = entity->my_sched_data;
 			bfqg = container_of(sd, struct bfq_group, sched_data);
@@ -634,14 +634,12 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 				BUG();
 			}
 			entity->orig_weight = entity->new_weight;
-			if (bfqq)
-				bfqq->ioprio =
-				  bfq_weight_to_ioprio(entity->orig_weight);
+			entity->ioprio =
+				bfq_weight_to_ioprio(entity->orig_weight);
 		}
 
-		if (bfqq)
-			bfqq->ioprio_class = bfqq->new_ioprio_class;
-		entity->prio_changed = 0;
+		entity->ioprio_class = entity->new_ioprio_class;
+		entity->ioprio_changed = 0;
 
 		/*
 		 * NOTE: here we may be changing the weight too early,
@@ -654,7 +652,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 
 		prev_weight = entity->weight;
 		new_weight = entity->orig_weight *
-			     (bfqq ? bfqq->wr_coeff : 1);
+			     (bfqq != NULL ? bfqq->wr_coeff : 1);
 		/*
 		 * If the weight of the entity changes, remove the entity
 		 * from its old weight counter (if there is a counter
@@ -685,10 +683,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 	return new_st;
 }
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
-#endif
-
 /**
  * bfq_bfqq_served - update the scheduler status after selection for
  *                   service.
@@ -699,7 +693,7 @@ static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
  * are synchronized every time a new bfqq is selected for service.  By now,
  * we keep it to better check consistency.
  */
-static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 	struct bfq_service_tree *st;
@@ -714,10 +708,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 		st->vtime += bfq_delta(served, st->wsum);
 		bfq_forget_idle(st);
 	}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
-#endif
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
 }
 
 /**
@@ -730,7 +721,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
  * budget.  In this way we should obtain a sort of time-domain
  * fairness among all the seeky/slow queues.
  */
-static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 
@@ -755,7 +746,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity)
 	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
 
 	if (entity == sd->in_service_entity) {
-		BUG_ON(entity->tree);
+		BUG_ON(entity->tree != NULL);
 		/*
 		 * If we are requeueing the current entity we have
 		 * to take care of not charging to it service it has
@@ -846,7 +837,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 	if (!entity->on_st)
 		return 0;
 
-	BUG_ON(was_in_service && entity->tree);
+	BUG_ON(was_in_service && entity->tree != NULL);
 
 	if (was_in_service) {
 		bfq_calc_finish(entity, entity->service);
@@ -855,7 +846,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 		bfq_active_extract(st, entity);
 	else if (entity->tree == &st->idle)
 		bfq_idle_extract(st, entity);
-	else if (entity->tree)
+	else if (entity->tree != NULL)
 		BUG();
 
 	if (was_in_service || sd->next_in_service == entity)
@@ -893,7 +884,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 			 */
 			break;
 
-		if (sd->next_in_service)
+		if (sd->next_in_service != NULL)
 			/*
 			 * The parent entity is still backlogged and
 			 * the budgets on the path towards the root
@@ -962,7 +953,7 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
 	struct bfq_entity *entry, *first = NULL;
 	struct rb_node *node = st->active.rb_node;
 
-	while (node) {
+	while (node != NULL) {
 		entry = rb_entry(node, struct bfq_entity, rb_node);
 left:
 		if (!bfq_gt(entry->start, st->vtime))
@@ -970,7 +961,7 @@ left:
 
 		BUG_ON(bfq_gt(entry->min_start, st->vtime));
 
-		if (node->rb_left) {
+		if (node->rb_left != NULL) {
 			entry = rb_entry(node->rb_left,
 					 struct bfq_entity, rb_node);
 			if (!bfq_gt(entry->min_start, st->vtime)) {
@@ -978,12 +969,12 @@ left:
 				goto left;
 			}
 		}
-		if (first)
+		if (first != NULL)
 			break;
 		node = node->rb_right;
 	}
 
-	BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));
+	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
 	return first;
 }
 
@@ -1039,13 +1030,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
 	struct bfq_entity *entity;
 	int i = 0;
 
-	BUG_ON(sd->in_service_entity);
+	BUG_ON(sd->in_service_entity != NULL);
 
-	if (bfqd &&
+	if (bfqd != NULL &&
 	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
 		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
 						  true);
-		if (entity) {
+		if (entity != NULL) {
 			i = BFQ_IOPRIO_CLASSES - 1;
 			bfqd->bfq_class_idle_last_service = jiffies;
 			sd->next_in_service = entity;
@@ -1053,7 +1044,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
 	}
 	for (; i < BFQ_IOPRIO_CLASSES; i++) {
 		entity = __bfq_lookup_next_entity(st + i, false);
-		if (entity) {
+		if (entity != NULL) {
 			if (extract) {
 				bfq_check_next_in_service(sd, entity);
 				bfq_active_extract(st + i, entity);
@@ -1076,27 +1067,27 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
 	struct bfq_sched_data *sd;
 	struct bfq_queue *bfqq;
 
-	BUG_ON(bfqd->in_service_queue);
+	BUG_ON(bfqd->in_service_queue != NULL);
 
 	if (bfqd->busy_queues == 0)
 		return NULL;
 
 	sd = &bfqd->root_group->sched_data;
-	for (; sd ; sd = entity->my_sched_data) {
+	for (; sd != NULL; sd = entity->my_sched_data) {
 		entity = bfq_lookup_next_entity(sd, 1, bfqd);
-		BUG_ON(!entity);
+		BUG_ON(entity == NULL);
 		entity->service = 0;
 	}
 
 	bfqq = bfq_entity_to_bfqq(entity);
-	BUG_ON(!bfqq);
+	BUG_ON(bfqq == NULL);
 
 	return bfqq;
 }
 
 static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
 {
-	if (bfqd->in_service_bic) {
+	if (bfqd->in_service_bic != NULL) {
 		put_io_context(bfqd->in_service_bic->icq.ioc);
 		bfqd->in_service_bic = NULL;
 	}
@@ -1123,10 +1114,6 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	bfq_activate_entity(entity);
 }
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
-#endif
-
 /*
  * Called when the bfqq no longer has requests pending, remove it from
  * the service tree.
@@ -1160,10 +1147,6 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues--;
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_update_dequeue(bfqq_group(bfqq));
-#endif
-
 	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
 }
 
diff --git a/block/bfq.h b/block/bfq.h
index 320c4389b..93d3f6e95 100644
--- a/block/bfq.h
+++ b/block/bfq.h
@@ -1,5 +1,5 @@
 /*
- * BFQ-v7r9 for 4.2.0: data structures and common functions prototypes.
+ * BFQ-v7r8 for 4.3.0: data structures and common functions prototypes.
  *
  * Based on ideas and code from CFQ:
  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
@@ -17,14 +17,12 @@
 #include <linux/hrtimer.h>
 #include <linux/ioprio.h>
 #include <linux/rbtree.h>
-#include <linux/blk-cgroup.h>
 
 #define BFQ_IOPRIO_CLASSES	3
 #define BFQ_CL_IDLE_TIMEOUT	(HZ/5)
 
-#define BFQ_MIN_WEIGHT			1
-#define BFQ_MAX_WEIGHT			1000
-#define BFQ_WEIGHT_CONVERSION_COEFF	10
+#define BFQ_MIN_WEIGHT	1
+#define BFQ_MAX_WEIGHT	1000
 
 #define BFQ_DEFAULT_QUEUE_IOPRIO	4
 
@@ -119,8 +117,12 @@ struct bfq_weight_counter {
  * @ioprio: the ioprio in use.
  * @new_weight: when a weight change is requested, the new weight value.
  * @orig_weight: original weight, used to implement weight boosting
- * @prio_changed: flag, true when the user requested a weight, ioprio or
- *		  ioprio_class change.
+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
+ * @ioprio_class: the ioprio_class in use.
+ * @new_ioprio_class: when an ioprio_class change is requested, the new
+ *                    ioprio_class value.
+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
+ *                  ioprio_class change.
  *
  * A bfq_entity is used to represent either a bfq_queue (leaf node in the
  * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
@@ -132,7 +134,7 @@ struct bfq_weight_counter {
  * allow different weights on different devices, but this
  * functionality is not exported to userspace by now.  Priorities and
  * weights are updated lazily, first storing the new values into the
- * new_* fields, then setting the @prio_changed flag.  As soon as
+ * new_* fields, then setting the @ioprio_changed flag.  As soon as
  * there is a transition in the entity state that allows the priority
  * update to take place the effective and the requested priority
  * values are synchronized.
@@ -159,7 +161,7 @@ struct bfq_entity {
 
 	u64 min_start;
 
-	int service, budget;
+	unsigned long service, budget;
 	unsigned short weight, new_weight;
 	unsigned short orig_weight;
 
@@ -168,7 +170,10 @@ struct bfq_entity {
 	struct bfq_sched_data *my_sched_data;
 	struct bfq_sched_data *sched_data;
 
-	int prio_changed;
+	unsigned short ioprio, new_ioprio;
+	unsigned short ioprio_class, new_ioprio_class;
+
+	int ioprio_changed;
 };
 
 struct bfq_group;
@@ -177,14 +182,10 @@ struct bfq_group;
  * struct bfq_queue - leaf schedulable entity.
  * @ref: reference counter.
  * @bfqd: parent bfq_data.
- * @new_ioprio: when an ioprio change is requested, the new ioprio value.
- * @ioprio_class: the ioprio_class in use.
- * @new_ioprio_class: when an ioprio_class change is requested, the new
- *                    ioprio_class value.
  * @new_bfqq: shared bfq_queue if queue is cooperating with
  *           one or more other queues.
- * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree).
- * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree).
+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
  * @sort_list: sorted list of pending requests.
  * @next_rq: if fifo isn't expired, next request to serve.
  * @queued: nr of requests queued in @sort_list.
@@ -238,9 +239,6 @@ struct bfq_queue {
 	atomic_t ref;
 	struct bfq_data *bfqd;
 
-	unsigned short ioprio, new_ioprio;
-	unsigned short ioprio_class, new_ioprio_class;
-
 	/* fields for cooperating queues handling */
 	struct bfq_queue *new_bfqq;
 	struct rb_node pos_node;
@@ -255,7 +253,7 @@ struct bfq_queue {
 
 	struct bfq_entity entity;
 
-	int max_budget;
+	unsigned long max_budget;
 	unsigned long budget_timeout;
 
 	int dispatched;
@@ -304,8 +302,6 @@ struct bfq_ttime {
  * @icq: associated io_cq structure
  * @bfqq: array of two process queues, the sync and the async
  * @ttime: associated @bfq_ttime struct
- * @ioprio: per (request_queue, blkcg) ioprio.
- * @blkcg_id: id of the blkcg the related io_cq belongs to.
  * @wr_time_left: snapshot of the time left before weight raising ends
  *                for the sync queue associated to this process; this
  *		  snapshot is taken to remember this value while the weight
@@ -333,10 +329,6 @@ struct bfq_io_cq {
 	struct bfq_ttime ttime;
 	int ioprio;
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	uint64_t blkcg_id; /* the current blkcg ID */
-#endif
-
 	unsigned int wr_time_left;
 	bool saved_idle_window;
 	bool saved_IO_bound;
@@ -357,6 +349,9 @@ enum bfq_device_speed {
  * struct bfq_data - per device data structure.
  * @queue: request queue for the managed device.
  * @root_group: root bfq_group for the device.
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ *               determining if two or more queues have interleaving
+ *               requests (see bfq_close_cooperator()).
  * @active_numerous_groups: number of bfq_groups containing more than one
  *                          active @bfq_entity.
  * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by
@@ -490,8 +485,9 @@ struct bfq_data {
 	struct request_queue *queue;
 
 	struct bfq_group *root_group;
+	struct rb_root rq_pos_tree;
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#ifdef CONFIG_CGROUP_BFQIO
 	int active_numerous_groups;
 #endif
 
@@ -524,7 +520,7 @@ struct bfq_data {
 	ktime_t last_idling_start;
 	int peak_rate_samples;
 	u64 peak_rate;
-	int bfq_max_budget;
+	unsigned long bfq_max_budget;
 
 	struct hlist_head group_list;
 	struct list_head active_list;
@@ -536,8 +532,8 @@ struct bfq_data {
 	unsigned int bfq_slice_idle;
 	u64 bfq_class_idle_last_service;
 
-	int bfq_user_max_budget;
-	int bfq_max_budget_async_rq;
+	unsigned int bfq_user_max_budget;
+	unsigned int bfq_max_budget_async_rq;
 	unsigned int bfq_timeout[2];
 
 	unsigned int bfq_coop_thresh;
@@ -597,15 +593,15 @@ enum bfqq_state_flags {
 };
 
 #define BFQ_BFQQ_FNS(name)						\
-static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
 {									\
 	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
 }									\
-static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)		\
+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\
 {									\
 	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
 }									\
-static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
 {									\
 	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
 }
@@ -644,64 +640,14 @@ enum bfqq_expiration {
 	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
 };
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-struct bfqg_stats {
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
-	/* number of ios merged */
-	struct blkg_rwstat		merged;
-	/* total time spent on device in ns, may not be accurate w/ queueing */
-	struct blkg_rwstat		service_time;
-	/* total time spent waiting in scheduler queue in ns */
-	struct blkg_rwstat		wait_time;
-	/* number of IOs queued up */
-	struct blkg_rwstat		queued;
-	/* total sectors transferred */
-	struct blkg_stat		sectors;
-	/* total disk time and nr sectors dispatched by this group */
-	struct blkg_stat		time;
-	/* time not charged to this cgroup */
-	struct blkg_stat		unaccounted_time;
-	/* sum of number of ios queued across all samples */
-	struct blkg_stat		avg_queue_size_sum;
-	/* count of samples taken for average */
-	struct blkg_stat		avg_queue_size_samples;
-	/* how many times this group has been removed from service tree */
-	struct blkg_stat		dequeue;
-	/* total time spent waiting for it to be assigned a timeslice. */
-	struct blkg_stat		group_wait_time;
-	/* time spent idling for this blkcg_gq */
-	struct blkg_stat		idle_time;
-	/* total time with empty current active q with other requests queued */
-	struct blkg_stat		empty_time;
-	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
-	uint16_t			flags;
-};
-
-/*
- * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
- *
- * @ps: @blkcg_policy_storage that this structure inherits
- * @weight: weight of the bfq_group
- */
-struct bfq_group_data {
-	/* must be the first member */
-	struct blkcg_policy_data pd;
-
-	unsigned short weight;
-};
-
+#ifdef CONFIG_CGROUP_BFQIO
 /**
  * struct bfq_group - per (device, cgroup) data structure.
  * @entity: schedulable entity to insert into the parent group sched_data.
  * @sched_data: own sched_data, to contain child entities (they may be
  *              both bfq_queues and bfq_groups).
+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
+ *              list of the containing cgroup's bfqio_cgroup.
  * @bfqd_node: node to be inserted into the @bfqd->group_list list
  *             of the groups active on the same device; used for cleanup.
  * @bfqd: the bfq_data for the device this group acts upon.
@@ -717,26 +663,23 @@ struct bfq_group_data {
  *                   are groups with more than one active @bfq_entity
  *                   (see the comments to the function
  *                   bfq_bfqq_must_not_expire()).
- * @rq_pos_tree: rbtree sorted by next_request position, used when
- *               determining if two or more queues have interleaving
- *               requests (see bfq_find_close_cooperator()).
  *
  * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
  * there is a set of bfq_groups, each one collecting the lower-level
  * entities belonging to the group that are acting on the same device.
  *
  * Locking works as follows:
+ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed
+ *      via RCU from its readers.
  *    o @bfqd is protected by the queue lock, RCU is used to access it
  *      from the readers.
  *    o All the other fields are protected by the @bfqd queue lock.
  */
 struct bfq_group {
-	/* must be the first member */
-	struct blkg_policy_data pd;
-
 	struct bfq_entity entity;
 	struct bfq_sched_data sched_data;
 
+	struct hlist_node group_node;
 	struct hlist_node bfqd_node;
 
 	void *bfqd;
@@ -747,33 +690,44 @@ struct bfq_group {
 	struct bfq_entity *my_entity;
 
 	int active_entities;
+};
 
-	struct rb_root rq_pos_tree;
+/**
+ * struct bfqio_cgroup - bfq cgroup data structure.
+ * @css: subsystem state for bfq in the containing cgroup.
+ * @online: flag marked when the subsystem is inserted.
+ * @weight: cgroup weight.
+ * @ioprio: cgroup ioprio.
+ * @ioprio_class: cgroup ioprio_class.
+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
+ * @group_data: list containing the bfq_group belonging to this cgroup.
+ *
+ * @group_data is accessed using RCU, with @lock protecting the updates,
+ * @ioprio and @ioprio_class are protected by @lock.
+ */
+struct bfqio_cgroup {
+	struct cgroup_subsys_state css;
+	bool online;
 
-	struct bfqg_stats stats;
-	struct bfqg_stats dead_stats;	/* stats pushed from dead children */
-};
+	unsigned short weight, ioprio, ioprio_class;
 
+	spinlock_t lock;
+	struct hlist_head group_data;
+};
 #else
 struct bfq_group {
 	struct bfq_sched_data sched_data;
 
 	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
 	struct bfq_queue *async_idle_bfqq;
-
-	struct rb_root rq_pos_tree;
 };
 #endif
 
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
-
-static struct bfq_service_tree *
+static inline struct bfq_service_tree *
 bfq_entity_service_tree(struct bfq_entity *entity)
 {
 	struct bfq_sched_data *sched_data = entity->sched_data;
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
-				  BFQ_DEFAULT_GRP_CLASS;
+	unsigned int idx = entity->ioprio_class - 1;
 
 	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
 	BUG_ON(sched_data == NULL);
@@ -781,18 +735,19 @@ bfq_entity_service_tree(struct bfq_entity *entity)
 	return sched_data->service_tree + idx;
 }
 
-static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
+					    bool is_sync)
 {
 	return bic->bfqq[is_sync];
 }
 
-static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
-			 bool is_sync)
+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
+				struct bfq_queue *bfqq, bool is_sync)
 {
 	bic->bfqq[is_sync] = bfqq;
 }
 
-static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
 {
 	return bic->icq.q->elevator->elevator_data;
 }
@@ -811,7 +766,8 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
  * the function returns NULL, with the queue unlocked, otherwise it
  * returns the dereferenced pointer, with the queue locked.
  */
-static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)
+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
+						   unsigned long *flags)
 {
 	struct bfq_data *bfqd;
 
@@ -820,9 +776,7 @@ static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)
 
 	if (bfqd != NULL) {
 		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
-		if (ptr == NULL)
-			printk(KERN_CRIT "get_bfqd_locked pointer NULL\n");
-		else if (*ptr == bfqd)
+		if (*ptr == bfqd)
 			goto out;
 		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
 	}
@@ -833,37 +787,17 @@ out:
 	return bfqd;
 }
 
-static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)
+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
+				       unsigned long *flags)
 {
 	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
 }
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *group_entity = bfqq->entity.parent;
-
-	if (!group_entity)
-		group_entity = &bfqq->bfqd->root_group->entity;
-
-	return container_of(group_entity, struct bfq_group, entity);
-}
-
-#else
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	return bfqq->bfqd->root_group;
-}
-
-#endif
-
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic);
 static void bfq_put_queue(struct bfq_queue *bfqq);
 static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bio *bio, int is_sync,
+				       struct bfq_group *bfqg, int is_sync,
 				       struct bfq_io_cq *bic, gfp_t gfp_mask);
 static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
 				    struct bfq_group *bfqg);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 719b7152a..14b8faf8b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -140,6 +140,11 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 
 	iv = bip->bip_vec + bip->bip_vcnt;
 
+	if (bip->bip_vcnt &&
+	    bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+			     &bip->bip_vec[bip->bip_vcnt - 1], offset))
+		return 0;
+
 	iv->bv_page = page;
 	iv->bv_len = len;
 	iv->bv_offset = offset;
@@ -355,13 +360,12 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 		container_of(work, struct bio_integrity_payload, bip_work);
 	struct bio *bio = bip->bip_bio;
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	int error;
 
-	error = bio_integrity_process(bio, bi->verify_fn);
+	bio->bi_error = bio_integrity_process(bio, bi->verify_fn);
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio(bio, error);
+	bio_endio(bio);
 }
 
 /**
@@ -376,7 +380,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
  * in process context.	This function postpones completion
  * accordingly.
  */
-void bio_integrity_endio(struct bio *bio, int error)
+void bio_integrity_endio(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
 
@@ -386,9 +390,9 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 * integrity metadata.  Restore original bio end_io handler
 	 * and run it.
 	 */
-	if (error) {
+	if (bio->bi_error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio(bio, error);
+		bio_endio(bio);
 
 		return;
 	}
diff --git a/block/bio.c b/block/bio.c
index d6e5ba339..ad3f276d7 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -269,7 +269,6 @@ static void bio_free(struct bio *bio)
 void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
-	bio->bi_flags = 1 << BIO_UPTODATE;
 	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->__bi_cnt, 1);
 }
@@ -292,14 +291,17 @@ void bio_reset(struct bio *bio)
 	__bio_free(bio);
 
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags | (1 << BIO_UPTODATE);
+	bio->bi_flags = flags;
 	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
-static void bio_chain_endio(struct bio *bio, int error)
+static void bio_chain_endio(struct bio *bio)
 {
-	bio_endio(bio->bi_private, error);
+	struct bio *parent = bio->bi_private;
+
+	parent->bi_error = bio->bi_error;
+	bio_endio(parent);
 	bio_put(bio);
 }
 
@@ -309,7 +311,7 @@ static void bio_chain_endio(struct bio *bio, int error)
  */
 static inline void bio_inc_remaining(struct bio *bio)
 {
-	bio->bi_flags |= (1 << BIO_CHAIN);
+	bio_set_flag(bio, BIO_CHAIN);
 	smp_mb__before_atomic();
 	atomic_inc(&bio->__bi_remaining);
 }
@@ -493,7 +495,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		if (unlikely(!bvl))
 			goto err_free;
 
-		bio->bi_flags |= 1 << BIO_OWNS_VEC;
+		bio_set_flag(bio, BIO_OWNS_VEC);
 	} else if (nr_iovecs) {
 		bvl = bio->bi_inline_vecs;
 	}
@@ -578,7 +580,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	 * so we don't set nor calculate new physical/hw segment counts here
 	 */
 	bio->bi_bdev = bio_src->bi_bdev;
-	bio->bi_flags |= 1 << BIO_CLONED;
+	bio_set_flag(bio, BIO_CLONED);
 	bio->bi_rw = bio_src->bi_rw;
 	bio->bi_iter = bio_src->bi_iter;
 	bio->bi_io_vec = bio_src->bi_io_vec;
@@ -692,31 +694,22 @@ integrity_clone:
 EXPORT_SYMBOL(bio_clone_bioset);
 
 /**
- *	bio_get_nr_vecs		- return approx number of vecs
- *	@bdev:  I/O target
+ *	bio_add_pc_page	-	attempt to add page to bio
+ *	@q: the target queue
+ *	@bio: destination bio
+ *	@page: page to add
+ *	@len: vec entry length
+ *	@offset: vec entry offset
  *
- *	Return the approximate number of pages we can send to this target.
- *	There's no guarantee that you will be able to fit this number of pages
- *	into a bio, it does not account for dynamic restrictions that vary
- *	on offset.
+ *	Attempt to add a page to the bio_vec maplist. This can fail for a
+ *	number of reasons, such as the bio being full or target block device
+ *	limitations. The target block device must allow bio's up to PAGE_SIZE,
+ *	so it is always possible to add a single page to an empty bio.
+ *
+ *	This should only be used by REQ_PC bios.
  */
-int bio_get_nr_vecs(struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-	int nr_pages;
-
-	nr_pages = min_t(unsigned,
-		     queue_max_segments(q),
-		     queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
-
-	return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
-
-}
-EXPORT_SYMBOL(bio_get_nr_vecs);
-
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
-			  *page, unsigned int len, unsigned int offset,
-			  unsigned int max_sectors)
+int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
+		    *page, unsigned int len, unsigned int offset)
 {
 	int retried_segments = 0;
 	struct bio_vec *bvec;
@@ -727,7 +720,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	if (unlikely(bio_flagged(bio, BIO_CLONED)))
 		return 0;
 
-	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
+	if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
 		return 0;
 
 	/*
@@ -740,28 +733,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
 		if (page == prev->bv_page &&
 		    offset == prev->bv_offset + prev->bv_len) {
-			unsigned int prev_bv_len = prev->bv_len;
 			prev->bv_len += len;
-
-			if (q->merge_bvec_fn) {
-				struct bvec_merge_data bvm = {
-					/* prev_bvec is already charged in
-					   bi_size, discharge it in order to
-					   simulate merging updated prev_bvec
-					   as new bvec. */
-					.bi_bdev = bio->bi_bdev,
-					.bi_sector = bio->bi_iter.bi_sector,
-					.bi_size = bio->bi_iter.bi_size -
-						prev_bv_len,
-					.bi_rw = bio->bi_rw,
-				};
-
-				if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
-					prev->bv_len -= len;
-					return 0;
-				}
-			}
-
 			bio->bi_iter.bi_size += len;
 			goto done;
 		}
@@ -770,8 +742,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) &&
-		    bvec_gap_to_prev(prev, offset))
+		if (bvec_gap_to_prev(q, prev, offset))
 			return 0;
 	}
 
@@ -804,30 +775,9 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		blk_recount_segments(q, bio);
 	}
 
-	/*
-	 * if queue has other restrictions (eg varying max sector size
-	 * depending on offset), it can specify a merge_bvec_fn in the
-	 * queue to get further control
-	 */
-	if (q->merge_bvec_fn) {
-		struct bvec_merge_data bvm = {
-			.bi_bdev = bio->bi_bdev,
-			.bi_sector = bio->bi_iter.bi_sector,
-			.bi_size = bio->bi_iter.bi_size - len,
-			.bi_rw = bio->bi_rw,
-		};
-
-		/*
-		 * merge_bvec_fn() returns number of bytes it can accept
-		 * at this offset
-		 */
-		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
-			goto failed;
-	}
-
 	/* If we may be able to merge these biovecs, force a recount */
 	if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
-		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+		bio_clear_flag(bio, BIO_SEG_VALID);
 
  done:
 	return len;
@@ -841,28 +791,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	blk_recount_segments(q, bio);
 	return 0;
 }
-
-/**
- *	bio_add_pc_page	-	attempt to add page to bio
- *	@q: the target queue
- *	@bio: destination bio
- *	@page: page to add
- *	@len: vec entry length
- *	@offset: vec entry offset
- *
- *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block device
- *	limitations. The target block device must allow bio's up to PAGE_SIZE,
- *	so it is always possible to add a single page to an empty bio.
- *
- *	This should only be used by REQ_PC bios.
- */
-int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
-		    unsigned int len, unsigned int offset)
-{
-	return __bio_add_page(q, bio, page, len, offset,
-			      queue_max_hw_sectors(q));
-}
 EXPORT_SYMBOL(bio_add_pc_page);
 
 /**
@@ -872,22 +800,47 @@ EXPORT_SYMBOL(bio_add_pc_page);
  *	@len: vec entry length
  *	@offset: vec entry offset
  *
- *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block device
- *	limitations. The target block device must allow bio's up to PAGE_SIZE,
- *	so it is always possible to add a single page to an empty bio.
+ *	Attempt to add a page to the bio_vec maplist. This will only fail
+ *	if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
  */
-int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
-		 unsigned int offset)
+int bio_add_page(struct bio *bio, struct page *page,
+		 unsigned int len, unsigned int offset)
 {
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	unsigned int max_sectors;
+	struct bio_vec *bv;
+
+	/*
+	 * cloned bio must not modify vec list
+	 */
+	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+		return 0;
 
-	max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
-	if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size)
-		max_sectors = len >> 9;
+	/*
+	 * For filesystems with a blocksize smaller than the pagesize
+	 * we will often be called with the same page as last time and
+	 * a consecutive offset.  Optimize this special case.
+	 */
+	if (bio->bi_vcnt > 0) {
+		bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
-	return __bio_add_page(q, bio, page, len, offset, max_sectors);
+		if (page == bv->bv_page &&
+		    offset == bv->bv_offset + bv->bv_len) {
+			bv->bv_len += len;
+			goto done;
+		}
+	}
+
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
+		return 0;
+
+	bv		= &bio->bi_io_vec[bio->bi_vcnt];
+	bv->bv_page	= page;
+	bv->bv_len	= len;
+	bv->bv_offset	= offset;
+
+	bio->bi_vcnt++;
+done:
+	bio->bi_iter.bi_size += len;
+	return len;
 }
 EXPORT_SYMBOL(bio_add_page);
 
@@ -896,11 +849,11 @@ struct submit_bio_ret {
 	int error;
 };
 
-static void submit_bio_wait_endio(struct bio *bio, int error)
+static void submit_bio_wait_endio(struct bio *bio)
 {
 	struct submit_bio_ret *ret = bio->bi_private;
 
-	ret->error = error;
+	ret->error = bio->bi_error;
 	complete(&ret->event);
 }
 
@@ -1388,7 +1341,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	if (iter->type & WRITE)
 		bio->bi_rw |= REQ_WRITE;
 
-	bio->bi_flags |= (1 << BIO_USER_MAPPED);
+	bio_set_flag(bio, BIO_USER_MAPPED);
 
 	/*
 	 * subtle -- if __bio_map_user() ended up bouncing a bio,
@@ -1445,7 +1398,7 @@ void bio_unmap_user(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_unmap_user);
 
-static void bio_map_kern_endio(struct bio *bio, int err)
+static void bio_map_kern_endio(struct bio *bio)
 {
 	bio_put(bio);
 }
@@ -1501,13 +1454,13 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 }
 EXPORT_SYMBOL(bio_map_kern);
 
-static void bio_copy_kern_endio(struct bio *bio, int err)
+static void bio_copy_kern_endio(struct bio *bio)
 {
 	bio_free_pages(bio);
 	bio_put(bio);
 }
 
-static void bio_copy_kern_endio_read(struct bio *bio, int err)
+static void bio_copy_kern_endio_read(struct bio *bio)
 {
 	char *p = bio->bi_private;
 	struct bio_vec *bvec;
@@ -1518,7 +1471,7 @@ static void bio_copy_kern_endio_read(struct bio *bio, int err)
 		p += bvec->bv_len;
 	}
 
-	bio_copy_kern_endio(bio, err);
+	bio_copy_kern_endio(bio);
 }
 
 /**
@@ -1768,7 +1721,7 @@ static inline bool bio_remaining_done(struct bio *bio)
 	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
 
 	if (atomic_dec_and_test(&bio->__bi_remaining)) {
-		clear_bit(BIO_CHAIN, &bio->bi_flags);
+		bio_clear_flag(bio, BIO_CHAIN);
 		return true;
 	}
 
@@ -1778,25 +1731,15 @@ static inline bool bio_remaining_done(struct bio *bio)
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
- * @error:	error, if any
  *
  * Description:
- *   bio_endio() will end I/O on the whole bio. bio_endio() is the
- *   preferred way to end I/O on a bio, it takes care of clearing
- *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
- *   established -Exxxx (-EIO, for instance) error values in case
- *   something went wrong. No one should call bi_end_io() directly on a
- *   bio unless they own it and thus know that it has an end_io
- *   function.
+ *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
+ *   way to end I/O on a bio. No one should call bi_end_io() directly on a
+ *   bio unless they own it and thus know that it has an end_io function.
  **/
-void bio_endio(struct bio *bio, int error)
+void bio_endio(struct bio *bio)
 {
 	while (bio) {
-		if (error)
-			clear_bit(BIO_UPTODATE, &bio->bi_flags);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			error = -EIO;
-
 		if (unlikely(!bio_remaining_done(bio)))
 			break;
 
@@ -1810,11 +1753,12 @@ void bio_endio(struct bio *bio, int error)
 		 */
 		if (bio->bi_end_io == bio_chain_endio) {
 			struct bio *parent = bio->bi_private;
+			parent->bi_error = bio->bi_error;
 			bio_put(bio);
 			bio = parent;
 		} else {
 			if (bio->bi_end_io)
-				bio->bi_end_io(bio, error);
+				bio->bi_end_io(bio);
 			bio = NULL;
 		}
 	}
@@ -1882,7 +1826,7 @@ void bio_trim(struct bio *bio, int offset, int size)
 	if (offset == 0 && size == bio->bi_iter.bi_size)
 		return;
 
-	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+	bio_clear_flag(bio, BIO_SEG_VALID);
 
 	bio_advance(bio, offset << 9);
 
@@ -2046,7 +1990,7 @@ int bio_associate_current(struct bio *bio)
 
 	get_io_context_active(ioc);
 	bio->bi_ioc = ioc;
-	bio->bi_css = task_get_css(current, blkio_cgrp_id);
+	bio->bi_css = task_get_css(current, io_cgrp_id);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(bio_associate_current);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9cc48d1d7..55512dd62 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,6 +24,7 @@
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 
@@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg)
 		return;
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
-		kfree(blkg->pd[i]);
+		if (blkg->pd[i])
+			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
-	blk_exit_rl(&blkg->rl);
+	if (blkg->blkcg != &blkcg_root)
+		blk_exit_rl(&blkg->rl);
+
+	blkg_rwstat_exit(&blkg->stat_ios);
+	blkg_rwstat_exit(&blkg->stat_bytes);
 	kfree(blkg);
 }
 
@@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	if (!blkg)
 		return NULL;
 
+	if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+	    blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+		goto err_free;
+
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
@@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 			continue;
 
 		/* alloc per-policy data and attach it to blkg */
-		pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+		pd = pol->pd_alloc_fn(gfp_mask, q->node);
 		if (!pd)
 			goto err_free;
 
@@ -129,26 +139,11 @@ err_free:
 	return NULL;
 }
 
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-			       bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+				      struct request_queue *q, bool update_hint)
 {
 	struct blkcg_gq *blkg;
 
-	blkg = rcu_dereference(blkcg->blkg_hint);
-	if (blkg && blkg->q == q)
-		return blkg;
-
 	/*
 	 * Hint didn't match.  Look up from the radix tree.  Note that the
 	 * hint can only be updated under queue_lock as otherwise @blkg
@@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 
 	return NULL;
 }
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	if (unlikely(blk_queue_bypass(q)))
-		return NULL;
-	return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 				    struct request_queue *q,
@@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
 	/* blkg holds a reference to blkcg */
 	if (!css_tryget_online(&blkcg->css)) {
-		ret = -EINVAL;
+		ret = -ENODEV;
 		goto err_free_blkg;
 	}
 
 	wb_congested = wb_congested_get_create(&q->backing_dev_info,
-					       blkcg->css.id, GFP_ATOMIC);
+					       blkcg->css.id, GFP_NOWAIT);
 	if (!wb_congested) {
 		ret = -ENOMEM;
 		goto err_put_css;
@@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
 	/* allocate */
 	if (!new_blkg) {
-		new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
 			goto err_put_congested;
@@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
-			ret = -EINVAL;
+			ret = -ENODEV;
 			goto err_put_congested;
 		}
 		blkg_get(blkg->parent);
@@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		struct blkcg_policy *pol = blkcg_policy[i];
 
 		if (blkg->pd[i] && pol->pd_init_fn)
-			pol->pd_init_fn(blkg);
+			pol->pd_init_fn(blkg->pd[i]);
 	}
 
 	/* insert */
@@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 			struct blkcg_policy *pol = blkcg_policy[i];
 
 			if (blkg->pd[i] && pol->pd_online_fn)
-				pol->pd_online_fn(blkg);
+				pol->pd_online_fn(blkg->pd[i]);
 		}
 	}
 	blkg->online = true;
@@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 	 * we shouldn't allow anything to go through for a bypassing queue.
 	 */
 	if (unlikely(blk_queue_bypass(q)))
-		return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
 
 	blkg = __blkg_lookup(blkcg, q, true);
 	if (blkg)
@@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 			return blkg;
 	}
 }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
+	struct blkcg_gq *parent = blkg->parent;
 	int i;
 
 	lockdep_assert_held(blkg->q->queue_lock);
@@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 		struct blkcg_policy *pol = blkcg_policy[i];
 
 		if (blkg->pd[i] && pol->pd_offline_fn)
-			pol->pd_offline_fn(blkg);
+			pol->pd_offline_fn(blkg->pd[i]);
+	}
+
+	if (parent) {
+		blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+		blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
 	}
+
 	blkg->online = false;
 
 	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -403,15 +386,6 @@ static void blkg_destroy_all(struct request_queue *q)
 void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
 	struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-	int i;
-
-	/* tell policies that this one is being freed */
-	for (i = 0; i < BLKCG_MAX_POLS; i++) {
-		struct blkcg_policy *pol = blkcg_policy[i];
-
-		if (blkg->pd[i] && pol->pd_exit_fn)
-			pol->pd_exit_fn(blkg);
-	}
 
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
@@ -475,12 +449,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	 * anyway.  If you get hit by a race, retry.
 	 */
 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		blkg_rwstat_reset(&blkg->stat_bytes);
+		blkg_rwstat_reset(&blkg->stat_ios);
+
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkcg_policy *pol = blkcg_policy[i];
 
-			if (blkcg_policy_enabled(blkg->q, pol) &&
-			    pol->pd_reset_stats_fn)
-				pol->pd_reset_stats_fn(blkg);
+			if (blkg->pd[i] && pol->pd_reset_stats_fn)
+				pol->pd_reset_stats_fn(blkg->pd[i]);
 		}
 	}
 
@@ -489,13 +465,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	return 0;
 }
 
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
 	/* some drivers (floppy) instantiate a queue w/o disk registered */
 	if (blkg->q->backing_dev_info.dev)
 		return dev_name(blkg->q->backing_dev_info.dev);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_dev_name);
 
 /**
  * blkcg_print_blkgs - helper for printing per-blkg data
@@ -584,9 +561,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
 		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-			   (unsigned long long)rwstat->cnt[i]);
+			   (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 
-	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+	v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
 	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 	return v;
 }
@@ -623,31 +601,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+				    struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_bytes), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_ios), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+					      struct blkg_policy_data *pd,
+					      int off)
+{
+	struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+							      NULL, off);
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field_recursive,
+			  (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_bytes), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field_recursive,
+			  (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_ios), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
 /**
  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+			    struct blkcg_policy *pol, int off)
 {
-	struct blkcg_policy *pol = blkcg_policy[pd->plid];
 	struct blkcg_gq *pos_blkg;
 	struct cgroup_subsys_state *pos_css;
 	u64 sum = 0;
 
-	lockdep_assert_held(pd->blkg->q->queue_lock);
+	lockdep_assert_held(blkg->q->queue_lock);
 
 	rcu_read_lock();
-	blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-		struct blkg_stat *stat = (void *)pos_pd + off;
+	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+		struct blkg_stat *stat;
+
+		if (!pos_blkg->online)
+			continue;
+
+		if (pol)
+			stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+		else
+			stat = (void *)blkg + off;
 
-		if (pos_blkg->online)
-			sum += blkg_stat_read(stat);
+		sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
 	}
 	rcu_read_unlock();
 
@@ -657,37 +726,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 
 /**
  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-					     int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+					     struct blkcg_policy *pol, int off)
 {
-	struct blkcg_policy *pol = blkcg_policy[pd->plid];
 	struct blkcg_gq *pos_blkg;
 	struct cgroup_subsys_state *pos_css;
 	struct blkg_rwstat sum = { };
 	int i;
 
-	lockdep_assert_held(pd->blkg->q->queue_lock);
+	lockdep_assert_held(blkg->q->queue_lock);
 
 	rcu_read_lock();
-	blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-		struct blkg_rwstat *rwstat = (void *)pos_pd + off;
-		struct blkg_rwstat tmp;
+	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+		struct blkg_rwstat *rwstat;
 
 		if (!pos_blkg->online)
 			continue;
 
-		tmp = blkg_rwstat_read(rwstat);
+		if (pol)
+			rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+		else
+			rwstat = (void *)pos_blkg + off;
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			sum.cnt[i] += tmp.cnt[i];
+			atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
+				percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+				&sum.aux_cnt[i]);
 	}
 	rcu_read_unlock();
 
@@ -703,29 +778,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
  * @ctx: blkg_conf_ctx to be filled
  *
  * Parse per-blkg config update from @input and initialize @ctx with the
- * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN.  This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   const char *input, struct blkg_conf_ctx *ctx)
+		   char *input, struct blkg_conf_ctx *ctx)
 	__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
 	struct gendisk *disk;
 	struct blkcg_gq *blkg;
 	unsigned int major, minor;
-	unsigned long long v;
-	int part, ret;
+	int key_len, part, ret;
+	char *body;
 
-	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
 		return -EINVAL;
 
+	body = input + key_len;
+	if (!isspace(*body))
+		return -EINVAL;
+	body = skip_spaces(body);
+
 	disk = get_gendisk(MKDEV(major, minor), &part);
 	if (!disk)
-		return -EINVAL;
+		return -ENODEV;
 	if (part) {
 		put_disk(disk);
-		return -EINVAL;
+		return -ENODEV;
 	}
 
 	rcu_read_lock();
@@ -734,7 +814,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	if (blkcg_policy_enabled(disk->queue, pol))
 		blkg = blkg_lookup_create(blkcg, disk->queue);
 	else
-		blkg = ERR_PTR(-EINVAL);
+		blkg = ERR_PTR(-EOPNOTSUPP);
 
 	if (IS_ERR(blkg)) {
 		ret = PTR_ERR(blkg);
@@ -756,7 +836,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
 	ctx->disk = disk;
 	ctx->blkg = blkg;
-	ctx->v = v;
+	ctx->body = body;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
@@ -777,8 +857,55 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+		const char *dname;
+		struct blkg_rwstat rwstat;
+		u64 rbytes, wbytes, rios, wios;
+
+		dname = blkg_dev_name(blkg);
+		if (!dname)
+			continue;
+
+		spin_lock_irq(blkg->q->queue_lock);
+
+		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+		rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+		wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+					offsetof(struct blkcg_gq, stat_ios));
+		rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+		spin_unlock_irq(blkg->q->queue_lock);
+
+		if (rbytes || wbytes || rios || wios)
+			seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+				   dname, rbytes, wbytes, rios, wios);
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+
 struct cftype blkcg_files[] = {
 	{
+		.name = "stat",
+		.seq_show = blkcg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
+struct cftype blkcg_legacy_files[] = {
+	{
 		.name = "reset_stats",
 		.write_u64 = blkcg_reset_stats,
 	},
@@ -825,18 +952,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
+	int i;
 
 	mutex_lock(&blkcg_pol_mutex);
+
 	list_del(&blkcg->all_blkcgs_node);
-	mutex_unlock(&blkcg_pol_mutex);
 
-	if (blkcg != &blkcg_root) {
-		int i;
+	for (i = 0; i < BLKCG_MAX_POLS; i++)
+		if (blkcg->cpd[i])
+			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 
-		for (i = 0; i < BLKCG_MAX_POLS; i++)
-			kfree(blkcg->pd[i]);
-		kfree(blkcg);
-	}
+	mutex_unlock(&blkcg_pol_mutex);
+
+	kfree(blkcg);
 }
 
 static struct cgroup_subsys_state *
@@ -850,13 +978,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
 	if (!parent_css) {
 		blkcg = &blkcg_root;
-		goto done;
-	}
-
-	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-	if (!blkcg) {
-		ret = ERR_PTR(-ENOMEM);
-		goto free_blkcg;
+	} else {
+		blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+		if (!blkcg) {
+			ret = ERR_PTR(-ENOMEM);
+			goto free_blkcg;
+		}
 	}
 
 	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -869,23 +996,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		 * check if the policy requires any specific per-cgroup
 		 * data: if it does, allocate and initialize it.
 		 */
-		if (!pol || !pol->cpd_size)
+		if (!pol || !pol->cpd_alloc_fn)
 			continue;
 
-		BUG_ON(blkcg->pd[i]);
-		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+		cpd = pol->cpd_alloc_fn(GFP_KERNEL);
 		if (!cpd) {
 			ret = ERR_PTR(-ENOMEM);
 			goto free_pd_blkcg;
 		}
-		blkcg->pd[i] = cpd;
+		blkcg->cpd[i] = cpd;
+		cpd->blkcg = blkcg;
 		cpd->plid = i;
-		pol->cpd_init_fn(blkcg);
+		if (pol->cpd_init_fn)
+			pol->cpd_init_fn(cpd);
 	}
 
-done:
 	spin_lock_init(&blkcg->lock);
-	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -897,7 +1024,8 @@ done:
 
 free_pd_blkcg:
 	for (i--; i >= 0; i--)
-		kfree(blkcg->pd[i]);
+		if (blkcg->cpd[i])
+			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 free_blkcg:
 	kfree(blkcg);
 	mutex_unlock(&blkcg_pol_mutex);
@@ -941,7 +1069,7 @@ int blkcg_init_queue(struct request_queue *q)
 		radix_tree_preload_end();
 
 	if (IS_ERR(blkg)) {
-		kfree(new_blkg);
+		blkg_free(new_blkg);
 		return PTR_ERR(blkg);
 	}
 
@@ -1018,12 +1146,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
 	return ret;
 }
 
-struct cgroup_subsys blkio_cgrp_subsys = {
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+	int i;
+
+	mutex_lock(&blkcg_pol_mutex);
+
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+		struct blkcg *blkcg;
+
+		if (!pol || !pol->cpd_bind_fn)
+			continue;
+
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+			if (blkcg->cpd[pol->plid])
+				pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+	}
+	mutex_unlock(&blkcg_pol_mutex);
+}
+
+struct cgroup_subsys io_cgrp_subsys = {
 	.css_alloc = blkcg_css_alloc,
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
-	.legacy_cftypes = blkcg_files,
+	.bind = blkcg_bind,
+	.dfl_cftypes = blkcg_files,
+	.legacy_cftypes = blkcg_legacy_files,
+	.legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
 	/*
 	 * This ensures that, if available, memcg is automatically enabled
@@ -1033,7 +1184,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
 	.depends_on = 1 << memory_cgrp_id,
 #endif
 };
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
 /**
  * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1054,65 +1205,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
 int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
-	LIST_HEAD(pds);
+	struct blkg_policy_data *pd_prealloc = NULL;
 	struct blkcg_gq *blkg;
-	struct blkg_policy_data *pd, *nd;
-	int cnt = 0, ret;
+	int ret;
 
 	if (blkcg_policy_enabled(q, pol))
 		return 0;
 
-	/* count and allocate policy_data for all existing blkgs */
 	blk_queue_bypass_start(q);
-	spin_lock_irq(q->queue_lock);
-	list_for_each_entry(blkg, &q->blkg_list, q_node)
-		cnt++;
-	spin_unlock_irq(q->queue_lock);
-
-	/* allocate per-blkg policy data for all existing blkgs */
-	while (cnt--) {
-		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
-		if (!pd) {
+pd_prealloc:
+	if (!pd_prealloc) {
+		pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
+		if (!pd_prealloc) {
 			ret = -ENOMEM;
-			goto out_free;
+			goto out_bypass_end;
 		}
-		list_add_tail(&pd->alloc_node, &pds);
 	}
 
-	/*
-	 * Install the allocated pds and cpds. With @q bypassing, no new blkg
-	 * should have been created while the queue lock was dropped.
-	 */
 	spin_lock_irq(q->queue_lock);
 
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		if (WARN_ON(list_empty(&pds))) {
-			/* umm... this shouldn't happen, just abort */
-			ret = -ENOMEM;
-			goto out_unlock;
-		}
-		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
-		list_del_init(&pd->alloc_node);
+		struct blkg_policy_data *pd;
 
-		/* grab blkcg lock too while installing @pd on @blkg */
-		spin_lock(&blkg->blkcg->lock);
+		if (blkg->pd[pol->plid])
+			continue;
+
+		pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+		if (!pd)
+			swap(pd, pd_prealloc);
+		if (!pd) {
+			spin_unlock_irq(q->queue_lock);
+			goto pd_prealloc;
+		}
 
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
-		pol->pd_init_fn(blkg);
-
-		spin_unlock(&blkg->blkcg->lock);
+		if (pol->pd_init_fn)
+			pol->pd_init_fn(pd);
 	}
 
 	__set_bit(pol->plid, q->blkcg_pols);
 	ret = 0;
-out_unlock:
+
 	spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
 	blk_queue_bypass_end(q);
-	list_for_each_entry_safe(pd, nd, &pds, alloc_node)
-		kfree(pd);
+	if (pd_prealloc)
+		pol->pd_free_fn(pd_prealloc);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1142,13 +1282,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
 		/* grab blkcg lock too while removing @pd from @blkg */
 		spin_lock(&blkg->blkcg->lock);
 
-		if (pol->pd_offline_fn)
-			pol->pd_offline_fn(blkg);
-		if (pol->pd_exit_fn)
-			pol->pd_exit_fn(blkg);
-
-		kfree(blkg->pd[pol->plid]);
-		blkg->pd[pol->plid] = NULL;
+		if (blkg->pd[pol->plid]) {
+			if (pol->pd_offline_fn)
+				pol->pd_offline_fn(blkg->pd[pol->plid]);
+			pol->pd_free_fn(blkg->pd[pol->plid]);
+			blkg->pd[pol->plid] = NULL;
+		}
 
 		spin_unlock(&blkg->blkcg->lock);
 	}
@@ -1170,9 +1309,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	struct blkcg *blkcg;
 	int i, ret;
 
-	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-		return -EINVAL;
-
 	mutex_lock(&blkcg_pol_register_mutex);
 	mutex_lock(&blkcg_pol_mutex);
 
@@ -1189,36 +1325,42 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	blkcg_policy[pol->plid] = pol;
 
 	/* allocate and install cpd's */
-	if (pol->cpd_size) {
+	if (pol->cpd_alloc_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
 			struct blkcg_policy_data *cpd;
 
-			cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+			cpd = pol->cpd_alloc_fn(GFP_KERNEL);
 			if (!cpd) {
 				mutex_unlock(&blkcg_pol_mutex);
 				goto err_free_cpds;
 			}
 
-			blkcg->pd[pol->plid] = cpd;
+			blkcg->cpd[pol->plid] = cpd;
+			cpd->blkcg = blkcg;
 			cpd->plid = pol->plid;
-			pol->cpd_init_fn(blkcg);
+			pol->cpd_init_fn(cpd);
 		}
 	}
 
 	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
-	if (pol->cftypes)
-		WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
-						  pol->cftypes));
+	if (pol->dfl_cftypes)
+		WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+					       pol->dfl_cftypes));
+	if (pol->legacy_cftypes)
+		WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
+						  pol->legacy_cftypes));
 	mutex_unlock(&blkcg_pol_register_mutex);
 	return 0;
 
 err_free_cpds:
-	if (pol->cpd_size) {
+	if (pol->cpd_alloc_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->pd[pol->plid]);
-			blkcg->pd[pol->plid] = NULL;
+			if (blkcg->cpd[pol->plid]) {
+				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+				blkcg->cpd[pol->plid] = NULL;
+			}
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
@@ -1245,16 +1387,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 		goto out_unlock;
 
 	/* kill the intf files first */
-	if (pol->cftypes)
-		cgroup_rm_cftypes(pol->cftypes);
+	if (pol->dfl_cftypes)
+		cgroup_rm_cftypes(pol->dfl_cftypes);
+	if (pol->legacy_cftypes)
+		cgroup_rm_cftypes(pol->legacy_cftypes);
 
 	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
 
-	if (pol->cpd_size) {
+	if (pol->cpd_alloc_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->pd[pol->plid]);
-			blkcg->pd[pol->plid] = NULL;
+			if (blkcg->cpd[pol->plid]) {
+				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+				blkcg->cpd[pol->plid] = NULL;
+			}
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
diff --git a/block/blk-core.c b/block/blk-core.c
index d1a63defd..48a6cc8df 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -145,18 +145,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
 	if (error)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		error = -EIO;
+		bio->bi_error = error;
 
 	if (unlikely(rq->cmd_flags & REQ_QUIET))
-		set_bit(BIO_QUIET, &bio->bi_flags);
+		bio_set_flag(bio, BIO_QUIET);
 
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
 	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
-		bio_endio(bio, error);
+		bio_endio(bio);
 }
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -580,7 +578,7 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);
 
-	bdi_destroy(&q->backing_dev_info);
+	bdi_unregister(&q->backing_dev_info);
 
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
@@ -647,6 +645,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
+	q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!q->bio_split)
+		goto fail_id;
+
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
@@ -655,7 +657,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	err = bdi_init(&q->backing_dev_info);
 	if (err)
-		goto fail_id;
+		goto fail_split;
 
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
@@ -697,6 +699,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 fail_bdi:
 	bdi_destroy(&q->backing_dev_info);
+fail_split:
+	bioset_free(q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
 fail_q:
@@ -1614,6 +1618,8 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	struct request *req;
 	unsigned int request_count = 0;
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
@@ -1622,7 +1628,8 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio_endio(bio, -EIO);
+		bio->bi_error = -EIO;
+		bio_endio(bio);
 		return;
 	}
 
@@ -1675,7 +1682,8 @@ get_rq:
 	 */
 	req = get_request(q, rw_flags, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
-		bio_endio(bio, PTR_ERR(req));	/* @q is dead */
+		bio->bi_error = PTR_ERR(req);
+		bio_endio(bio);
 		goto out_unlock;
 	}
 
@@ -1834,15 +1842,6 @@ generic_make_request_checks(struct bio *bio)
 		goto end_io;
 	}
 
-	if (likely(bio_is_rw(bio) &&
-		   nr_sectors > queue_max_hw_sectors(q))) {
-		printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-		       bdevname(bio->bi_bdev, b),
-		       bio_sectors(bio),
-		       queue_max_hw_sectors(q));
-		goto end_io;
-	}
-
 	part = bio->bi_bdev->bd_part;
 	if (should_fail_request(part, bio->bi_iter.bi_size) ||
 	    should_fail_request(&part_to_disk(part)->part0,
@@ -1891,14 +1890,15 @@ generic_make_request_checks(struct bio *bio)
 	 */
 	create_io_context(GFP_ATOMIC, q->node);
 
-	if (blk_throtl_bio(q, bio))
-		return false;	/* throttled, will be resubmitted later */
+	if (!blkcg_bio_issue_check(q, bio))
+		return false;
 
 	trace_block_bio_queue(q, bio);
 	return true;
 
 end_io:
-	bio_endio(bio, err);
+	bio->bi_error = err;
+	bio_endio(bio);
 	return false;
 }
 
@@ -1991,7 +1991,7 @@ void submit_bio(int rw, struct bio *bio)
 	bio->bi_rw |= rw;
 
 	if (unlikely(trap_non_toi_io))
-		BUG_ON(!(bio->bi_flags & (1 << BIO_TOI)));
+		BUG_ON(!bio_flagged(bio, BIO_TOI));
 
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index f548b64be..75f29cf70 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -204,6 +204,9 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
 	    q->limits.max_integrity_segments)
 		return false;
 
+	if (integrity_req_gap_back_merge(req, next->bio))
+		return false;
+
 	return true;
 }
 EXPORT_SYMBOL(blk_integrity_merge_rq);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 7688ee3f5..9ebf65379 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -11,16 +11,16 @@
 
 struct bio_batch {
 	atomic_t		done;
-	unsigned long		flags;
+	int			error;
 	struct completion	*wait;
 };
 
-static void bio_batch_end_io(struct bio *bio, int err)
+static void bio_batch_end_io(struct bio *bio)
 {
 	struct bio_batch *bb = bio->bi_private;
 
-	if (err && (err != -EOPNOTSUPP))
-		clear_bit(BIO_UPTODATE, &bb->flags);
+	if (bio->bi_error && bio->bi_error != -EOPNOTSUPP)
+		bb->error = bio->bi_error;
 	if (atomic_dec_and_test(&bb->done))
 		complete(bb->wait);
 	bio_put(bio);
@@ -43,7 +43,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q = bdev_get_queue(bdev);
 	int type = REQ_WRITE | REQ_DISCARD;
-	unsigned int max_discard_sectors, granularity;
+	unsigned int granularity;
 	int alignment;
 	struct bio_batch bb;
 	struct bio *bio;
@@ -60,17 +60,6 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	granularity = max(q->limits.discard_granularity >> 9, 1U);
 	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
 
-	/*
-	 * Ensure that max_discard_sectors is of the proper
-	 * granularity, so that requests stay aligned after a split.
-	 */
-	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-	max_discard_sectors -= max_discard_sectors % granularity;
-	if (unlikely(!max_discard_sectors)) {
-		/* Avoid infinite loop below. Being cautious never hurts. */
-		return -EOPNOTSUPP;
-	}
-
 	if (flags & BLKDEV_DISCARD_SECURE) {
 		if (!blk_queue_secdiscard(q))
 			return -EOPNOTSUPP;
@@ -78,7 +67,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	}
 
 	atomic_set(&bb.done, 1);
-	bb.flags = 1 << BIO_UPTODATE;
+	bb.error = 0;
 	bb.wait = &wait;
 
 	blk_start_plug(&plug);
@@ -92,7 +81,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			break;
 		}
 
-		req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+		/* Make sure bi_size doesn't overflow */
+		req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
 
 		/*
 		 * If splitting a request, and the next starting sector would be
@@ -134,9 +124,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	if (!atomic_dec_and_test(&bb.done))
 		wait_for_completion_io(&wait);
 
-	if (!test_bit(BIO_UPTODATE, &bb.flags))
-		ret = -EIO;
-
+	if (bb.error)
+		return bb.error;
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
@@ -166,13 +155,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	if (!q)
 		return -ENXIO;
 
-	max_write_same_sectors = q->limits.max_write_same_sectors;
-
-	if (max_write_same_sectors == 0)
-		return -EOPNOTSUPP;
+	/* Ensure that max_write_same_sectors doesn't overflow bi_size */
+	max_write_same_sectors = UINT_MAX >> 9;
 
 	atomic_set(&bb.done, 1);
-	bb.flags = 1 << BIO_UPTODATE;
+	bb.error = 0;
 	bb.wait = &wait;
 
 	while (nr_sects) {
@@ -208,9 +195,8 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	if (!atomic_dec_and_test(&bb.done))
 		wait_for_completion_io(&wait);
 
-	if (!test_bit(BIO_UPTODATE, &bb.flags))
-		ret = -ENOTSUPP;
-
+	if (bb.error)
+		return bb.error;
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
@@ -236,7 +222,7 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	DECLARE_COMPLETION_ONSTACK(wait);
 
 	atomic_set(&bb.done, 1);
-	bb.flags = 1 << BIO_UPTODATE;
+	bb.error = 0;
 	bb.wait = &wait;
 
 	ret = 0;
@@ -270,10 +256,8 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	if (!atomic_dec_and_test(&bb.done))
 		wait_for_completion_io(&wait);
 
-	if (!test_bit(BIO_UPTODATE, &bb.flags))
-		/* One of bios in the batch was completed with error.*/
-		ret = -EIO;
-
+	if (bb.error)
+		return bb.error;
 	return ret;
 }
 
diff --git a/block/blk-map.c b/block/blk-map.c
index da310a105..f565e11f4 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -9,6 +9,24 @@
 
 #include "blk.h"
 
+static bool iovec_gap_to_prv(struct request_queue *q,
+			     struct iovec *prv, struct iovec *cur)
+{
+	unsigned long prev_end;
+
+	if (!queue_virt_boundary(q))
+		return false;
+
+	if (prv->iov_base == NULL && prv->iov_len == 0)
+		/* prv is not set - don't check */
+		return false;
+
+	prev_end = (unsigned long)(prv->iov_base + prv->iov_len);
+
+	return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) ||
+		prev_end & queue_virt_boundary(q));
+}
+
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio)
 {
@@ -67,7 +85,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	struct bio *bio;
 	int unaligned = 0;
 	struct iov_iter i;
-	struct iovec iov;
+	struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
 
 	if (!iter || !iter->count)
 		return -EINVAL;
@@ -81,8 +99,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		/*
 		 * Keep going so we check length of all segments
 		 */
-		if (uaddr & queue_dma_alignment(q))
+		if ((uaddr & queue_dma_alignment(q)) ||
+		    iovec_gap_to_prv(q, &prv, &iov))
 			unaligned = 1;
+
+		prv.iov_base = iov.iov_base;
+		prv.iov_len = iov.iov_len;
 	}
 
 	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
@@ -94,7 +116,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		return PTR_ERR(bio);
 
 	if (map_data && map_data->null_mapped)
-		bio->bi_flags |= (1 << BIO_NULL_MAPPED);
+		bio_set_flag(bio, BIO_NULL_MAPPED);
 
 	if (bio->bi_iter.bi_size != iter->count) {
 		/*
@@ -103,7 +125,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		 * normal IO completion path
 		 */
 		bio_get(bio);
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		__blk_rq_unmap_user(bio);
 		return -EINVAL;
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 30a0d9f89..c4e9c37f3 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,12 +9,134 @@
 
 #include "blk.h"
 
+static struct bio *blk_bio_discard_split(struct request_queue *q,
+					 struct bio *bio,
+					 struct bio_set *bs)
+{
+	unsigned int max_discard_sectors, granularity;
+	int alignment;
+	sector_t tmp;
+	unsigned split_sectors;
+
+	/* Zero-sector (unknown) and one-sector granularities are the same.  */
+	granularity = max(q->limits.discard_granularity >> 9, 1U);
+
+	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+	max_discard_sectors -= max_discard_sectors % granularity;
+
+	if (unlikely(!max_discard_sectors)) {
+		/* XXX: warn */
+		return NULL;
+	}
+
+	if (bio_sectors(bio) <= max_discard_sectors)
+		return NULL;
+
+	split_sectors = max_discard_sectors;
+
+	/*
+	 * If the next starting sector would be misaligned, stop the discard at
+	 * the previous aligned sector.
+	 */
+	alignment = (q->limits.discard_alignment >> 9) % granularity;
+
+	tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
+	tmp = sector_div(tmp, granularity);
+
+	if (split_sectors > tmp)
+		split_sectors -= tmp;
+
+	return bio_split(bio, split_sectors, GFP_NOIO, bs);
+}
+
+static struct bio *blk_bio_write_same_split(struct request_queue *q,
+					    struct bio *bio,
+					    struct bio_set *bs)
+{
+	if (!q->limits.max_write_same_sectors)
+		return NULL;
+
+	if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
+		return NULL;
+
+	return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
+}
+
+static struct bio *blk_bio_segment_split(struct request_queue *q,
+					 struct bio *bio,
+					 struct bio_set *bs)
+{
+	struct bio_vec bv, bvprv, *bvprvp = NULL;
+	struct bvec_iter iter;
+	unsigned seg_size = 0, nsegs = 0, sectors = 0;
+
+	bio_for_each_segment(bv, bio, iter) {
+		if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q))
+			goto split;
+
+		/*
+		 * If the queue doesn't support SG gaps and adding this
+		 * offset would create a gap, disallow it.
+		 */
+		if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
+			goto split;
+
+		if (bvprvp && blk_queue_cluster(q)) {
+			if (seg_size + bv.bv_len > queue_max_segment_size(q))
+				goto new_segment;
+			if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv))
+				goto new_segment;
+			if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv))
+				goto new_segment;
+
+			seg_size += bv.bv_len;
+			bvprv = bv;
+			bvprvp = &bv;
+			sectors += bv.bv_len >> 9;
+			continue;
+		}
+new_segment:
+		if (nsegs == queue_max_segments(q))
+			goto split;
+
+		nsegs++;
+		bvprv = bv;
+		bvprvp = &bv;
+		seg_size = bv.bv_len;
+		sectors += bv.bv_len >> 9;
+	}
+
+	return NULL;
+split:
+	return bio_split(bio, sectors, GFP_NOIO, bs);
+}
+
+void blk_queue_split(struct request_queue *q, struct bio **bio,
+		     struct bio_set *bs)
+{
+	struct bio *split;
+
+	if ((*bio)->bi_rw & REQ_DISCARD)
+		split = blk_bio_discard_split(q, *bio, bs);
+	else if ((*bio)->bi_rw & REQ_WRITE_SAME)
+		split = blk_bio_write_same_split(q, *bio, bs);
+	else
+		split = blk_bio_segment_split(q, *bio, q->bio_split);
+
+	if (split) {
+		bio_chain(split, *bio);
+		generic_make_request(*bio);
+		*bio = split;
+	}
+}
+EXPORT_SYMBOL(blk_queue_split);
+
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio,
 					     bool no_sg_merge)
 {
 	struct bio_vec bv, bvprv = { NULL };
-	int cluster, high, highprv = 1;
+	int cluster, prev = 0;
 	unsigned int seg_size, nr_phys_segs;
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
@@ -36,7 +158,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	cluster = blk_queue_cluster(q);
 	seg_size = 0;
 	nr_phys_segs = 0;
-	high = 0;
 	for_each_bio(bio) {
 		bio_for_each_segment(bv, bio, iter) {
 			/*
@@ -46,13 +167,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 			if (no_sg_merge)
 				goto new_segment;
 
-			/*
-			 * the trick here is making sure that a high page is
-			 * never considered part of another segment, since
-			 * that might change with the bounce page.
-			 */
-			high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
-			if (!high && !highprv && cluster) {
+			if (prev && cluster) {
 				if (seg_size + bv.bv_len
 				    > queue_max_segment_size(q))
 					goto new_segment;
@@ -72,8 +187,8 @@ new_segment:
 
 			nr_phys_segs++;
 			bvprv = bv;
+			prev = 1;
 			seg_size = bv.bv_len;
-			highprv = high;
 		}
 		bbio = bio;
 	}
@@ -116,7 +231,7 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
 		bio->bi_next = nxt;
 	}
 
-	bio->bi_flags |= (1 << BIO_SEG_VALID);
+	bio_set_flag(bio, BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
 
@@ -266,7 +381,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 		if (rq->cmd_flags & REQ_WRITE)
 			memset(q->dma_drain_buffer, 0, q->dma_drain_size);
 
-		sg->page_link &= ~0x02;
+		sg_unmark_end(sg);
 		sg = sg_next(sg);
 		sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
 			    q->dma_drain_size,
@@ -312,6 +427,11 @@ no_merge:
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		     struct bio *bio)
 {
+	if (req_gap_back_merge(req, bio))
+		return 0;
+	if (blk_integrity_rq(req) &&
+	    integrity_req_gap_back_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -330,6 +450,12 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		      struct bio *bio)
 {
+
+	if (req_gap_front_merge(req, bio))
+		return 0;
+	if (blk_integrity_rq(req) &&
+	    integrity_req_gap_front_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -356,14 +482,6 @@ static bool req_no_special_merge(struct request *req)
 	return !q->mq_ops && req->special;
 }
 
-static int req_gap_to_prev(struct request *req, struct request *next)
-{
-	struct bio *prev = req->biotail;
-
-	return bvec_gap_to_prev(&prev->bi_io_vec[prev->bi_vcnt - 1],
-				next->bio->bi_io_vec[0].bv_offset);
-}
-
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
@@ -378,8 +496,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (req_no_special_merge(req) || req_no_special_merge(next))
 		return 0;
 
-	if (test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags) &&
-	    req_gap_to_prev(req, next))
+	if (req_gap_back_merge(req, next->bio))
 		return 0;
 
 	/*
@@ -564,8 +681,6 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 {
-	struct request_queue *q = rq->q;
-
 	if (!rq_mergeable(rq) || !bio_mergeable(bio))
 		return false;
 
@@ -589,15 +704,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 
-	/* Only check gaps if the bio carries data */
-	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
-		struct bio_vec *bprev;
-
-		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
-		if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
-			return false;
-	}
-
 	return true;
 }
 
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 1e28ddb65..8764c241e 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
 	return cpu;
 }
 
-int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+			    const struct cpumask *online_mask)
 {
 	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
 	cpumask_var_t cpus;
@@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
 
 	cpumask_clear(cpus);
 	nr_cpus = nr_uniq_cpus = 0;
-	for_each_online_cpu(i) {
+	for_each_cpu(i, online_mask) {
 		nr_cpus++;
 		first_sibling = get_first_sibling(i);
 		if (!cpumask_test_cpu(first_sibling, cpus))
@@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
 
 	queue = 0;
 	for_each_possible_cpu(i) {
-		if (!cpu_online(i)) {
+		if (!cpumask_test_cpu(i, online_mask)) {
 			map[i] = 0;
 			continue;
 		}
@@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
 	if (!map)
 		return NULL;
 
-	if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
+	if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
 		return map;
 
 	kfree(map);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 279c5d674..788fffd9b 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -229,8 +229,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	unsigned int i, first = 1;
 	ssize_t ret = 0;
 
-	blk_mq_disable_hotplug();
-
 	for_each_cpu(i, hctx->cpumask) {
 		if (first)
 			ret += sprintf(ret + page, "%u", i);
@@ -240,8 +238,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 		first = 0;
 	}
 
-	blk_mq_enable_hotplug();
-
 	ret += sprintf(ret + page, "\n");
 	return ret;
 }
@@ -343,7 +339,7 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
 	struct blk_mq_ctx *ctx;
 	int i;
 
-	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+	if (!hctx->nr_ctx)
 		return;
 
 	hctx_for_each_ctx(hctx, ctx, i)
@@ -358,7 +354,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	struct blk_mq_ctx *ctx;
 	int i, ret;
 
-	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+	if (!hctx->nr_ctx)
 		return 0;
 
 	ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
@@ -381,6 +377,8 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	struct blk_mq_ctx *ctx;
 	int i, j;
 
+	blk_mq_disable_hotplug();
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		blk_mq_unregister_hctx(hctx);
 
@@ -395,6 +393,9 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	kobject_put(&q->mq_kobj);
 
 	kobject_put(&disk_to_dev(disk)->kobj);
+
+	q->mq_sysfs_init_done = false;
+	blk_mq_enable_hotplug();
 }
 
 static void blk_mq_sysfs_init(struct request_queue *q)
@@ -425,27 +426,30 @@ int blk_mq_register_disk(struct gendisk *disk)
 	struct blk_mq_hw_ctx *hctx;
 	int ret, i;
 
+	blk_mq_disable_hotplug();
+
 	blk_mq_sysfs_init(q);
 
 	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		hctx->flags |= BLK_MQ_F_SYSFS_UP;
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)
 			break;
 	}
 
-	if (ret) {
+	if (ret)
 		blk_mq_unregister_disk(disk);
-		return ret;
-	}
+	else
+		q->mq_sysfs_init_done = true;
+out:
+	blk_mq_enable_hotplug();
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_mq_register_disk);
 
@@ -454,6 +458,9 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
+	if (!q->mq_sysfs_init_done)
+		return;
+
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 }
@@ -463,6 +470,9 @@ int blk_mq_sysfs_register(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i, ret = 0;
 
+	if (!q->mq_sysfs_init_done)
+		return ret;
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9115c6d59..ec2d11915 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -471,17 +471,30 @@ void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 }
 EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
 
-void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv)
 {
-	struct blk_mq_tags *tags = hctx->tags;
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		struct blk_mq_tags *tags = hctx->tags;
+
+		/*
+		 * If not software queues are currently mapped to this
+		 * hardware queue, there's nothing to check
+		 */
+		if (!blk_mq_hw_queue_mapped(hctx))
+			continue;
+
+		if (tags->nr_reserved_tags)
+			bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
+		bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+		      false);
+	}
 
-	if (tags->nr_reserved_tags)
-		bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
-	bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
-			false);
 }
-EXPORT_SYMBOL(blk_mq_tag_busy_iter);
 
 static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
 {
@@ -628,6 +641,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
 	bt_free(&tags->bitmap_tags);
 	bt_free(&tags->breserved_tags);
+	free_cpumask_var(tags->cpumask);
 	kfree(tags);
 }
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 9eb2cf4f0..d468a79f2 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -58,6 +58,8 @@ extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
 extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
 extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+		void *priv);
 
 enum {
 	BLK_MQ_TAG_CACHE_MIN	= 1,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c69902695..85f014327 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -393,14 +393,16 @@ void __blk_mq_complete_request(struct request *rq)
  *	Ends all I/O on a request. It does not handle partial completions.
  *	The actual completion happens out-of-order, through a IPI handler.
  **/
-void blk_mq_complete_request(struct request *rq)
+void blk_mq_complete_request(struct request *rq, int error)
 {
 	struct request_queue *q = rq->q;
 
 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
-	if (!blk_mark_rq_complete(rq))
+	if (!blk_mark_rq_complete(rq)) {
+		rq->errors = error;
 		__blk_mq_complete_request(rq);
+	}
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
@@ -616,10 +618,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		 * If a request wasn't started before the queue was
 		 * marked dying, kill it here or it'll go unnoticed.
 		 */
-		if (unlikely(blk_queue_dying(rq->q))) {
-			rq->errors = -EIO;
-			blk_mq_complete_request(rq);
-		}
+		if (unlikely(blk_queue_dying(rq->q)))
+			blk_mq_complete_request(rq, -EIO);
 		return;
 	}
 	if (rq->cmd_flags & REQ_NO_TIMEOUT)
@@ -641,24 +641,16 @@ static void blk_mq_rq_timer(unsigned long priv)
 		.next		= 0,
 		.next_set	= 0,
 	};
-	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	queue_for_each_hw_ctx(q, hctx, i) {
-		/*
-		 * If not software queues are currently mapped to this
-		 * hardware queue, there's nothing to check
-		 */
-		if (!blk_mq_hw_queue_mapped(hctx))
-			continue;
-
-		blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
-	}
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 
 	if (data.next_set) {
 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
 		mod_timer(&q->timeout, data.next);
 	} else {
+		struct blk_mq_hw_ctx *hctx;
+
 		queue_for_each_hw_ctx(q, hctx, i) {
 			/* the hctx may be unmapped, so check it here */
 			if (blk_mq_hw_queue_mapped(hctx))
@@ -1185,7 +1177,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	struct blk_mq_alloc_data alloc_data;
 
 	if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 		return NULL;
 	}
 
@@ -1269,10 +1261,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 		return;
 	}
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return;
@@ -1354,10 +1348,12 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 		return;
 	}
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
@@ -1785,13 +1781,19 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 	}
 }
 
-static void blk_mq_map_swqueue(struct request_queue *q)
+static void blk_mq_map_swqueue(struct request_queue *q,
+			       const struct cpumask *online_mask)
 {
 	unsigned int i;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_tag_set *set = q->tag_set;
 
+	/*
+	 * Avoid others reading imcomplete hctx->cpumask through sysfs
+	 */
+	mutex_lock(&q->sysfs_lock);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		cpumask_clear(hctx->cpumask);
 		hctx->nr_ctx = 0;
@@ -1802,7 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	 */
 	queue_for_each_ctx(q, ctx, i) {
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
-		if (!cpu_online(i))
+		if (!cpumask_test_cpu(i, online_mask))
 			continue;
 
 		hctx = q->mq_ops->map_queue(q, i);
@@ -1811,6 +1813,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
 
+	mutex_unlock(&q->sysfs_lock);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		struct blk_mq_ctxmap *map = &hctx->ctx_map;
 
@@ -1848,7 +1852,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	}
 
 	queue_for_each_ctx(q, ctx, i) {
-		if (!cpu_online(i))
+		if (!cpumask_test_cpu(i, online_mask))
 			continue;
 
 		hctx = q->mq_ops->map_queue(q, i);
@@ -1921,6 +1925,9 @@ void blk_mq_release(struct request_queue *q)
 		kfree(hctx);
 	}
 
+	kfree(q->mq_map);
+	q->mq_map = NULL;
+
 	kfree(q->queue_hw_ctx);
 
 	/* ctx kobj stays in queue_ctx */
@@ -2030,13 +2037,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (blk_mq_init_hw_queues(q, set))
 		goto err_hctxs;
 
+	get_online_cpus();
 	mutex_lock(&all_q_mutex);
-	list_add_tail(&q->all_q_node, &all_q_list);
-	mutex_unlock(&all_q_mutex);
 
+	list_add_tail(&q->all_q_node, &all_q_list);
 	blk_mq_add_queue_tag_set(set, q);
+	blk_mq_map_swqueue(q, cpu_online_mask);
 
-	blk_mq_map_swqueue(q);
+	mutex_unlock(&all_q_mutex);
+	put_online_cpus();
 
 	return q;
 
@@ -2060,30 +2069,27 @@ void blk_mq_free_queue(struct request_queue *q)
 {
 	struct blk_mq_tag_set	*set = q->tag_set;
 
+	mutex_lock(&all_q_mutex);
+	list_del_init(&q->all_q_node);
+	mutex_unlock(&all_q_mutex);
+
 	blk_mq_del_queue_tag_set(q);
 
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 	blk_mq_free_hw_queues(q, set);
 
 	percpu_ref_exit(&q->mq_usage_counter);
-
-	kfree(q->mq_map);
-
-	q->mq_map = NULL;
-
-	mutex_lock(&all_q_mutex);
-	list_del_init(&q->all_q_node);
-	mutex_unlock(&all_q_mutex);
 }
 
 /* Basically redo blk_mq_init_queue with queue frozen */
-static void blk_mq_queue_reinit(struct request_queue *q)
+static void blk_mq_queue_reinit(struct request_queue *q,
+				const struct cpumask *online_mask)
 {
 	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
 
 	blk_mq_sysfs_unregister(q);
 
-	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
+	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
 
 	/*
 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
@@ -2091,7 +2097,7 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 	 * involves free and re-allocate memory, worthy doing?)
 	 */
 
-	blk_mq_map_swqueue(q);
+	blk_mq_map_swqueue(q, online_mask);
 
 	blk_mq_sysfs_register(q);
 }
@@ -2100,16 +2106,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 				      unsigned long action, void *hcpu)
 {
 	struct request_queue *q;
+	int cpu = (unsigned long)hcpu;
+	/*
+	 * New online cpumask which is going to be set in this hotplug event.
+	 * Declare this cpumasks as global as cpu-hotplug operation is invoked
+	 * one-by-one and dynamically allocating this could result in a failure.
+	 */
+	static struct cpumask online_new;
 
 	/*
-	 * Before new mappings are established, hotadded cpu might already
-	 * start handling requests. This doesn't break anything as we map
-	 * offline CPUs to first hardware queue. We will re-init the queue
-	 * below to get optimal settings.
+	 * Before hotadded cpu starts handling requests, new mappings must
+	 * be established.  Otherwise, these requests in hw queue might
+	 * never be dispatched.
+	 *
+	 * For example, there is a single hw queue (hctx) and two CPU queues
+	 * (ctx0 for CPU0, and ctx1 for CPU1).
+	 *
+	 * Now CPU1 is just onlined and a request is inserted into
+	 * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
+	 * still zero.
+	 *
+	 * And then while running hw queue, flush_busy_ctxs() finds bit0 is
+	 * set in pending bitmap and tries to retrieve requests in
+	 * hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0,
+	 * so the request in ctx1->rq_list is ignored.
 	 */
-	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
-	    action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DEAD:
+	case CPU_UP_CANCELED:
+		cpumask_copy(&online_new, cpu_online_mask);
+		break;
+	case CPU_UP_PREPARE:
+		cpumask_copy(&online_new, cpu_online_mask);
+		cpumask_set_cpu(cpu, &online_new);
+		break;
+	default:
 		return NOTIFY_OK;
+	}
 
 	mutex_lock(&all_q_mutex);
 
@@ -2133,7 +2166,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	}
 
 	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_queue_reinit(q);
+		blk_mq_queue_reinit(q, &online_new);
 
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_unfreeze_queue(q);
@@ -2263,10 +2296,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	int i;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (set->tags[i]) {
+		if (set->tags[i])
 			blk_mq_free_rq_map(set, set->tags[i], i);
-			free_cpumask_var(set->tags[i]->cpumask);
-		}
 	}
 
 	kfree(set->tags);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6a48c4c0d..f4fea7964 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
  * CPU -> queue mappings
  */
 extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+				   const struct cpumask *online_mask);
 extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
 
 /*
diff --git a/block/blk-settings.c b/block/blk-settings.c
index e0057d035..7d8f129a1 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -53,28 +53,6 @@ void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
 }
 EXPORT_SYMBOL(blk_queue_unprep_rq);
 
-/**
- * blk_queue_merge_bvec - set a merge_bvec function for queue
- * @q:		queue
- * @mbfn:	merge_bvec_fn
- *
- * Usually queues have static limitations on the max sectors or segments that
- * we can put in a request. Stacking drivers may have some settings that
- * are dynamic, and thus we have to query the queue whether it is ok to
- * add a new bio_vec to a bio at a given offset or not. If the block device
- * has such limitations, it needs to register a merge_bvec_fn to control
- * the size of bio's sent to it. Note that a block device *must* allow a
- * single page to be added to an empty bio. The block device driver may want
- * to use the bio_split() function to deal with these bio's. By default
- * no merge_bvec_fn is defined for a queue, and only the fixed limits are
- * honored.
- */
-void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
-{
-	q->merge_bvec_fn = mbfn;
-}
-EXPORT_SYMBOL(blk_queue_merge_bvec);
-
 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
 {
 	q->softirq_done_fn = fn;
@@ -111,11 +89,13 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segments = BLK_MAX_SEGMENTS;
 	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+	lim->virt_boundary_mask = 0;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
 	lim->chunk_sectors = 0;
 	lim->max_write_same_sectors = 0;
 	lim->max_discard_sectors = 0;
+	lim->max_hw_discard_sectors = 0;
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
@@ -257,7 +237,9 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
 		       __func__, max_hw_sectors);
 	}
 
-	limits->max_sectors = limits->max_hw_sectors = max_hw_sectors;
+	limits->max_hw_sectors = max_hw_sectors;
+	limits->max_sectors = min_t(unsigned int, max_hw_sectors,
+				    BLK_DEF_MAX_SECTORS);
 }
 EXPORT_SYMBOL(blk_limits_max_hw_sectors);
 
@@ -303,6 +285,7 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors);
 void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors)
 {
+	q->limits.max_hw_discard_sectors = max_discard_sectors;
 	q->limits.max_discard_sectors = max_discard_sectors;
 }
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
@@ -550,6 +533,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
+	t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask,
+					    b->virt_boundary_mask);
 
 	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
 	t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
@@ -641,6 +626,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 		t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
 						      b->max_discard_sectors);
+		t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors,
+							 b->max_hw_discard_sectors);
 		t->discard_granularity = max(t->discard_granularity,
 					     b->discard_granularity);
 		t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) %
@@ -788,6 +775,17 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
 EXPORT_SYMBOL(blk_queue_segment_boundary);
 
 /**
+ * blk_queue_virt_boundary - set boundary rules for bio merging
+ * @q:  the request queue for the device
+ * @mask:  the memory boundary mask
+ **/
+void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
+{
+	q->limits.virt_boundary_mask = mask;
+}
+EXPORT_SYMBOL(blk_queue_virt_boundary);
+
+/**
  * blk_queue_dma_alignment - set dma length and memory alignment
  * @q:     the request queue for the device
  * @mask:  alignment mask
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6264b382d..07b42f5ad 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -145,12 +145,43 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
 	return queue_var_show(q->limits.discard_granularity, page);
 }
 
+static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
+{
+	unsigned long long val;
+
+	val = q->limits.max_hw_discard_sectors << 9;
+	return sprintf(page, "%llu\n", val);
+}
+
 static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
 {
 	return sprintf(page, "%llu\n",
 		       (unsigned long long)q->limits.max_discard_sectors << 9);
 }
 
+static ssize_t queue_discard_max_store(struct request_queue *q,
+				       const char *page, size_t count)
+{
+	unsigned long max_discard;
+	ssize_t ret = queue_var_store(&max_discard, page, count);
+
+	if (ret < 0)
+		return ret;
+
+	if (max_discard & (q->limits.discard_granularity - 1))
+		return -EINVAL;
+
+	max_discard >>= 9;
+	if (max_discard > UINT_MAX)
+		return -EINVAL;
+
+	if (max_discard > q->limits.max_hw_discard_sectors)
+		max_discard = q->limits.max_hw_discard_sectors;
+
+	q->limits.max_discard_sectors = max_discard;
+	return ret;
+}
+
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(queue_discard_zeroes_data(q), page);
@@ -360,9 +391,15 @@ static struct queue_sysfs_entry queue_discard_granularity_entry = {
 	.show = queue_discard_granularity_show,
 };
 
+static struct queue_sysfs_entry queue_discard_max_hw_entry = {
+	.attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO },
+	.show = queue_discard_max_hw_show,
+};
+
 static struct queue_sysfs_entry queue_discard_max_entry = {
-	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_discard_max_show,
+	.store = queue_discard_max_store,
 };
 
 static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
@@ -421,6 +458,7 @@ static struct attribute *default_attrs[] = {
 	&queue_io_opt_entry.attr,
 	&queue_discard_granularity_entry.attr,
 	&queue_discard_max_entry.attr,
+	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_nonrot_entry.attr,
@@ -502,6 +540,7 @@ static void blk_release_queue(struct kobject *kobj)
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 
+	bdi_exit(&q->backing_dev_info);
 	blkcg_exit_queue(q);
 
 	if (q->elevator) {
@@ -523,6 +562,9 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_trace_shutdown(q);
 
+	if (q->bio_split)
+		bioset_free(q->bio_split);
+
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b23193518..c75a2636d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -83,14 +83,6 @@ enum tg_state_flags {
 
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
-/* Per-cpu group stats */
-struct tg_stats_cpu {
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
-};
-
 struct throtl_grp {
 	/* must be the first member */
 	struct blkg_policy_data pd;
@@ -141,12 +133,6 @@ struct throtl_grp {
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
-
-	/* Per cpu stats pointer */
-	struct tg_stats_cpu __percpu *stats_cpu;
-
-	/* List of tgs waiting for per cpu stats memory to be allocated */
-	struct list_head stats_alloc_node;
 };
 
 struct throtl_data
@@ -168,13 +154,6 @@ struct throtl_data
 	struct work_struct dispatch_work;
 };
 
-/* list and work item to allocate percpu group stats */
-static DEFINE_SPINLOCK(tg_stats_alloc_lock);
-static LIST_HEAD(tg_stats_alloc_list);
-
-static void tg_stats_alloc_fn(struct work_struct *);
-static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
-
 static void throtl_pending_timer_fn(unsigned long arg);
 
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
 	return pd_to_blkg(&tg->pd);
 }
 
-static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
-{
-	return blkg_to_tg(td->queue->root_blkg);
-}
-
 /**
  * sq_to_tg - return the throl_grp the specified service queue belongs to
  * @sq: the throtl_service_queue of interest
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
 	}								\
 } while (0)
 
-static void tg_stats_init(struct tg_stats_cpu *tg_stats)
-{
-	blkg_rwstat_init(&tg_stats->service_bytes);
-	blkg_rwstat_init(&tg_stats->serviced);
-}
-
-/*
- * Worker for allocating per cpu stat for tgs. This is scheduled on the
- * system_wq once there are some groups on the alloc_list waiting for
- * allocation.
- */
-static void tg_stats_alloc_fn(struct work_struct *work)
-{
-	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
-	struct delayed_work *dwork = to_delayed_work(work);
-	bool empty = false;
-
-alloc_stats:
-	if (!stats_cpu) {
-		int cpu;
-
-		stats_cpu = alloc_percpu(struct tg_stats_cpu);
-		if (!stats_cpu) {
-			/* allocation failed, try again after some time */
-			schedule_delayed_work(dwork, msecs_to_jiffies(10));
-			return;
-		}
-		for_each_possible_cpu(cpu)
-			tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
-	}
-
-	spin_lock_irq(&tg_stats_alloc_lock);
-
-	if (!list_empty(&tg_stats_alloc_list)) {
-		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
-							 struct throtl_grp,
-							 stats_alloc_node);
-		swap(tg->stats_cpu, stats_cpu);
-		list_del_init(&tg->stats_alloc_node);
-	}
-
-	empty = list_empty(&tg_stats_alloc_list);
-	spin_unlock_irq(&tg_stats_alloc_lock);
-	if (!empty)
-		goto alloc_stats;
-}
-
 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
 {
 	INIT_LIST_HEAD(&qn->node);
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
 }
 
 /* init a service_queue, assumes the caller zeroed it */
-static void throtl_service_queue_init(struct throtl_service_queue *sq,
-				      struct throtl_service_queue *parent_sq)
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
 {
 	INIT_LIST_HEAD(&sq->queued[0]);
 	INIT_LIST_HEAD(&sq->queued[1]);
 	sq->pending_tree = RB_ROOT;
-	sq->parent_sq = parent_sq;
 	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
 		    (unsigned long)sq);
 }
 
-static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
-	del_timer_sync(&sq->pending_timer);
+	struct throtl_grp *tg;
+	int rw;
+
+	tg = kzalloc_node(sizeof(*tg), gfp, node);
+	if (!tg)
+		return NULL;
+
+	throtl_service_queue_init(&tg->service_queue);
+
+	for (rw = READ; rw <= WRITE; rw++) {
+		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+	}
+
+	RB_CLEAR_NODE(&tg->rb_node);
+	tg->bps[READ] = -1;
+	tg->bps[WRITE] = -1;
+	tg->iops[READ] = -1;
+	tg->iops[WRITE] = -1;
+
+	return &tg->pd;
 }
 
-static void throtl_pd_init(struct blkcg_gq *blkg)
+static void throtl_pd_init(struct blkg_policy_data *pd)
 {
-	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct throtl_grp *tg = pd_to_tg(pd);
+	struct blkcg_gq *blkg = tg_to_blkg(tg);
 	struct throtl_data *td = blkg->q->td;
-	struct throtl_service_queue *parent_sq;
-	unsigned long flags;
-	int rw;
+	struct throtl_service_queue *sq = &tg->service_queue;
 
 	/*
 	 * If on the default hierarchy, we switch to properly hierarchical
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	 * Limits of a group don't interact with limits of other groups
 	 * regardless of the position of the group in the hierarchy.
 	 */
-	parent_sq = &td->service_queue;
-
+	sq->parent_sq = &td->service_queue;
 	if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
-		parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
-
-	throtl_service_queue_init(&tg->service_queue, parent_sq);
-
-	for (rw = READ; rw <= WRITE; rw++) {
-		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
-		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
-	}
-
-	RB_CLEAR_NODE(&tg->rb_node);
+		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
 	tg->td = td;
-
-	tg->bps[READ] = -1;
-	tg->bps[WRITE] = -1;
-	tg->iops[READ] = -1;
-	tg->iops[WRITE] = -1;
-
-	/*
-	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
-	 * but percpu allocator can't be called from IO path.  Queue tg on
-	 * tg_stats_alloc_list and allocate from work item.
-	 */
-	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
-	schedule_delayed_work(&tg_stats_alloc_work, 0);
-	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
 
 /*
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg)
 				    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
 }
 
-static void throtl_pd_online(struct blkcg_gq *blkg)
+static void throtl_pd_online(struct blkg_policy_data *pd)
 {
 	/*
 	 * We don't want new groups to escape the limits of its ancestors.
 	 * Update has_rules[] after a new group is brought online.
 	 */
-	tg_update_has_rules(blkg_to_tg(blkg));
-}
-
-static void throtl_pd_exit(struct blkcg_gq *blkg)
-{
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-	unsigned long flags;
-
-	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-	list_del_init(&tg->stats_alloc_node);
-	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
-
-	free_percpu(tg->stats_cpu);
-
-	throtl_service_queue_exit(&tg->service_queue);
-}
-
-static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
-{
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-	int cpu;
-
-	if (tg->stats_cpu == NULL)
-		return;
-
-	for_each_possible_cpu(cpu) {
-		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-		blkg_rwstat_reset(&sc->service_bytes);
-		blkg_rwstat_reset(&sc->serviced);
-	}
-}
-
-static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
-					   struct blkcg *blkcg)
-{
-	/*
-	 * This is the common case when there are no blkcgs.  Avoid lookup
-	 * in this case
-	 */
-	if (blkcg == &blkcg_root)
-		return td_root_tg(td);
-
-	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
+	tg_update_has_rules(pd_to_tg(pd));
 }
 
-static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
-						  struct blkcg *blkcg)
+static void throtl_pd_free(struct blkg_policy_data *pd)
 {
-	struct request_queue *q = td->queue;
-	struct throtl_grp *tg = NULL;
-
-	/*
-	 * This is the common case when there are no blkcgs.  Avoid lookup
-	 * in this case
-	 */
-	if (blkcg == &blkcg_root) {
-		tg = td_root_tg(td);
-	} else {
-		struct blkcg_gq *blkg;
-
-		blkg = blkg_lookup_create(blkcg, q);
-
-		/* if %NULL and @q is alive, fall back to root_tg */
-		if (!IS_ERR(blkg))
-			tg = blkg_to_tg(blkg);
-		else if (!blk_queue_dying(q))
-			tg = td_root_tg(td);
-	}
+	struct throtl_grp *tg = pd_to_tg(pd);
 
-	return tg;
+	del_timer_sync(&tg->service_queue.pending_timer);
+	kfree(tg);
 }
 
 static struct throtl_grp *
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
-					 int rw)
-{
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-	struct tg_stats_cpu *stats_cpu;
-	unsigned long flags;
-
-	/* If per cpu stats are not allocated yet, don't do any accounting. */
-	if (tg->stats_cpu == NULL)
-		return;
-
-	/*
-	 * Disabling interrupts to provide mutual exclusion between two
-	 * writes on same cpu. It probably is not needed for 64bit. Not
-	 * optimizing that case yet.
-	 */
-	local_irq_save(flags);
-
-	stats_cpu = this_cpu_ptr(tg->stats_cpu);
-
-	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
-
-	local_irq_restore(flags);
-}
-
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	 * more than once as a throttled bio will go through blk-throtl the
 	 * second time when it eventually gets issued.  Set it when a bio
 	 * is being charged to a tg.
-	 *
-	 * Dispatch stats aren't recursive and each @bio should only be
-	 * accounted by the @tg it was originally associated with.  Let's
-	 * update the stats when setting REQ_THROTTLED for the first time
-	 * which is guaranteed to be for the @bio's original tg.
 	 */
-	if (!(bio->bi_rw & REQ_THROTTLED)) {
+	if (!(bio->bi_rw & REQ_THROTTLED))
 		bio->bi_rw |= REQ_THROTTLED;
-		throtl_update_dispatch_stats(tg_to_blkg(tg),
-					     bio->bi_iter.bi_size, bio->bi_rw);
-	}
 }
 
 /**
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	}
 }
 
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
-				struct blkg_policy_data *pd, int off)
-{
-	struct throtl_grp *tg = pd_to_tg(pd);
-	struct blkg_rwstat rwstat = { }, tmp;
-	int i, cpu;
-
-	if (tg->stats_cpu == NULL)
-		return 0;
-
-	for_each_possible_cpu(cpu) {
-		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-		tmp = blkg_rwstat_read((void *)sc + off);
-		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			rwstat.cnt[i] += tmp.cnt[i];
-	}
-
-	return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
-			  &blkcg_policy_throtl, seq_cft(sf)->private, true);
-	return 0;
-}
-
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
 			      int off)
 {
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
 	return 0;
 }
 
-static ssize_t tg_set_conf(struct kernfs_open_file *of,
-			   char *buf, size_t nbytes, loff_t off, bool is_u64)
+static void tg_conf_updated(struct throtl_grp *tg)
 {
-	struct blkcg *blkcg = css_to_blkcg(of_css(of));
-	struct blkg_conf_ctx ctx;
-	struct throtl_grp *tg;
-	struct throtl_service_queue *sq;
-	struct blkcg_gq *blkg;
+	struct throtl_service_queue *sq = &tg->service_queue;
 	struct cgroup_subsys_state *pos_css;
-	int ret;
-
-	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
-	if (ret)
-		return ret;
-
-	tg = blkg_to_tg(ctx.blkg);
-	sq = &tg->service_queue;
-
-	if (!ctx.v)
-		ctx.v = -1;
-
-	if (is_u64)
-		*(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
-	else
-		*(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
+	struct blkcg_gq *blkg;
 
 	throtl_log(&tg->service_queue,
 		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	 * restrictions in the whole hierarchy and allows them to bypass
 	 * blk-throttle.
 	 */
-	blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
+	blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
 		tg_update_has_rules(blkg_to_tg(blkg));
 
 	/*
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 		tg_update_disptime(tg);
 		throtl_schedule_next_dispatch(sq->parent_sq, true);
 	}
+}
+
+static ssize_t tg_set_conf(struct kernfs_open_file *of,
+			   char *buf, size_t nbytes, loff_t off, bool is_u64)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct blkg_conf_ctx ctx;
+	struct throtl_grp *tg;
+	int ret;
+	u64 v;
 
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	if (sscanf(ctx.body, "%llu", &v) != 1)
+		goto out_finish;
+	if (!v)
+		v = -1;
+
+	tg = blkg_to_tg(ctx.blkg);
+
+	if (is_u64)
+		*(u64 *)((void *)tg + of_cft(of)->private) = v;
+	else
+		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
+
+	tg_conf_updated(tg);
+	ret = 0;
+out_finish:
 	blkg_conf_finish(&ctx);
-	return nbytes;
+	return ret ?: nbytes;
 }
 
 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
 	return tg_set_conf(of, buf, nbytes, off, false);
 }
 
-static struct cftype throtl_files[] = {
+static struct cftype throtl_legacy_files[] = {
 	{
 		.name = "throttle.read_bps_device",
 		.private = offsetof(struct throtl_grp, bps[READ]),
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = {
 	},
 	{
 		.name = "throttle.io_service_bytes",
-		.private = offsetof(struct tg_stats_cpu, service_bytes),
-		.seq_show = tg_print_cpu_rwstat,
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_bytes,
 	},
 	{
 		.name = "throttle.io_serviced",
-		.private = offsetof(struct tg_stats_cpu, serviced),
-		.seq_show = tg_print_cpu_rwstat,
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_ios,
+	},
+	{ }	/* terminate */
+};
+
+static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+			 int off)
+{
+	struct throtl_grp *tg = pd_to_tg(pd);
+	const char *dname = blkg_dev_name(pd->blkg);
+	char bufs[4][21] = { "max", "max", "max", "max" };
+
+	if (!dname)
+		return 0;
+	if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
+	    tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+		return 0;
+
+	if (tg->bps[READ] != -1)
+		snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+	if (tg->bps[WRITE] != -1)
+		snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+	if (tg->iops[READ] != -1)
+		snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+	if (tg->iops[WRITE] != -1)
+		snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+	seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+		   dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+	return 0;
+}
+
+static int tg_print_max(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
+	return 0;
+}
+
+static ssize_t tg_set_max(struct kernfs_open_file *of,
+			  char *buf, size_t nbytes, loff_t off)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct blkg_conf_ctx ctx;
+	struct throtl_grp *tg;
+	u64 v[4];
+	int ret;
+
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+	if (ret)
+		return ret;
+
+	tg = blkg_to_tg(ctx.blkg);
+
+	v[0] = tg->bps[READ];
+	v[1] = tg->bps[WRITE];
+	v[2] = tg->iops[READ];
+	v[3] = tg->iops[WRITE];
+
+	while (true) {
+		char tok[27];	/* wiops=18446744073709551616 */
+		char *p;
+		u64 val = -1;
+		int len;
+
+		if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
+			break;
+		if (tok[0] == '\0')
+			break;
+		ctx.body += len;
+
+		ret = -EINVAL;
+		p = tok;
+		strsep(&p, "=");
+		if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+			goto out_finish;
+
+		ret = -ERANGE;
+		if (!val)
+			goto out_finish;
+
+		ret = -EINVAL;
+		if (!strcmp(tok, "rbps"))
+			v[0] = val;
+		else if (!strcmp(tok, "wbps"))
+			v[1] = val;
+		else if (!strcmp(tok, "riops"))
+			v[2] = min_t(u64, val, UINT_MAX);
+		else if (!strcmp(tok, "wiops"))
+			v[3] = min_t(u64, val, UINT_MAX);
+		else
+			goto out_finish;
+	}
+
+	tg->bps[READ] = v[0];
+	tg->bps[WRITE] = v[1];
+	tg->iops[READ] = v[2];
+	tg->iops[WRITE] = v[3];
+
+	tg_conf_updated(tg);
+	ret = 0;
+out_finish:
+	blkg_conf_finish(&ctx);
+	return ret ?: nbytes;
+}
+
+static struct cftype throtl_files[] = {
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = tg_print_max,
+		.write = tg_set_max,
 	},
 	{ }	/* terminate */
 };
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
-	.pd_size		= sizeof(struct throtl_grp),
-	.cftypes		= throtl_files,
+	.dfl_cftypes		= throtl_files,
+	.legacy_cftypes		= throtl_legacy_files,
 
+	.pd_alloc_fn		= throtl_pd_alloc,
 	.pd_init_fn		= throtl_pd_init,
 	.pd_online_fn		= throtl_pd_online,
-	.pd_exit_fn		= throtl_pd_exit,
-	.pd_reset_stats_fn	= throtl_pd_reset_stats,
+	.pd_free_fn		= throtl_pd_free,
 };
 
-bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+		    struct bio *bio)
 {
-	struct throtl_data *td = q->td;
 	struct throtl_qnode *qn = NULL;
-	struct throtl_grp *tg;
+	struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
-	struct blkcg *blkcg;
 	bool throttled = false;
 
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
 	/* see throtl_charge_bio() */
-	if (bio->bi_rw & REQ_THROTTLED)
+	if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
 		goto out;
 
-	/*
-	 * A throtl_grp pointer retrieved under rcu can be used to access
-	 * basic fields like stats and io rates. If a group has no rules,
-	 * just update the dispatch stats in lockless manner and return.
-	 */
-	rcu_read_lock();
-	blkcg = bio_blkcg(bio);
-	tg = throtl_lookup_tg(td, blkcg);
-	if (tg) {
-		if (!tg->has_rules[rw]) {
-			throtl_update_dispatch_stats(tg_to_blkg(tg),
-					bio->bi_iter.bi_size, bio->bi_rw);
-			goto out_unlock_rcu;
-		}
-	}
-
-	/*
-	 * Either group has not been allocated yet or it is not an unlimited
-	 * IO group
-	 */
 	spin_lock_irq(q->queue_lock);
-	tg = throtl_lookup_create_tg(td, blkcg);
-	if (unlikely(!tg))
+
+	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
 	sq = &tg->service_queue;
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
-out_unlock_rcu:
-	rcu_read_unlock();
 out:
 	/*
 	 * As multiple blk-throtls may stack in the same issue path, we
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q)
 		return -ENOMEM;
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
-	throtl_service_queue_init(&td->service_queue, NULL);
+	throtl_service_queue_init(&td->service_queue);
 
 	q->td = td;
 	td->queue = q;
diff --git a/block/blk.h b/block/blk.h
index 838188b35..98614ad37 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
  * Internal throttling interface
  */
 #ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
-static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
-{
-	return false;
-}
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
diff --git a/block/bounce.c b/block/bounce.c
index b17311227..1cb5dd3a5 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -123,17 +123,19 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 	}
 }
 
-static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+static void bounce_end_io(struct bio *bio, mempool_t *pool)
 {
 	struct bio *bio_orig = bio->bi_private;
 	struct bio_vec *bvec, *org_vec;
 	int i;
+	int start = bio_orig->bi_iter.bi_idx;
 
 	/*
 	 * free up bounce indirect pages used
 	 */
 	bio_for_each_segment_all(bvec, bio, i) {
-		org_vec = bio_orig->bi_io_vec + i;
+		org_vec = bio_orig->bi_io_vec + i + start;
+
 		if (bvec->bv_page == org_vec->bv_page)
 			continue;
 
@@ -141,61 +143,44 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 		mempool_free(bvec->bv_page, pool);
 	}
 
-	bio_endio(bio_orig, err);
+	bio_orig->bi_error = bio->bi_error;
+	bio_endio(bio_orig);
 	bio_put(bio);
 }
 
-static void bounce_end_io_write(struct bio *bio, int err)
+static void bounce_end_io_write(struct bio *bio)
 {
-	bounce_end_io(bio, page_pool, err);
+	bounce_end_io(bio, page_pool);
 }
 
-static void bounce_end_io_write_isa(struct bio *bio, int err)
+static void bounce_end_io_write_isa(struct bio *bio)
 {
 
-	bounce_end_io(bio, isa_page_pool, err);
+	bounce_end_io(bio, isa_page_pool);
 }
 
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
 {
 	struct bio *bio_orig = bio->bi_private;
 
-	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+	if (!bio->bi_error)
 		copy_to_high_bio_irq(bio_orig, bio);
 
-	bounce_end_io(bio, pool, err);
-}
-
-static void bounce_end_io_read(struct bio *bio, int err)
-{
-	__bounce_end_io_read(bio, page_pool, err);
+	bounce_end_io(bio, pool);
 }
 
-static void bounce_end_io_read_isa(struct bio *bio, int err)
+static void bounce_end_io_read(struct bio *bio)
 {
-	__bounce_end_io_read(bio, isa_page_pool, err);
+	__bounce_end_io_read(bio, page_pool);
 }
 
-#ifdef CONFIG_NEED_BOUNCE_POOL
-static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
-{
-	if (bio_data_dir(bio) != WRITE)
-		return 0;
-
-	if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
-		return 0;
-
-	return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
-}
-#else
-static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
+static void bounce_end_io_read_isa(struct bio *bio)
 {
-	return 0;
+	__bounce_end_io_read(bio, isa_page_pool);
 }
-#endif /* CONFIG_NEED_BOUNCE_POOL */
 
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
-			       mempool_t *pool, int force)
+			       mempool_t *pool)
 {
 	struct bio *bio;
 	int rw = bio_data_dir(*bio_orig);
@@ -203,8 +188,6 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	struct bvec_iter iter;
 	unsigned i;
 
-	if (force)
-		goto bounce;
 	bio_for_each_segment(from, *bio_orig, iter)
 		if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
 			goto bounce;
@@ -216,7 +199,7 @@ bounce:
 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;
 
-		if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+		if (page_to_pfn(page) <= queue_bounce_pfn(q))
 			continue;
 
 		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
@@ -254,7 +237,6 @@ bounce:
 
 void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 {
-	int must_bounce;
 	mempool_t *pool;
 
 	/*
@@ -263,15 +245,13 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	if (!bio_has_data(*bio_orig))
 		return;
 
-	must_bounce = must_snapshot_stable_pages(q, *bio_orig);
-
 	/*
 	 * for non-isa bounce case, just check if the bounce pfn is equal
 	 * to or bigger than the highest pfn in the system -- in that case,
 	 * don't waste time iterating over bio segments
 	 */
 	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
+		if (queue_bounce_pfn(q) >= blk_max_pfn)
 			return;
 		pool = page_pool;
 	} else {
@@ -282,7 +262,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	/*
 	 * slow path
 	 */
-	__blk_queue_bounce(q, bio_orig, pool, must_bounce);
+	__blk_queue_bounce(q, bio_orig, pool);
 }
 
 EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c62bb2e65..04de88463 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool;
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
 
 /* blkio-related constants */
-#define CFQ_WEIGHT_MIN          10
-#define CFQ_WEIGHT_MAX          1000
-#define CFQ_WEIGHT_DEFAULT      500
+#define CFQ_WEIGHT_LEGACY_MIN	10
+#define CFQ_WEIGHT_LEGACY_DFL	500
+#define CFQ_WEIGHT_LEGACY_MAX	1000
 
 struct cfq_ttime {
 	unsigned long last_end_request;
@@ -177,10 +177,6 @@ enum wl_type_t {
 
 struct cfqg_stats {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
 	/* number of ios merged */
 	struct blkg_rwstat		merged;
 	/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -189,8 +185,6 @@ struct cfqg_stats {
 	struct blkg_rwstat		wait_time;
 	/* number of IOs queued up */
 	struct blkg_rwstat		queued;
-	/* total sectors transferred */
-	struct blkg_stat		sectors;
 	/* total disk time and nr sectors dispatched by this group */
 	struct blkg_stat		time;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -220,7 +214,7 @@ struct cfqg_stats {
 /* Per-cgroup data */
 struct cfq_group_data {
 	/* must be the first member */
-	struct blkcg_policy_data pd;
+	struct blkcg_policy_data cpd;
 
 	unsigned int weight;
 	unsigned int leaf_weight;
@@ -304,7 +298,11 @@ struct cfq_group {
 	int dispatched;
 	struct cfq_ttime ttime;
 	struct cfqg_stats stats;	/* stats for this cfqg */
-	struct cfqg_stats dead_stats;	/* stats pushed from dead children */
+
+	/* async queue for each priority case */
+	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
+	struct cfq_queue *async_idle_cfqq;
+
 };
 
 struct cfq_io_cq {
@@ -370,12 +368,6 @@ struct cfq_data {
 	struct cfq_queue *active_queue;
 	struct cfq_io_cq *active_cic;
 
-	/*
-	 * async queue for each priority case
-	 */
-	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
-	struct cfq_queue *async_idle_cfqq;
-
 	sector_t last_position;
 
 	/*
@@ -401,6 +393,7 @@ struct cfq_data {
 };
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
+static void cfq_put_queue(struct cfq_queue *cfqq);
 
 static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
 					    enum wl_class_t class,
@@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 static struct cfq_group_data
 *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 {
-	return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+	return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
 }
 
 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
@@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
 	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
 }
 
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-					      uint64_t bytes, int rw)
-{
-	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
-	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
-	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
-}
-
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 			uint64_t start_time, uint64_t io_start_time, int rw)
 {
@@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 static void cfqg_stats_reset(struct cfqg_stats *stats)
 {
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_reset(&stats->service_bytes);
-	blkg_rwstat_reset(&stats->serviced);
 	blkg_rwstat_reset(&stats->merged);
 	blkg_rwstat_reset(&stats->service_time);
 	blkg_rwstat_reset(&stats->wait_time);
@@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
 }
 
 /* @to += @from */
-static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 {
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
-	blkg_rwstat_merge(&to->serviced, &from->serviced);
-	blkg_rwstat_merge(&to->merged, &from->merged);
-	blkg_rwstat_merge(&to->service_time, &from->service_time);
-	blkg_rwstat_merge(&to->wait_time, &from->wait_time);
-	blkg_stat_merge(&from->time, &from->time);
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
-	blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-	blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
-	blkg_stat_merge(&to->dequeue, &from->dequeue);
-	blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
-	blkg_stat_merge(&to->idle_time, &from->idle_time);
-	blkg_stat_merge(&to->empty_time, &from->empty_time);
+	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
 #endif
 }
 
 /*
- * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
  * recursive stats can still account for the amount used by this cfqg after
  * it's gone.
  */
@@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
 	if (unlikely(!parent))
 		return;
 
-	cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
-	cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+	cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
 	cfqg_stats_reset(&cfqg->stats);
-	cfqg_stats_reset(&cfqg->dead_stats);
 }
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
@@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 			unsigned long time, unsigned long unaccounted_time) { }
 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-					      uint64_t bytes, int rw) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 			uint64_t start_time, uint64_t io_start_time, int rw) { }
 
@@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
-				       struct cfq_io_cq *cic, struct bio *bio,
-				       gfp_t gfp_mask);
+				       struct cfq_io_cq *cic, struct bio *bio);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
@@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfqg_stats_init(struct cfqg_stats *stats)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+			    bool on_dfl, bool reset_dev, bool is_leaf_weight);
+
+static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
-	blkg_rwstat_init(&stats->service_bytes);
-	blkg_rwstat_init(&stats->serviced);
-	blkg_rwstat_init(&stats->merged);
-	blkg_rwstat_init(&stats->service_time);
-	blkg_rwstat_init(&stats->wait_time);
-	blkg_rwstat_init(&stats->queued);
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	blkg_stat_exit(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_exit(&stats->unaccounted_time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+#endif
+}
 
-	blkg_stat_init(&stats->sectors);
-	blkg_stat_init(&stats->time);
+static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+	    blkg_stat_init(&stats->time, gfp))
+		goto err;
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_init(&stats->unaccounted_time);
-	blkg_stat_init(&stats->avg_queue_size_sum);
-	blkg_stat_init(&stats->avg_queue_size_samples);
-	blkg_stat_init(&stats->dequeue);
-	blkg_stat_init(&stats->group_wait_time);
-	blkg_stat_init(&stats->idle_time);
-	blkg_stat_init(&stats->empty_time);
+	if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp))
+		goto err;
 #endif
+	return 0;
+err:
+	cfqg_stats_exit(stats);
+	return -ENOMEM;
 }
 
-static void cfq_cpd_init(const struct blkcg *blkcg)
+static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
 {
-	struct cfq_group_data *cgd =
-		cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+	struct cfq_group_data *cgd;
 
-	if (blkcg == &blkcg_root) {
-		cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
-		cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
-	} else {
-		cgd->weight = CFQ_WEIGHT_DEFAULT;
-		cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
-	}
+	cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+	if (!cgd)
+		return NULL;
+	return &cgd->cpd;
+}
+
+static void cfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
+	unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+			      CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+	if (cpd_to_blkcg(cpd) == &blkcg_root)
+		weight *= 2;
+
+	cgd->weight = weight;
+	cgd->leaf_weight = weight;
 }
 
-static void cfq_pd_init(struct blkcg_gq *blkg)
+static void cfq_cpd_free(struct blkcg_policy_data *cpd)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
+	kfree(cpd_to_cfqgd(cpd));
+}
+
+static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
+{
+	struct blkcg *blkcg = cpd_to_blkcg(cpd);
+	bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+	unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+	if (blkcg == &blkcg_root)
+		weight *= 2;
+
+	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
+	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
+}
+
+static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
+{
+	struct cfq_group *cfqg;
+
+	cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
+	if (!cfqg)
+		return NULL;
 
 	cfq_init_cfqg_base(cfqg);
+	if (cfqg_stats_init(&cfqg->stats, gfp)) {
+		kfree(cfqg);
+		return NULL;
+	}
+
+	return &cfqg->pd;
+}
+
+static void cfq_pd_init(struct blkg_policy_data *pd)
+{
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
+
 	cfqg->weight = cgd->weight;
 	cfqg->leaf_weight = cgd->leaf_weight;
-	cfqg_stats_init(&cfqg->stats);
-	cfqg_stats_init(&cfqg->dead_stats);
 }
 
-static void cfq_pd_offline(struct blkcg_gq *blkg)
+static void cfq_pd_offline(struct blkg_policy_data *pd)
 {
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
+	int i;
+
+	for (i = 0; i < IOPRIO_BE_NR; i++) {
+		if (cfqg->async_cfqq[0][i])
+			cfq_put_queue(cfqg->async_cfqq[0][i]);
+		if (cfqg->async_cfqq[1][i])
+			cfq_put_queue(cfqg->async_cfqq[1][i]);
+	}
+
+	if (cfqg->async_idle_cfqq)
+		cfq_put_queue(cfqg->async_idle_cfqq);
+
 	/*
 	 * @blkg is going offline and will be ignored by
 	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
 	 * that they don't get lost.  If IOs complete after this point, the
 	 * stats for them will be lost.  Oh well...
 	 */
-	cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+	cfqg_stats_xfer_dead(cfqg);
 }
 
-/* offset delta from cfqg->stats to cfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
-					offsetof(struct cfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+static void cfq_pd_free(struct blkg_policy_data *pd)
 {
-	u64 sum = 0;
-
-	sum += blkg_stat_recursive_sum(pd, off);
-	sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
-	return sum;
-}
-
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
-						       int off)
-{
-	struct blkg_rwstat a, b;
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
 
-	a = blkg_rwstat_recursive_sum(pd, off);
-	b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
-	blkg_rwstat_merge(&a, &b);
-	return a;
+	cfqg_stats_exit(&cfqg->stats);
+	return kfree(cfqg);
 }
 
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
 
 	cfqg_stats_reset(&cfqg->stats);
-	cfqg_stats_reset(&cfqg->dead_stats);
 }
 
-/*
- * Search for the cfq group current task belongs to. request_queue lock must
- * be held.
- */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-						struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+					 struct blkcg *blkcg)
 {
-	struct request_queue *q = cfqd->queue;
-	struct cfq_group *cfqg = NULL;
-
-	/* avoid lookup for the common case where there's no blkcg */
-	if (blkcg == &blkcg_root) {
-		cfqg = cfqd->root_group;
-	} else {
-		struct blkcg_gq *blkg;
-
-		blkg = blkg_lookup_create(blkcg, q);
-		if (!IS_ERR(blkg))
-			cfqg = blkg_to_cfqg(blkg);
-	}
+	struct blkcg_gq *blkg;
 
-	return cfqg;
+	blkg = blkg_lookup(blkcg, cfqd->queue);
+	if (likely(blkg))
+		return blkg_to_cfqg(blkg);
+	return NULL;
 }
 
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
-	/* Currently, all async queues are mapped to root group */
-	if (!cfq_cfqq_sync(cfqq))
-		cfqg = cfqq->cfqd->root_group;
-
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
 	cfqg_get(cfqg);
@@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 
 static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 					char *buf, size_t nbytes, loff_t off,
-					bool is_leaf_weight)
+					bool on_dfl, bool is_leaf_weight)
 {
+	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
 	struct cfq_group_data *cfqgd;
 	int ret;
+	u64 v;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
 	if (ret)
 		return ret;
 
-	ret = -EINVAL;
+	if (sscanf(ctx.body, "%llu", &v) == 1) {
+		/* require "default" on dfl */
+		ret = -ERANGE;
+		if (!v && on_dfl)
+			goto out_finish;
+	} else if (!strcmp(strim(ctx.body), "default")) {
+		v = 0;
+	} else {
+		ret = -EINVAL;
+		goto out_finish;
+	}
+
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	cfqgd = blkcg_to_cfqgd(blkcg);
-	if (!cfqg || !cfqgd)
-		goto err;
 
-	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+	ret = -ERANGE;
+	if (!v || (v >= min && v <= max)) {
 		if (!is_leaf_weight) {
-			cfqg->dev_weight = ctx.v;
-			cfqg->new_weight = ctx.v ?: cfqgd->weight;
+			cfqg->dev_weight = v;
+			cfqg->new_weight = v ?: cfqgd->weight;
 		} else {
-			cfqg->dev_leaf_weight = ctx.v;
-			cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
+			cfqg->dev_leaf_weight = v;
+			cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
 		}
 		ret = 0;
 	}
-
-err:
+out_finish:
 	blkg_conf_finish(&ctx);
 	return ret ?: nbytes;
 }
@@ -1776,25 +1805,27 @@ err:
 static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
 				      char *buf, size_t nbytes, loff_t off)
 {
-	return __cfqg_set_weight_device(of, buf, nbytes, off, false);
+	return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
 }
 
 static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
 					   char *buf, size_t nbytes, loff_t off)
 {
-	return __cfqg_set_weight_device(of, buf, nbytes, off, true);
+	return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
 }
 
-static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
-			    u64 val, bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+			    bool on_dfl, bool reset_dev, bool is_leaf_weight)
 {
+	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
 	struct cfq_group_data *cfqgd;
 	int ret = 0;
 
-	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
-		return -EINVAL;
+	if (val < min || val > max)
+		return -ERANGE;
 
 	spin_lock_irq(&blkcg->lock);
 	cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 			continue;
 
 		if (!is_leaf_weight) {
+			if (reset_dev)
+				cfqg->dev_weight = 0;
 			if (!cfqg->dev_weight)
 				cfqg->new_weight = cfqgd->weight;
 		} else {
+			if (reset_dev)
+				cfqg->dev_leaf_weight = 0;
 			if (!cfqg->dev_leaf_weight)
 				cfqg->new_leaf_weight = cfqgd->leaf_weight;
 		}
@@ -1831,13 +1866,13 @@ out:
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 			  u64 val)
 {
-	return __cfq_set_weight(css, cft, val, false);
+	return __cfq_set_weight(css, val, false, false, false);
 }
 
 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
 			       struct cftype *cft, u64 val)
 {
-	return __cfq_set_weight(css, cft, val, true);
+	return __cfq_set_weight(css, val, false, false, true);
 }
 
 static int cfqg_print_stat(struct seq_file *sf, void *v)
@@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
-	u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
-
+	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+					  &blkcg_policy_cfq, off);
 	return __blkg_prfill_u64(sf, pd, sum);
 }
 
 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
 					struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
-
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+							&blkcg_policy_cfq, off);
 	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
@@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
 	return 0;
 }
 
+static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
+{
+	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
+	return 0;
+}
+
+static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
+					 struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
+			  false);
+	return 0;
+}
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
@@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
 }
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 
-static struct cftype cfq_blkcg_files[] = {
+static struct cftype cfq_blkcg_legacy_files[] = {
 	/* on root, weight is mapped to leaf_weight */
 	{
 		.name = "weight_device",
@@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "sectors",
-		.private = offsetof(struct cfq_group, stats.sectors),
-		.seq_show = cfqg_print_stat,
+		.seq_show = cfqg_print_stat_sectors,
 	},
 	{
 		.name = "io_service_bytes",
-		.private = offsetof(struct cfq_group, stats.service_bytes),
-		.seq_show = cfqg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_bytes,
 	},
 	{
 		.name = "io_serviced",
-		.private = offsetof(struct cfq_group, stats.serviced),
-		.seq_show = cfqg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_ios,
 	},
 	{
 		.name = "io_service_time",
@@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "sectors_recursive",
-		.private = offsetof(struct cfq_group, stats.sectors),
-		.seq_show = cfqg_print_stat_recursive,
+		.seq_show = cfqg_print_stat_sectors_recursive,
 	},
 	{
 		.name = "io_service_bytes_recursive",
-		.private = offsetof(struct cfq_group, stats.service_bytes),
-		.seq_show = cfqg_print_rwstat_recursive,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_bytes_recursive,
 	},
 	{
 		.name = "io_serviced_recursive",
-		.private = offsetof(struct cfq_group, stats.serviced),
-		.seq_show = cfqg_print_rwstat_recursive,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_ios_recursive,
 	},
 	{
 		.name = "io_service_time_recursive",
@@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = {
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 	{ }	/* terminate */
 };
+
+static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+
+	seq_printf(sf, "default %u\n", cgd->weight);
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
+			  &blkcg_policy_cfq, 0, false);
+	return 0;
+}
+
+static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
+				     char *buf, size_t nbytes, loff_t off)
+{
+	char *endp;
+	int ret;
+	u64 v;
+
+	buf = strim(buf);
+
+	/* "WEIGHT" or "default WEIGHT" sets the default weight */
+	v = simple_strtoull(buf, &endp, 0);
+	if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+		ret = __cfq_set_weight(of_css(of), v, true, false, false);
+		return ret ?: nbytes;
+	}
+
+	/* "MAJ:MIN WEIGHT" */
+	return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
+}
+
+static struct cftype cfq_blkcg_files[] = {
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cfq_print_weight_on_dfl,
+		.write = cfq_set_weight_on_dfl,
+	},
+	{ }	/* terminate */
+};
+
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-						struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+					 struct blkcg *blkcg)
 {
 	return cfqd->root_group;
 }
@@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	cfqq->nr_sectors += blk_rq_sectors(rq);
-	cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
 }
 
 /*
@@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq)
 	struct cfq_io_cq *cic = icq_to_cic(icq);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 
-	if (cic->cfqq[BLK_RW_ASYNC]) {
-		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
-		cic->cfqq[BLK_RW_ASYNC] = NULL;
+	if (cic_to_cfqq(cic, false)) {
+		cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
+		cic_set_cfqq(cic, NULL, false);
 	}
 
-	if (cic->cfqq[BLK_RW_SYNC]) {
-		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
-		cic->cfqq[BLK_RW_SYNC] = NULL;
+	if (cic_to_cfqq(cic, true)) {
+		cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
+		cic_set_cfqq(cic, NULL, true);
 	}
 }
 
@@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
 	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
 		return;
 
-	cfqq = cic->cfqq[BLK_RW_ASYNC];
+	cfqq = cic_to_cfqq(cic, false);
 	if (cfqq) {
-		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
-					 GFP_ATOMIC);
-		if (new_cfqq) {
-			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
-			cfq_put_queue(cfqq);
-		}
+		cfq_put_queue(cfqq);
+		cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
+		cic_set_cfqq(cic, cfqq, false);
 	}
 
-	cfqq = cic->cfqq[BLK_RW_SYNC];
+	cfqq = cic_to_cfqq(cic, true);
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
 
@@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	struct cfq_queue *sync_cfqq;
+	struct cfq_queue *cfqq;
 	uint64_t serial_nr;
 
 	rcu_read_lock();
@@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
 		return;
 
-	sync_cfqq = cic_to_cfqq(cic, 1);
-	if (sync_cfqq) {
-		/*
-		 * Drop reference to sync queue. A new sync queue will be
-		 * assigned in new group upon arrival of a fresh request.
-		 */
-		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
-		cic_set_cfqq(cic, NULL, 1);
-		cfq_put_queue(sync_cfqq);
+	/*
+	 * Drop reference to queues.  New queues will be assigned in new
+	 * group upon arrival of fresh requests.
+	 */
+	cfqq = cic_to_cfqq(cic, false);
+	if (cfqq) {
+		cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+		cic_set_cfqq(cic, NULL, false);
+		cfq_put_queue(cfqq);
+	}
+
+	cfqq = cic_to_cfqq(cic, true);
+	if (cfqq) {
+		cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+		cic_set_cfqq(cic, NULL, true);
+		cfq_put_queue(cfqq);
 	}
 
 	cic->blkcg_serial_nr = serial_nr;
@@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
-static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-		     struct bio *bio, gfp_t gfp_mask)
-{
-	struct blkcg *blkcg;
-	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	struct cfq_group *cfqg;
-
-retry:
-	rcu_read_lock();
-
-	blkcg = bio_blkcg(bio);
-	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
-	if (!cfqg) {
-		cfqq = &cfqd->oom_cfqq;
-		goto out;
-	}
-
-	cfqq = cic_to_cfqq(cic, is_sync);
-
-	/*
-	 * Always try a new alloc if we fell back to the OOM cfqq
-	 * originally, since it should just be a temporary situation.
-	 */
-	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = NULL;
-		if (new_cfqq) {
-			cfqq = new_cfqq;
-			new_cfqq = NULL;
-		} else if (gfp_mask & __GFP_WAIT) {
-			rcu_read_unlock();
-			spin_unlock_irq(cfqd->queue->queue_lock);
-			new_cfqq = kmem_cache_alloc_node(cfq_pool,
-					gfp_mask | __GFP_ZERO,
-					cfqd->queue->node);
-			spin_lock_irq(cfqd->queue->queue_lock);
-			if (new_cfqq)
-				goto retry;
-			else
-				return &cfqd->oom_cfqq;
-		} else {
-			cfqq = kmem_cache_alloc_node(cfq_pool,
-					gfp_mask | __GFP_ZERO,
-					cfqd->queue->node);
-		}
-
-		if (cfqq) {
-			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
-			cfq_init_prio_data(cfqq, cic);
-			cfq_link_cfqq_cfqg(cfqq, cfqg);
-			cfq_log_cfqq(cfqd, cfqq, "alloced");
-		} else
-			cfqq = &cfqd->oom_cfqq;
-	}
-out:
-	if (new_cfqq)
-		kmem_cache_free(cfq_pool, new_cfqq);
-
-	rcu_read_unlock();
-	return cfqq;
-}
-
 static struct cfq_queue **
-cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
+cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
 {
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
-		return &cfqd->async_cfqq[0][ioprio];
+		return &cfqg->async_cfqq[0][ioprio];
 	case IOPRIO_CLASS_NONE:
 		ioprio = IOPRIO_NORM;
 		/* fall through */
 	case IOPRIO_CLASS_BE:
-		return &cfqd->async_cfqq[1][ioprio];
+		return &cfqg->async_cfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
-		return &cfqd->async_idle_cfqq;
+		return &cfqg->async_idle_cfqq;
 	default:
 		BUG();
 	}
@@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-	      struct bio *bio, gfp_t gfp_mask)
+	      struct bio *bio)
 {
 	int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 	struct cfq_queue **async_cfqq = NULL;
-	struct cfq_queue *cfqq = NULL;
+	struct cfq_queue *cfqq;
+	struct cfq_group *cfqg;
+
+	rcu_read_lock();
+	cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
+	if (!cfqg) {
+		cfqq = &cfqd->oom_cfqq;
+		goto out;
+	}
 
 	if (!is_sync) {
 		if (!ioprio_valid(cic->ioprio)) {
@@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 			ioprio = task_nice_ioprio(tsk);
 			ioprio_class = task_nice_ioclass(tsk);
 		}
-		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
+		async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
 		cfqq = *async_cfqq;
+		if (cfqq)
+			goto out;
 	}
 
-	if (!cfqq)
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+	cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+				     cfqd->queue->node);
+	if (!cfqq) {
+		cfqq = &cfqd->oom_cfqq;
+		goto out;
+	}
 
-	/*
-	 * pin the queue now that it's allocated, scheduler exit will prune it
-	 */
-	if (!is_sync && !(*async_cfqq)) {
+	cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
+	cfq_init_prio_data(cfqq, cic);
+	cfq_link_cfqq_cfqg(cfqq, cfqg);
+	cfq_log_cfqq(cfqd, cfqq, "alloced");
+
+	if (async_cfqq) {
+		/* a new async queue is created, pin and remember */
 		cfqq->ref++;
 		*async_cfqq = cfqq;
 	}
-
+out:
 	cfqq->ref++;
+	rcu_read_unlock();
 	return cfqq;
 }
 
@@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
 
-	might_sleep_if(gfp_mask & __GFP_WAIT);
-
 	spin_lock_irq(q->queue_lock);
 
 	check_ioprio_changed(cic, bio);
@@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+		if (cfqq)
+			cfq_put_queue(cfqq);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
@@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 	cancel_work_sync(&cfqd->unplug_work);
 }
 
-static void cfq_put_async_queues(struct cfq_data *cfqd)
-{
-	int i;
-
-	for (i = 0; i < IOPRIO_BE_NR; i++) {
-		if (cfqd->async_cfqq[0][i])
-			cfq_put_queue(cfqd->async_cfqq[0][i]);
-		if (cfqd->async_cfqq[1][i])
-			cfq_put_queue(cfqd->async_cfqq[1][i]);
-	}
-
-	if (cfqd->async_idle_cfqq)
-		cfq_put_queue(cfqd->async_idle_cfqq);
-}
-
 static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
@@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
-	cfq_put_async_queues(cfqd);
-
 	spin_unlock_irq(q->queue_lock);
 
 	cfq_shutdown_timer_wq(cfqd);
@@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		goto out_free;
 
 	cfq_init_cfqg_base(cfqd->root_group);
+	cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
 #endif
-	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
-	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		cfqd->prio_trees[i] = RB_ROOT;
 
 	/*
-	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
+	 * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
 	 * will not attempt to free it.  oom_cfqq is linked to root_group
 	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
@@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-	.pd_size		= sizeof(struct cfq_group),
-	.cpd_size		= sizeof(struct cfq_group_data),
-	.cftypes		= cfq_blkcg_files,
+	.dfl_cftypes		= cfq_blkcg_files,
+	.legacy_cftypes		= cfq_blkcg_legacy_files,
 
+	.cpd_alloc_fn		= cfq_cpd_alloc,
 	.cpd_init_fn		= cfq_cpd_init,
+	.cpd_free_fn		= cfq_cpd_free,
+	.cpd_bind_fn		= cfq_cpd_bind,
+
+	.pd_alloc_fn		= cfq_pd_alloc,
 	.pd_init_fn		= cfq_pd_init,
 	.pd_offline_fn		= cfq_pd_offline,
+	.pd_free_fn		= cfq_pd_free,
 	.pd_reset_stats_fn	= cfq_pd_reset_stats,
 };
 #endif
diff --git a/block/genhd.c b/block/genhd.c
index 6d1003f0d..398dab06d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1112,8 +1112,7 @@ static void disk_release(struct device *dev)
 	disk_release_events(disk);
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
-	free_part_stats(&disk->part0);
-	free_part_info(&disk->part0);
+	hd_free_part(&disk->part0);
 	if (disk->queue)
 		blk_put_queue(disk->queue);
 	kfree(disk);
@@ -1287,7 +1286,11 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 		 * converted to make use of bd_mutex and sequence counters.
 		 */
 		seqcount_init(&disk->part0.nr_sects_seq);
-		hd_ref_init(&disk->part0);
+		if (hd_ref_init(&disk->part0)) {
+			hd_free_part(&disk->part0);
+			kfree(disk);
+			return NULL;
+		}
 
 		disk->minors = minors;
 		rand_initialize_disk(disk);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 0d9e5f97f..e77111332 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -212,8 +212,7 @@ static void part_release(struct device *dev)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	blk_free_devt(dev->devt);
-	free_part_stats(p);
-	free_part_info(p);
+	hd_free_part(p);
 	kfree(p);
 }
 
@@ -233,8 +232,9 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 	put_device(part_to_dev(part));
 }
 
-void __delete_partition(struct hd_struct *part)
+void __delete_partition(struct percpu_ref *ref)
 {
+	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
@@ -255,7 +255,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-	hd_struct_put(part);
+	hd_struct_kill(part);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -356,8 +356,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!dev_get_uevent_suppress(ddev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	hd_ref_init(p);
-	return p;
+	if (!hd_ref_init(p))
+		return p;
 
 out_free_info:
 	free_part_info(p);
diff --git a/block/uuid.c b/block/uuid.c
index 722d53b63..4610d7b8f 100644
--- a/block/uuid.c
+++ b/block/uuid.c
@@ -142,12 +142,12 @@ static int null_uuid(const char *uuid)
 }
 
 
-static void uuid_end_bio(struct bio *bio, int err)
+static void uuid_end_bio(struct bio *bio)
 {
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
-	if(!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		SetPageError(page);
+        if (bio->bi_error)
+            SetPageError(page);
 
 	unlock_page(page);
 	bio_put(bio);