32 files changed, 2571 insertions, 1631 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 01da733dc..1fc1a4dc5 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -51,14 +51,12 @@ config IOSCHED_BFQ
 	  applications. If compiled built-in (saying Y here), BFQ can
 	  be configured to support hierarchical scheduling.
 
-config CGROUP_BFQIO
+config BFQ_GROUP_IOSCHED
 	bool "BFQ hierarchical scheduling support"
 	depends on CGROUPS && IOSCHED_BFQ=y
 	default n
 	---help---
-	  Enable hierarchical scheduling in BFQ, using the cgroups
-	  filesystem interface.  The name of the subsystem will be
-	  bfqio.
+	  Enable hierarchical scheduling in BFQ, using the blkio controller.
 
 choice
 	prompt "Default I/O scheduler"
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 11e2f1d4e..7a6192007 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -13,254 +13,522 @@
  * file.
  */
 
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 
-static DEFINE_MUTEX(bfqio_mutex);
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+	BFQG_stats_waiting = 0,
+	BFQG_stats_idling,
+	BFQG_stats_empty,
+};
 
-static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
-{
-	return bgrp ? !bgrp->online : false;
-}
+#define BFQG_FLAG_FNS(name)						\
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags |= (1 << BFQG_stats_##name);			\
+}									\
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags &= ~(1 << BFQG_stats_##name);			\
+}									\
+static int bfqg_stats_##name(struct bfqg_stats *stats)		\
+{									\
+	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
+}									\
 
-static struct bfqio_cgroup bfqio_root_cgroup = {
-	.weight = BFQ_DEFAULT_GRP_WEIGHT,
-	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,
-	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,
-};
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
 
-static inline void bfq_init_entity(struct bfq_entity *entity,
-				   struct bfq_group *bfqg)
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
 {
-	entity->weight = entity->new_weight;
-	entity->orig_weight = entity->new_weight;
-	entity->ioprio = entity->new_ioprio;
-	entity->ioprio_class = entity->new_ioprio_class;
-	entity->parent = bfqg->my_entity;
-	entity->sched_data = &bfqg->sched_data;
+	unsigned long long now;
+
+	if (!bfqg_stats_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	bfqg_stats_clear_waiting(stats);
 }
 
-static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+						 struct bfq_group *curr_bfqg)
 {
-	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_waiting(stats))
+		return;
+	if (bfqg == curr_bfqg)
+		return;
+	stats->start_group_wait_time = sched_clock();
+	bfqg_stats_mark_waiting(stats);
 }
 
-/*
- * Search the bfq_group for bfqd into the hash table (by now only a list)
- * of bgrp.  Must be called under rcu_read_lock().
- */
-static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
-					    struct bfq_data *bfqd)
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
 {
-	struct bfq_group *bfqg;
-	void *key;
+	unsigned long long now;
 
-	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
-		key = rcu_dereference(bfqg->bfqd);
-		if (key == bfqd)
-			return bfqg;
-	}
+	if (!bfqg_stats_empty(stats))
+		return;
 
-	return NULL;
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	bfqg_stats_clear_empty(stats);
 }
 
-static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
-					 struct bfq_group *bfqg)
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
 {
-	struct bfq_entity *entity = &bfqg->entity;
+	blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (blkg_rwstat_total(&stats->queued))
+		return;
 
 	/*
-	 * If the weight of the entity has never been set via the sysfs
-	 * interface, then bgrp->weight == 0. In this case we initialize
-	 * the weight from the current ioprio value. Otherwise, the group
-	 * weight, if set, has priority over the ioprio value.
+	 * group is already marked empty. This can happen if bfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
 	 */
-	if (bgrp->weight == 0) {
-		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
-		entity->new_ioprio = bgrp->ioprio;
-	} else {
-		if (bgrp->weight < BFQ_MIN_WEIGHT ||
-		    bgrp->weight > BFQ_MAX_WEIGHT) {
-			printk(KERN_CRIT "bfq_group_init_entity: "
-					 "bgrp->weight %d\n", bgrp->weight);
-			BUG();
-		}
-		entity->new_weight = bgrp->weight;
-		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
+	if (bfqg_stats_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	bfqg_stats_mark_empty(stats);
+}
+
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		bfqg_stats_clear_idling(stats);
 	}
-	entity->orig_weight = entity->weight = entity->new_weight;
-	entity->ioprio = entity->new_ioprio;
-	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
-	entity->my_sched_data = &bfqg->sched_data;
-	bfqg->active_entities = 0;
 }
 
-static inline void bfq_group_set_parent(struct bfq_group *bfqg,
-					struct bfq_group *parent)
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
 {
-	struct bfq_entity *entity;
+	struct bfqg_stats *stats = &bfqg->stats;
 
-	BUG_ON(parent == NULL);
-	BUG_ON(bfqg == NULL);
+	stats->start_idle_time = sched_clock();
+	bfqg_stats_mark_idling(stats);
+}
 
-	entity = &bfqg->entity;
-	entity->parent = parent->my_entity;
-	entity->sched_data = &parent->sched_data;
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_total(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	bfqg_stats_update_group_wait_time(stats);
 }
 
-/**
- * bfq_group_chain_alloc - allocate a chain of groups.
- * @bfqd: queue descriptor.
- * @css: the leaf cgroup_subsys_state this chain starts from.
- *
- * Allocate a chain of groups starting from the one belonging to
- * @cgroup up to the root cgroup.  Stop if a cgroup on the chain
- * to the root has already an allocated group on @bfqd.
+static struct blkcg_policy blkcg_policy_bfq;
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
  */
-static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
-					       struct cgroup_subsys_state *css)
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
 {
-	struct bfqio_cgroup *bgrp;
-	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
+	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
 
-	for (; css != NULL; css = css->parent) {
-		bgrp = css_to_bfqio(css);
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+	return pd_to_blkg(&bfqg->pd);
+}
 
-		bfqg = bfqio_lookup_group(bgrp, bfqd);
-		if (bfqg != NULL) {
-			/*
-			 * All the cgroups in the path from there to the
-			 * root must have a bfq_group for bfqd, so we don't
-			 * need any more allocations.
-			 */
-			break;
-		}
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+	return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+}
 
-		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
-		if (bfqg == NULL)
-			goto cleanup;
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
 
-		bfq_group_init_entity(bgrp, bfqg);
-		bfqg->my_entity = &bfqg->entity;
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
 
-		if (leaf == NULL) {
-			leaf = bfqg;
-			prev = leaf;
-		} else {
-			bfq_group_set_parent(prev, bfqg);
-			/*
-			 * Build a list of allocated nodes using the bfqd
-			 * filed, that is still unused and will be
-			 * initialized only after the node will be
-			 * connected.
-			 */
-			prev->bfqd = bfqg;
-			prev = bfqg;
-		}
-	}
+	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
 
-	return leaf;
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
 
-cleanup:
-	while (leaf != NULL) {
-		prev = leaf;
-		leaf = leaf->bfqd;
-		kfree(prev);
-	}
+	return group_entity ? container_of(group_entity, struct bfq_group,
+					   entity) :
+			      bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
 
-	return NULL;
+static void bfqg_get(struct bfq_group *bfqg)
+{
+	return blkg_get(bfqg_to_blkg(bfqg));
 }
 
-/**
- * bfq_group_chain_link - link an allocated group chain to a cgroup
- *                        hierarchy.
- * @bfqd: the queue descriptor.
- * @css: the leaf cgroup_subsys_state to start from.
- * @leaf: the leaf group (to be associated to @cgroup).
- *
- * Try to link a chain of groups to a cgroup hierarchy, connecting the
- * nodes bottom-up, so we can be sure that when we find a cgroup in the
- * hierarchy that already as a group associated to @bfqd all the nodes
- * in the path to the root cgroup have one too.
- *
- * On locking: the queue lock protects the hierarchy (there is a hierarchy
- * per device) while the bfqio_cgroup lock protects the list of groups
- * belonging to the same cgroup.
+static void bfqg_put(struct bfq_group *bfqg)
+{
+	return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+				     struct bfq_queue *bfqq,
+				     int rw)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, rw, 1);
+	bfqg_stats_end_empty_time(&bfqg->stats);
+	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, rw, -1);
+}
+
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)
+{
+	blkg_rwstat_add(&bfqg->stats.merged, rw, 1);
+}
+
+static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,
+					      uint64_t bytes, int rw)
+{
+	blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);
+	blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);
+	blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);
+}
+
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time, int rw)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+	unsigned long long now = sched_clock();
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, rw,
+				io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+	if (!stats)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->service_bytes);
+	blkg_rwstat_reset(&stats->serviced);
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+	blkg_stat_reset(&stats->unaccounted_time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+	if (!to || !from)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);
+	blkg_rwstat_add_aux(&to->serviced, &from->serviced);
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
+	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
  */
-static void bfq_group_chain_link(struct bfq_data *bfqd,
-				 struct cgroup_subsys_state *css,
-				 struct bfq_group *leaf)
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
 {
-	struct bfqio_cgroup *bgrp;
-	struct bfq_group *bfqg, *next, *prev = NULL;
-	unsigned long flags;
+	struct bfq_group *parent;
 
-	assert_spin_locked(bfqd->queue->queue_lock);
+	if (!bfqg) /* root_group */
+		return;
 
-	for (; css != NULL && leaf != NULL; css = css->parent) {
-		bgrp = css_to_bfqio(css);
-		next = leaf->bfqd;
+	parent = bfqg_parent(bfqg);
 
-		bfqg = bfqio_lookup_group(bgrp, bfqd);
-		BUG_ON(bfqg != NULL);
+	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
 
-		spin_lock_irqsave(&bgrp->lock, flags);
+	if (unlikely(!parent))
+		return;
 
-		rcu_assign_pointer(leaf->bfqd, bfqd);
-		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
-		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
+	bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);
+	bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);
+	bfqg_stats_reset(&bfqg->stats);
+	bfqg_stats_reset(&bfqg->dead_stats);
+}
 
-		spin_unlock_irqrestore(&bgrp->lock, flags);
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
-		prev = leaf;
-		leaf = next;
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+		bfqg_get(bfqg);
 	}
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+}
 
-	BUG_ON(css == NULL && leaf != NULL);
-	if (css != NULL && prev != NULL) {
-		bgrp = css_to_bfqio(css);
-		bfqg = bfqio_lookup_group(bgrp, bfqd);
-		bfq_group_set_parent(prev, bfqg);
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->service_bytes);
+	blkg_rwstat_exit(&stats->serviced);
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	blkg_stat_exit(&stats->sectors);
+	blkg_stat_exit(&stats->time);
+	blkg_stat_exit(&stats->unaccounted_time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
+	    blkg_rwstat_init(&stats->serviced, gfp) ||
+	    blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+	    blkg_stat_init(&stats->sectors, gfp) ||
+	    blkg_stat_init(&stats->time, gfp) ||
+	    blkg_stat_init(&stats->unaccounted_time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp)) {
+		bfqg_stats_exit(stats);
+		return -ENOMEM;
 	}
+
+	return 0;
 }
 
-/**
- * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
- * @bfqd: queue descriptor.
- * @cgroup: cgroup being searched for.
- *
- * Return a group associated to @bfqd in @cgroup, allocating one if
- * necessary.  When a group is returned all the cgroups in the path
- * to the root have a group associated to @bfqd.
- *
- * If the allocation fails, return the root group: this breaks guarantees
- * but is a safe fallback.  If this loss becomes a problem it can be
- * mitigated using the equivalent weight (given by the product of the
- * weights of the groups in the path from @group to the root) in the
- * root scheduler.
- *
- * We allocate all the missing nodes in the path from the leaf cgroup
- * to the root and we connect the nodes only after all the allocations
- * have been successful.
- */
-static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
-					      struct cgroup_subsys_state *css)
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+ {
+	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+ }
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+	d->weight = BFQ_DEFAULT_GRP_WEIGHT;
+}
+
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
 {
-	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
 	struct bfq_group *bfqg;
 
-	bfqg = bfqio_lookup_group(bgrp, bfqd);
-	if (bfqg != NULL)
-		return bfqg;
+	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+	if (!bfqg)
+		return NULL;
+
+	if (bfqg_stats_init(&bfqg->stats, gfp)) {
+		kfree(bfqg);
+		return NULL;
+	}
+
+	return &bfqg->pd;
+}
+
+static void bfq_pd_init(struct blkg_policy_data *pd)
+{
+	struct blkcg_gq *blkg = pd_to_blkg(pd);
+	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+	struct bfq_entity *entity = &bfqg->entity;
+	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+	entity->my_sched_data = &bfqg->sched_data;
+	bfqg->my_entity = entity; /*
+				   * the root_group's will be set to NULL
+				   * in bfq_init_queue()
+				   */
+	bfqg->bfqd = bfqd;
+	bfqg->active_entities = 0;
+	bfqg->rq_pos_tree = RB_ROOT;
+
+	/* if the root_group does not exist, we are handling it right now */
+	if (bfqd->root_group && bfqg != bfqd->root_group)
+		hlist_add_head(&bfqg->bfqd_node, &bfqd->group_list);
+}
+
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+	return kfree(pd_to_bfqg(pd));
+}
+
+/* offset delta from bfqg->stats to bfqg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -
+					offsetof(struct bfq_group, stats);
+
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+	u64 sum = 0;
+
+	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
+	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
+				       off + dead_stats_off_delta);
+	return sum;
+}
+
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
+						       int off)
+{
+	struct blkg_rwstat a, b;
+
+	a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
+	b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
+				      off + dead_stats_off_delta);
+	blkg_rwstat_add_aux(&a, &b);
+	return a;
+}
+
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_reset(&bfqg->stats);
+	bfqg_stats_reset(&bfqg->dead_stats);
+}
 
-	bfqg = bfq_group_chain_alloc(bfqd, css);
-	if (bfqg != NULL)
-		bfq_group_chain_link(bfqd, css, bfqg);
-	else
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
+{
+	struct bfq_entity *entity;
+
+	BUG_ON(!parent);
+	BUG_ON(!bfqg);
+	BUG_ON(bfqg == parent);
+
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
+					      struct blkcg *blkcg)
+{
+	struct request_queue *q = bfqd->queue;
+	struct bfq_group *bfqg = NULL, *parent;
+	struct bfq_entity *entity = NULL;
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+
+	/* avoid lookup for the common case where there's no blkcg */
+	if (blkcg == &blkcg_root) {
 		bfqg = bfqd->root_group;
+	} else {
+		struct blkcg_gq *blkg;
+
+		blkg = blkg_lookup_create(blkcg, q);
+		if (!IS_ERR(blkg))
+			bfqg = blkg_to_bfqg(blkg);
+		else /* fallback to root_group */
+			bfqg = bfqd->root_group;
+	}
+
+	BUG_ON(!bfqg);
+
+	/*
+	 * Update chain of bfq_groups as we might be handling a leaf group
+	 * which, along with some of its relatives, has not been hooked yet
+	 * to the private hierarchy of BFQ.
+	 */
+	entity = &bfqg->entity;
+	for_each_entity(entity) {
+		bfqg = container_of(entity, struct bfq_group, entity);
+		BUG_ON(!bfqg);
+		if (bfqg != bfqd->root_group) {
+			parent = bfqg_parent(bfqg);
+			if (!parent)
+				parent = bfqd->root_group;
+			BUG_ON(!parent);
+			bfq_group_set_parent(bfqg, parent);
+		}
+	}
 
 	return bfqg;
 }
 
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
 /**
  * bfq_bfqq_move - migrate @bfqq to @bfqg.
  * @bfqd: queue descriptor.
@@ -296,6 +564,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			bfq_deactivate_bfqq(bfqd, bfqq, 0);
 	} else if (entity->on_st)
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	bfqg_put(bfqq_group(bfqq));
 
 	/*
 	 * Here we use a reference to bfqg.  We don't need a refcounter
@@ -304,11 +573,15 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	 */
 	entity->parent = bfqg->my_entity;
 	entity->sched_data = &bfqg->sched_data;
+	bfqg_get(bfqg);
 
-	if (busy && resume)
-		bfq_activate_bfqq(bfqd, bfqq);
+	if (busy) {
+		bfq_pos_tree_add_move(bfqd, bfqq);
+		if (resume)
+			bfq_activate_bfqq(bfqd, bfqq);
+	}
 
-	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
+	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
 		bfq_schedule_dispatch(bfqd);
 }
 
@@ -316,9 +589,9 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  * __bfq_bic_change_cgroup - move @bic to @cgroup.
  * @bfqd: the queue descriptor.
  * @bic: the bic to move.
- * @cgroup: the cgroup to move to.
+ * @blkcg: the blk-cgroup to move to.
  *
- * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
  * has to make sure that the reference to cgroup is valid across the call.
  *
  * NOTE: an alternative approach might have been to store the current
@@ -327,18 +600,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  */
 static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 						struct bfq_io_cq *bic,
-						struct cgroup_subsys_state *css)
+						struct blkcg *blkcg)
 {
 	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
 	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
-	struct bfq_entity *entity;
 	struct bfq_group *bfqg;
-	struct bfqio_cgroup *bgrp;
+	struct bfq_entity *entity;
 
-	bgrp = css_to_bfqio(css);
+	lockdep_assert_held(bfqd->queue->queue_lock);
 
-	bfqg = bfq_find_alloc_group(bfqd, css);
-	if (async_bfqq != NULL) {
+	bfqg = bfq_find_alloc_group(bfqd, blkcg);
+	if (async_bfqq) {
 		entity = &async_bfqq->entity;
 
 		if (entity->sched_data != &bfqg->sched_data) {
@@ -350,7 +622,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 		}
 	}
 
-	if (sync_bfqq != NULL) {
+	if (sync_bfqq) {
 		entity = &sync_bfqq->entity;
 		if (entity->sched_data != &bfqg->sched_data)
 			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
@@ -359,74 +631,39 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 	return bfqg;
 }
 
-/**
- * bfq_bic_change_cgroup - move @bic to @cgroup.
- * @bic: the bic being migrated.
- * @cgroup: the destination cgroup.
- *
- * When the task owning @bic is moved to @cgroup, @bic is immediately
- * moved into its new parent group.
- */
-static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
-				  struct cgroup_subsys_state *css)
-{
-	struct bfq_data *bfqd;
-	unsigned long uninitialized_var(flags);
-
-	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
-				   &flags);
-	if (bfqd != NULL) {
-		__bfq_bic_change_cgroup(bfqd, bic, css);
-		bfq_put_bfqd_unlock(bfqd, &flags);
-	}
-}
-
-/**
- * bfq_bic_update_cgroup - update the cgroup of @bic.
- * @bic: the @bic to update.
- *
- * Make sure that @bic is enqueued in the cgroup of the current task.
- * We need this in addition to moving bics during the cgroup attach
- * phase because the task owning @bic could be at its first disk
- * access or we may end up in the root cgroup as the result of a
- * memory allocation failure and here we try to move to the right
- * group.
- *
- * Must be called under the queue lock.  It is safe to use the returned
- * value even after the rcu_read_unlock() as the migration/destruction
- * paths act under the queue lock too.  IOW it is impossible to race with
- * group migration/destruction and end up with an invalid group as:
- *   a) here cgroup has not yet been destroyed, nor its destroy callback
- *      has started execution, as current holds a reference to it,
- *   b) if it is destroyed after rcu_read_unlock() [after current is
- *      migrated to a different cgroup] its attach() callback will have
- *      taken care of remove all the references to the old cgroup data.
- */
-static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 {
 	struct bfq_data *bfqd = bic_to_bfqd(bic);
-	struct bfq_group *bfqg;
-	struct cgroup_subsys_state *css;
-
-	BUG_ON(bfqd == NULL);
+	struct blkcg *blkcg;
+	struct bfq_group *bfqg = NULL;
+	uint64_t id;
 
 	rcu_read_lock();
-	css = task_css(current, bfqio_cgrp_id);
-	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
+	blkcg = bio_blkcg(bio);
+	id = blkcg->css.serial_nr;
 	rcu_read_unlock();
 
-	return bfqg;
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!bfqd) || likely(bic->blkcg_id == id))
+		return;
+
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);
+	BUG_ON(!bfqg);
+	bic->blkcg_id = id;
 }
 
 /**
  * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
  * @st: the service tree being flushed.
  */
-static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
 {
 	struct bfq_entity *entity = st->first_idle;
 
-	for (; entity != NULL; entity = st->first_idle)
+	for (; entity ; entity = st->first_idle)
 		__bfq_deactivate_entity(entity, 0);
 }
 
@@ -435,12 +672,12 @@ static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
  * @bfqd: the device data structure with the root group.
  * @entity: the entity to move.
  */
-static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
-					    struct bfq_entity *entity)
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+				     struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
-	BUG_ON(bfqq == NULL);
+	BUG_ON(!bfqq);
 	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
 	return;
 }
@@ -454,9 +691,9 @@ static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
  *
  * Needs queue_lock to be taken and reference to be valid over the call.
  */
-static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
-						struct bfq_group *bfqg,
-						struct bfq_service_tree *st)
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+					 struct bfq_group *bfqg,
+					 struct bfq_service_tree *st)
 {
 	struct rb_root *active = &st->active;
 	struct bfq_entity *entity = NULL;
@@ -464,10 +701,10 @@ static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
 	if (!RB_EMPTY_ROOT(&st->active))
 		entity = bfq_entity_of(rb_first(active));
 
-	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
+	for (; entity ; entity = bfq_entity_of(rb_first(active)))
 		bfq_reparent_leaf_entity(bfqd, entity);
 
-	if (bfqg->sched_data.in_service_entity != NULL)
+	if (bfqg->sched_data.in_service_entity)
 		bfq_reparent_leaf_entity(bfqd,
 			bfqg->sched_data.in_service_entity);
 
@@ -476,20 +713,21 @@ static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
 
 /**
  * bfq_destroy_group - destroy @bfqg.
- * @bgrp: the bfqio_cgroup containing @bfqg.
  * @bfqg: the group being destroyed.
  *
  * Destroy @bfqg, making sure that it is not referenced from its parent.
+ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic
  */
-static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
+static void bfq_pd_offline(struct blkg_policy_data *pd)
 {
-	struct bfq_data *bfqd;
 	struct bfq_service_tree *st;
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	struct bfq_data *bfqd = bfqg->bfqd;
 	struct bfq_entity *entity = bfqg->my_entity;
-	unsigned long uninitialized_var(flags);
 	int i;
 
-	hlist_del(&bfqg->group_node);
+	if (!entity) /* root group */
+		return;
 
 	/*
 	 * Empty all service_trees belonging to this group before
@@ -518,37 +756,19 @@ static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
 		 * There is no need to put the sync queues, as the
 		 * scheduler has taken no reference.
 		 */
-		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
-		if (bfqd != NULL) {
-			bfq_reparent_active_entities(bfqd, bfqg, st);
-			bfq_put_bfqd_unlock(bfqd, &flags);
-		}
+		bfq_reparent_active_entities(bfqd, bfqg, st);
 		BUG_ON(!RB_EMPTY_ROOT(&st->active));
 		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
 	}
-	BUG_ON(bfqg->sched_data.next_in_service != NULL);
-	BUG_ON(bfqg->sched_data.in_service_entity != NULL);
+	BUG_ON(bfqg->sched_data.next_in_service);
+	BUG_ON(bfqg->sched_data.in_service_entity);
 
-	/*
-	 * We may race with device destruction, take extra care when
-	 * dereferencing bfqg->bfqd.
-	 */
-	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
-	if (bfqd != NULL) {
-		hlist_del(&bfqg->bfqd_node);
-		__bfq_deactivate_entity(entity, 0);
-		bfq_put_async_queues(bfqd, bfqg);
-		bfq_put_bfqd_unlock(bfqd, &flags);
-	}
-	BUG_ON(entity->tree != NULL);
+	hlist_del(&bfqg->bfqd_node);
+	__bfq_deactivate_entity(entity, 0);
+	bfq_put_async_queues(bfqd, bfqg);
+	BUG_ON(entity->tree);
 
-	/*
-	 * No need to defer the kfree() to the end of the RCU grace
-	 * period: we are called from the destroy() callback of our
-	 * cgroup, so we can be sure that no one is a) still using
-	 * this cgroup or b) doing lookups in it.
-	 */
-	kfree(bfqg);
+	bfqg_stats_xfer_dead(bfqg);
 }
 
 static void bfq_end_wr_async(struct bfq_data *bfqd)
@@ -595,312 +815,362 @@ static void bfq_disconnect_groups(struct bfq_data *bfqd)
 	}
 }
 
-static inline void bfq_free_root_group(struct bfq_data *bfqd)
+static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,
+				       struct cftype *cftype)
 {
-	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
-	struct bfq_group *bfqg = bfqd->root_group;
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	int ret = -EINVAL;
 
-	bfq_put_async_queues(bfqd, bfqg);
-
-	spin_lock_irq(&bgrp->lock);
-	hlist_del_rcu(&bfqg->group_node);
-	spin_unlock_irq(&bgrp->lock);
+	spin_lock_irq(&blkcg->lock);
+	ret = bfqgd->weight;
+	spin_unlock_irq(&blkcg->lock);
 
-	/*
-	 * No need to synchronize_rcu() here: since the device is gone
-	 * there cannot be any read-side access to its root_group.
-	 */
-	kfree(bfqg);
+	return ret;
 }
 
-static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
+static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v)
 {
-	struct bfq_group *bfqg;
-	struct bfqio_cgroup *bgrp;
-	int i;
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
 
-	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
-	if (bfqg == NULL)
-		return NULL;
+	spin_lock_irq(&blkcg->lock);
+	seq_printf(sf, "%u\n", bfqgd->weight);
+	spin_unlock_irq(&blkcg->lock);
 
-	bfqg->entity.parent = NULL;
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+	return 0;
+}
 
-	bgrp = &bfqio_root_cgroup;
-	spin_lock_irq(&bgrp->lock);
-	rcu_assign_pointer(bfqg->bfqd, bfqd);
-	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
-	spin_unlock_irq(&bgrp->lock);
+static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,
+					struct cftype *cftype,
+					u64 val)
+{
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	struct blkcg_gq *blkg;
+	int ret = -EINVAL;
+
+	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+		return ret;
+
+	ret = 0;
+	spin_lock_irq(&blkcg->lock);
+	bfqgd->weight = (unsigned short)val;
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+		if (!bfqg)
+			continue;
+		/*
+		 * Setting the prio_changed flag of the entity
+		 * to 1 with new_weight == weight would re-set
+		 * the value of the weight to its ioprio mapping.
+		 * Set the flag only if necessary.
+		 */
+		if ((unsigned short)val != bfqg->entity.new_weight) {
+			bfqg->entity.new_weight = (unsigned short)val;
+			/*
+			 * Make sure that the above new value has been
+			 * stored in bfqg->entity.new_weight before
+			 * setting the prio_changed flag. In fact,
+			 * this flag may be read asynchronously (in
+			 * critical sections protected by a different
+			 * lock than that held here), and finding this
+			 * flag set may cause the execution of the code
+			 * for updating parameters whose value may
+			 * depend also on bfqg->entity.new_weight (in
+			 * __bfq_entity_update_weight_prio).
+			 * This barrier makes sure that the new value
+			 * of bfqg->entity.new_weight is correctly
+			 * seen in that code.
+			 */
+			smp_wmb();
+			bfqg->entity.prio_changed = 1;
+		}
+	}
+	spin_unlock_irq(&blkcg->lock);
 
-	return bfqg;
+	return ret;
 }
 
-#define SHOW_FUNCTION(__VAR)						\
-static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
-				       struct cftype *cftype)		\
-{									\
-	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\
-	u64 ret = -ENODEV;						\
-									\
-	mutex_lock(&bfqio_mutex);					\
-	if (bfqio_is_removed(bgrp))					\
-		goto out_unlock;					\
-									\
-	spin_lock_irq(&bgrp->lock);					\
-	ret = bgrp->__VAR;						\
-	spin_unlock_irq(&bgrp->lock);					\
-									\
-out_unlock:								\
-	mutex_unlock(&bfqio_mutex);					\
-	return ret;							\
-}
-
-SHOW_FUNCTION(weight);
-SHOW_FUNCTION(ioprio);
-SHOW_FUNCTION(ioprio_class);
-#undef SHOW_FUNCTION
-
-#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\
-static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
-					struct cftype *cftype,		\
-					u64 val)			\
-{									\
-	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\
-	struct bfq_group *bfqg;						\
-	int ret = -EINVAL;						\
-									\
-	if (val < (__MIN) || val > (__MAX))				\
-		return ret;						\
-									\
-	ret = -ENODEV;							\
-	mutex_lock(&bfqio_mutex);					\
-	if (bfqio_is_removed(bgrp))					\
-		goto out_unlock;					\
-	ret = 0;							\
-									\
-	spin_lock_irq(&bgrp->lock);					\
-	bgrp->__VAR = (unsigned short)val;				\
-	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\
-		/*							\
-		 * Setting the ioprio_changed flag of the entity        \
-		 * to 1 with new_##__VAR == ##__VAR would re-set        \
-		 * the value of the weight to its ioprio mapping.       \
-		 * Set the flag only if necessary.			\
-		 */							\
-		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \
-			bfqg->entity.new_##__VAR = (unsigned short)val; \
-			/*						\
-			 * Make sure that the above new value has been	\
-			 * stored in bfqg->entity.new_##__VAR before	\
-			 * setting the ioprio_changed flag. In fact,	\
-			 * this flag may be read asynchronously (in	\
-			 * critical sections protected by a different	\
-			 * lock than that held here), and finding this	\
-			 * flag set may cause the execution of the code	\
-			 * for updating parameters whose value may	\
-			 * depend also on bfqg->entity.new_##__VAR (in	\
-			 * __bfq_entity_update_weight_prio).		\
-			 * This barrier makes sure that the new value	\
-			 * of bfqg->entity.new_##__VAR is correctly	\
-			 * seen in that code.				\
-			 */						\
-			smp_wmb();                                      \
-			bfqg->entity.ioprio_changed = 1;                \
-		}							\
-	}								\
-	spin_unlock_irq(&bgrp->lock);					\
-									\
-out_unlock:								\
-	mutex_unlock(&bfqio_mutex);					\
-	return ret;							\
-}
-
-STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
-STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
-STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
-#undef STORE_FUNCTION
-
-static struct cftype bfqio_files[] = {
-	{
-		.name = "weight",
-		.read_u64 = bfqio_cgroup_weight_read,
-		.write_u64 = bfqio_cgroup_weight_write,
-	},
-	{
-		.name = "ioprio",
-		.read_u64 = bfqio_cgroup_ioprio_read,
-		.write_u64 = bfqio_cgroup_ioprio_write,
-	},
-	{
-		.name = "ioprio_class",
-		.read_u64 = bfqio_cgroup_ioprio_class_read,
-		.write_u64 = bfqio_cgroup_ioprio_class_write,
-	},
-	{ },	/* terminate */
-};
+static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of,
+					     char *buf, size_t nbytes,
+					     loff_t off)
+{
+	/* First unsigned long found in the file is used */
+	return bfqio_cgroup_weight_write(of_css(of), NULL,
+					 simple_strtoull(strim(buf), NULL, 0));
+}
 
-static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
-						*parent_css)
+static int bfqg_print_stat(struct seq_file *sf, void *v)
 {
-	struct bfqio_cgroup *bgrp;
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
+	return 0;
+}
 
-	if (parent_css != NULL) {
-		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
-		if (bgrp == NULL)
-			return ERR_PTR(-ENOMEM);
-	} else
-		bgrp = &bfqio_root_cgroup;
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
+	return 0;
+}
 
-	spin_lock_init(&bgrp->lock);
-	INIT_HLIST_HEAD(&bgrp->group_data);
-	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
-	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = bfqg_stat_pd_recursive_sum(pd, off);
 
-	return &bgrp->css;
+	return __blkg_prfill_u64(sf, pd, sum);
 }
 
-/*
- * We cannot support shared io contexts, as we have no means to support
- * two tasks with the same ioc in two different groups without major rework
- * of the main bic/bfqq data structures.  By now we allow a task to change
- * its cgroup only if it's the only owner of its ioc; the drawback of this
- * behavior is that a group containing a task that forked using CLONE_IO
- * will not be destroyed until the tasks sharing the ioc die.
- */
-static int bfqio_can_attach(struct cgroup_subsys_state *css,
-			    struct cgroup_taskset *tset)
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
 {
-	struct task_struct *task;
-	struct io_context *ioc;
-	int ret = 0;
+	struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);
 
-	cgroup_taskset_for_each(task, tset) {
-		/*
-		 * task_lock() is needed to avoid races with
-		 * exit_io_context()
-		 */
-		task_lock(task);
-		ioc = task->io_context;
-		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
-			/*
-			 * ioc == NULL means that the task is either too
-			 * young or exiting: if it has still no ioc the
-			 * ioc can't be shared, if the task is exiting the
-			 * attach will fail anyway, no matter what we
-			 * return here.
-			 */
-			ret = -EINVAL;
-		task_unlock(task);
-		if (ret)
-			break;
-	}
+	return __blkg_prfill_rwstat(sf, pd, &sum);
+}
 
-	return ret;
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, false);
+	return 0;
 }
 
-static void bfqio_attach(struct cgroup_subsys_state *css,
-			 struct cgroup_taskset *tset)
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
 {
-	struct task_struct *task;
-	struct io_context *ioc;
-	struct io_cq *icq;
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, true);
+	return 0;
+}
 
-	/*
-	 * IMPORTANT NOTE: The move of more than one process at a time to a
-	 * new group has not yet been tested.
-	 */
-	cgroup_taskset_for_each(task, tset) {
-		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
-		if (ioc) {
-			/*
-			 * Handle cgroup change here.
-			 */
-			rcu_read_lock();
-			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
-				if (!strncmp(
-					icq->q->elevator->type->elevator_name,
-					"bfq", ELV_NAME_MAX))
-					bfq_bic_change_cgroup(icq_to_bic(icq),
-							      css);
-			rcu_read_unlock();
-			put_io_context(ioc);
-		}
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+	u64 v = 0;
+
+	if (samples) {
+		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+		v = div64_u64(v, samples);
 	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
 }
 
-static void bfqio_destroy(struct cgroup_subsys_state *css)
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
 {
-	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
-	struct hlist_node *tmp;
-	struct bfq_group *bfqg;
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+			  0, false);
+	return 0;
+}
 
-	/*
-	 * Since we are destroying the cgroup, there are no more tasks
-	 * referencing it, and all the RCU grace periods that may have
-	 * referenced it are ended (as the destruction of the parent
-	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by
-	 * anything else and we don't need any synchronization.
-	 */
-	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
-		bfq_destroy_group(bgrp, bfqg);
+static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	int ret;
 
-	BUG_ON(!hlist_empty(&bgrp->group_data));
+	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+	if (ret)
+		return NULL;
 
-	kfree(bgrp);
+        return blkg_to_bfqg(bfqd->queue->root_blkg);
 }
 
-static int bfqio_css_online(struct cgroup_subsys_state *css)
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
 {
-	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
-
-	mutex_lock(&bfqio_mutex);
-	bgrp->online = true;
-	mutex_unlock(&bfqio_mutex);
+        struct bfq_group_data *bgd;
 
-	return 0;
+        bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
+        if (!bgd)
+                return NULL;
+        return &bgd->pd;
 }
 
-static void bfqio_css_offline(struct cgroup_subsys_state *css)
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
 {
-	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
-
-	mutex_lock(&bfqio_mutex);
-	bgrp->online = false;
-	mutex_unlock(&bfqio_mutex);
+        kfree(cpd_to_bfqgd(cpd));
 }
 
-struct cgroup_subsys bfqio_cgrp_subsys = {
-	.css_alloc = bfqio_create,
-	.css_online = bfqio_css_online,
-	.css_offline = bfqio_css_offline,
-	.can_attach = bfqio_can_attach,
-	.attach = bfqio_attach,
-	.css_free = bfqio_destroy,
-	.legacy_cftypes = bfqio_files,
+static struct cftype bfqio_files_dfl[] = {
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfqio_cgroup_weight_read_dfl,
+		.write = bfqio_cgroup_weight_write_dfl,
+	},
+	{} /* terminate */
 };
+
+static struct cftype bfqio_files[] = {
+	{
+		.name = "bfq.weight",
+		.read_u64 = bfqio_cgroup_weight_read,
+		.write_u64 = bfqio_cgroup_weight_write,
+	},
+	/* statistics, cover only the tasks in the bfqg */
+	{
+		.name = "bfq.time",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.sectors",
+		.private = offsetof(struct bfq_group, stats.sectors),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.io_service_bytes",
+		.private = offsetof(struct bfq_group, stats.service_bytes),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_serviced",
+		.private = offsetof(struct bfq_group, stats.serviced),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_service_time",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_wait_time",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_merged",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_queued",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat,
+	},
+
+	/* the same statictics which cover the bfqg and its descendants */
+	{
+		.name = "bfq.time_recursive",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.sectors_recursive",
+		.private = offsetof(struct bfq_group, stats.sectors),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.io_service_bytes_recursive",
+		.private = offsetof(struct bfq_group, stats.service_bytes),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_serviced_recursive",
+		.private = offsetof(struct bfq_group, stats.serviced),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_service_time_recursive",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_wait_time_recursive",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_merged_recursive",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_queued_recursive",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.avg_queue_size",
+		.seq_show = bfqg_print_avg_queue_size,
+	},
+	{
+		.name = "bfq.group_wait_time",
+		.private = offsetof(struct bfq_group, stats.group_wait_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.idle_time",
+		.private = offsetof(struct bfq_group, stats.idle_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.empty_time",
+		.private = offsetof(struct bfq_group, stats.empty_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.dequeue",
+		.private = offsetof(struct bfq_group, stats.dequeue),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.unaccounted_time",
+		.private = offsetof(struct bfq_group, stats.unaccounted_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
+static struct blkcg_policy blkcg_policy_bfq = {
+       .dfl_cftypes            = bfqio_files_dfl,
+       .legacy_cftypes         = bfqio_files,
+
+       .pd_alloc_fn            = bfq_pd_alloc,
+       .pd_init_fn             = bfq_pd_init,
+       .pd_offline_fn          = bfq_pd_offline,
+       .pd_free_fn             = bfq_pd_free,
+       .pd_reset_stats_fn      = bfq_pd_reset_stats,
+
+       .cpd_alloc_fn           = bfq_cpd_alloc,
+       .cpd_init_fn            = bfq_cpd_init,
+       .cpd_bind_fn	       = bfq_cpd_init,
+       .cpd_free_fn            = bfq_cpd_free,
+
+};
+
 #else
-static inline void bfq_init_entity(struct bfq_entity *entity,
-				   struct bfq_group *bfqg)
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
 {
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	entity->weight = entity->new_weight;
 	entity->orig_weight = entity->new_weight;
-	entity->ioprio = entity->new_ioprio;
-	entity->ioprio_class = entity->new_ioprio_class;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+	}
 	entity->sched_data = &bfqg->sched_data;
 }
 
-static inline struct bfq_group *
-bfq_bic_update_cgroup(struct bfq_io_cq *bic)
+static struct bfq_group *
+bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 {
 	struct bfq_data *bfqd = bic_to_bfqd(bic);
 	return bfqd->root_group;
 }
 
-static inline void bfq_bfqq_move(struct bfq_data *bfqd,
-				 struct bfq_queue *bfqq,
-				 struct bfq_entity *entity,
-				 struct bfq_group *bfqg)
+static void bfq_bfqq_move(struct bfq_data *bfqd,
+			  struct bfq_queue *bfqq,
+			  struct bfq_entity *entity,
+			  struct bfq_group *bfqg)
 {
 }
 
@@ -909,23 +1179,24 @@ static void bfq_end_wr_async(struct bfq_data *bfqd)
 	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
 }
 
-static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
+static void bfq_disconnect_groups(struct bfq_data *bfqd)
 {
 	bfq_put_async_queues(bfqd, bfqd->root_group);
 }
 
-static inline void bfq_free_root_group(struct bfq_data *bfqd)
+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
+                                              struct blkcg *blkcg)
 {
-	kfree(bfqd->root_group);
+	return bfqd->root_group;
 }
 
-static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
+static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
 {
 	struct bfq_group *bfqg;
 	int i;
 
 	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
-	if (bfqg == NULL)
+	if (!bfqg)
 		return NULL;
 
 	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
index 7f6b0004c..fb7bb8f08 100644
--- a/block/bfq-ioc.c
+++ b/block/bfq-ioc.c
@@ -14,7 +14,7 @@
  * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
  * @icq: the iocontext queue.
  */
-static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
 {
 	/* bic->icq is the first member, %NULL will convert to %NULL */
 	return container_of(icq, struct bfq_io_cq, icq);
@@ -27,8 +27,8 @@ static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
  *
  * Queue lock must be held.
  */
-static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
-					       struct io_context *ioc)
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+					struct io_context *ioc)
 {
 	if (ioc)
 		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 71b51c1b4..dbce1f83f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -82,6 +82,9 @@ static const int bfq_back_penalty = 2;
 /* Idling period duration, in jiffies. */
 static int bfq_slice_idle = HZ / 125;
 
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
 /* Default maximum budget values, in sectors and number of requests. */
 static const int bfq_default_max_budget = 16 * 1024;
 static const int bfq_max_budget_async_rq = 4;
@@ -163,38 +166,22 @@ static int device_speed_thresh[2];
 #define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
 #define RQ_BFQQ(rq)		((rq)->elv.priv[1])
 
-static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
+static void bfq_schedule_dispatch(struct bfq_data *bfqd);
 
 #include "bfq-ioc.c"
 #include "bfq-sched.c"
 #include "bfq-cgroup.c"
 
-#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\
-				 IOPRIO_CLASS_IDLE)
-#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\
-				 IOPRIO_CLASS_RT)
+#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
 #define bfq_sample_valid(samples)	((samples) > 80)
 
 /*
- * The following macro groups conditions that need to be evaluated when
- * checking if existing queues and groups form a symmetric scenario
- * and therefore idling can be reduced or disabled for some of the
- * queues. See the comment to the function bfq_bfqq_must_not_expire()
- * for further details.
- */
-#ifdef CONFIG_CGROUP_BFQIO
-#define symmetric_scenario	  (!bfqd->active_numerous_groups && \
-				   !bfq_differentiated_weights(bfqd))
-#else
-#define symmetric_scenario	  (!bfq_differentiated_weights(bfqd))
-#endif
-
-/*
  * We regard a request as SYNC, if either it's a read or has the SYNC bit
  * set (in which case it could also be a direct WRITE).
  */
-static inline int bfq_bio_sync(struct bio *bio)
+static int bfq_bio_sync(struct bio *bio)
 {
 	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
 		return 1;
@@ -206,7 +193,7 @@ static inline int bfq_bio_sync(struct bio *bio)
  * Scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing.
  */
-static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
 {
 	if (bfqd->queued != 0) {
 		bfq_log(bfqd, "schedule dispatch");
@@ -230,9 +217,9 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
 #define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
 	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 
-	if (rq1 == NULL || rq1 == rq2)
+	if (!rq1 || rq1 == rq2)
 		return rq2;
-	if (rq2 == NULL)
+	if (!rq2)
 		return rq1;
 
 	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
@@ -345,17 +332,17 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
 
 	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
 		(long long unsigned)sector,
-		bfqq != NULL ? bfqq->pid : 0);
+		bfqq ? bfqq->pid : 0);
 
 	return bfqq;
 }
 
-static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	struct rb_node **p, *parent;
 	struct bfq_queue *__bfqq;
 
-	if (bfqq->pos_root != NULL) {
+	if (bfqq->pos_root) {
 		rb_erase(&bfqq->pos_node, bfqq->pos_root);
 		bfqq->pos_root = NULL;
 	}
@@ -365,10 +352,10 @@ static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	if (!bfqq->next_rq)
 		return;
 
-	bfqq->pos_root = &bfqd->rq_pos_tree;
+	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
 	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
 			blk_rq_pos(bfqq->next_rq), &parent, &p);
-	if (__bfqq == NULL) {
+	if (!__bfqq) {
 		rb_link_node(&bfqq->pos_node, parent, p);
 		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
 	} else
@@ -378,7 +365,7 @@ static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 /*
  * Tell whether there are active queues or groups with differentiated weights.
  */
-static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
 {
 	/*
 	 * For weights to differ, at least one of the trees must contain
@@ -387,7 +374,7 @@ static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
 	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
 		(bfqd->queue_weights_tree.rb_node->rb_left ||
 		 bfqd->queue_weights_tree.rb_node->rb_right)
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	       ) ||
 	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
 		(bfqd->group_weights_tree.rb_node->rb_left ||
@@ -397,6 +384,40 @@ static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
 }
 
 /*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ *    number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+	return
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		!bfqd->active_numerous_groups &&
+#endif
+		!bfq_differentiated_weights(bfqd);
+}
+
+/*
  * If the weight-counter tree passed as input contains no counter for
  * the weight of the input entity, then add that counter; otherwise just
  * increment the existing counter.
@@ -495,10 +516,10 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
 
 	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 
-	if (rbprev != NULL)
+	if (rbprev)
 		prev = rb_entry_rq(rbprev);
 
-	if (rbnext != NULL)
+	if (rbnext)
 		next = rb_entry_rq(rbnext);
 	else {
 		rbnext = rb_first(&bfqq->sort_list);
@@ -510,8 +531,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
 }
 
 /* see the definition of bfq_async_charge_factor for details */
-static inline unsigned long bfq_serv_to_charge(struct request *rq,
-					       struct bfq_queue *bfqq)
+static unsigned long bfq_serv_to_charge(struct request *rq,
+					struct bfq_queue *bfqq)
 {
 	return blk_rq_sectors(rq) *
 		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
@@ -537,7 +558,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 	struct request *next_rq = bfqq->next_rq;
 	unsigned long new_budget;
 
-	if (next_rq == NULL)
+	if (!next_rq)
 		return;
 
 	if (bfqq == bfqd->in_service_queue)
@@ -560,7 +581,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 	}
 }
 
-static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
 {
 	u64 dur;
 
@@ -573,13 +594,12 @@ static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
 	return dur;
 }
 
-static inline unsigned
-bfq_bfqq_cooperations(struct bfq_queue *bfqq)
+static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq)
 {
 	return bfqq->bic ? bfqq->bic->cooperations : 0;
 }
 
-static inline void
+static void
 bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 {
 	if (bic->saved_idle_window)
@@ -603,7 +623,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;
 		bfqq->wr_cur_max_time = bic->wr_time_left;
 		bfqq->last_wr_start_finish = jiffies;
-		bfqq->entity.ioprio_changed = 1;
+		bfqq->entity.prio_changed = 1;
 	}
 	/*
 	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from
@@ -613,11 +633,12 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 	bic->wr_time_left = 0;
 }
 
-/* Must be called with the queue_lock held. */
 static int bfqq_process_refs(struct bfq_queue *bfqq)
 {
 	int process_refs, io_refs;
 
+	lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
+
 	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
 	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
 	BUG_ON(process_refs < 0);
@@ -625,8 +646,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq)
 }
 
 /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
-static inline void bfq_reset_burst_list(struct bfq_data *bfqd,
-					struct bfq_queue *bfqq)
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	struct bfq_queue *item;
 	struct hlist_node *n;
@@ -858,14 +878,14 @@ static void bfq_add_request(struct request *rq)
 	 */
 	prev = bfqq->next_rq;
 	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
-	BUG_ON(next_rq == NULL);
+	BUG_ON(!next_rq);
 	bfqq->next_rq = next_rq;
 
 	/*
 	 * Adjust priority tree position, if next_rq changes.
 	 */
 	if (prev != bfqq->next_rq)
-		bfq_rq_pos_tree_add(bfqd, bfqq);
+		bfq_pos_tree_add_move(bfqd, bfqq);
 
 	if (!bfq_bfqq_busy(bfqq)) {
 		bool soft_rt, coop_or_in_burst,
@@ -873,6 +893,10 @@ static void bfq_add_request(struct request *rq)
 						bfqq->budget_timeout +
 						bfqd->bfq_wr_min_idle_time);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
+					 rq->cmd_flags);
+#endif
 		if (bfq_bfqq_sync(bfqq)) {
 			bool already_in_burst =
 			   !hlist_unhashed(&bfqq->burst_list_node) ||
@@ -917,7 +941,7 @@ static void bfq_add_request(struct request *rq)
 			goto add_bfqq_busy;
 
 		if (bfq_bfqq_just_split(bfqq))
-			goto set_ioprio_changed;
+			goto set_prio_changed;
 
 		/*
 		 * If the queue:
@@ -929,7 +953,7 @@ static void bfq_add_request(struct request *rq)
 		 * start a weight-raising period.
 		 */
 		if (old_wr_coeff == 1 && (interactive || soft_rt) &&
-		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
+		    (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
 			if (interactive)
 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
@@ -1008,9 +1032,9 @@ static void bfq_add_request(struct request *rq)
 					bfqd->bfq_wr_rt_max_time;
 			}
 		}
-set_ioprio_changed:
+set_prio_changed:
 		if (old_wr_coeff != bfqq->wr_coeff)
-			entity->ioprio_changed = 1;
+			entity->prio_changed = 1;
 add_bfqq_busy:
 		bfqq->last_idle_bklogged = jiffies;
 		bfqq->service_from_backlogged = 0;
@@ -1025,7 +1049,7 @@ add_bfqq_busy:
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
 
 			bfqd->wr_busy_queues++;
-			entity->ioprio_changed = 1;
+			entity->prio_changed = 1;
 			bfq_log_bfqq(bfqd, bfqq,
 			    "non-idle wrais starting at %lu, rais_max_time %u",
 			    jiffies,
@@ -1048,11 +1072,11 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
 	struct bfq_queue *bfqq;
 
 	bic = bfq_bic_lookup(bfqd, tsk->io_context);
-	if (bic == NULL)
+	if (!bic)
 		return NULL;
 
 	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
-	if (bfqq != NULL)
+	if (bfqq)
 		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
 
 	return NULL;
@@ -1068,8 +1092,7 @@ static void bfq_activate_request(struct request_queue *q, struct request *rq)
 		(long long unsigned)bfqd->last_position);
 }
 
-static inline void bfq_deactivate_request(struct request_queue *q,
-					  struct request *rq)
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
 {
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 
@@ -1101,7 +1124,7 @@ static void bfq_remove_request(struct request *rq)
 		/*
 		 * Remove queue from request-position tree as it is empty.
 		 */
-		if (bfqq->pos_root != NULL) {
+		if (bfqq->pos_root) {
 			rb_erase(&bfqq->pos_node, bfqq->pos_root);
 			bfqq->pos_root = NULL;
 		}
@@ -1111,6 +1134,9 @@ static void bfq_remove_request(struct request *rq)
 		BUG_ON(bfqq->meta_pending == 0);
 		bfqq->meta_pending--;
 	}
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
+#endif
 }
 
 static int bfq_merge(struct request_queue *q, struct request **req,
@@ -1120,7 +1146,7 @@ static int bfq_merge(struct request_queue *q, struct request **req,
 	struct request *__rq;
 
 	__rq = bfq_find_rq_fmerge(bfqd, bio);
-	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
+	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_FRONT_MERGE;
 	}
@@ -1147,7 +1173,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,
 		prev = bfqq->next_rq;
 		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
 					 bfqd->last_position);
-		BUG_ON(next_rq == NULL);
+		BUG_ON(!next_rq);
 		bfqq->next_rq = next_rq;
 		/*
 		 * If next_rq changes, update both the queue's budget to
@@ -1156,11 +1182,19 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,
 		 */
 		if (prev != bfqq->next_rq) {
 			bfq_updated_next_req(bfqd, bfqq);
-			bfq_rq_pos_tree_add(bfqd, bfqq);
+			bfq_pos_tree_add_move(bfqd, bfqq);
 		}
 	}
 }
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfq_bio_merged(struct request_queue *q, struct request *req,
+			   struct bio *bio)
+{
+	bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw);
+}
+#endif
+
 static void bfq_merged_requests(struct request_queue *q, struct request *rq,
 				struct request *next)
 {
@@ -1187,18 +1221,21 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq,
 		bfqq->next_rq = rq;
 
 	bfq_remove_request(next);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
+#endif
 }
 
 /* Must be called with bfqq != NULL */
-static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
 {
-	BUG_ON(bfqq == NULL);
+	BUG_ON(!bfqq);
 	if (bfq_bfqq_busy(bfqq))
 		bfqq->bfqd->wr_busy_queues--;
 	bfqq->wr_coeff = 1;
 	bfqq->wr_cur_max_time = 0;
 	/* Trigger a weight change on the next activation of the queue */
-	bfqq->entity.ioprio_changed = 1;
+	bfqq->entity.prio_changed = 1;
 }
 
 static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
@@ -1208,9 +1245,9 @@ static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
 
 	for (i = 0; i < 2; i++)
 		for (j = 0; j < IOPRIO_BE_NR; j++)
-			if (bfqg->async_bfqq[i][j] != NULL)
+			if (bfqg->async_bfqq[i][j])
 				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
-	if (bfqg->async_idle_bfqq != NULL)
+	if (bfqg->async_idle_bfqq)
 		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
 }
 
@@ -1229,7 +1266,7 @@ static void bfq_end_wr(struct bfq_data *bfqd)
 	spin_unlock_irq(bfqd->queue->queue_lock);
 }
 
-static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
+static sector_t bfq_io_struct_pos(void *io_struct, bool request)
 {
 	if (request)
 		return blk_rq_pos(io_struct);
@@ -1237,25 +1274,18 @@ static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
 		return ((struct bio *)io_struct)->bi_iter.bi_sector;
 }
 
-static inline sector_t bfq_dist_from(sector_t pos1,
-				     sector_t pos2)
-{
-	if (pos1 >= pos2)
-		return pos1 - pos2;
-	else
-		return pos2 - pos1;
-}
-
-static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
-					 sector_t sector)
+static int bfq_rq_close_to_sector(void *io_struct, bool request,
+				  sector_t sector)
 {
-	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
+	return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
 	       BFQQ_SEEK_THR;
 }
 
-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
+					 struct bfq_queue *bfqq,
+					 sector_t sector)
 {
-	struct rb_root *root = &bfqd->rq_pos_tree;
+	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
 	struct rb_node *parent, *node;
 	struct bfq_queue *__bfqq;
 
@@ -1267,7 +1297,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
 	 * request, choose it.
 	 */
 	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
-	if (__bfqq != NULL)
+	if (__bfqq)
 		return __bfqq;
 
 	/*
@@ -1283,7 +1313,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
 		node = rb_next(&__bfqq->pos_node);
 	else
 		node = rb_prev(&__bfqq->pos_node);
-	if (node == NULL)
+	if (!node)
 		return NULL;
 
 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
@@ -1293,56 +1323,21 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
 	return NULL;
 }
 
-/*
- * bfqd - obvious
- * cur_bfqq - passed in so that we don't decide that the current queue
- *            is closely cooperating with itself
- * sector - used as a reference point to search for a close queue
- */
-static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
-					      struct bfq_queue *cur_bfqq,
-					      sector_t sector)
+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
+						   struct bfq_queue *cur_bfqq,
+						   sector_t sector)
 {
 	struct bfq_queue *bfqq;
 
-	if (bfq_class_idle(cur_bfqq))
-		return NULL;
-	if (!bfq_bfqq_sync(cur_bfqq))
-		return NULL;
-	if (BFQQ_SEEKY(cur_bfqq))
-		return NULL;
-
-	/* If device has only one backlogged bfq_queue, don't search. */
-	if (bfqd->busy_queues == 1)
-		return NULL;
-
-	/*
-	 * We should notice if some of the queues are cooperating, e.g.
-	 * working closely on the same area of the disk. In that case,
-	 * we can group them together and don't waste time idling.
-	 */
-	bfqq = bfqq_close(bfqd, sector);
-	if (bfqq == NULL || bfqq == cur_bfqq)
-		return NULL;
-
-	/*
-	 * Do not merge queues from different bfq_groups.
-	*/
-	if (bfqq->entity.parent != cur_bfqq->entity.parent)
-		return NULL;
-
 	/*
-	 * It only makes sense to merge sync queues.
+	 * We shall notice if some of the queues are cooperating,
+	 * e.g., working closely on the same area of the device. In
+	 * that case, we can group them together and: 1) don't waste
+	 * time idling, and 2) serve the union of their requests in
+	 * the best possible order for throughput.
 	 */
-	if (!bfq_bfqq_sync(bfqq))
-		return NULL;
-	if (BFQQ_SEEKY(bfqq))
-		return NULL;
-
-	/*
-	 * Do not merge queues of different priority classes.
-	 */
-	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
+	bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
+	if (!bfqq || bfqq == cur_bfqq)
 		return NULL;
 
 	return bfqq;
@@ -1409,6 +1404,32 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
 	return new_bfqq;
 }
 
+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
+					struct bfq_queue *new_bfqq)
+{
+	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
+	    (bfqq->ioprio_class != new_bfqq->ioprio_class))
+		return false;
+
+	/*
+	 * If either of the queues has already been detected as seeky,
+	 * then merging it with the other queue is unlikely to lead to
+	 * sequential I/O.
+	 */
+	if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
+		return false;
+
+	/*
+	 * Interleaved I/O is known to be done by (some) applications
+	 * only for reads, so it does not make sense to merge async
+	 * queues.
+	 */
+	if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
+		return false;
+
+	return true;
+}
+
 /*
  * Attempt to schedule a merge of bfqq with the currently in-service queue
  * or with a close queue among the scheduled queues.
@@ -1430,56 +1451,52 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	if (bfqq->new_bfqq)
 		return bfqq->new_bfqq;
-
 	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
 		return NULL;
+	/* If device has only one backlogged bfq_queue, don't search. */
+	if (bfqd->busy_queues == 1)
+		return NULL;
 
 	in_service_bfqq = bfqd->in_service_queue;
 
-	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
+	if (!in_service_bfqq || in_service_bfqq == bfqq ||
 	    !bfqd->in_service_bic ||
 	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))
 		goto check_scheduled;
 
-	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
-		goto check_scheduled;
-
-	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
-		goto check_scheduled;
-
-	if (in_service_bfqq->entity.parent != bfqq->entity.parent)
-		goto check_scheduled;
-
 	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
-	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
+	    bfqq->entity.parent == in_service_bfqq->entity.parent &&
+	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
 		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
-		if (new_bfqq != NULL)
-			return new_bfqq; /* Merge with in-service queue */
+		if (new_bfqq)
+			return new_bfqq;
 	}
-
 	/*
 	 * Check whether there is a cooperator among currently scheduled
 	 * queues. The only thing we need is that the bio/request is not
 	 * NULL, as we need it to establish whether a cooperator exists.
 	 */
 check_scheduled:
-	new_bfqq = bfq_close_cooperator(bfqd, bfqq,
-					bfq_io_struct_pos(io_struct, request));
-	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))
+	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
+			bfq_io_struct_pos(io_struct, request));
+
+	BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
+
+	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
+	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
 		return bfq_setup_merge(bfqq, new_bfqq);
 
 	return NULL;
 }
 
-static inline void
-bfq_bfqq_save_state(struct bfq_queue *bfqq)
+static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 {
 	/*
-	 * If bfqq->bic == NULL, the queue is already shared or its requests
+	 * If !bfqq->bic, the queue is already shared or its requests
 	 * have already been redirected to a shared queue; both idle window
 	 * and weight raising state have already been saved. Do nothing.
 	 */
-	if (bfqq->bic == NULL)
+	if (!bfqq->bic)
 		return;
 	if (bfqq->bic->wr_time_left)
 		/*
@@ -1523,8 +1540,7 @@ bfq_bfqq_save_state(struct bfq_queue *bfqq)
 	bfqq->bic->failed_cooperations = 0;
 }
 
-static inline void
-bfq_get_bic_reference(struct bfq_queue *bfqq)
+static void bfq_get_bic_reference(struct bfq_queue *bfqq)
 {
 	/*
 	 * If bfqq->bic has a non-NULL value, the bic to which it belongs
@@ -1572,7 +1588,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
 	bfq_put_queue(bfqq);
 }
 
-static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
+static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
 {
 	struct bfq_io_cq *bic = bfqq->bic;
 	struct bfq_data *bfqd = bfqq->bfqd;
@@ -1603,7 +1619,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 	 * Queue lock is held here.
 	 */
 	bic = bfq_bic_lookup(bfqd, current->io_context);
-	if (bic == NULL)
+	if (!bic)
 		return 0;
 
 	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
@@ -1611,9 +1627,9 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 	 * We take advantage of this function to perform an early merge
 	 * of the queues of possible cooperating processes.
 	 */
-	if (bfqq != NULL) {
+	if (bfqq) {
 		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
-		if (new_bfqq != NULL) {
+		if (new_bfqq) {
 			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
 			/*
 			 * If we get here, the bio will be queued in the
@@ -1631,7 +1647,10 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 				       struct bfq_queue *bfqq)
 {
-	if (bfqq != NULL) {
+	if (bfqq) {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
+#endif
 		bfq_mark_bfqq_must_alloc(bfqq);
 		bfq_mark_bfqq_budget_new(bfqq);
 		bfq_clear_bfqq_fifo_expire(bfqq);
@@ -1639,7 +1658,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
 
 		bfq_log_bfqq(bfqd, bfqq,
-			     "set_in_service_queue, cur-budget = %lu",
+			     "set_in_service_queue, cur-budget = %d",
 			     bfqq->entity.budget);
 	}
 
@@ -1662,9 +1681,9 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
  * stored in bfqd, which is dynamically updated according to the
  * estimated disk peak rate; otherwise return the default max budget
  */
-static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
+static int bfq_max_budget(struct bfq_data *bfqd)
 {
-	if (bfqd->budgets_assigned < 194)
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
 		return bfq_default_max_budget;
 	else
 		return bfqd->bfq_max_budget;
@@ -1674,9 +1693,9 @@ static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
  * Return min budget, which is a fraction of the current or default
  * max budget (trying with 1/32)
  */
-static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
+static int bfq_min_budget(struct bfq_data *bfqd)
 {
-	if (bfqd->budgets_assigned < 194)
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
 		return bfq_default_max_budget / 32;
 	else
 		return bfqd->bfq_max_budget / 32;
@@ -1692,7 +1711,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 
 	/* Processes have exited, don't wait. */
 	bic = bfqd->in_service_bic;
-	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
+	if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
 		return;
 
 	bfq_mark_bfqq_wait_request(bfqq);
@@ -1718,12 +1737,15 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
 				  bfq_max_budget(bfqq->bfqd) / 8) ||
 	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
-	    symmetric_scenario)
+	    bfq_symmetric_scenario(bfqd))
 		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
 	else if (bfqq->wr_coeff > 1)
 		sl = sl * 3;
 	bfqd->last_idling_start = ktime_get();
 	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
+#endif
 	bfq_log(bfqd, "arm idle: %u/%u ms",
 		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
 }
@@ -1777,6 +1799,10 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
 	if (bfq_bfqq_sync(bfqq))
 		bfqd->sync_flight++;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq),
+				   rq->cmd_flags);
+#endif
 }
 
 /*
@@ -1802,7 +1828,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
 	return rq;
 }
 
-static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 	return entity->budget - entity->service;
@@ -1836,7 +1862,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 		/*
 		 * Resort priority tree of potential close cooperators.
 		 */
-		bfq_rq_pos_tree_add(bfqd, bfqq);
+		bfq_pos_tree_add_move(bfqd, bfqq);
 	}
 }
 
@@ -1846,24 +1872,24 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  * @bfqq: queue to update.
  * @reason: reason for expiration.
  *
- * Handle the feedback on @bfqq budget.  See the body for detailed
- * comments.
+ * Handle the feedback on @bfqq budget at queue expiration.
+ * See the body for detailed comments.
  */
 static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 				     struct bfq_queue *bfqq,
 				     enum bfqq_expiration reason)
 {
 	struct request *next_rq;
-	unsigned long budget, min_budget;
+	int budget, min_budget;
 
 	budget = bfqq->max_budget;
 	min_budget = bfq_min_budget(bfqd);
 
 	BUG_ON(bfqq != bfqd->in_service_queue);
 
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
 		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
 		budget, bfq_min_budget(bfqd));
 	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
 		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
@@ -1940,18 +1966,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 		default:
 			return;
 		}
-	} else /* async queue */
-	    /* async queues get always the maximum possible budget
-	     * (their ability to dispatch is limited by
-	     * @bfqd->bfq_max_budget_async_rq).
-	     */
+	} else
+		/*
+		 * Async queues get always the maximum possible budget
+		 * (their ability to dispatch is limited by
+		 * @bfqd->bfq_max_budget_async_rq).
+		 */
 		budget = bfqd->bfq_max_budget;
 
 	bfqq->max_budget = budget;
 
-	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
-	    bfqq->max_budget > bfqd->bfq_max_budget)
-		bfqq->max_budget = bfqd->bfq_max_budget;
+	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
+	    !bfqd->bfq_user_max_budget)
+		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
 
 	/*
 	 * Make sure that we have enough budget for the next request.
@@ -1960,14 +1987,14 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 	 * update.
 	 */
 	next_rq = bfqq->next_rq;
-	if (next_rq != NULL)
+	if (next_rq)
 		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
 					    bfq_serv_to_charge(next_rq, bfqq));
 	else
 		bfqq->entity.budget = bfqq->max_budget;
 
-	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
-			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
+			next_rq ? blk_rq_sectors(next_rq) : 0,
 			bfqq->entity.budget);
 }
 
@@ -1993,15 +2020,15 @@ static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
  * seeky processes, and hence reduce their chances to lower the
  * throughput. See the code for more details.
  */
-static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				int compensate, enum bfqq_expiration reason)
+static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				 bool compensate, enum bfqq_expiration reason)
 {
 	u64 bw, usecs, expected, timeout;
 	ktime_t delta;
 	int update = 0;
 
 	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
-		return 0;
+		return false;
 
 	if (compensate)
 		delta = bfqd->last_idling_start;
@@ -2012,7 +2039,7 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	/* Don't trust short/unrealistic values. */
 	if (usecs < 100 || usecs >= LONG_MAX)
-		return 0;
+		return false;
 
 	/*
 	 * Calculate the bandwidth for the last slice.  We use a 64 bit
@@ -2061,7 +2088,7 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 				bfqd->bfq_max_budget =
 					bfq_calc_max_budget(bfqd->peak_rate,
 							    timeout);
-				bfq_log(bfqd, "new max_budget=%lu",
+				bfq_log(bfqd, "new max_budget=%d",
 					bfqd->bfq_max_budget);
 			}
 			if (bfqd->device_speed == BFQ_BFQD_FAST &&
@@ -2086,7 +2113,7 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	 * and for the moment return false.
 	 */
 	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
-		return 0;
+		return false;
 
 	/*
 	 * A process is considered ``slow'' (i.e., seeky, so that we
@@ -2161,8 +2188,8 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  * seems to be quite precise also in embedded systems and KVM/QEMU virtual
  * machines.
  */
-static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
-						       struct bfq_queue *bfqq)
+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+						struct bfq_queue *bfqq)
 {
 	return max(bfqq->last_idle_bklogged +
 		   HZ * bfqq->service_from_backlogged /
@@ -2175,7 +2202,7 @@ static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
  * the current time will be lower than this time instant according to the macro
  * time_is_before_jiffies().
  */
-static inline unsigned long bfq_infinity_from_now(unsigned long now)
+static unsigned long bfq_infinity_from_now(unsigned long now)
 {
 	return now + ULONG_MAX / 2;
 }
@@ -2212,13 +2239,14 @@ static inline unsigned long bfq_infinity_from_now(unsigned long now)
  */
 static void bfq_bfqq_expire(struct bfq_data *bfqd,
 			    struct bfq_queue *bfqq,
-			    int compensate,
+			    bool compensate,
 			    enum bfqq_expiration reason)
 {
-	int slow;
+	bool slow;
 	BUG_ON(bfqq != bfqd->in_service_queue);
 
-	/* Update disk peak rate for autotuning and check whether the
+	/*
+	 * Update disk peak rate for autotuning and check whether the
 	 * process is slow (see bfq_update_peak_rate).
 	 */
 	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
@@ -2312,12 +2340,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
  * just checked on request arrivals and completions, as well as on
  * idle timer expirations.
  */
-static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
 {
 	if (bfq_bfqq_budget_new(bfqq) ||
 	    time_before(jiffies, bfqq->budget_timeout))
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 /*
@@ -2328,7 +2356,7 @@ static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
  * does not hold, or if the queue is slow enough to deserve only to be
  * kicked off for preserving a high throughput.
 */
-static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
 {
 	bfq_log_bfqq(bfqq->bfqd, bfqq,
 		"may_budget_timeout: wait_request %d left %d timeout %d",
@@ -2343,183 +2371,278 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
 }
 
 /*
- * Device idling is allowed only for the queues for which this function
- * returns true. For this reason, the return value of this function plays a
- * critical role for both throughput boosting and service guarantees. The
- * return value is computed through a logical expression. In this rather
- * long comment, we try to briefly describe all the details and motivations
- * behind the components of this logical expression.
- *
- * First, the expression is false if bfqq is not sync, or if: bfqq happened
- * to become active during a large burst of queue activations, and the
- * pattern of requests bfqq contains boosts the throughput if bfqq is
- * expired. In fact, queues that became active during a large burst benefit
- * only from throughput, as discussed in the comments to bfq_handle_burst.
- * In this respect, expiring bfqq certainly boosts the throughput on NCQ-
- * capable flash-based devices, whereas, on rotational devices, it boosts
- * the throughput only if bfqq contains random requests.
- *
- * On the opposite end, if (a) bfqq is sync, (b) the above burst-related
- * condition does not hold, and (c) bfqq is being weight-raised, then the
- * expression always evaluates to true, as device idling is instrumental
- * for preserving low-latency guarantees (see [1]). If, instead, conditions
- * (a) and (b) do hold, but (c) does not, then the expression evaluates to
- * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and
- * (2) at least one of the following two conditions holds.
- * The first condition is that the device is not performing NCQ, because
- * idling the device most certainly boosts the throughput if this condition
- * holds and bfqq is I/O-bound and has been granted a non-null idle window.
- * The second compound condition is made of the logical AND of two components.
- *
- * The first component is true only if there is no weight-raised busy
- * queue. This guarantees that the device is not idled for a sync non-
- * weight-raised queue when there are busy weight-raised queues. The former
- * is then expired immediately if empty. Combined with the timestamping
- * rules of BFQ (see [1] for details), this causes sync non-weight-raised
- * queues to get a lower number of requests served, and hence to ask for a
- * lower number of requests from the request pool, before the busy weight-
- * raised queues get served again.
- *
- * This is beneficial for the processes associated with weight-raised
- * queues, when the request pool is saturated (e.g., in the presence of
- * write hogs). In fact, if the processes associated with the other queues
- * ask for requests at a lower rate, then weight-raised processes have a
- * higher probability to get a request from the pool immediately (or at
- * least soon) when they need one. Hence they have a higher probability to
- * actually get a fraction of the disk throughput proportional to their
- * high weight. This is especially true with NCQ-capable drives, which
- * enqueue several requests in advance and further reorder internally-
- * queued requests.
- *
- * In the end, mistreating non-weight-raised queues when there are busy
- * weight-raised queues seems to mitigate starvation problems in the
- * presence of heavy write workloads and NCQ, and hence to guarantee a
- * higher application and system responsiveness in these hostile scenarios.
- *
- * If the first component of the compound condition is instead true, i.e.,
- * there is no weight-raised busy queue, then the second component of the
- * compound condition takes into account service-guarantee and throughput
- * issues related to NCQ (recall that the compound condition is evaluated
- * only if the device is detected as supporting NCQ).
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for that queue. As a consequence, since
+ * device idling plays a critical role for both throughput boosting
+ * and service guarantees, the return value of this function plays a
+ * critical role as well.
  *
- * As for service guarantees, allowing the drive to enqueue more than one
- * request at a time, and hence delegating de facto final scheduling
- * decisions to the drive's internal scheduler, causes loss of control on
- * the actual request service order. In this respect, when the drive is
- * allowed to enqueue more than one request at a time, the service
- * distribution enforced by the drive's internal scheduler is likely to
- * coincide with the desired device-throughput distribution only in the
- * following, perfectly symmetric, scenario:
- * 1) all active queues have the same weight,
- * 2) all active groups at the same level in the groups tree have the same
- *    weight,
- * 3) all active groups at the same level in the groups tree have the same
- *    number of children.
- *
- * Even in such a scenario, sequential I/O may still receive a preferential
- * treatment, but this is not likely to be a big issue with flash-based
- * devices, because of their non-dramatic loss of throughput with random
- * I/O. Things do differ with HDDs, for which additional care is taken, as
- * explained after completing the discussion for flash-based devices.
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
  *
- * Unfortunately, keeping the necessary state for evaluating exactly the
- * above symmetry conditions would be quite complex and time-consuming.
- * Therefore BFQ evaluates instead the following stronger sub-conditions,
- * for which it is much easier to maintain the needed state:
- * 1) all active queues have the same weight,
- * 2) all active groups have the same weight,
- * 3) all active groups have at most one active child each.
- * In particular, the last two conditions are always true if hierarchical
- * support and the cgroups interface are not enabled, hence no state needs
- * to be maintained in this case.
- *
- * According to the above considerations, the second component of the
- * compound condition evaluates to true if any of the above symmetry
- * sub-condition does not hold, or the device is not flash-based. Therefore,
- * if also the first component is true, then idling is allowed for a sync
- * queue. These are the only sub-conditions considered if the device is
- * flash-based, as, for such a device, it is sensible to force idling only
- * for service-guarantee issues. In fact, as for throughput, idling
- * NCQ-capable flash-based devices would not boost the throughput even
- * with sequential I/O; rather it would lower the throughput in proportion
- * to how fast the device is. In the end, (only) if all the three
- * sub-conditions hold and the device is flash-based, the compound
- * condition evaluates to false and therefore no idling is performed.
- *
- * As already said, things change with a rotational device, where idling
- * boosts the throughput with sequential I/O (even with NCQ). Hence, for
- * such a device the second component of the compound condition evaluates
- * to true also if the following additional sub-condition does not hold:
- * the queue is constantly seeky. Unfortunately, this different behavior
- * with respect to flash-based devices causes an additional asymmetry: if
- * some sync queues enjoy idling and some other sync queues do not, then
- * the latter get a low share of the device throughput, simply because the
- * former get many requests served after being set as in service, whereas
- * the latter do not. As a consequence, to guarantee the desired throughput
- * distribution, on HDDs the compound expression evaluates to true (and
- * hence device idling is performed) also if the following last symmetry
- * condition does not hold: no other queue is benefiting from idling. Also
- * this last condition is actually replaced with a simpler-to-maintain and
- * stronger condition: there is no busy queue which is not constantly seeky
- * (and hence may also benefit from idling).
- *
- * To sum up, when all the required symmetry and throughput-boosting
- * sub-conditions hold, the second component of the compound condition
- * evaluates to false, and hence no idling is performed. This helps to
- * keep the drives' internal queues full on NCQ-capable devices, and hence
- * to boost the throughput, without causing 'almost' any loss of service
- * guarantees. The 'almost' follows from the fact that, if the internal
- * queue of one such device is filled while all the sub-conditions hold,
- * but at some point in time some sub-condition stops to hold, then it may
- * become impossible to let requests be served in the new desired order
- * until all the requests already queued in the device have been served.
+ * In more detail, the return value of this function is obtained by,
+ * first, computing a number of boolean variables that take into
+ * account throughput and service-guarantee issues, and, then,
+ * combining these variables in a logical expression. Most of the
+ * issues taken into account are not trivial. We discuss these issues
+ * while introducing the variables.
  */
-static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
-#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \
-				   bfqd->busy_in_flight_queues == \
-				   bfqd->const_seeky_busy_in_flight_queues)
+	bool idling_boosts_thr, idling_boosts_thr_without_issues,
+		all_queues_seeky, on_hdd_and_not_all_queues_seeky,
+		idling_needed_for_service_guarantees,
+		asymmetric_scenario;
 
-#define cond_for_expiring_in_burst	(bfq_bfqq_in_large_burst(bfqq) && \
-					 bfqd->hw_tag && \
-					 (blk_queue_nonrot(bfqd->queue) || \
-					  bfq_bfqq_constantly_seeky(bfqq)))
+	/*
+	 * The next variable takes into account the cases where idling
+	 * boosts the throughput.
+	 *
+	 * The value of the variable is computed considering, first, that
+	 * idling is virtually always beneficial for the throughput if:
+	 * (a) the device is not NCQ-capable, or
+	 * (b) regardless of the presence of NCQ, the device is rotational
+	 *     and the request pattern for bfqq is I/O-bound and sequential.
+	 *
+	 * Secondly, and in contrast to the above item (b), idling an
+	 * NCQ-capable flash-based device would not boost the
+	 * throughput even with sequential I/O; rather it would lower
+	 * the throughput in proportion to how fast the device
+	 * is. Accordingly, the next variable is true if any of the
+	 * above conditions (a) and (b) is true, and, in particular,
+	 * happens to be false if bfqd is an NCQ-capable flash-based
+	 * device.
+	 */
+	idling_boosts_thr = !bfqd->hw_tag ||
+		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
+		 bfq_bfqq_idle_window(bfqq)) ;
 
-/*
- * Condition for expiring a non-weight-raised queue (and hence not idling
- * the device).
- */
-#define cond_for_expiring_non_wr  (bfqd->hw_tag && \
-				   (bfqd->wr_busy_queues > 0 || \
-				    (blk_queue_nonrot(bfqd->queue) || \
-				      cond_for_seeky_on_ncq_hdd)))
+	/*
+	 * The value of the next variable,
+	 * idling_boosts_thr_without_issues, is equal to that of
+	 * idling_boosts_thr, unless a special case holds. In this
+	 * special case, described below, idling may cause problems to
+	 * weight-raised queues.
+	 *
+	 * When the request pool is saturated (e.g., in the presence
+	 * of write hogs), if the processes associated with
+	 * non-weight-raised queues ask for requests at a lower rate,
+	 * then processes associated with weight-raised queues have a
+	 * higher probability to get a request from the pool
+	 * immediately (or at least soon) when they need one. Thus
+	 * they have a higher probability to actually get a fraction
+	 * of the device throughput proportional to their high
+	 * weight. This is especially true with NCQ-capable drives,
+	 * which enqueue several requests in advance, and further
+	 * reorder internally-queued requests.
+	 *
+	 * For this reason, we force to false the value of
+	 * idling_boosts_thr_without_issues if there are weight-raised
+	 * busy queues. In this case, and if bfqq is not weight-raised,
+	 * this guarantees that the device is not idled for bfqq (if,
+	 * instead, bfqq is weight-raised, then idling will be
+	 * guaranteed by another variable, see below). Combined with
+	 * the timestamping rules of BFQ (see [1] for details), this
+	 * behavior causes bfqq, and hence any sync non-weight-raised
+	 * queue, to get a lower number of requests served, and thus
+	 * to ask for a lower number of requests from the request
+	 * pool, before the busy weight-raised queues get served
+	 * again. This often mitigates starvation problems in the
+	 * presence of heavy write workloads and NCQ, thereby
+	 * guaranteeing a higher application and system responsiveness
+	 * in these hostile scenarios.
+	 */
+	idling_boosts_thr_without_issues = idling_boosts_thr &&
+		bfqd->wr_busy_queues == 0;
+
+	/*
+	 * There are then two cases where idling must be performed not
+	 * for throughput concerns, but to preserve service
+	 * guarantees. In the description of these cases, we say, for
+	 * short, that a queue is sequential/random if the process
+	 * associated to the queue issues sequential/random requests
+	 * (in the second case the queue may be tagged as seeky or
+	 * even constantly_seeky).
+	 *
+	 * To introduce the first case, we note that, since
+	 * bfq_bfqq_idle_window(bfqq) is false if the device is
+	 * NCQ-capable and bfqq is random (see
+	 * bfq_update_idle_window()), then, from the above two
+	 * assignments it follows that
+	 * idling_boosts_thr_without_issues is false if the device is
+	 * NCQ-capable and bfqq is random. Therefore, for this case,
+	 * device idling would never be allowed if we used just
+	 * idling_boosts_thr_without_issues to decide whether to allow
+	 * it. And, beneficially, this would imply that throughput
+	 * would always be boosted also with random I/O on NCQ-capable
+	 * HDDs.
+	 *
+	 * But we must be careful on this point, to avoid an unfair
+	 * treatment for bfqq. In fact, because of the same above
+	 * assignments, idling_boosts_thr_without_issues is, on the
+	 * other hand, true if 1) the device is an HDD and bfqq is
+	 * sequential, and 2) there are no busy weight-raised
+	 * queues. As a consequence, if we used just
+	 * idling_boosts_thr_without_issues to decide whether to idle
+	 * the device, then with an HDD we might easily bump into a
+	 * scenario where queues that are sequential and I/O-bound
+	 * would enjoy idling, whereas random queues would not. The
+	 * latter might then get a low share of the device throughput,
+	 * simply because the former would get many requests served
+	 * after being set as in service, while the latter would not.
+	 *
+	 * To address this issue, we start by setting to true a
+	 * sentinel variable, on_hdd_and_not_all_queues_seeky, if the
+	 * device is rotational and not all queues with pending or
+	 * in-flight requests are constantly seeky (i.e., there are
+	 * active sequential queues, and bfqq might then be mistreated
+	 * if it does not enjoy idling because it is random).
+	 */
+	all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) &&
+			   bfqd->busy_in_flight_queues ==
+			   bfqd->const_seeky_busy_in_flight_queues;
+
+	on_hdd_and_not_all_queues_seeky =
+		!blk_queue_nonrot(bfqd->queue) && !all_queues_seeky;
+
+	/*
+	 * To introduce the second case where idling needs to be
+	 * performed to preserve service guarantees, we can note that
+	 * allowing the drive to enqueue more than one request at a
+	 * time, and hence delegating de facto final scheduling
+	 * decisions to the drive's internal scheduler, causes loss of
+	 * control on the actual request service order. In particular,
+	 * the critical situation is when requests from different
+	 * processes happens to be present, at the same time, in the
+	 * internal queue(s) of the drive. In such a situation, the
+	 * drive, by deciding the service order of the
+	 * internally-queued requests, does determine also the actual
+	 * throughput distribution among these processes. But the
+	 * drive typically has no notion or concern about per-process
+	 * throughput distribution, and makes its decisions only on a
+	 * per-request basis. Therefore, the service distribution
+	 * enforced by the drive's internal scheduler is likely to
+	 * coincide with the desired device-throughput distribution
+	 * only in a completely symmetric scenario where:
+	 * (i)  each of these processes must get the same throughput as
+	 *      the others;
+	 * (ii) all these processes have the same I/O pattern
+	        (either sequential or random).
+	 * In fact, in such a scenario, the drive will tend to treat
+	 * the requests of each of these processes in about the same
+	 * way as the requests of the others, and thus to provide
+	 * each of these processes with about the same throughput
+	 * (which is exactly the desired throughput distribution). In
+	 * contrast, in any asymmetric scenario, device idling is
+	 * certainly needed to guarantee that bfqq receives its
+	 * assigned fraction of the device throughput (see [1] for
+	 * details).
+	 *
+	 * We address this issue by controlling, actually, only the
+	 * symmetry sub-condition (i), i.e., provided that
+	 * sub-condition (i) holds, idling is not performed,
+	 * regardless of whether sub-condition (ii) holds. In other
+	 * words, only if sub-condition (i) holds, then idling is
+	 * allowed, and the device tends to be prevented from queueing
+	 * many requests, possibly of several processes. The reason
+	 * for not controlling also sub-condition (ii) is that, first,
+	 * in the case of an HDD, the asymmetry in terms of types of
+	 * I/O patterns is already taken in to account in the above
+	 * sentinel variable
+	 * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a
+	 * flash-based device, we prefer however to privilege
+	 * throughput (and idling lowers throughput for this type of
+	 * devices), for the following reasons:
+	 * 1) differently from HDDs, the service time of random
+	 *    requests is not orders of magnitudes lower than the service
+	 *    time of sequential requests; thus, even if processes doing
+	 *    sequential I/O get a preferential treatment with respect to
+	 *    others doing random I/O, the consequences are not as
+	 *    dramatic as with HDDs;
+	 * 2) if a process doing random I/O does need strong
+	 *    throughput guarantees, it is hopefully already being
+	 *    weight-raised, or the user is likely to have assigned it a
+	 *    higher weight than the other processes (and thus
+	 *    sub-condition (i) is likely to be false, which triggers
+	 *    idling).
+	 *
+	 * According to the above considerations, the next variable is
+	 * true (only) if sub-condition (i) holds. To compute the
+	 * value of this variable, we not only use the return value of
+	 * the function bfq_symmetric_scenario(), but also check
+	 * whether bfqq is being weight-raised, because
+	 * bfq_symmetric_scenario() does not take into account also
+	 * weight-raised queues (see comments to
+	 * bfq_weights_tree_add()).
+	 *
+	 * As a side note, it is worth considering that the above
+	 * device-idling countermeasures may however fail in the
+	 * following unlucky scenario: if idling is (correctly)
+	 * disabled in a time period during which all symmetry
+	 * sub-conditions hold, and hence the device is allowed to
+	 * enqueue many requests, but at some later point in time some
+	 * sub-condition stops to hold, then it may become impossible
+	 * to let requests be served in the desired order until all
+	 * the requests already queued in the device have been served.
+	 */
+	asymmetric_scenario = bfqq->wr_coeff > 1 ||
+		!bfq_symmetric_scenario(bfqd);
+
+	/*
+	 * Finally, there is a case where maximizing throughput is the
+	 * best choice even if it may cause unfairness toward
+	 * bfqq. Such a case is when bfqq became active in a burst of
+	 * queue activations. Queues that became active during a large
+	 * burst benefit only from throughput, as discussed in the
+	 * comments to bfq_handle_burst. Thus, if bfqq became active
+	 * in a burst and not idling the device maximizes throughput,
+	 * then the device must no be idled, because not idling the
+	 * device provides bfqq and all other queues in the burst with
+	 * maximum benefit. Combining this and the two cases above, we
+	 * can now establish when idling is actually needed to
+	 * preserve service guarantees.
+	 */
+	idling_needed_for_service_guarantees =
+		(on_hdd_and_not_all_queues_seeky || asymmetric_scenario) &&
+		!bfq_bfqq_in_large_burst(bfqq);
 
+	/*
+	 * We have now all the components we need to compute the return
+	 * value of the function, which is true only if both the following
+	 * conditions hold:
+	 * 1) bfqq is sync, because idling make sense only for sync queues;
+	 * 2) idling either boosts the throughput (without issues), or
+	 *    is necessary to preserve service guarantees.
+	 */
 	return bfq_bfqq_sync(bfqq) &&
-		!cond_for_expiring_in_burst &&
-		(bfqq->wr_coeff > 1 || !symmetric_scenario ||
-		 (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&
-		  !cond_for_expiring_non_wr)
-	);
+		(idling_boosts_thr_without_issues ||
+		 idling_needed_for_service_guarantees);
 }
 
 /*
- * If the in-service queue is empty but sync, and the function
- * bfq_bfqq_must_not_expire returns true, then:
+ * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * returns true, then:
  * 1) the queue must remain in service and cannot be expired, and
- * 2) the disk must be idled to wait for the possible arrival of a new
+ * 2) the device must be idled to wait for the possible arrival of a new
  *    request for the queue.
- * See the comments to the function bfq_bfqq_must_not_expire for the reasons
+ * See the comments to the function bfq_bfqq_may_idle for the reasons
  * why performing device idling is the best choice to boost the throughput
- * and preserve service guarantees when bfq_bfqq_must_not_expire itself
+ * and preserve service guarantees when bfq_bfqq_may_idle itself
  * returns true.
  */
-static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
 
 	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
-	       bfq_bfqq_must_not_expire(bfqq);
+	       bfq_bfqq_may_idle(bfqq);
 }
 
 /*
@@ -2533,7 +2656,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
 
 	bfqq = bfqd->in_service_queue;
-	if (bfqq == NULL)
+	if (!bfqq)
 		goto new_queue;
 
 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
@@ -2548,7 +2671,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	 * If bfqq has requests queued and it has enough budget left to
 	 * serve them, keep the queue, otherwise expire it.
 	 */
-	if (next_rq != NULL) {
+	if (next_rq) {
 		if (bfq_serv_to_charge(next_rq, bfqq) >
 			bfq_bfqq_budget_left(bfqq)) {
 			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
@@ -2575,6 +2698,9 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 				 */
 				bfq_clear_bfqq_wait_request(bfqq);
 				del_timer(&bfqd->idle_slice_timer);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+				bfqg_stats_update_idle_time(bfqq_group(bfqq));
+#endif
 			}
 			goto keep_queue;
 		}
@@ -2586,18 +2712,18 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	 * may idle after their completion, then keep it anyway.
 	 */
 	if (timer_pending(&bfqd->idle_slice_timer) ||
-	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
+	    (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
 		bfqq = NULL;
 		goto keep_queue;
 	}
 
 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
 expire:
-	bfq_bfqq_expire(bfqd, bfqq, 0, reason);
+	bfq_bfqq_expire(bfqd, bfqq, false, reason);
 new_queue:
 	bfqq = bfq_set_in_service_queue(bfqd);
 	bfq_log(bfqd, "select_queue: new queue %d returned",
-		bfqq != NULL ? bfqq->pid : 0);
+		bfqq ? bfqq->pid : 0);
 keep_queue:
 	return bfqq;
 }
@@ -2615,7 +2741,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 
 		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
 		       entity->orig_weight * bfqq->wr_coeff);
-		if (entity->ioprio_changed)
+		if (entity->prio_changed)
 			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
 
 		/*
@@ -2659,7 +2785,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
 
 	/* Follow expired path, else get first next available. */
 	rq = bfq_check_fifo(bfqq);
-	if (rq == NULL)
+	if (!rq)
 		rq = bfqq->next_rq;
 	service_to_charge = bfq_serv_to_charge(rq, bfqq);
 
@@ -2695,14 +2821,14 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
 	bfq_update_wr_data(bfqd, bfqq);
 
 	bfq_log_bfqq(bfqd, bfqq,
-			"dispatched %u sec req (%llu), budg left %lu",
+			"dispatched %u sec req (%llu), budg left %d",
 			blk_rq_sectors(rq),
 			(long long unsigned)blk_rq_pos(rq),
 			bfq_bfqq_budget_left(bfqq));
 
 	dispatched++;
 
-	if (bfqd->in_service_bic == NULL) {
+	if (!bfqd->in_service_bic) {
 		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
 		bfqd->in_service_bic = RQ_BIC(rq);
 	}
@@ -2715,7 +2841,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
 	return dispatched;
 
 expire:
-	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
+	bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
 	return dispatched;
 }
 
@@ -2723,7 +2849,7 @@ static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
 {
 	int dispatched = 0;
 
-	while (bfqq->next_rq != NULL) {
+	while (bfqq->next_rq) {
 		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
 		dispatched++;
 	}
@@ -2743,7 +2869,7 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd)
 	int dispatched = 0;
 
 	bfqq = bfqd->in_service_queue;
-	if (bfqq != NULL)
+	if (bfqq)
 		__bfq_bfqq_expire(bfqd, bfqq);
 
 	/*
@@ -2779,7 +2905,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
 		return bfq_forced_dispatch(bfqd);
 
 	bfqq = bfq_select_queue(bfqd);
-	if (bfqq == NULL)
+	if (!bfqq)
 		return 0;
 
 	if (bfq_class_idle(bfqq))
@@ -2819,6 +2945,9 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
 static void bfq_put_queue(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
 
 	BUG_ON(atomic_read(&bfqq->ref) <= 0);
 
@@ -2827,9 +2956,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 	if (!atomic_dec_and_test(&bfqq->ref))
 		return;
 
-	BUG_ON(rb_first(&bfqq->sort_list) != NULL);
+	BUG_ON(rb_first(&bfqq->sort_list));
 	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
-	BUG_ON(bfqq->entity.tree != NULL);
+	BUG_ON(bfqq->entity.tree);
 	BUG_ON(bfq_bfqq_busy(bfqq));
 	BUG_ON(bfqd->in_service_queue == bfqq);
 
@@ -2847,6 +2976,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
 
 	kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_put(bfqg);
+#endif
 }
 
 static void bfq_put_cooperator(struct bfq_queue *bfqq)
@@ -2883,7 +3015,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	bfq_put_queue(bfqq);
 }
 
-static inline void bfq_init_icq(struct io_cq *icq)
+static void bfq_init_icq(struct io_cq *icq)
 {
 	struct bfq_io_cq *bic = icq_to_bic(icq);
 
@@ -2950,40 +3082,38 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b
 		/*
 		 * No prio set, inherit CPU scheduling settings.
 		 */
-		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
-		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
+		bfqq->new_ioprio = task_nice_ioprio(tsk);
+		bfqq->new_ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
-		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
-		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
+		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
 		break;
 	case IOPRIO_CLASS_BE:
-		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
-		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
+		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
 		break;
 	case IOPRIO_CLASS_IDLE:
-		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
-		bfqq->entity.new_ioprio = 7;
+		bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
+		bfqq->new_ioprio = 7;
 		bfq_clear_bfqq_idle_window(bfqq);
 		break;
 	}
 
-	if (bfqq->entity.new_ioprio < 0 ||
-	    bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {
+	if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) {
 		printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",
-				 bfqq->entity.new_ioprio);
+				 bfqq->new_ioprio);
 		BUG();
 	}
 
-	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio);
-	bfqq->entity.ioprio_changed = 1;
+	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+	bfqq->entity.prio_changed = 1;
 }
 
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic)
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 {
 	struct bfq_data *bfqd;
 	struct bfq_queue *bfqq, *new_bfqq;
-	struct bfq_group *bfqg;
 	unsigned long uninitialized_var(flags);
 	int ioprio = bic->icq.ioc->ioprio;
 
@@ -2993,18 +3123,16 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic)
 	 * This condition may trigger on a newly created bic, be sure to
 	 * drop the lock before returning.
 	 */
-	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
+	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
 		goto out;
 
 	bic->ioprio = ioprio;
 
 	bfqq = bic->bfqq[BLK_RW_ASYNC];
-	if (bfqq != NULL) {
-		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
-				    sched_data);
-		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
+	if (bfqq) {
+		new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic,
 					 GFP_ATOMIC);
-		if (new_bfqq != NULL) {
+		if (new_bfqq) {
 			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
 			bfq_log_bfqq(bfqd, bfqq,
 				     "check_ioprio_change: bfqq %p %d",
@@ -3014,7 +3142,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic)
 	}
 
 	bfqq = bic->bfqq[BLK_RW_SYNC];
-	if (bfqq != NULL)
+	if (bfqq)
 		bfq_set_next_ioprio_data(bfqq, bic);
 
 out:
@@ -3038,7 +3166,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		if (!bfq_class_idle(bfqq))
 			bfq_mark_bfqq_idle_window(bfqq);
 		bfq_mark_bfqq_sync(bfqq);
-	}
+	} else
+		bfq_clear_bfqq_sync(bfqq);
 	bfq_mark_bfqq_IO_bound(bfqq);
 
 	/* Tentative initial value to trade off between thr and lat */
@@ -3055,14 +3184,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 }
 
 static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
-					      struct bfq_group *bfqg,
-					      int is_sync,
+					      struct bio *bio, int is_sync,
 					      struct bfq_io_cq *bic,
 					      gfp_t gfp_mask)
 {
+	struct bfq_group *bfqg;
 	struct bfq_queue *bfqq, *new_bfqq = NULL;
+	struct blkcg *blkcg;
 
 retry:
+	rcu_read_lock();
+
+	blkcg = bio_blkcg(bio);
+	bfqg = bfq_find_alloc_group(bfqd, blkcg);
 	/* bic always exists here */
 	bfqq = bic_to_bfqq(bic, is_sync);
 
@@ -3070,18 +3204,19 @@ retry:
 	 * Always try a new alloc if we fall back to the OOM bfqq
 	 * originally, since it should just be a temporary situation.
 	 */
-	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
+	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
 		bfqq = NULL;
-		if (new_bfqq != NULL) {
+		if (new_bfqq) {
 			bfqq = new_bfqq;
 			new_bfqq = NULL;
-		} else if (gfp_mask & __GFP_WAIT) {
+		} else if (gfpflags_allow_blocking(gfp_mask)) {
+			rcu_read_unlock();
 			spin_unlock_irq(bfqd->queue->queue_lock);
 			new_bfqq = kmem_cache_alloc_node(bfq_pool,
 					gfp_mask | __GFP_ZERO,
 					bfqd->queue->node);
 			spin_lock_irq(bfqd->queue->queue_lock);
-			if (new_bfqq != NULL)
+			if (new_bfqq)
 				goto retry;
 		} else {
 			bfqq = kmem_cache_alloc_node(bfq_pool,
@@ -3089,7 +3224,7 @@ retry:
 					bfqd->queue->node);
 		}
 
-		if (bfqq != NULL) {
+		if (bfqq) {
 			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
                                       is_sync);
 			bfq_init_entity(&bfqq->entity, bfqg);
@@ -3100,9 +3235,11 @@ retry:
 		}
 	}
 
-	if (new_bfqq != NULL)
+	if (new_bfqq)
 		kmem_cache_free(bfq_pool, new_bfqq);
 
+	rcu_read_unlock();
+
 	return bfqq;
 }
 
@@ -3126,7 +3263,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
 }
 
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bfq_group *bfqg, int is_sync,
+				       struct bio *bio, int is_sync,
 				       struct bfq_io_cq *bic, gfp_t gfp_mask)
 {
 	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
@@ -3135,19 +3272,26 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	struct bfq_queue *bfqq = NULL;
 
 	if (!is_sync) {
+		struct blkcg *blkcg;
+		struct bfq_group *bfqg;
+
+		rcu_read_lock();
+		blkcg = bio_blkcg(bio);
+		rcu_read_unlock();
+		bfqg = bfq_find_alloc_group(bfqd, blkcg);
 		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
 						  ioprio);
 		bfqq = *async_bfqq;
 	}
 
-	if (bfqq == NULL)
-		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
+	if (!bfqq)
+		bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask);
 
 	/*
 	 * Pin the queue now that it's allocated, scheduler exit will
 	 * prune it.
 	 */
-	if (!is_sync && *async_bfqq == NULL) {
+	if (!is_sync && !(*async_bfqq)) {
 		atomic_inc(&bfqq->ref);
 		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
 			     bfqq, atomic_read(&bfqq->ref));
@@ -3280,9 +3424,9 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
-		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
-				blk_rq_sectors(rq) < 32;
-		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+		bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+				 blk_rq_sectors(rq) < 32;
+		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
 
 		/*
 		 * There is just this request queued: if the request
@@ -3309,6 +3453,9 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		 */
 		bfq_clear_bfqq_wait_request(bfqq);
 		del_timer(&bfqd->idle_slice_timer);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		bfqg_stats_update_idle_time(bfqq_group(bfqq));
+#endif
 
 		/*
 		 * The queue is not empty, because a new request just
@@ -3318,7 +3465,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		 * See [1] for more details.
 		 */
 		if (budget_timeout)
-			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQ_BFQQ_BUDGET_TIMEOUT);
 
 		/*
 		 * Let the request rip immediately, or let a new queue be
@@ -3342,7 +3490,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
 	 */
 	if (!in_interrupt()) {
 		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
-		if (new_bfqq != NULL) {
+		if (new_bfqq) {
 			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
 				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
 			/*
@@ -3370,7 +3518,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
 	 * from assigning it a full weight-raising period. See the detailed
 	 * comments about this field in bfq_init_icq().
 	 */
-	if (bfqq->bic != NULL)
+	if (bfqq->bic)
 		bfqq->bic->wr_time_left = 0;
 	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
 	list_add_tail(&rq->queuelist, &bfqq->fifo);
@@ -3418,6 +3566,11 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
 	BUG_ON(!bfqq->dispatched);
 	bfqd->rq_in_driver--;
 	bfqq->dispatched--;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_completion(bfqq_group(bfqq),
+				     rq_start_time_ns(rq),
+				     rq_io_start_time_ns(rq), rq->cmd_flags);
+#endif
 
 	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
 		bfq_weights_tree_remove(bfqd, &bfqq->entity,
@@ -3462,11 +3615,12 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
 			bfq_arm_slice_timer(bfqd);
 			goto out;
 		} else if (bfq_may_expire_for_budg_timeout(bfqq))
-			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQ_BFQQ_BUDGET_TIMEOUT);
 		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
 			 (bfqq->dispatched == 0 ||
-			  !bfq_bfqq_must_not_expire(bfqq)))
-			bfq_bfqq_expire(bfqd, bfqq, 0,
+			  !bfq_bfqq_may_idle(bfqq)))
+			bfq_bfqq_expire(bfqd, bfqq, false,
 					BFQ_BFQQ_NO_MORE_REQUESTS);
 	}
 
@@ -3477,7 +3631,7 @@ out:
 	return;
 }
 
-static inline int __bfq_may_queue(struct bfq_queue *bfqq)
+static int __bfq_may_queue(struct bfq_queue *bfqq)
 {
 	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
 		bfq_clear_bfqq_must_alloc(bfqq);
@@ -3501,11 +3655,11 @@ static int bfq_may_queue(struct request_queue *q, int rw)
 	 * 'may queue' if that fails.
 	 */
 	bic = bfq_bic_lookup(bfqd, tsk->io_context);
-	if (bic == NULL)
+	if (!bic)
 		return ELV_MQUEUE_MAY;
 
 	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
-	if (bfqq != NULL)
+	if (bfqq)
 		return __bfq_may_queue(bfqq);
 
 	return ELV_MQUEUE_MAY;
@@ -3518,7 +3672,7 @@ static void bfq_put_request(struct request *rq)
 {
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 
-	if (bfqq != NULL) {
+	if (bfqq) {
 		const int rw = rq_data_dir(rq);
 
 		BUG_ON(!bfqq->allocated[rw]);
@@ -3570,25 +3724,24 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
 	const int rw = rq_data_dir(rq);
 	const int is_sync = rq_is_sync(rq);
 	struct bfq_queue *bfqq;
-	struct bfq_group *bfqg;
 	unsigned long flags;
 	bool split = false;
 
-	might_sleep_if(gfp_mask & __GFP_WAIT);
+	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
 
-	bfq_check_ioprio_change(bic);
+	bfq_check_ioprio_change(bic, bio);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
-	if (bic == NULL)
+	if (!bic)
 		goto queue_fail;
 
-	bfqg = bfq_bic_update_cgroup(bic);
+	bfq_bic_update_cgroup(bic, bio);
 
 new_queue:
 	bfqq = bic_to_bfqq(bic, is_sync);
-	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
-		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
+	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
+		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);
 		bic_set_bfqq(bic, bfqq, is_sync);
 		if (split && is_sync) {
 			if ((bic->was_in_burst_list && bfqd->large_burst) ||
@@ -3684,7 +3837,7 @@ static void bfq_idle_slice_timer(unsigned long data)
 	 * the in-service queue.  This can hardly happen, but in the worst
 	 * case we just expire a queue too early.
 	 */
-	if (bfqq != NULL) {
+	if (bfqq) {
 		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
 		if (bfq_bfqq_budget_timeout(bfqq))
 			/*
@@ -3704,7 +3857,7 @@ static void bfq_idle_slice_timer(unsigned long data)
 		else
 			goto schedule_dispatch;
 
-		bfq_bfqq_expire(bfqd, bfqq, 1, reason);
+		bfq_bfqq_expire(bfqd, bfqq, true, reason);
 	}
 
 schedule_dispatch:
@@ -3719,14 +3872,14 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
 	cancel_work_sync(&bfqd->unplug_work);
 }
 
-static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 					struct bfq_queue **bfqq_ptr)
 {
 	struct bfq_group *root_group = bfqd->root_group;
 	struct bfq_queue *bfqq = *bfqq_ptr;
 
 	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
-	if (bfqq != NULL) {
+	if (bfqq) {
 		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
 		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
 			     bfqq, atomic_read(&bfqq->ref));
@@ -3762,7 +3915,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	spin_lock_irq(q->queue_lock);
 
-	BUG_ON(bfqd->in_service_queue != NULL);
+	BUG_ON(bfqd->in_service_queue);
 	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
 		bfq_deactivate_bfqq(bfqd, bfqq, 0);
 
@@ -3775,22 +3928,39 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
 
-	bfq_free_root_group(bfqd);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_deactivate_policy(q, &blkcg_policy_bfq);
+#endif
+
 	kfree(bfqd);
 }
 
+static void bfq_init_root_group(struct bfq_group *root_group,
+				struct bfq_data *bfqd)
+{
+	int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	root_group->entity.parent = NULL;
+	root_group->my_entity = NULL;
+	root_group->bfqd = bfqd;
+#endif
+	root_group->rq_pos_tree = RB_ROOT;
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+}
+
 static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 {
-	struct bfq_group *bfqg;
 	struct bfq_data *bfqd;
 	struct elevator_queue *eq;
 
 	eq = elevator_alloc(q, e);
-	if (eq == NULL)
+	if (!eq)
 		return -ENOMEM;
 
 	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
-	if (bfqd == NULL) {
+	if (!bfqd) {
 		kobject_put(&eq->kobj);
 		return -ENOMEM;
 	}
@@ -3803,16 +3973,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	 */
 	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
 	atomic_inc(&bfqd->oom_bfqq.ref);
-	bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
-	bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;
+	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
 	bfqd->oom_bfqq.entity.new_weight =
-		bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio);
+		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
 	/*
 	 * Trigger weight initialization, according to ioprio, at the
 	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
 	 * class won't be changed any more.
 	 */
-	bfqd->oom_bfqq.entity.ioprio_changed = 1;
+	bfqd->oom_bfqq.entity.prio_changed = 1;
 
 	bfqd->queue = q;
 
@@ -3820,16 +3990,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	q->elevator = eq;
 	spin_unlock_irq(q->queue_lock);
 
-	bfqg = bfq_alloc_root_group(bfqd, q->node);
-	if (bfqg == NULL) {
-		kfree(bfqd);
-		kobject_put(&eq->kobj);
-		return -ENOMEM;
-	}
-
-	bfqd->root_group = bfqg;
+	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+	if (!bfqd->root_group)
+		goto out_free;
+	bfq_init_root_group(bfqd->root_group, bfqd);
 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	bfqd->active_numerous_groups = 0;
 #endif
 
@@ -3837,7 +4003,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
 	bfqd->idle_slice_timer.data = (unsigned long)bfqd;
 
-	bfqd->rq_pos_tree = RB_ROOT;
 	bfqd->queue_weights_tree = RB_ROOT;
 	bfqd->group_weights_tree = RB_ROOT;
 
@@ -3895,18 +4060,23 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->device_speed = BFQ_BFQD_FAST;
 
 	return 0;
+
+out_free:
+	kfree(bfqd);
+	kobject_put(&eq->kobj);
+	return -ENOMEM;
 }
 
 static void bfq_slab_kill(void)
 {
-	if (bfq_pool != NULL)
+	if (bfq_pool)
 		kmem_cache_destroy(bfq_pool);
 }
 
 static int __init bfq_slab_setup(void)
 {
 	bfq_pool = KMEM_CACHE(bfq_queue, 0);
-	if (bfq_pool == NULL)
+	if (!bfq_pool)
 		return -ENOMEM;
 	return 0;
 }
@@ -4051,7 +4221,7 @@ static ssize_t bfq_weights_store(struct elevator_queue *e,
 	return count;
 }
 
-static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
+static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
 {
 	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
 
@@ -4145,6 +4315,9 @@ static struct elevator_type iosched_bfq = {
 		.elevator_merge_fn =		bfq_merge,
 		.elevator_merged_fn =		bfq_merged_request,
 		.elevator_merge_req_fn =	bfq_merged_requests,
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		.elevator_bio_merged_fn =	bfq_bio_merged,
+#endif
 		.elevator_allow_merge_fn =	bfq_allow_merge,
 		.elevator_dispatch_fn =		bfq_dispatch_requests,
 		.elevator_add_req_fn =		bfq_insert_request,
@@ -4170,6 +4343,8 @@ static struct elevator_type iosched_bfq = {
 
 static int __init bfq_init(void)
 {
+	int ret;
+
 	/*
 	 * Can be 0 on HZ < 1000 setups.
 	 */
@@ -4179,8 +4354,15 @@ static int __init bfq_init(void)
 	if (bfq_timeout_async == 0)
 		bfq_timeout_async = 1;
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	ret = blkcg_policy_register(&blkcg_policy_bfq);
+	if (ret)
+		return ret;
+#endif
+
+	ret = -ENOMEM;
 	if (bfq_slab_setup())
-		return -ENOMEM;
+		goto err_pol_unreg;
 
 	/*
 	 * Times to load large popular applications for the typical systems
@@ -4199,15 +4381,27 @@ static int __init bfq_init(void)
 	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
 	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
 
-	elv_register(&iosched_bfq);
-	pr_info("BFQ I/O-scheduler: v7r8");
+	ret = elv_register(&iosched_bfq);
+	if (ret)
+		goto err_pol_unreg;
+
+	pr_info("BFQ I/O-scheduler: v7r10");
 
 	return 0;
+
+err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+	return ret;
 }
 
 static void __exit bfq_exit(void)
 {
 	elv_unregister(&iosched_bfq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
 	bfq_slab_kill();
 }
 
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
index d0890c6d4..9328a1f09 100644
--- a/block/bfq-sched.c
+++ b/block/bfq-sched.c
@@ -10,24 +10,27 @@
  * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
  */
 
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 #define for_each_entity(entity)	\
-	for (; entity != NULL; entity = entity->parent)
+	for (; entity ; entity = entity->parent)
 
 #define for_each_entity_safe(entity, parent) \
 	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
 
+
 static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
 						 int extract,
 						 struct bfq_data *bfqd);
 
-static inline void bfq_update_budget(struct bfq_entity *next_in_service)
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
+static void bfq_update_budget(struct bfq_entity *next_in_service)
 {
 	struct bfq_entity *bfqg_entity;
 	struct bfq_group *bfqg;
 	struct bfq_sched_data *group_sd;
 
-	BUG_ON(next_in_service == NULL);
+	BUG_ON(!next_in_service);
 
 	group_sd = next_in_service->sched_data;
 
@@ -38,7 +41,7 @@ static inline void bfq_update_budget(struct bfq_entity *next_in_service)
 	 * as it must never become an in-service entity.
 	 */
 	bfqg_entity = bfqg->my_entity;
-	if (bfqg_entity != NULL)
+	if (bfqg_entity)
 		bfqg_entity->budget = next_in_service->budget;
 }
 
@@ -46,7 +49,7 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd)
 {
 	struct bfq_entity *next_in_service;
 
-	if (sd->in_service_entity != NULL)
+	if (sd->in_service_entity)
 		/* will update/requeue at the end of service */
 		return 0;
 
@@ -60,35 +63,35 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd)
 	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
 	sd->next_in_service = next_in_service;
 
-	if (next_in_service != NULL)
+	if (next_in_service)
 		bfq_update_budget(next_in_service);
 
 	return 1;
 }
 
-static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
-					     struct bfq_entity *entity)
+static void bfq_check_next_in_service(struct bfq_sched_data *sd,
+				      struct bfq_entity *entity)
 {
 	BUG_ON(sd->next_in_service != entity);
 }
 #else
 #define for_each_entity(entity)	\
-	for (; entity != NULL; entity = NULL)
+	for (; entity ; entity = NULL)
 
 #define for_each_entity_safe(entity, parent) \
-	for (parent = NULL; entity != NULL; entity = parent)
+	for (parent = NULL; entity ; entity = parent)
 
-static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
+static int bfq_update_next_in_service(struct bfq_sched_data *sd)
 {
 	return 0;
 }
 
-static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
-					     struct bfq_entity *entity)
+static void bfq_check_next_in_service(struct bfq_sched_data *sd,
+				      struct bfq_entity *entity)
 {
 }
 
-static inline void bfq_update_budget(struct bfq_entity *next_in_service)
+static void bfq_update_budget(struct bfq_entity *next_in_service)
 {
 }
 #endif
@@ -109,18 +112,18 @@ static inline void bfq_update_budget(struct bfq_entity *next_in_service)
  *
  * Return @a > @b, dealing with wrapping correctly.
  */
-static inline int bfq_gt(u64 a, u64 b)
+static int bfq_gt(u64 a, u64 b)
 {
 	return (s64)(a - b) > 0;
 }
 
-static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = NULL;
 
-	BUG_ON(entity == NULL);
+	BUG_ON(!entity);
 
-	if (entity->my_sched_data == NULL)
+	if (!entity->my_sched_data)
 		bfqq = container_of(entity, struct bfq_queue, entity);
 
 	return bfqq;
@@ -132,8 +135,7 @@ static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
  * @service: amount of service.
  * @weight: scale factor (weight of an entity or weight sum).
  */
-static inline u64 bfq_delta(unsigned long service,
-					unsigned long weight)
+static u64 bfq_delta(unsigned long service, unsigned long weight)
 {
 	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
 
@@ -146,8 +148,7 @@ static inline u64 bfq_delta(unsigned long service,
  * @entity: the entity to act upon.
  * @service: the service to be charged to the entity.
  */
-static inline void bfq_calc_finish(struct bfq_entity *entity,
-				   unsigned long service)
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
@@ -156,7 +157,7 @@ static inline void bfq_calc_finish(struct bfq_entity *entity,
 	entity->finish = entity->start +
 		bfq_delta(service, entity->weight);
 
-	if (bfqq != NULL) {
+	if (bfqq) {
 		bfq_log_bfqq(bfqq->bfqd, bfqq,
 			"calc_finish: serv %lu, w %d",
 			service, entity->weight);
@@ -176,11 +177,11 @@ static inline void bfq_calc_finish(struct bfq_entity *entity,
  * conversion mechanism because, e.g., in the tree walking functions,
  * the check for a %NULL value would be redundant.
  */
-static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
+static struct bfq_entity *bfq_entity_of(struct rb_node *node)
 {
 	struct bfq_entity *entity = NULL;
 
-	if (node != NULL)
+	if (node)
 		entity = rb_entry(node, struct bfq_entity, rb_node);
 
 	return entity;
@@ -191,8 +192,7 @@ static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
  * @root: the tree root.
  * @entity: the entity to remove.
  */
-static inline void bfq_extract(struct rb_root *root,
-			       struct bfq_entity *entity)
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
 {
 	BUG_ON(entity->tree != root);
 
@@ -225,7 +225,7 @@ static void bfq_idle_extract(struct bfq_service_tree *st,
 
 	bfq_extract(&st->idle, entity);
 
-	if (bfqq != NULL)
+	if (bfqq)
 		list_del(&bfqq->bfqq_list);
 }
 
@@ -243,9 +243,9 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
 	struct rb_node **node = &root->rb_node;
 	struct rb_node *parent = NULL;
 
-	BUG_ON(entity->tree != NULL);
+	BUG_ON(entity->tree);
 
-	while (*node != NULL) {
+	while (*node) {
 		parent = *node;
 		entry = rb_entry(parent, struct bfq_entity, rb_node);
 
@@ -271,12 +271,11 @@ static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
  * that the subtree rooted at @node (which may be its left or its right
  * child) has a valid min_start value.
  */
-static inline void bfq_update_min(struct bfq_entity *entity,
-				  struct rb_node *node)
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
 {
 	struct bfq_entity *child;
 
-	if (node != NULL) {
+	if (node) {
 		child = rb_entry(node, struct bfq_entity, rb_node);
 		if (bfq_gt(entity->min_start, child->min_start))
 			entity->min_start = child->min_start;
@@ -291,7 +290,7 @@ static inline void bfq_update_min(struct bfq_entity *entity,
  * this function updates its min_start value.  The left and right subtrees
  * are assumed to hold a correct min_start value.
  */
-static inline void bfq_update_active_node(struct rb_node *node)
+static void bfq_update_active_node(struct rb_node *node)
 {
 	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
 
@@ -318,12 +317,12 @@ up:
 	bfq_update_active_node(node);
 
 	parent = rb_parent(node);
-	if (parent == NULL)
+	if (!parent)
 		return;
 
-	if (node == parent->rb_left && parent->rb_right != NULL)
+	if (node == parent->rb_left && parent->rb_right)
 		bfq_update_active_node(parent->rb_right);
-	else if (parent->rb_left != NULL)
+	else if (parent->rb_left)
 		bfq_update_active_node(parent->rb_left);
 
 	node = parent;
@@ -355,7 +354,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node = &entity->rb_node;
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	struct bfq_sched_data *sd = NULL;
 	struct bfq_group *bfqg = NULL;
 	struct bfq_data *bfqd = NULL;
@@ -363,22 +362,22 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 
 	bfq_insert(&st->active, entity);
 
-	if (node->rb_left != NULL)
+	if (node->rb_left)
 		node = node->rb_left;
-	else if (node->rb_right != NULL)
+	else if (node->rb_right)
 		node = node->rb_right;
 
 	bfq_update_active_tree(node);
 
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	sd = entity->sched_data;
 	bfqg = container_of(sd, struct bfq_group, sched_data);
 	BUG_ON(!bfqg);
 	bfqd = (struct bfq_data *)bfqg->bfqd;
 #endif
-	if (bfqq != NULL)
+	if (bfqq)
 		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	else { /* bfq_group */
 		BUG_ON(!bfqd);
 		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
@@ -397,31 +396,32 @@ static void bfq_active_insert(struct bfq_service_tree *st,
  * bfq_ioprio_to_weight - calc a weight from an ioprio.
  * @ioprio: the ioprio value to convert.
  */
-static inline unsigned short bfq_ioprio_to_weight(int ioprio)
+static unsigned short bfq_ioprio_to_weight(int ioprio)
 {
 	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
-	return IOPRIO_BE_NR - ioprio;
+	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio;
 }
 
 /**
  * bfq_weight_to_ioprio - calc an ioprio from a weight.
  * @weight: the weight value to convert.
  *
- * To preserve as mush as possible the old only-ioprio user interface,
+ * To preserve as much as possible the old only-ioprio user interface,
  * 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
  */
-static inline unsigned short bfq_weight_to_ioprio(int weight)
+static unsigned short bfq_weight_to_ioprio(int weight)
 {
 	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
-	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
+	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?
+		0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;
 }
 
-static inline void bfq_get_entity(struct bfq_entity *entity)
+static void bfq_get_entity(struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
-	if (bfqq != NULL) {
+	if (bfqq) {
 		atomic_inc(&bfqq->ref);
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
 			     bfqq, atomic_read(&bfqq->ref));
@@ -441,15 +441,15 @@ static struct rb_node *bfq_find_deepest(struct rb_node *node)
 {
 	struct rb_node *deepest;
 
-	if (node->rb_right == NULL && node->rb_left == NULL)
+	if (!node->rb_right && !node->rb_left)
 		deepest = rb_parent(node);
-	else if (node->rb_right == NULL)
+	else if (!node->rb_right)
 		deepest = node->rb_left;
-	else if (node->rb_left == NULL)
+	else if (!node->rb_left)
 		deepest = node->rb_right;
 	else {
 		deepest = rb_next(node);
-		if (deepest->rb_right != NULL)
+		if (deepest->rb_right)
 			deepest = deepest->rb_right;
 		else if (rb_parent(deepest) != node)
 			deepest = rb_parent(deepest);
@@ -468,7 +468,7 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node;
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	struct bfq_sched_data *sd = NULL;
 	struct bfq_group *bfqg = NULL;
 	struct bfq_data *bfqd = NULL;
@@ -477,18 +477,18 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 	node = bfq_find_deepest(&entity->rb_node);
 	bfq_extract(&st->active, entity);
 
-	if (node != NULL)
+	if (node)
 		bfq_update_active_tree(node);
 
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	sd = entity->sched_data;
 	bfqg = container_of(sd, struct bfq_group, sched_data);
 	BUG_ON(!bfqg);
 	bfqd = (struct bfq_data *)bfqg->bfqd;
 #endif
-	if (bfqq != NULL)
+	if (bfqq)
 		list_del(&bfqq->bfqq_list);
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	else { /* bfq_group */
 		BUG_ON(!bfqd);
 		bfq_weights_tree_remove(bfqd, entity,
@@ -519,14 +519,14 @@ static void bfq_idle_insert(struct bfq_service_tree *st,
 	struct bfq_entity *first_idle = st->first_idle;
 	struct bfq_entity *last_idle = st->last_idle;
 
-	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
+	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
 		st->first_idle = entity;
-	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
+	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
 		st->last_idle = entity;
 
 	bfq_insert(&st->idle, entity);
 
-	if (bfqq != NULL)
+	if (bfqq)
 		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
 }
 
@@ -549,7 +549,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
 
 	entity->on_st = 0;
 	st->wsum -= entity->weight;
-	if (bfqq != NULL) {
+	if (bfqq) {
 		sd = entity->sched_data;
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
 			     bfqq, atomic_read(&bfqq->ref));
@@ -581,7 +581,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
 	struct bfq_entity *first_idle = st->first_idle;
 	struct bfq_entity *last_idle = st->last_idle;
 
-	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
+	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
 	    !bfq_gt(last_idle->finish, st->vtime)) {
 		/*
 		 * Forget the whole idle tree, increasing the vtime past
@@ -590,7 +590,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
 		st->vtime = last_idle->finish;
 	}
 
-	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
+	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
 		bfq_put_idle_entity(st, first_idle);
 }
 
@@ -600,19 +600,19 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 {
 	struct bfq_service_tree *new_st = old_st;
 
-	if (entity->ioprio_changed) {
+	if (entity->prio_changed) {
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 		unsigned short prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
 		struct rb_root *root;
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 		struct bfq_sched_data *sd;
 		struct bfq_group *bfqg;
 #endif
 
-		if (bfqq != NULL)
+		if (bfqq)
 			bfqd = bfqq->bfqd;
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 		else {
 			sd = entity->my_sched_data;
 			bfqg = container_of(sd, struct bfq_group, sched_data);
@@ -634,12 +634,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 				BUG();
 			}
 			entity->orig_weight = entity->new_weight;
-			entity->ioprio =
-				bfq_weight_to_ioprio(entity->orig_weight);
+			if (bfqq)
+				bfqq->ioprio =
+				  bfq_weight_to_ioprio(entity->orig_weight);
 		}
 
-		entity->ioprio_class = entity->new_ioprio_class;
-		entity->ioprio_changed = 0;
+		if (bfqq)
+			bfqq->ioprio_class = bfqq->new_ioprio_class;
+		entity->prio_changed = 0;
 
 		/*
 		 * NOTE: here we may be changing the weight too early,
@@ -652,7 +654,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 
 		prev_weight = entity->weight;
 		new_weight = entity->orig_weight *
-			     (bfqq != NULL ? bfqq->wr_coeff : 1);
+			     (bfqq ? bfqq->wr_coeff : 1);
 		/*
 		 * If the weight of the entity changes, remove the entity
 		 * from its old weight counter (if there is a counter
@@ -683,6 +685,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 	return new_st;
 }
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+#endif
+
 /**
  * bfq_bfqq_served - update the scheduler status after selection for
  *                   service.
@@ -693,7 +699,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
  * are synchronized every time a new bfqq is selected for service.  By now,
  * we keep it to better check consistency.
  */
-static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 	struct bfq_service_tree *st;
@@ -708,7 +714,10 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
 		st->vtime += bfq_delta(served, st->wsum);
 		bfq_forget_idle(st);
 	}
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
+#endif
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
 }
 
 /**
@@ -721,7 +730,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
  * budget.  In this way we should obtain a sort of time-domain
  * fairness among all the seeky/slow queues.
  */
-static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
+static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 
@@ -746,7 +755,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity)
 	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
 
 	if (entity == sd->in_service_entity) {
-		BUG_ON(entity->tree != NULL);
+		BUG_ON(entity->tree);
 		/*
 		 * If we are requeueing the current entity we have
 		 * to take care of not charging to it service it has
@@ -837,7 +846,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 	if (!entity->on_st)
 		return 0;
 
-	BUG_ON(was_in_service && entity->tree != NULL);
+	BUG_ON(was_in_service && entity->tree);
 
 	if (was_in_service) {
 		bfq_calc_finish(entity, entity->service);
@@ -846,7 +855,7 @@ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 		bfq_active_extract(st, entity);
 	else if (entity->tree == &st->idle)
 		bfq_idle_extract(st, entity);
-	else if (entity->tree != NULL)
+	else if (entity->tree)
 		BUG();
 
 	if (was_in_service || sd->next_in_service == entity)
@@ -884,7 +893,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 			 */
 			break;
 
-		if (sd->next_in_service != NULL)
+		if (sd->next_in_service)
 			/*
 			 * The parent entity is still backlogged and
 			 * the budgets on the path towards the root
@@ -953,7 +962,7 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
 	struct bfq_entity *entry, *first = NULL;
 	struct rb_node *node = st->active.rb_node;
 
-	while (node != NULL) {
+	while (node) {
 		entry = rb_entry(node, struct bfq_entity, rb_node);
 left:
 		if (!bfq_gt(entry->start, st->vtime))
@@ -961,7 +970,7 @@ left:
 
 		BUG_ON(bfq_gt(entry->min_start, st->vtime));
 
-		if (node->rb_left != NULL) {
+		if (node->rb_left) {
 			entry = rb_entry(node->rb_left,
 					 struct bfq_entity, rb_node);
 			if (!bfq_gt(entry->min_start, st->vtime)) {
@@ -969,12 +978,12 @@ left:
 				goto left;
 			}
 		}
-		if (first != NULL)
+		if (first)
 			break;
 		node = node->rb_right;
 	}
 
-	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
+	BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));
 	return first;
 }
 
@@ -1030,13 +1039,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
 	struct bfq_entity *entity;
 	int i = 0;
 
-	BUG_ON(sd->in_service_entity != NULL);
+	BUG_ON(sd->in_service_entity);
 
-	if (bfqd != NULL &&
+	if (bfqd &&
 	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
 		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
 						  true);
-		if (entity != NULL) {
+		if (entity) {
 			i = BFQ_IOPRIO_CLASSES - 1;
 			bfqd->bfq_class_idle_last_service = jiffies;
 			sd->next_in_service = entity;
@@ -1044,7 +1053,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
 	}
 	for (; i < BFQ_IOPRIO_CLASSES; i++) {
 		entity = __bfq_lookup_next_entity(st + i, false);
-		if (entity != NULL) {
+		if (entity) {
 			if (extract) {
 				bfq_check_next_in_service(sd, entity);
 				bfq_active_extract(st + i, entity);
@@ -1067,27 +1076,27 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
 	struct bfq_sched_data *sd;
 	struct bfq_queue *bfqq;
 
-	BUG_ON(bfqd->in_service_queue != NULL);
+	BUG_ON(bfqd->in_service_queue);
 
 	if (bfqd->busy_queues == 0)
 		return NULL;
 
 	sd = &bfqd->root_group->sched_data;
-	for (; sd != NULL; sd = entity->my_sched_data) {
+	for (; sd ; sd = entity->my_sched_data) {
 		entity = bfq_lookup_next_entity(sd, 1, bfqd);
-		BUG_ON(entity == NULL);
+		BUG_ON(!entity);
 		entity->service = 0;
 	}
 
 	bfqq = bfq_entity_to_bfqq(entity);
-	BUG_ON(bfqq == NULL);
+	BUG_ON(!bfqq);
 
 	return bfqq;
 }
 
 static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
 {
-	if (bfqd->in_service_bic != NULL) {
+	if (bfqd->in_service_bic) {
 		put_io_context(bfqd->in_service_bic->icq.ioc);
 		bfqd->in_service_bic = NULL;
 	}
@@ -1114,6 +1123,10 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	bfq_activate_entity(entity);
 }
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+#endif
+
 /*
  * Called when the bfqq no longer has requests pending, remove it from
  * the service tree.
@@ -1147,6 +1160,10 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues--;
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_dequeue(bfqq_group(bfqq));
+#endif
+
 	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
 }
 
diff --git a/block/bfq.h b/block/bfq.h
index 93d3f6e95..97a677f8c 100644
--- a/block/bfq.h
+++ b/block/bfq.h
@@ -1,5 +1,5 @@
 /*
- * BFQ-v7r8 for 4.3.0: data structures and common functions prototypes.
+ * BFQ-v7r10 for 4.4.0: data structures and common functions prototypes.
  *
  * Based on ideas and code from CFQ:
  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
@@ -17,12 +17,14 @@
 #include <linux/hrtimer.h>
 #include <linux/ioprio.h>
 #include <linux/rbtree.h>
+#include <linux/blk-cgroup.h>
 
 #define BFQ_IOPRIO_CLASSES	3
 #define BFQ_CL_IDLE_TIMEOUT	(HZ/5)
 
-#define BFQ_MIN_WEIGHT	1
-#define BFQ_MAX_WEIGHT	1000
+#define BFQ_MIN_WEIGHT			1
+#define BFQ_MAX_WEIGHT			1000
+#define BFQ_WEIGHT_CONVERSION_COEFF	10
 
 #define BFQ_DEFAULT_QUEUE_IOPRIO	4
 
@@ -117,12 +119,8 @@ struct bfq_weight_counter {
  * @ioprio: the ioprio in use.
  * @new_weight: when a weight change is requested, the new weight value.
  * @orig_weight: original weight, used to implement weight boosting
- * @new_ioprio: when an ioprio change is requested, the new ioprio value.
- * @ioprio_class: the ioprio_class in use.
- * @new_ioprio_class: when an ioprio_class change is requested, the new
- *                    ioprio_class value.
- * @ioprio_changed: flag, true when the user requested a weight, ioprio or
- *                  ioprio_class change.
+ * @prio_changed: flag, true when the user requested a weight, ioprio or
+ *		  ioprio_class change.
  *
  * A bfq_entity is used to represent either a bfq_queue (leaf node in the
  * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
@@ -134,7 +132,7 @@ struct bfq_weight_counter {
  * allow different weights on different devices, but this
  * functionality is not exported to userspace by now.  Priorities and
  * weights are updated lazily, first storing the new values into the
- * new_* fields, then setting the @ioprio_changed flag.  As soon as
+ * new_* fields, then setting the @prio_changed flag.  As soon as
  * there is a transition in the entity state that allows the priority
  * update to take place the effective and the requested priority
  * values are synchronized.
@@ -161,7 +159,7 @@ struct bfq_entity {
 
 	u64 min_start;
 
-	unsigned long service, budget;
+	int service, budget;
 	unsigned short weight, new_weight;
 	unsigned short orig_weight;
 
@@ -170,10 +168,7 @@ struct bfq_entity {
 	struct bfq_sched_data *my_sched_data;
 	struct bfq_sched_data *sched_data;
 
-	unsigned short ioprio, new_ioprio;
-	unsigned short ioprio_class, new_ioprio_class;
-
-	int ioprio_changed;
+	int prio_changed;
 };
 
 struct bfq_group;
@@ -182,10 +177,14 @@ struct bfq_group;
  * struct bfq_queue - leaf schedulable entity.
  * @ref: reference counter.
  * @bfqd: parent bfq_data.
+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
+ * @ioprio_class: the ioprio_class in use.
+ * @new_ioprio_class: when an ioprio_class change is requested, the new
+ *                    ioprio_class value.
  * @new_bfqq: shared bfq_queue if queue is cooperating with
  *           one or more other queues.
- * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
- * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
+ * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree).
+ * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree).
  * @sort_list: sorted list of pending requests.
  * @next_rq: if fifo isn't expired, next request to serve.
  * @queued: nr of requests queued in @sort_list.
@@ -239,6 +238,9 @@ struct bfq_queue {
 	atomic_t ref;
 	struct bfq_data *bfqd;
 
+	unsigned short ioprio, new_ioprio;
+	unsigned short ioprio_class, new_ioprio_class;
+
 	/* fields for cooperating queues handling */
 	struct bfq_queue *new_bfqq;
 	struct rb_node pos_node;
@@ -253,7 +255,7 @@ struct bfq_queue {
 
 	struct bfq_entity entity;
 
-	unsigned long max_budget;
+	int max_budget;
 	unsigned long budget_timeout;
 
 	int dispatched;
@@ -302,6 +304,8 @@ struct bfq_ttime {
  * @icq: associated io_cq structure
  * @bfqq: array of two process queues, the sync and the async
  * @ttime: associated @bfq_ttime struct
+ * @ioprio: per (request_queue, blkcg) ioprio.
+ * @blkcg_id: id of the blkcg the related io_cq belongs to.
  * @wr_time_left: snapshot of the time left before weight raising ends
  *                for the sync queue associated to this process; this
  *		  snapshot is taken to remember this value while the weight
@@ -329,6 +333,10 @@ struct bfq_io_cq {
 	struct bfq_ttime ttime;
 	int ioprio;
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	uint64_t blkcg_id; /* the current blkcg ID */
+#endif
+
 	unsigned int wr_time_left;
 	bool saved_idle_window;
 	bool saved_IO_bound;
@@ -349,9 +357,6 @@ enum bfq_device_speed {
  * struct bfq_data - per device data structure.
  * @queue: request queue for the managed device.
  * @root_group: root bfq_group for the device.
- * @rq_pos_tree: rbtree sorted by next_request position, used when
- *               determining if two or more queues have interleaving
- *               requests (see bfq_close_cooperator()).
  * @active_numerous_groups: number of bfq_groups containing more than one
  *                          active @bfq_entity.
  * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by
@@ -485,9 +490,8 @@ struct bfq_data {
 	struct request_queue *queue;
 
 	struct bfq_group *root_group;
-	struct rb_root rq_pos_tree;
 
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	int active_numerous_groups;
 #endif
 
@@ -520,7 +524,7 @@ struct bfq_data {
 	ktime_t last_idling_start;
 	int peak_rate_samples;
 	u64 peak_rate;
-	unsigned long bfq_max_budget;
+	int bfq_max_budget;
 
 	struct hlist_head group_list;
 	struct list_head active_list;
@@ -532,8 +536,8 @@ struct bfq_data {
 	unsigned int bfq_slice_idle;
 	u64 bfq_class_idle_last_service;
 
-	unsigned int bfq_user_max_budget;
-	unsigned int bfq_max_budget_async_rq;
+	int bfq_user_max_budget;
+	int bfq_max_budget_async_rq;
 	unsigned int bfq_timeout[2];
 
 	unsigned int bfq_coop_thresh;
@@ -593,15 +597,15 @@ enum bfqq_state_flags {
 };
 
 #define BFQ_BFQQ_FNS(name)						\
-static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
 {									\
 	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
 }									\
-static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\
+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)		\
 {									\
 	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
 }									\
-static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
+static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
 {									\
 	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
 }
@@ -640,14 +644,64 @@ enum bfqq_expiration {
 	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
 };
 
-#ifdef CONFIG_CGROUP_BFQIO
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+struct bfqg_stats {
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
+	/* total sectors transferred */
+	struct blkg_stat		sectors;
+	/* total disk time and nr sectors dispatched by this group */
+	struct blkg_stat		time;
+	/* time not charged to this cgroup */
+	struct blkg_stat		unaccounted_time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkcg_gq */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
+};
+
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned short weight;
+};
+
 /**
  * struct bfq_group - per (device, cgroup) data structure.
  * @entity: schedulable entity to insert into the parent group sched_data.
  * @sched_data: own sched_data, to contain child entities (they may be
  *              both bfq_queues and bfq_groups).
- * @group_node: node to be inserted into the bfqio_cgroup->group_data
- *              list of the containing cgroup's bfqio_cgroup.
  * @bfqd_node: node to be inserted into the @bfqd->group_list list
  *             of the groups active on the same device; used for cleanup.
  * @bfqd: the bfq_data for the device this group acts upon.
@@ -663,23 +717,26 @@ enum bfqq_expiration {
  *                   are groups with more than one active @bfq_entity
  *                   (see the comments to the function
  *                   bfq_bfqq_must_not_expire()).
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ *               determining if two or more queues have interleaving
+ *               requests (see bfq_find_close_cooperator()).
  *
  * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
  * there is a set of bfq_groups, each one collecting the lower-level
  * entities belonging to the group that are acting on the same device.
  *
  * Locking works as follows:
- *    o @group_node is protected by the bfqio_cgroup lock, and is accessed
- *      via RCU from its readers.
  *    o @bfqd is protected by the queue lock, RCU is used to access it
  *      from the readers.
  *    o All the other fields are protected by the @bfqd queue lock.
  */
 struct bfq_group {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
 	struct bfq_entity entity;
 	struct bfq_sched_data sched_data;
 
-	struct hlist_node group_node;
 	struct hlist_node bfqd_node;
 
 	void *bfqd;
@@ -690,44 +747,33 @@ struct bfq_group {
 	struct bfq_entity *my_entity;
 
 	int active_entities;
-};
 
-/**
- * struct bfqio_cgroup - bfq cgroup data structure.
- * @css: subsystem state for bfq in the containing cgroup.
- * @online: flag marked when the subsystem is inserted.
- * @weight: cgroup weight.
- * @ioprio: cgroup ioprio.
- * @ioprio_class: cgroup ioprio_class.
- * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
- * @group_data: list containing the bfq_group belonging to this cgroup.
- *
- * @group_data is accessed using RCU, with @lock protecting the updates,
- * @ioprio and @ioprio_class are protected by @lock.
- */
-struct bfqio_cgroup {
-	struct cgroup_subsys_state css;
-	bool online;
-
-	unsigned short weight, ioprio, ioprio_class;
+	struct rb_root rq_pos_tree;
 
-	spinlock_t lock;
-	struct hlist_head group_data;
+	struct bfqg_stats stats;
+	struct bfqg_stats dead_stats;	/* stats pushed from dead children */
 };
+
 #else
 struct bfq_group {
 	struct bfq_sched_data sched_data;
 
 	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
 	struct bfq_queue *async_idle_bfqq;
+
+	struct rb_root rq_pos_tree;
 };
 #endif
 
-static inline struct bfq_service_tree *
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+
+static struct bfq_service_tree *
 bfq_entity_service_tree(struct bfq_entity *entity)
 {
 	struct bfq_sched_data *sched_data = entity->sched_data;
-	unsigned int idx = entity->ioprio_class - 1;
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
+				  BFQ_DEFAULT_GRP_CLASS;
 
 	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
 	BUG_ON(sched_data == NULL);
@@ -735,19 +781,18 @@ bfq_entity_service_tree(struct bfq_entity *entity)
 	return sched_data->service_tree + idx;
 }
 
-static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
-					    bool is_sync)
+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
 {
 	return bic->bfqq[is_sync];
 }
 
-static inline void bic_set_bfqq(struct bfq_io_cq *bic,
-				struct bfq_queue *bfqq, bool is_sync)
+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
+			 bool is_sync)
 {
 	bic->bfqq[is_sync] = bfqq;
 }
 
-static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
 {
 	return bic->icq.q->elevator->elevator_data;
 }
@@ -766,8 +811,7 @@ static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
  * the function returns NULL, with the queue unlocked, otherwise it
  * returns the dereferenced pointer, with the queue locked.
  */
-static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
-						   unsigned long *flags)
+static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)
 {
 	struct bfq_data *bfqd;
 
@@ -776,7 +820,9 @@ static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
 
 	if (bfqd != NULL) {
 		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
-		if (*ptr == bfqd)
+		if (ptr == NULL)
+			printk(KERN_CRIT "get_bfqd_locked pointer NULL\n");
+		else if (*ptr == bfqd)
 			goto out;
 		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
 	}
@@ -787,17 +833,37 @@ out:
 	return bfqd;
 }
 
-static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
-				       unsigned long *flags)
+static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)
 {
 	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
 }
 
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	if (!group_entity)
+		group_entity = &bfqq->bfqd->root_group->entity;
+
+	return container_of(group_entity, struct bfq_group, entity);
+}
+
+#else
+
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+	return bfqq->bfqd->root_group;
+}
+
+#endif
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
 static void bfq_put_queue(struct bfq_queue *bfqq);
 static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bfq_group *bfqg, int is_sync,
+				       struct bio *bio, int is_sync,
 				       struct bfq_io_cq *bic, gfp_t gfp_mask);
 static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
 				    struct bfq_group *bfqg);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 14b8faf8b..f6325d573 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -32,6 +32,11 @@
 static struct kmem_cache *bip_slab;
 static struct workqueue_struct *kintegrityd_wq;
 
+void blk_flush_integrity(void)
+{
+	flush_workqueue(kintegrityd_wq);
+}
+
 /**
  * bio_integrity_alloc - Allocate integrity payload and attach it to bio
  * @bio:	bio to attach integrity metadata to
@@ -177,11 +182,11 @@ bool bio_integrity_enabled(struct bio *bio)
 	if (bi == NULL)
 		return false;
 
-	if (bio_data_dir(bio) == READ && bi->verify_fn != NULL &&
+	if (bio_data_dir(bio) == READ && bi->profile->verify_fn != NULL &&
 	    (bi->flags & BLK_INTEGRITY_VERIFY))
 		return true;
 
-	if (bio_data_dir(bio) == WRITE && bi->generate_fn != NULL &&
+	if (bio_data_dir(bio) == WRITE && bi->profile->generate_fn != NULL &&
 	    (bi->flags & BLK_INTEGRITY_GENERATE))
 		return true;
 
@@ -202,7 +207,7 @@ EXPORT_SYMBOL(bio_integrity_enabled);
 static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
 						   unsigned int sectors)
 {
-	return sectors >> (ilog2(bi->interval) - 9);
+	return sectors >> (bi->interval_exp - 9);
 }
 
 static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
@@ -229,7 +234,7 @@ static int bio_integrity_process(struct bio *bio,
 		bip->bip_vec->bv_offset;
 
 	iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
-	iter.interval = bi->interval;
+	iter.interval = 1 << bi->interval_exp;
 	iter.seed = bip_get_seed(bip);
 	iter.prot_buf = prot_buf;
 
@@ -340,7 +345,7 @@ int bio_integrity_prep(struct bio *bio)
 
 	/* Auto-generate integrity metadata if this is a write */
 	if (bio_data_dir(bio) == WRITE)
-		bio_integrity_process(bio, bi->generate_fn);
+		bio_integrity_process(bio, bi->profile->generate_fn);
 
 	return 0;
 }
@@ -361,7 +366,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 	struct bio *bio = bip->bip_bio;
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 
-	bio->bi_error = bio_integrity_process(bio, bi->verify_fn);
+	bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn);
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
diff --git a/block/bio.c b/block/bio.c
index ad3f276d7..4f184d938 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -211,7 +211,7 @@ fallback:
 		bvl = mempool_alloc(pool, gfp_mask);
 	} else {
 		struct biovec_slab *bvs = bvec_slabs + *idx;
-		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+		gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
 
 		/*
 		 * Make this allocation restricted and don't dump info on
@@ -221,11 +221,11 @@ fallback:
 		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 
 		/*
-		 * Try a slab allocation. If this fails and __GFP_WAIT
+		 * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
 		 * is set, retry with the 1-entry mempool
 		 */
 		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
-		if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
+		if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
 			*idx = BIOVEC_MAX_IDX;
 			goto fallback;
 		}
@@ -395,12 +395,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
  *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
  *   backed by the @bs's mempool.
  *
- *   When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
- *   able to allocate a bio. This is due to the mempool guarantees. To make this
- *   work, callers must never allocate more than 1 bio at a time from this pool.
- *   Callers that need to allocate more than 1 bio must always submit the
- *   previously allocated bio for IO before attempting to allocate a new one.
- *   Failure to do so can cause deadlocks under memory pressure.
+ *   When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
+ *   always be able to allocate a bio. This is due to the mempool guarantees.
+ *   To make this work, callers must never allocate more than 1 bio at a time
+ *   from this pool. Callers that need to allocate more than 1 bio must always
+ *   submit the previously allocated bio for IO before attempting to allocate
+ *   a new one. Failure to do so can cause deadlocks under memory pressure.
  *
  *   Note that when running under generic_make_request() (i.e. any block
  *   driver), bios are not submitted until after you return - see the code in
@@ -459,13 +459,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		 * We solve this, and guarantee forward progress, with a rescuer
 		 * workqueue per bio_set. If we go to allocate and there are
 		 * bios on current->bio_list, we first try the allocation
-		 * without __GFP_WAIT; if that fails, we punt those bios we
-		 * would be blocking to the rescuer workqueue before we retry
-		 * with the original gfp_flags.
+		 * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
+		 * bios we would be blocking to the rescuer workqueue before
+		 * we retry with the original gfp_flags.
 		 */
 
 		if (current->bio_list && !bio_list_empty(current->bio_list))
-			gfp_mask &= ~__GFP_WAIT;
+			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
 		if (!p && gfp_mask != saved_gfp) {
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 55512dd62..5a37188b5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -899,6 +899,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 struct cftype blkcg_files[] = {
 	{
 		.name = "stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = blkcg_print_stat,
 	},
 	{ }	/* terminate */
@@ -1126,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q)
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkcg_can_attach(struct cgroup_subsys_state *css,
-			    struct cgroup_taskset *tset)
+static int blkcg_can_attach(struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
+	struct cgroup_subsys_state *dst_css;
 	struct io_context *ioc;
 	int ret = 0;
 
 	/* task_lock() is needed to avoid races with exit_io_context() */
-	cgroup_taskset_for_each(task, tset) {
+	cgroup_taskset_for_each(task, dst_css, tset) {
 		task_lock(task);
 		ioc = task->io_context;
 		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
diff --git a/block/blk-core.c b/block/blk-core.c
index 48a6cc8df..e8e229c70 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -209,6 +209,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 EXPORT_SYMBOL(blk_delay_queue);
 
 /**
+ * blk_start_queue_async - asynchronously restart a previously stopped queue
+ * @q:    The &struct request_queue in question
+ *
+ * Description:
+ *   blk_start_queue_async() will clear the stop flag on the queue, and
+ *   ensure that the request_fn for the queue is run from an async
+ *   context.
+ **/
+void blk_start_queue_async(struct request_queue *q)
+{
+	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+	blk_run_queue_async(q);
+}
+EXPORT_SYMBOL(blk_start_queue_async);
+
+/**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
  *
@@ -556,22 +572,23 @@ void blk_cleanup_queue(struct request_queue *q)
 	 * Drain all requests queued before DYING marking. Set DEAD flag to
 	 * prevent that q->request_fn() gets invoked after draining finished.
 	 */
-	if (q->mq_ops) {
-		blk_mq_freeze_queue(q);
-		spin_lock_irq(lock);
-	} else {
-		spin_lock_irq(lock);
+	blk_freeze_queue(q);
+	spin_lock_irq(lock);
+	if (!q->mq_ops)
 		__blk_drain_queue(q, true);
-	}
 	queue_flag_set(QUEUE_FLAG_DEAD, q);
 	spin_unlock_irq(lock);
 
+	/* for synchronous bio-based driver finish in-flight integrity i/o */
+	blk_flush_integrity();
+
 	/* @q won't process any more request, flush async actions */
 	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
 	blk_sync_queue(q);
 
 	if (q->mq_ops)
 		blk_mq_free_queue(q);
+	percpu_ref_exit(&q->q_usage_counter);
 
 	spin_lock_irq(lock);
 	if (q->queue_lock != &q->__queue_lock)
@@ -631,6 +648,40 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
+int blk_queue_enter(struct request_queue *q, gfp_t gfp)
+{
+	while (true) {
+		int ret;
+
+		if (percpu_ref_tryget_live(&q->q_usage_counter))
+			return 0;
+
+		if (!gfpflags_allow_blocking(gfp))
+			return -EBUSY;
+
+		ret = wait_event_interruptible(q->mq_freeze_wq,
+				!atomic_read(&q->mq_freeze_depth) ||
+				blk_queue_dying(q));
+		if (blk_queue_dying(q))
+			return -ENODEV;
+		if (ret)
+			return ret;
+	}
+}
+
+void blk_queue_exit(struct request_queue *q)
+{
+	percpu_ref_put(&q->q_usage_counter);
+}
+
+static void blk_queue_usage_counter_release(struct percpu_ref *ref)
+{
+	struct request_queue *q =
+		container_of(ref, struct request_queue, q_usage_counter);
+
+	wake_up_all(&q->mq_freeze_wq);
+}
+
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
@@ -692,11 +743,22 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	init_waitqueue_head(&q->mq_freeze_wq);
 
-	if (blkcg_init_queue(q))
+	/*
+	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
+	 * See blk_register_queue() for details.
+	 */
+	if (percpu_ref_init(&q->q_usage_counter,
+				blk_queue_usage_counter_release,
+				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
 		goto fail_bdi;
 
+	if (blkcg_init_queue(q))
+		goto fail_ref;
+
 	return q;
 
+fail_ref:
+	percpu_ref_exit(&q->q_usage_counter);
 fail_bdi:
 	bdi_destroy(&q->backing_dev_info);
 fail_split:
@@ -765,7 +827,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 
-static void blk_queue_bio(struct request_queue *q, struct bio *bio);
+static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
 struct request_queue *
 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
@@ -1162,8 +1224,8 @@ rq_starved:
  * @bio: bio to allocate request for (can be %NULL)
  * @gfp_mask: allocation mask
  *
- * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
- * function keeps retrying under memory pressure and fails iff @q is dead.
+ * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
+ * this function keeps retrying under memory pressure and fails iff @q is dead.
  *
  * Must be called with @q->queue_lock held and,
  * Returns ERR_PTR on failure, with @q->queue_lock held.
@@ -1183,7 +1245,7 @@ retry:
 	if (!IS_ERR(rq))
 		return rq;
 
-	if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
+	if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
 		blk_put_rl(rl);
 		return rq;
 	}
@@ -1261,11 +1323,11 @@ EXPORT_SYMBOL(blk_get_request);
  * BUG.
  *
  * WARNING: When allocating/cloning a bio-chain, careful consideration should be
- * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
- * anything but the first bio in the chain. Otherwise you risk waiting for IO
- * completion of a bio that hasn't been submitted yet, thus resulting in a
- * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
- * of bio_alloc(), as that avoids the mempool deadlock.
+ * given to how you allocate bios. In particular, you cannot use
+ * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise
+ * you risk waiting for IO completion of a bio that hasn't been submitted yet,
+ * thus resulting in a deadlock. Alternatively bios should be allocated using
+ * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock.
  * If possible a big IO should be split into smaller parts when allocation
  * fails. Partial allocation should not be an error, or you risk a live-lock.
  */
@@ -1531,6 +1593,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
  * @q: request_queue new bio is being queued at
  * @bio: new bio being queued
  * @request_count: out parameter for number of traversed plugged requests
+ * @same_queue_rq: pointer to &struct request that gets filled in when
+ * another request associated with @q is found on the plug list
+ * (optional, may be %NULL)
  *
  * Determine whether @bio being queued on @q can be merged with a request
  * on %current's plugged list.  Returns %true if merge was successful,
@@ -1596,6 +1661,30 @@ out:
 	return ret;
 }
 
+unsigned int blk_plug_queued_count(struct request_queue *q)
+{
+	struct blk_plug *plug;
+	struct request *rq;
+	struct list_head *plug_list;
+	unsigned int ret = 0;
+
+	plug = current->plug;
+	if (!plug)
+		goto out;
+
+	if (q->mq_ops)
+		plug_list = &plug->mq_list;
+	else
+		plug_list = &plug->list;
+
+	list_for_each_entry(rq, plug_list, queuelist) {
+		if (rq->q == q)
+			ret++;
+	}
+out:
+	return ret;
+}
+
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cmd_type = REQ_TYPE_FS;
@@ -1610,7 +1699,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-static void blk_queue_bio(struct request_queue *q, struct bio *bio)
+static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
 	struct blk_plug *plug;
@@ -1618,8 +1707,6 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	struct request *req;
 	unsigned int request_count = 0;
 
-	blk_queue_split(q, &bio, q->bio_split);
-
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
@@ -1627,10 +1714,12 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 */
 	blk_queue_bounce(q, &bio);
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 		bio->bi_error = -EIO;
 		bio_endio(bio);
-		return;
+		return BLK_QC_T_NONE;
 	}
 
 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
@@ -1643,9 +1732,11 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * Check if we can merge with the plugged list before grabbing
 	 * any locks.
 	 */
-	if (!blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
-		return;
+	if (!blk_queue_nomerges(q)) {
+		if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
+			return BLK_QC_T_NONE;
+	} else
+		request_count = blk_plug_queued_count(q);
 
 	spin_lock_irq(q->queue_lock);
 
@@ -1721,6 +1812,8 @@ get_rq:
 out_unlock:
 		spin_unlock_irq(q->queue_lock);
 	}
+
+	return BLK_QC_T_NONE;
 }
 
 /*
@@ -1926,12 +2019,13 @@ end_io:
  * a lower device by calling into generic_make_request recursively, which
  * means the bio should NOT be touched after the call to ->make_request_fn.
  */
-void generic_make_request(struct bio *bio)
+blk_qc_t generic_make_request(struct bio *bio)
 {
 	struct bio_list bio_list_on_stack;
+	blk_qc_t ret = BLK_QC_T_NONE;
 
 	if (!generic_make_request_checks(bio))
-		return;
+		goto out;
 
 	/*
 	 * We only want one ->make_request_fn to be active at a time, else
@@ -1945,7 +2039,7 @@ void generic_make_request(struct bio *bio)
 	 */
 	if (current->bio_list) {
 		bio_list_add(current->bio_list, bio);
-		return;
+		goto out;
 	}
 
 	/* following loop may be a bit non-obvious, and so deserves some
@@ -1968,11 +2062,24 @@ void generic_make_request(struct bio *bio)
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-		q->make_request_fn(q, bio);
+		if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
+
+			ret = q->make_request_fn(q, bio);
+
+			blk_queue_exit(q);
 
-		bio = bio_list_pop(current->bio_list);
+			bio = bio_list_pop(current->bio_list);
+		} else {
+			struct bio *bio_next = bio_list_pop(current->bio_list);
+
+			bio_io_error(bio);
+			bio = bio_next;
+		}
 	} while (bio);
 	current->bio_list = NULL; /* deactivate */
+
+out:
+	return ret;
 }
 EXPORT_SYMBOL(generic_make_request);
 
@@ -1986,7 +2093,7 @@ EXPORT_SYMBOL(generic_make_request);
  * interfaces; @bio must be presetup and ready for I/O.
  *
  */
-void submit_bio(int rw, struct bio *bio)
+blk_qc_t submit_bio(int rw, struct bio *bio)
 {
 	bio->bi_rw |= rw;
 
@@ -2023,12 +2130,13 @@ void submit_bio(int rw, struct bio *bio)
 		}
 	}
 
-	generic_make_request(bio);
+	return generic_make_request(bio);
 }
 EXPORT_SYMBOL(submit_bio);
 
 /**
- * blk_rq_check_limits - Helper function to check a request for the queue limit
+ * blk_cloned_rq_check_limits - Helper function to check a cloned request
+ *                              for new the queue limits
  * @q:  the queue
  * @rq: the request being checked
  *
@@ -2039,20 +2147,13 @@ EXPORT_SYMBOL(submit_bio);
  *    after it is inserted to @q, it should be checked against @q before
  *    the insertion using this generic function.
  *
- *    This function should also be useful for request stacking drivers
- *    in some cases below, so export this function.
  *    Request stacking drivers like request-based dm may change the queue
- *    limits while requests are in the queue (e.g. dm's table swapping).
- *    Such request stacking drivers should check those requests against
- *    the new queue limits again when they dispatch those requests,
- *    although such checkings are also done against the old queue limits
- *    when submitting requests.
+ *    limits when retrying requests on other queues. Those requests need
+ *    to be checked against the new queue limits again during dispatch.
  */
-int blk_rq_check_limits(struct request_queue *q, struct request *rq)
+static int blk_cloned_rq_check_limits(struct request_queue *q,
+				      struct request *rq)
 {
-	if (!rq_mergeable(rq))
-		return 0;
-
 	if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
@@ -2072,7 +2173,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 
 /**
  * blk_insert_cloned_request - Helper for stacking drivers to submit a request
@@ -2084,7 +2184,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	unsigned long flags;
 	int where = ELEVATOR_INSERT_BACK;
 
-	if (blk_rq_check_limits(q, rq))
+	if (blk_cloned_rq_check_limits(q, rq))
 		return -EIO;
 
 	if (rq->rq_disk &&
@@ -3229,6 +3329,47 @@ void blk_finish_plug(struct blk_plug *plug)
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
+bool blk_poll(struct request_queue *q, blk_qc_t cookie)
+{
+	struct blk_plug *plug;
+	long state;
+
+	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
+	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+		return false;
+
+	plug = current->plug;
+	if (plug)
+		blk_flush_plug_list(plug, false);
+
+	state = current->state;
+	while (!need_resched()) {
+		unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
+		struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
+		int ret;
+
+		hctx->poll_invoked++;
+
+		ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
+		if (ret > 0) {
+			hctx->poll_success++;
+			set_current_state(TASK_RUNNING);
+			return true;
+		}
+
+		if (signal_pending_state(state, current))
+			set_current_state(TASK_RUNNING);
+
+		if (current->state == TASK_RUNNING)
+			return true;
+		if (ret < 0)
+			break;
+		cpu_relax();
+	}
+
+	return false;
+}
+
 #ifdef CONFIG_PM
 /**
  * blk_pm_runtime_init - Block layer runtime PM initialization routine
@@ -3285,6 +3426,9 @@ int blk_pre_runtime_suspend(struct request_queue *q)
 {
 	int ret = 0;
 
+	if (!q->dev)
+		return ret;
+
 	spin_lock_irq(q->queue_lock);
 	if (q->nr_pending) {
 		ret = -EBUSY;
@@ -3312,6 +3456,9 @@ EXPORT_SYMBOL(blk_pre_runtime_suspend);
  */
 void blk_post_runtime_suspend(struct request_queue *q, int err)
 {
+	if (!q->dev)
+		return;
+
 	spin_lock_irq(q->queue_lock);
 	if (!err) {
 		q->rpm_status = RPM_SUSPENDED;
@@ -3336,6 +3483,9 @@ EXPORT_SYMBOL(blk_post_runtime_suspend);
  */
 void blk_pre_runtime_resume(struct request_queue *q)
 {
+	if (!q->dev)
+		return;
+
 	spin_lock_irq(q->queue_lock);
 	q->rpm_status = RPM_RESUMING;
 	spin_unlock_irq(q->queue_lock);
@@ -3358,6 +3508,9 @@ EXPORT_SYMBOL(blk_pre_runtime_resume);
  */
 void blk_post_runtime_resume(struct request_queue *q, int err)
 {
+	if (!q->dev)
+		return;
+
 	spin_lock_irq(q->queue_lock);
 	if (!err) {
 		q->rpm_status = RPM_ACTIVE;
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 75f29cf70..d69c5c79f 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -30,10 +30,6 @@
 
 #include "blk.h"
 
-static struct kmem_cache *integrity_cachep;
-
-static const char *bi_unsupported_name = "unsupported";
-
 /**
  * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
  * @q:		request queue
@@ -146,40 +142,40 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg);
  */
 int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
 {
-	struct blk_integrity *b1 = gd1->integrity;
-	struct blk_integrity *b2 = gd2->integrity;
+	struct blk_integrity *b1 = &gd1->queue->integrity;
+	struct blk_integrity *b2 = &gd2->queue->integrity;
 
-	if (!b1 && !b2)
+	if (!b1->profile && !b2->profile)
 		return 0;
 
-	if (!b1 || !b2)
+	if (!b1->profile || !b2->profile)
 		return -1;
 
-	if (b1->interval != b2->interval) {
+	if (b1->interval_exp != b2->interval_exp) {
 		pr_err("%s: %s/%s protection interval %u != %u\n",
 		       __func__, gd1->disk_name, gd2->disk_name,
-		       b1->interval, b2->interval);
+		       1 << b1->interval_exp, 1 << b2->interval_exp);
 		return -1;
 	}
 
 	if (b1->tuple_size != b2->tuple_size) {
-		printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
+		pr_err("%s: %s/%s tuple sz %u != %u\n", __func__,
 		       gd1->disk_name, gd2->disk_name,
 		       b1->tuple_size, b2->tuple_size);
 		return -1;
 	}
 
 	if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
-		printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
+		pr_err("%s: %s/%s tag sz %u != %u\n", __func__,
 		       gd1->disk_name, gd2->disk_name,
 		       b1->tag_size, b2->tag_size);
 		return -1;
 	}
 
-	if (strcmp(b1->name, b2->name)) {
-		printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
+	if (b1->profile != b2->profile) {
+		pr_err("%s: %s/%s type %s != %s\n", __func__,
 		       gd1->disk_name, gd2->disk_name,
-		       b1->name, b2->name);
+		       b1->profile->name, b2->profile->name);
 		return -1;
 	}
 
@@ -249,8 +245,8 @@ struct integrity_sysfs_entry {
 static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
 				   char *page)
 {
-	struct blk_integrity *bi =
-		container_of(kobj, struct blk_integrity, kobj);
+	struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj);
+	struct blk_integrity *bi = &disk->queue->integrity;
 	struct integrity_sysfs_entry *entry =
 		container_of(attr, struct integrity_sysfs_entry, attr);
 
@@ -261,8 +257,8 @@ static ssize_t integrity_attr_store(struct kobject *kobj,
 				    struct attribute *attr, const char *page,
 				    size_t count)
 {
-	struct blk_integrity *bi =
-		container_of(kobj, struct blk_integrity, kobj);
+	struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj);
+	struct blk_integrity *bi = &disk->queue->integrity;
 	struct integrity_sysfs_entry *entry =
 		container_of(attr, struct integrity_sysfs_entry, attr);
 	ssize_t ret = 0;
@@ -275,18 +271,21 @@ static ssize_t integrity_attr_store(struct kobject *kobj,
 
 static ssize_t integrity_format_show(struct blk_integrity *bi, char *page)
 {
-	if (bi != NULL && bi->name != NULL)
-		return sprintf(page, "%s\n", bi->name);
+	if (bi->profile && bi->profile->name)
+		return sprintf(page, "%s\n", bi->profile->name);
 	else
 		return sprintf(page, "none\n");
 }
 
 static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page)
 {
-	if (bi != NULL)
-		return sprintf(page, "%u\n", bi->tag_size);
-	else
-		return sprintf(page, "0\n");
+	return sprintf(page, "%u\n", bi->tag_size);
+}
+
+static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%u\n",
+		       bi->interval_exp ? 1 << bi->interval_exp : 0);
 }
 
 static ssize_t integrity_verify_store(struct blk_integrity *bi,
@@ -343,6 +342,11 @@ static struct integrity_sysfs_entry integrity_tag_size_entry = {
 	.show = integrity_tag_size_show,
 };
 
+static struct integrity_sysfs_entry integrity_interval_entry = {
+	.attr = { .name = "protection_interval_bytes", .mode = S_IRUGO },
+	.show = integrity_interval_show,
+};
+
 static struct integrity_sysfs_entry integrity_verify_entry = {
 	.attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
 	.show = integrity_verify_show,
@@ -363,6 +367,7 @@ static struct integrity_sysfs_entry integrity_device_entry = {
 static struct attribute *integrity_attrs[] = {
 	&integrity_format_entry.attr,
 	&integrity_tag_size_entry.attr,
+	&integrity_interval_entry.attr,
 	&integrity_verify_entry.attr,
 	&integrity_generate_entry.attr,
 	&integrity_device_entry.attr,
@@ -374,114 +379,89 @@ static const struct sysfs_ops integrity_ops = {
 	.store	= &integrity_attr_store,
 };
 
-static int __init blk_dev_integrity_init(void)
-{
-	integrity_cachep = kmem_cache_create("blkdev_integrity",
-					     sizeof(struct blk_integrity),
-					     0, SLAB_PANIC, NULL);
-	return 0;
-}
-subsys_initcall(blk_dev_integrity_init);
-
-static void blk_integrity_release(struct kobject *kobj)
-{
-	struct blk_integrity *bi =
-		container_of(kobj, struct blk_integrity, kobj);
-
-	kmem_cache_free(integrity_cachep, bi);
-}
-
 static struct kobj_type integrity_ktype = {
 	.default_attrs	= integrity_attrs,
 	.sysfs_ops	= &integrity_ops,
-	.release	= blk_integrity_release,
 };
 
-bool blk_integrity_is_initialized(struct gendisk *disk)
+static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
 {
-	struct blk_integrity *bi = blk_get_integrity(disk);
-
-	return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0);
+	return 0;
 }
-EXPORT_SYMBOL(blk_integrity_is_initialized);
+
+static struct blk_integrity_profile nop_profile = {
+	.name = "nop",
+	.generate_fn = blk_integrity_nop_fn,
+	.verify_fn = blk_integrity_nop_fn,
+};
 
 /**
  * blk_integrity_register - Register a gendisk as being integrity-capable
  * @disk:	struct gendisk pointer to make integrity-aware
- * @template:	optional integrity profile to register
+ * @template:	block integrity profile to register
  *
- * Description: When a device needs to advertise itself as being able
- * to send/receive integrity metadata it must use this function to
- * register the capability with the block layer.  The template is a
- * blk_integrity struct with values appropriate for the underlying
- * hardware.  If template is NULL the new profile is allocated but
- * not filled out. See Documentation/block/data-integrity.txt.
+ * Description: When a device needs to advertise itself as being able to
+ * send/receive integrity metadata it must use this function to register
+ * the capability with the block layer. The template is a blk_integrity
+ * struct with values appropriate for the underlying hardware. See
+ * Documentation/block/data-integrity.txt.
  */
-int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
+void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 {
-	struct blk_integrity *bi;
+	struct blk_integrity *bi = &disk->queue->integrity;
 
-	BUG_ON(disk == NULL);
+	bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE |
+		template->flags;
+	bi->interval_exp = ilog2(queue_logical_block_size(disk->queue));
+	bi->profile = template->profile ? template->profile : &nop_profile;
+	bi->tuple_size = template->tuple_size;
+	bi->tag_size = template->tag_size;
 
-	if (disk->integrity == NULL) {
-		bi = kmem_cache_alloc(integrity_cachep,
-				      GFP_KERNEL | __GFP_ZERO);
-		if (!bi)
-			return -1;
-
-		if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
-					 &disk_to_dev(disk)->kobj,
-					 "%s", "integrity")) {
-			kmem_cache_free(integrity_cachep, bi);
-			return -1;
-		}
-
-		kobject_uevent(&bi->kobj, KOBJ_ADD);
-
-		bi->flags |= BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE;
-		bi->interval = queue_logical_block_size(disk->queue);
-		disk->integrity = bi;
-	} else
-		bi = disk->integrity;
-
-	/* Use the provided profile as template */
-	if (template != NULL) {
-		bi->name = template->name;
-		bi->generate_fn = template->generate_fn;
-		bi->verify_fn = template->verify_fn;
-		bi->tuple_size = template->tuple_size;
-		bi->tag_size = template->tag_size;
-		bi->flags |= template->flags;
-	} else
-		bi->name = bi_unsupported_name;
-
-	disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
-
-	return 0;
+	blk_integrity_revalidate(disk);
 }
 EXPORT_SYMBOL(blk_integrity_register);
 
 /**
- * blk_integrity_unregister - Remove block integrity profile
- * @disk:	disk whose integrity profile to deallocate
+ * blk_integrity_unregister - Unregister block integrity profile
+ * @disk:	disk whose integrity profile to unregister
  *
- * Description: This function frees all memory used by the block
- * integrity profile.  To be called at device teardown.
+ * Description: This function unregisters the integrity capability from
+ * a block device.
  */
 void blk_integrity_unregister(struct gendisk *disk)
 {
-	struct blk_integrity *bi;
+	blk_integrity_revalidate(disk);
+	memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
+}
+EXPORT_SYMBOL(blk_integrity_unregister);
+
+void blk_integrity_revalidate(struct gendisk *disk)
+{
+	struct blk_integrity *bi = &disk->queue->integrity;
 
-	if (!disk || !disk->integrity)
+	if (!(disk->flags & GENHD_FL_UP))
 		return;
 
-	disk->queue->backing_dev_info.capabilities &= ~BDI_CAP_STABLE_WRITES;
+	if (bi->profile)
+		disk->queue->backing_dev_info.capabilities |=
+			BDI_CAP_STABLE_WRITES;
+	else
+		disk->queue->backing_dev_info.capabilities &=
+			~BDI_CAP_STABLE_WRITES;
+}
+
+void blk_integrity_add(struct gendisk *disk)
+{
+	if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
+				 &disk_to_dev(disk)->kobj, "%s", "integrity"))
+		return;
 
-	bi = disk->integrity;
+	kobject_uevent(&disk->integrity_kobj, KOBJ_ADD);
+}
 
-	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
-	kobject_del(&bi->kobj);
-	kobject_put(&bi->kobj);
-	disk->integrity = NULL;
+void blk_integrity_del(struct gendisk *disk)
+{
+	kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE);
+	kobject_del(&disk->integrity_kobj);
+	kobject_put(&disk->integrity_kobj);
 }
-EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 1a27f45ec..381cb50a6 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task,
 {
 	struct io_context *ioc;
 
-	might_sleep_if(gfp_flags & __GFP_WAIT);
+	might_sleep_if(gfpflags_allow_blocking(gfp_flags));
 
 	do {
 		task_lock(task);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 0e5f4fc12..e01405a3e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -11,13 +11,16 @@
 
 static struct bio *blk_bio_discard_split(struct request_queue *q,
 					 struct bio *bio,
-					 struct bio_set *bs)
+					 struct bio_set *bs,
+					 unsigned *nsegs)
 {
 	unsigned int max_discard_sectors, granularity;
 	int alignment;
 	sector_t tmp;
 	unsigned split_sectors;
 
+	*nsegs = 1;
+
 	/* Zero-sector (unknown) and one-sector granularities are the same.  */
 	granularity = max(q->limits.discard_granularity >> 9, 1U);
 
@@ -51,8 +54,11 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 
 static struct bio *blk_bio_write_same_split(struct request_queue *q,
 					    struct bio *bio,
-					    struct bio_set *bs)
+					    struct bio_set *bs,
+					    unsigned *nsegs)
 {
+	*nsegs = 1;
+
 	if (!q->limits.max_write_same_sectors)
 		return NULL;
 
@@ -64,11 +70,15 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
 
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio *bio,
-					 struct bio_set *bs)
+					 struct bio_set *bs,
+					 unsigned *segs)
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
 	unsigned seg_size = 0, nsegs = 0, sectors = 0;
+	unsigned front_seg_size = bio->bi_seg_front_size;
+	bool do_split = true;
+	struct bio *new = NULL;
 
 	bio_for_each_segment(bv, bio, iter) {
 		if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q))
@@ -93,6 +103,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			bvprv = bv;
 			bvprvp = &bvprv;
 			sectors += bv.bv_len >> 9;
+
+			if (nsegs == 1 && seg_size > front_seg_size)
+				front_seg_size = seg_size;
 			continue;
 		}
 new_segment:
@@ -104,26 +117,50 @@ new_segment:
 		bvprvp = &bvprv;
 		seg_size = bv.bv_len;
 		sectors += bv.bv_len >> 9;
+
+		if (nsegs == 1 && seg_size > front_seg_size)
+			front_seg_size = seg_size;
 	}
 
-	return NULL;
+	do_split = false;
 split:
-	return bio_split(bio, sectors, GFP_NOIO, bs);
+	*segs = nsegs;
+
+	if (do_split) {
+		new = bio_split(bio, sectors, GFP_NOIO, bs);
+		if (new)
+			bio = new;
+	}
+
+	bio->bi_seg_front_size = front_seg_size;
+	if (seg_size > bio->bi_seg_back_size)
+		bio->bi_seg_back_size = seg_size;
+
+	return do_split ? new : NULL;
 }
 
 void blk_queue_split(struct request_queue *q, struct bio **bio,
 		     struct bio_set *bs)
 {
-	struct bio *split;
+	struct bio *split, *res;
+	unsigned nsegs;
 
 	if ((*bio)->bi_rw & REQ_DISCARD)
-		split = blk_bio_discard_split(q, *bio, bs);
+		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
 	else if ((*bio)->bi_rw & REQ_WRITE_SAME)
-		split = blk_bio_write_same_split(q, *bio, bs);
+		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
 	else
-		split = blk_bio_segment_split(q, *bio, q->bio_split);
+		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
+
+	/* physical segments can be figured out during splitting */
+	res = split ? split : *bio;
+	res->bi_phys_segments = nsegs;
+	bio_set_flag(res, BIO_SEG_VALID);
 
 	if (split) {
+		/* there isn't chance to merge the splitted bio */
+		split->bi_rw |= REQ_NOMERGE;
+
 		bio_chain(split, *bio);
 		generic_make_request(*bio);
 		*bio = split;
@@ -394,6 +431,12 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 	if (sg)
 		sg_mark_end(sg);
 
+	/*
+	 * Something must have been wrong if the figured number of
+	 * segment is bigger than number of req's physical segments
+	 */
+	WARN_ON(nsegs > rq->nr_phys_segments);
+
 	return nsegs;
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 788fffd9b..1cf18784c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -174,6 +174,11 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
 	return ret;
 }
 
+static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success);
+}
+
 static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
 					   char *page)
 {
@@ -295,6 +300,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
 	.attr = {.name = "cpu_list", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_cpus_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
+	.attr = {.name = "io_poll", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_poll_show,
+};
 
 static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_queued.attr,
@@ -304,6 +313,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
 	&blk_mq_hw_sysfs_active.attr,
+	&blk_mq_hw_sysfs_poll.attr,
 	NULL,
 };
 
@@ -413,12 +423,6 @@ static void blk_mq_sysfs_init(struct request_queue *q)
 		kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
 }
 
-/* see blk_register_queue() */
-void blk_mq_finish_init(struct request_queue *q)
-{
-	percpu_ref_switch_to_percpu(&q->mq_usage_counter);
-}
-
 int blk_mq_register_disk(struct gendisk *disk)
 {
 	struct device *dev = disk_to_dev(disk);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index ec2d11915..a07ca3488 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -75,6 +75,10 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 	struct blk_mq_bitmap_tags *bt;
 	int i, wake_index;
 
+	/*
+	 * Make sure all changes prior to this are visible from other CPUs.
+	 */
+	smp_mb();
 	bt = &tags->bitmap_tags;
 	wake_index = atomic_read(&bt->wake_index);
 	for (i = 0; i < BT_WAIT_QUEUES; i++) {
@@ -264,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 	if (tag != -1)
 		return tag;
 
-	if (!(data->gfp & __GFP_WAIT))
+	if (!gfpflags_allow_blocking(data->gfp))
 		return -1;
 
 	bs = bt_wait_ptr(bt, hctx);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 85f014327..6d6f8feb4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -9,6 +9,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/kmemleak.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -77,47 +78,13 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
 }
 
-static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
-{
-	while (true) {
-		int ret;
-
-		if (percpu_ref_tryget_live(&q->mq_usage_counter))
-			return 0;
-
-		if (!(gfp & __GFP_WAIT))
-			return -EBUSY;
-
-		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!atomic_read(&q->mq_freeze_depth) ||
-				blk_queue_dying(q));
-		if (blk_queue_dying(q))
-			return -ENODEV;
-		if (ret)
-			return ret;
-	}
-}
-
-static void blk_mq_queue_exit(struct request_queue *q)
-{
-	percpu_ref_put(&q->mq_usage_counter);
-}
-
-static void blk_mq_usage_counter_release(struct percpu_ref *ref)
-{
-	struct request_queue *q =
-		container_of(ref, struct request_queue, mq_usage_counter);
-
-	wake_up_all(&q->mq_freeze_wq);
-}
-
 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
 	int freeze_depth;
 
 	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
 	if (freeze_depth == 1) {
-		percpu_ref_kill(&q->mq_usage_counter);
+		percpu_ref_kill(&q->q_usage_counter);
 		blk_mq_run_hw_queues(q, false);
 	}
 }
@@ -125,18 +92,34 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
 
 static void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
-	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
+	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 }
 
 /*
  * Guarantee no request is in use, so we can change any data structure of
  * the queue afterward.
  */
-void blk_mq_freeze_queue(struct request_queue *q)
+void blk_freeze_queue(struct request_queue *q)
 {
+	/*
+	 * In the !blk_mq case we are only calling this to kill the
+	 * q_usage_counter, otherwise this increases the freeze depth
+	 * and waits for it to return to zero.  For this reason there is
+	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
+	 * exported to drivers as the only user for unfreeze is blk_mq.
+	 */
 	blk_mq_freeze_queue_start(q);
 	blk_mq_freeze_queue_wait(q);
 }
+
+void blk_mq_freeze_queue(struct request_queue *q)
+{
+	/*
+	 * ...just an alias to keep freeze and unfreeze actions balanced
+	 * in the blk_mq_* namespace
+	 */
+	blk_freeze_queue(q);
+}
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
@@ -146,7 +129,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
 	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
 	WARN_ON_ONCE(freeze_depth < 0);
 	if (!freeze_depth) {
-		percpu_ref_reinit(&q->mq_usage_counter);
+		percpu_ref_reinit(&q->q_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
 }
@@ -255,17 +238,17 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 	struct blk_mq_alloc_data alloc_data;
 	int ret;
 
-	ret = blk_mq_queue_enter(q, gfp);
+	ret = blk_queue_enter(q, gfp);
 	if (ret)
 		return ERR_PTR(ret);
 
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
+	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
 			reserved, ctx, hctx);
 
 	rq = __blk_mq_alloc_request(&alloc_data, rw);
-	if (!rq && (gfp & __GFP_WAIT)) {
+	if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
 		__blk_mq_run_hw_queue(hctx);
 		blk_mq_put_ctx(ctx);
 
@@ -278,7 +261,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 	}
 	blk_mq_put_ctx(ctx);
 	if (!rq) {
-		blk_mq_queue_exit(q);
+		blk_queue_exit(q);
 		return ERR_PTR(-EWOULDBLOCK);
 	}
 	return rq;
@@ -297,7 +280,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	blk_mq_put_tag(hctx, tag, &ctx->last_tag);
-	blk_mq_queue_exit(q);
+	blk_queue_exit(q);
 }
 
 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
@@ -375,7 +358,7 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 	put_cpu();
 }
 
-void __blk_mq_complete_request(struct request *rq)
+static void __blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
@@ -989,18 +972,25 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
-static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq, bool at_head)
+static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
+					    struct blk_mq_ctx *ctx,
+					    struct request *rq,
+					    bool at_head)
 {
-	struct blk_mq_ctx *ctx = rq->mq_ctx;
-
 	trace_block_rq_insert(hctx->queue, rq);
 
 	if (at_head)
 		list_add(&rq->queuelist, &ctx->rq_list);
 	else
 		list_add_tail(&rq->queuelist, &ctx->rq_list);
+}
+
+static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
+				    struct request *rq, bool at_head)
+{
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
+	__blk_mq_insert_req_list(hctx, ctx, rq, at_head);
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
@@ -1056,8 +1046,9 @@ static void blk_mq_insert_requests(struct request_queue *q,
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		rq->mq_ctx = ctx;
-		__blk_mq_insert_request(hctx, rq, false);
+		__blk_mq_insert_req_list(hctx, ctx, rq, false);
 	}
+	blk_mq_hctx_mark_pending(hctx, ctx);
 	spin_unlock(&ctx->lock);
 
 	blk_mq_run_hw_queue(hctx, from_schedule);
@@ -1139,7 +1130,7 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
 					 struct blk_mq_ctx *ctx,
 					 struct request *rq, struct bio *bio)
 {
-	if (!hctx_allow_merges(hctx)) {
+	if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
 		blk_mq_bio_to_request(rq, bio);
 		spin_lock(&ctx->lock);
 insert_rq:
@@ -1176,11 +1167,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	int rw = bio_data_dir(bio);
 	struct blk_mq_alloc_data alloc_data;
 
-	if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
-		bio_io_error(bio);
-		return NULL;
-	}
-
+	blk_queue_enter_live(q);
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
@@ -1199,7 +1186,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
 		blk_mq_set_alloc_data(&alloc_data, q,
-				__GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
+				__GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
 		rq = __blk_mq_alloc_request(&alloc_data, rw);
 		ctx = alloc_data.ctx;
 		hctx = alloc_data.hctx;
@@ -1211,7 +1198,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	return rq;
 }
 
-static int blk_mq_direct_issue_request(struct request *rq)
+static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
 {
 	int ret;
 	struct request_queue *q = rq->q;
@@ -1222,6 +1209,7 @@ static int blk_mq_direct_issue_request(struct request *rq)
 		.list = NULL,
 		.last = 1
 	};
+	blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
 
 	/*
 	 * For OK queue, we are done. For error, kill it. Any other
@@ -1229,18 +1217,21 @@ static int blk_mq_direct_issue_request(struct request *rq)
 	 * would have done
 	 */
 	ret = q->mq_ops->queue_rq(hctx, &bd);
-	if (ret == BLK_MQ_RQ_QUEUE_OK)
+	if (ret == BLK_MQ_RQ_QUEUE_OK) {
+		*cookie = new_cookie;
 		return 0;
-	else {
-		__blk_mq_requeue_request(rq);
+	}
 
-		if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
-			rq->errors = -EIO;
-			blk_mq_end_request(rq, rq->errors);
-			return 0;
-		}
-		return -1;
+	__blk_mq_requeue_request(rq);
+
+	if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+		*cookie = BLK_QC_T_NONE;
+		rq->errors = -EIO;
+		blk_mq_end_request(rq, rq->errors);
+		return 0;
 	}
+
+	return -1;
 }
 
 /*
@@ -1248,7 +1239,7 @@ static int blk_mq_direct_issue_request(struct request *rq)
  * but will attempt to bypass the hctx queueing if we can go straight to
  * hardware for SYNC IO.
  */
-static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio->bi_rw);
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
@@ -1257,23 +1248,29 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	unsigned int request_count = 0;
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
+	blk_qc_t cookie;
 
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 		bio_io_error(bio);
-		return;
+		return BLK_QC_T_NONE;
 	}
 
 	blk_queue_split(q, &bio, q->bio_split);
 
-	if (!is_flush_fua && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
-		return;
+	if (!is_flush_fua && !blk_queue_nomerges(q)) {
+		if (blk_attempt_plug_merge(q, bio, &request_count,
+					   &same_queue_rq))
+			return BLK_QC_T_NONE;
+	} else
+		request_count = blk_plug_queued_count(q);
 
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
-		return;
+		return BLK_QC_T_NONE;
+
+	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
@@ -1294,15 +1291,16 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_bio_to_request(rq, bio);
 
 		/*
-		 * we do limited pluging. If bio can be merged, do merge.
+		 * We do limited pluging. If the bio can be merged, do that.
 		 * Otherwise the existing request in the plug list will be
 		 * issued. So the plug list will have one request at most
 		 */
 		if (plug) {
 			/*
 			 * The plug list might get flushed before this. If that
-			 * happens, same_queue_rq is invalid and plug list is empty
-			 **/
+			 * happens, same_queue_rq is invalid and plug list is
+			 * empty
+			 */
 			if (same_queue_rq && !list_empty(&plug->mq_list)) {
 				old_rq = same_queue_rq;
 				list_del_init(&old_rq->queuelist);
@@ -1312,11 +1310,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 			old_rq = rq;
 		blk_mq_put_ctx(data.ctx);
 		if (!old_rq)
-			return;
-		if (!blk_mq_direct_issue_request(old_rq))
-			return;
+			goto done;
+		if (!blk_mq_direct_issue_request(old_rq, &cookie))
+			goto done;
 		blk_mq_insert_request(old_rq, false, true, true);
-		return;
+		goto done;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1330,13 +1328,15 @@ run_queue:
 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
 	}
 	blk_mq_put_ctx(data.ctx);
+done:
+	return cookie;
 }
 
 /*
  * Single hardware queue variant. This will attempt to use any per-process
  * plug for merging and IO deferral.
  */
-static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio->bi_rw);
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
@@ -1344,23 +1344,26 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	unsigned int request_count = 0;
 	struct blk_map_ctx data;
 	struct request *rq;
+	blk_qc_t cookie;
 
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 		bio_io_error(bio);
-		return;
+		return BLK_QC_T_NONE;
 	}
 
 	blk_queue_split(q, &bio, q->bio_split);
 
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
-		return;
+		return BLK_QC_T_NONE;
 
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
-		return;
+		return BLK_QC_T_NONE;
+
+	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
@@ -1376,15 +1379,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	plug = current->plug;
 	if (plug) {
 		blk_mq_bio_to_request(rq, bio);
-		if (list_empty(&plug->mq_list))
+		if (!request_count)
 			trace_block_plug(q);
-		else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+
+		blk_mq_put_ctx(data.ctx);
+
+		if (request_count >= BLK_MAX_REQUEST_COUNT) {
 			blk_flush_plug_list(plug, false);
 			trace_block_plug(q);
 		}
+
 		list_add_tail(&rq->queuelist, &plug->mq_list);
-		blk_mq_put_ctx(data.ctx);
-		return;
+		return cookie;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1399,6 +1405,7 @@ run_queue:
 	}
 
 	blk_mq_put_ctx(data.ctx);
+	return cookie;
 }
 
 /*
@@ -1430,6 +1437,11 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 	while (!list_empty(&tags->page_list)) {
 		page = list_first_entry(&tags->page_list, struct page, lru);
 		list_del_init(&page->lru);
+		/*
+		 * Remove kmemleak object previously allocated in
+		 * blk_mq_init_rq_map().
+		 */
+		kmemleak_free(page_address(page));
 		__free_pages(page, page->private);
 	}
 
@@ -1502,6 +1514,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 		list_add_tail(&page->lru, &tags->page_list);
 
 		p = page_address(page);
+		/*
+		 * Allow kmemleak to scan these pages as they contain pointers
+		 * to additional allocations like via ops->init_request().
+		 */
+		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
 		entries_per_page = order_to_size(this_order) / rq_size;
 		to_do = min(entries_per_page, set->queue_depth - i);
 		left -= to_do * rq_size;
@@ -1673,7 +1690,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	INIT_LIST_HEAD(&hctx->dispatch);
 	hctx->queue = q;
 	hctx->queue_num = hctx_idx;
-	hctx->flags = set->flags;
+	hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
 
 	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
 					blk_mq_hctx_notify, hctx);
@@ -1860,27 +1877,26 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	}
 }
 
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
+static void queue_set_hctx_shared(struct request_queue *q, bool shared)
 {
 	struct blk_mq_hw_ctx *hctx;
-	struct request_queue *q;
-	bool shared;
 	int i;
 
-	if (set->tag_list.next == set->tag_list.prev)
-		shared = false;
-	else
-		shared = true;
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (shared)
+			hctx->flags |= BLK_MQ_F_TAG_SHARED;
+		else
+			hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+	}
+}
+
+static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
+{
+	struct request_queue *q;
 
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_freeze_queue(q);
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			if (shared)
-				hctx->flags |= BLK_MQ_F_TAG_SHARED;
-			else
-				hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
-		}
+		queue_set_hctx_shared(q, shared);
 		blk_mq_unfreeze_queue(q);
 	}
 }
@@ -1891,7 +1907,12 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
 
 	mutex_lock(&set->tag_list_lock);
 	list_del_init(&q->tag_set_list);
-	blk_mq_update_tag_set_depth(set);
+	if (list_is_singular(&set->tag_list)) {
+		/* just transitioned to unshared */
+		set->flags &= ~BLK_MQ_F_TAG_SHARED;
+		/* update existing queue */
+		blk_mq_update_tag_set_depth(set, false);
+	}
 	mutex_unlock(&set->tag_list_lock);
 }
 
@@ -1901,8 +1922,17 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 	q->tag_set = set;
 
 	mutex_lock(&set->tag_list_lock);
+
+	/* Check to see if we're transitioning to shared (from 1 to 2 queues). */
+	if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
+		set->flags |= BLK_MQ_F_TAG_SHARED;
+		/* update existing queue */
+		blk_mq_update_tag_set_depth(set, true);
+	}
+	if (set->flags & BLK_MQ_F_TAG_SHARED)
+		queue_set_hctx_shared(q, true);
 	list_add_tail(&q->tag_set_list, &set->tag_list);
-	blk_mq_update_tag_set_depth(set);
+
 	mutex_unlock(&set->tag_list_lock);
 }
 
@@ -1989,14 +2019,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		hctxs[i]->queue_num = i;
 	}
 
-	/*
-	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
-	 * See blk_register_queue() for details.
-	 */
-	if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
-			    PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
-		goto err_hctxs;
-
 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
@@ -2077,8 +2099,6 @@ void blk_mq_free_queue(struct request_queue *q)
 
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 	blk_mq_free_hw_queues(q, set);
-
-	percpu_ref_exit(&q->mq_usage_counter);
 }
 
 /* Basically redo blk_mq_init_queue with queue frozen */
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f4fea7964..713820b47 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -25,12 +25,9 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
-void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
-void blk_mq_clone_flush_request(struct request *flush_rq,
-		struct request *orig_rq);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7d8f129a1..dd4973583 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,7 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->virt_boundary_mask = 0;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
-	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+	lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors =
+		BLK_SAFE_MAX_SECTORS;
 	lim->chunk_sectors = 0;
 	lim->max_write_same_sectors = 0;
 	lim->max_discard_sectors = 0;
@@ -127,6 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	lim->max_hw_sectors = UINT_MAX;
 	lim->max_segment_size = UINT_MAX;
 	lim->max_sectors = UINT_MAX;
+	lim->max_dev_sectors = UINT_MAX;
 	lim->max_write_same_sectors = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -214,8 +216,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
 /**
- * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request
- * @limits: the queue limits
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
+ * @q:  the request queue for the device
  * @max_hw_sectors:  max hardware sectors in the usual 512b unit
  *
  * Description:
@@ -224,13 +226,19 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
  *    the device driver based upon the capabilities of the I/O
  *    controller.
  *
+ *    max_dev_sectors is a hard limit imposed by the storage device for
+ *    READ/WRITE requests. It is set by the disk driver.
+ *
  *    max_sectors is a soft limit imposed by the block layer for
  *    filesystem type requests.  This value can be overridden on a
  *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
  *    The soft limit can not exceed max_hw_sectors.
  **/
-void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors)
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
 {
+	struct queue_limits *limits = &q->limits;
+	unsigned int max_sectors;
+
 	if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
 		max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
 		printk(KERN_INFO "%s: set to minimum %d\n",
@@ -238,22 +246,9 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
 	}
 
 	limits->max_hw_sectors = max_hw_sectors;
-	limits->max_sectors = min_t(unsigned int, max_hw_sectors,
-				    BLK_DEF_MAX_SECTORS);
-}
-EXPORT_SYMBOL(blk_limits_max_hw_sectors);
-
-/**
- * blk_queue_max_hw_sectors - set max sectors for a request for this queue
- * @q:  the request queue for the device
- * @max_hw_sectors:  max hardware sectors in the usual 512b unit
- *
- * Description:
- *    See description for blk_limits_max_hw_sectors().
- **/
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
-{
-	blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
+	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
+	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
+	limits->max_sectors = max_sectors;
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
@@ -527,6 +522,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_same_sectors = min(t->max_write_same_sectors,
 					b->max_write_same_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 07b42f5ad..e140cc487 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -205,6 +205,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 	if (ret < 0)
 		return ret;
 
+	max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long)
+					 q->limits.max_dev_sectors >> 1);
+
 	if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
 		return -EINVAL;
 
@@ -317,6 +320,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 	return ret;
 }
 
+static ssize_t queue_poll_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
+}
+
+static ssize_t queue_poll_store(struct request_queue *q, const char *page,
+				size_t count)
+{
+	unsigned long poll_on;
+	ssize_t ret;
+
+	if (!q->mq_ops || !q->mq_ops->poll)
+		return -EINVAL;
+
+	ret = queue_var_store(&poll_on, page, count);
+	if (ret < 0)
+		return ret;
+
+	spin_lock_irq(q->queue_lock);
+	if (poll_on)
+		queue_flag_set(QUEUE_FLAG_POLL, q);
+	else
+		queue_flag_clear(QUEUE_FLAG_POLL, q);
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -442,6 +473,12 @@ static struct queue_sysfs_entry queue_random_entry = {
 	.store = queue_store_random,
 };
 
+static struct queue_sysfs_entry queue_poll_entry = {
+	.attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_poll_show,
+	.store = queue_poll_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -466,6 +503,7 @@ static struct attribute *default_attrs[] = {
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
 	&queue_random_entry.attr,
+	&queue_poll_entry.attr,
 	NULL,
 };
 
@@ -600,9 +638,8 @@ int blk_register_queue(struct gendisk *disk)
 	 */
 	if (!blk_queue_init_done(q)) {
 		queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
+		percpu_ref_switch_to_percpu(&q->q_usage_counter);
 		blk_queue_bypass_end(q);
-		if (q->mq_ops)
-			blk_mq_finish_init(q);
 	}
 
 	ret = blk_trace_init_sysfs(dev);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c75a2636d..2149a1ddb 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -369,7 +369,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 	 * regardless of the position of the group in the hierarchy.
 	 */
 	sq->parent_sq = &td->service_queue;
-	if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
+	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
 		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
 	tg->td = td;
 }
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 246dfb16c..aa40aa933 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -158,11 +158,13 @@ void blk_abort_request(struct request *req)
 {
 	if (blk_mark_rq_complete(req))
 		return;
-	blk_delete_timer(req);
-	if (req->q->mq_ops)
+
+	if (req->q->mq_ops) {
 		blk_mq_rq_timed_out(req, false);
-	else
+	} else {
+		blk_delete_timer(req);
 		blk_rq_timed_out(req);
+	}
 }
 EXPORT_SYMBOL_GPL(blk_abort_request);
 
diff --git a/block/blk.h b/block/blk.h
index 98614ad37..c43926d3d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -72,6 +72,26 @@ void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
 			    unsigned int nr_bytes, unsigned int bidi_bytes);
+void blk_freeze_queue(struct request_queue *q);
+
+static inline void blk_queue_enter_live(struct request_queue *q)
+{
+	/*
+	 * Given that running in generic_make_request() context
+	 * guarantees that a live reference against q_usage_counter has
+	 * been established, further references under that same context
+	 * need not check that the queue has been frozen (marked dead).
+	 */
+	percpu_ref_get(&q->q_usage_counter);
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+void blk_flush_integrity(void);
+#else
+static inline void blk_flush_integrity(void)
+{
+}
+#endif
 
 void blk_rq_timed_out_timer(unsigned long data);
 unsigned long blk_rq_timeout(unsigned long timeout);
@@ -86,6 +106,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 			    unsigned int *request_count,
 			    struct request **same_queue_rq);
+unsigned int blk_plug_queued_count(struct request_queue *q);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 04de88463..1f9093e90 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1581,7 +1581,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
 static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 {
 	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
-	unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+	unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
 			      CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
 
 	if (cpd_to_blkcg(cpd) == &blkcg_root)
@@ -1599,7 +1599,7 @@ static void cfq_cpd_free(struct blkcg_policy_data *cpd)
 static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
 {
 	struct blkcg *blkcg = cpd_to_blkcg(cpd);
-	bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+	bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
 	unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
 
 	if (blkcg == &blkcg_root)
diff --git a/block/elevator.c b/block/elevator.c
index 84d63943f..c3555c9c6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -420,7 +420,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	 * 	noxmerges: Only simple one-hit cache try
 	 * 	merges:	   All merge tries attempted
 	 */
-	if (blk_queue_nomerges(q))
+	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
 		return ELEVATOR_NO_MERGE;
 
 	/*
diff --git a/block/genhd.c b/block/genhd.c
index 398dab06d..c06731166 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -632,6 +632,7 @@ void add_disk(struct gendisk *disk)
 	WARN_ON(retval);
 
 	disk_add_events(disk);
+	blk_integrity_add(disk);
 }
 EXPORT_SYMBOL(add_disk);
 
@@ -640,6 +641,7 @@ void del_gendisk(struct gendisk *disk)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 
+	blk_integrity_del(disk);
 	disk_del_events(disk);
 
 	/* invalidate stuff */
diff --git a/block/ioctl.c b/block/ioctl.c
index 8061eba42..0918aed2d 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -7,6 +7,7 @@
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/blktrace_api.h>
+#include <linux/pr.h>
 #include <asm/uaccess.h>
 
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@@ -193,10 +194,20 @@ int blkdev_reread_part(struct block_device *bdev)
 }
 EXPORT_SYMBOL(blkdev_reread_part);
 
-static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
-			     uint64_t len, int secure)
+static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
+		unsigned long arg, unsigned long flags)
 {
-	unsigned long flags = 0;
+	uint64_t range[2];
+	uint64_t start, len;
+
+	if (!(mode & FMODE_WRITE))
+		return -EBADF;
+
+	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
+		return -EFAULT;
+
+	start = range[0];
+	len = range[1];
 
 	if (start & 511)
 		return -EINVAL;
@@ -207,14 +218,24 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 
 	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
 		return -EINVAL;
-	if (secure)
-		flags |= BLKDEV_DISCARD_SECURE;
 	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
 }
 
-static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
-			     uint64_t len)
+static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
+		unsigned long arg)
 {
+	uint64_t range[2];
+	uint64_t start, len;
+
+	if (!(mode & FMODE_WRITE))
+		return -EBADF;
+
+	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
+		return -EFAULT;
+
+	start = range[0];
+	len = range[1];
+
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
@@ -275,6 +296,96 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
  */
 EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
 
+static int blkdev_pr_register(struct block_device *bdev,
+		struct pr_registration __user *arg)
+{
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	struct pr_registration reg;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!ops || !ops->pr_register)
+		return -EOPNOTSUPP;
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+
+	if (reg.flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+	return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags);
+}
+
+static int blkdev_pr_reserve(struct block_device *bdev,
+		struct pr_reservation __user *arg)
+{
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	struct pr_reservation rsv;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!ops || !ops->pr_reserve)
+		return -EOPNOTSUPP;
+	if (copy_from_user(&rsv, arg, sizeof(rsv)))
+		return -EFAULT;
+
+	if (rsv.flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+	return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags);
+}
+
+static int blkdev_pr_release(struct block_device *bdev,
+		struct pr_reservation __user *arg)
+{
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	struct pr_reservation rsv;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!ops || !ops->pr_release)
+		return -EOPNOTSUPP;
+	if (copy_from_user(&rsv, arg, sizeof(rsv)))
+		return -EFAULT;
+
+	if (rsv.flags)
+		return -EOPNOTSUPP;
+	return ops->pr_release(bdev, rsv.key, rsv.type);
+}
+
+static int blkdev_pr_preempt(struct block_device *bdev,
+		struct pr_preempt __user *arg, bool abort)
+{
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	struct pr_preempt p;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!ops || !ops->pr_preempt)
+		return -EOPNOTSUPP;
+	if (copy_from_user(&p, arg, sizeof(p)))
+		return -EFAULT;
+
+	if (p.flags)
+		return -EOPNOTSUPP;
+	return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort);
+}
+
+static int blkdev_pr_clear(struct block_device *bdev,
+		struct pr_clear __user *arg)
+{
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	struct pr_clear c;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!ops || !ops->pr_clear)
+		return -EOPNOTSUPP;
+	if (copy_from_user(&c, arg, sizeof(c)))
+		return -EFAULT;
+
+	if (c.flags)
+		return -EOPNOTSUPP;
+	return ops->pr_clear(bdev, c.key);
+}
+
 /*
  * Is it an unrecognized ioctl? The correct returns are either
  * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
@@ -295,89 +406,115 @@ static inline int is_unrecognized_ioctl(int ret)
 		ret == -ENOIOCTLCMD;
 }
 
-/*
- * always keep this in sync with compat_blkdev_ioctl()
- */
-int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
-			unsigned long arg)
+static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
+		unsigned cmd, unsigned long arg)
 {
-	struct gendisk *disk = bdev->bd_disk;
-	struct backing_dev_info *bdi;
-	loff_t size;
-	int ret, n;
-	unsigned int max_sectors;
+	int ret;
 
-	switch(cmd) {
-	case BLKFLSBUF:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
-
-		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
-		if (!is_unrecognized_ioctl(ret))
-			return ret;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
 
-		fsync_bdev(bdev);
-		invalidate_bdev(bdev);
-		return 0;
+	ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+	if (!is_unrecognized_ioctl(ret))
+		return ret;
 
-	case BLKROSET:
-		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
-		if (!is_unrecognized_ioctl(ret))
-			return ret;
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		if (get_user(n, (int __user *)(arg)))
-			return -EFAULT;
-		set_device_ro(bdev, n);
-		return 0;
+	fsync_bdev(bdev);
+	invalidate_bdev(bdev);
+	return 0;
+}
 
-	case BLKDISCARD:
-	case BLKSECDISCARD: {
-		uint64_t range[2];
+static int blkdev_roset(struct block_device *bdev, fmode_t mode,
+		unsigned cmd, unsigned long arg)
+{
+	int ret, n;
 
-		if (!(mode & FMODE_WRITE))
-			return -EBADF;
+	ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+	if (!is_unrecognized_ioctl(ret))
+		return ret;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (get_user(n, (int __user *)arg))
+		return -EFAULT;
+	set_device_ro(bdev, n);
+	return 0;
+}
 
-		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
-			return -EFAULT;
+static int blkdev_getgeo(struct block_device *bdev,
+		struct hd_geometry __user *argp)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct hd_geometry geo;
+	int ret;
 
-		return blk_ioctl_discard(bdev, range[0], range[1],
-					 cmd == BLKSECDISCARD);
-	}
-	case BLKZEROOUT: {
-		uint64_t range[2];
+	if (!argp)
+		return -EINVAL;
+	if (!disk->fops->getgeo)
+		return -ENOTTY;
+
+	/*
+	 * We need to set the startsect first, the driver may
+	 * want to override it.
+	 */
+	memset(&geo, 0, sizeof(geo));
+	geo.start = get_start_sect(bdev);
+	ret = disk->fops->getgeo(bdev, &geo);
+	if (ret)
+		return ret;
+	if (copy_to_user(argp, &geo, sizeof(geo)))
+		return -EFAULT;
+	return 0;
+}
 
-		if (!(mode & FMODE_WRITE))
-			return -EBADF;
+/* set the logical block size */
+static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
+		int __user *argp)
+{
+	int ret, n;
 
-		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
-			return -EFAULT;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (!argp)
+		return -EINVAL;
+	if (get_user(n, argp))
+		return -EFAULT;
 
-		return blk_ioctl_zeroout(bdev, range[0], range[1]);
+	if (!(mode & FMODE_EXCL)) {
+		bdgrab(bdev);
+		if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
+			return -EBUSY;
 	}
 
-	case HDIO_GETGEO: {
-		struct hd_geometry geo;
+	ret = set_blocksize(bdev, n);
+	if (!(mode & FMODE_EXCL))
+		blkdev_put(bdev, mode | FMODE_EXCL);
+	return ret;
+}
 
-		if (!arg)
-			return -EINVAL;
-		if (!disk->fops->getgeo)
-			return -ENOTTY;
-
-		/*
-		 * We need to set the startsect first, the driver may
-		 * want to override it.
-		 */
-		memset(&geo, 0, sizeof(geo));
-		geo.start = get_start_sect(bdev);
-		ret = disk->fops->getgeo(bdev, &geo);
-		if (ret)
-			return ret;
-		if (copy_to_user((struct hd_geometry __user *)arg, &geo,
-					sizeof(geo)))
-			return -EFAULT;
-		return 0;
-	}
+/*
+ * always keep this in sync with compat_blkdev_ioctl()
+ */
+int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
+			unsigned long arg)
+{
+	struct backing_dev_info *bdi;
+	void __user *argp = (void __user *)arg;
+	loff_t size;
+	unsigned int max_sectors;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return blkdev_flushbuf(bdev, mode, cmd, arg);
+	case BLKROSET:
+		return blkdev_roset(bdev, mode, cmd, arg);
+	case BLKDISCARD:
+		return blk_ioctl_discard(bdev, mode, arg, 0);
+	case BLKSECDISCARD:
+		return blk_ioctl_discard(bdev, mode, arg,
+				BLKDEV_DISCARD_SECURE);
+	case BLKZEROOUT:
+		return blk_ioctl_zeroout(bdev, mode, arg);
+	case HDIO_GETGEO:
+		return blkdev_getgeo(bdev, argp);
 	case BLKRAGET:
 	case BLKFRAGET:
 		if (!arg)
@@ -414,28 +551,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
 		return 0;
 	case BLKBSZSET:
-		/* set the logical block size */
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		if (!arg)
-			return -EINVAL;
-		if (get_user(n, (int __user *) arg))
-			return -EFAULT;
-		if (!(mode & FMODE_EXCL)) {
-			bdgrab(bdev);
-			if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
-				return -EBUSY;
-		}
-		ret = set_blocksize(bdev, n);
-		if (!(mode & FMODE_EXCL))
-			blkdev_put(bdev, mode | FMODE_EXCL);
-		return ret;
+		return blkdev_bszset(bdev, mode, argp);
 	case BLKPG:
-		ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
-		break;
+		return blkpg_ioctl(bdev, argp);
 	case BLKRRPART:
-		ret = blkdev_reread_part(bdev);
-		break;
+		return blkdev_reread_part(bdev);
 	case BLKGETSIZE:
 		size = i_size_read(bdev->bd_inode);
 		if ((size >> 9) > ~0UL)
@@ -447,11 +567,21 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESTOP:
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
-		ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg);
-		break;
+		return blk_trace_ioctl(bdev, cmd, argp);
+	case IOC_PR_REGISTER:
+		return blkdev_pr_register(bdev, argp);
+	case IOC_PR_RESERVE:
+		return blkdev_pr_reserve(bdev, argp);
+	case IOC_PR_RELEASE:
+		return blkdev_pr_release(bdev, argp);
+	case IOC_PR_PREEMPT:
+		return blkdev_pr_preempt(bdev, argp, false);
+	case IOC_PR_PREEMPT_ABORT:
+		return blkdev_pr_preempt(bdev, argp, true);
+	case IOC_PR_CLEAR:
+		return blkdev_pr_clear(bdev, argp);
 	default:
-		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+		return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 	}
-	return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_ioctl);
diff --git a/block/ioprio.c b/block/ioprio.c
index 31666c92b..cc7800e9e 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -123,7 +123,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 				break;
 
 			do_each_thread(g, p) {
-				if (!uid_eq(task_uid(p), uid))
+				if (!uid_eq(task_uid(p), uid) ||
+				    !task_pid_vnr(p))
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
@@ -220,7 +221,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 				break;
 
 			do_each_thread(g, p) {
-				if (!uid_eq(task_uid(p), user->uid))
+				if (!uid_eq(task_uid(p), user->uid) ||
+				    !task_pid_vnr(p))
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 3de89d469..a163c487c 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -21,10 +21,10 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq,
 static int noop_dispatch(struct request_queue *q, int force)
 {
 	struct noop_data *nd = q->elevator->elevator_data;
+	struct request *rq;
 
-	if (!list_empty(&nd->queue)) {
-		struct request *rq;
-		rq = list_entry(nd->queue.next, struct request, queuelist);
+	rq = list_first_entry_or_null(&nd->queue, struct request, queuelist);
+	if (rq) {
 		list_del_init(&rq->queuelist);
 		elv_dispatch_sort(q, rq);
 		return 1;
@@ -46,7 +46,7 @@ noop_former_request(struct request_queue *q, struct request *rq)
 
 	if (rq->queuelist.prev == &nd->queue)
 		return NULL;
-	return list_entry(rq->queuelist.prev, struct request, queuelist);
+	return list_prev_entry(rq, queuelist);
 }
 
 static struct request *
@@ -56,7 +56,7 @@ noop_latter_request(struct request_queue *q, struct request *rq)
 
 	if (rq->queuelist.next == &nd->queue)
 		return NULL;
-	return list_entry(rq->queuelist.next, struct request, queuelist);
+	return list_next_entry(rq, queuelist);
 }
 
 static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
diff --git a/block/partition-generic.c b/block/partition-generic.c
index e77111332..746935a59 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -397,7 +397,7 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
 	struct hd_struct *part;
 	int res;
 
-	if (bdev->bd_part_count)
+	if (bdev->bd_part_count || bdev->bd_super)
 		return -EBUSY;
 	res = invalidate_partition(disk, 0);
 	if (res)
@@ -428,6 +428,7 @@ rescan:
 
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
+	blk_integrity_revalidate(disk);
 	check_disk_size_change(disk, bdev);
 	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
index c2c48ec64..621317ac4 100644
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -32,7 +32,7 @@ int mac_partition(struct parsed_partitions *state)
 	Sector sect;
 	unsigned char *data;
 	int slot, blocks_in_map;
-	unsigned secsize;
+	unsigned secsize, datasize, partoffset;
 #ifdef CONFIG_PPC_PMAC
 	int found_root = 0;
 	int found_root_goodness = 0;
@@ -50,10 +50,14 @@ int mac_partition(struct parsed_partitions *state)
 	}
 	secsize = be16_to_cpu(md->block_size);
 	put_dev_sector(sect);
-	data = read_part_sector(state, secsize/512, &sect);
+	datasize = round_down(secsize, 512);
+	data = read_part_sector(state, datasize / 512, &sect);
 	if (!data)
 		return -1;
-	part = (struct mac_partition *) (data + secsize%512);
+	partoffset = secsize % 512;
+	if (partoffset + sizeof(*part) > datasize)
+		return -1;
+	part = (struct mac_partition *) (data + partoffset);
 	if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
 		put_dev_sector(sect);
 		return 0;		/* not a MacOS disk */
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index dda653ce7..077479994 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -444,7 +444,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	}
 
-	rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
+	rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto error_free_buffer;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		break;
 	}
 
-	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
+	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) {
 		err = DRIVER_ERROR << 24;
 		goto error;
 	}
@@ -536,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	struct request *rq;
 	int err;
 
-	rq = blk_get_request(q, WRITE, __GFP_WAIT);
+	rq = blk_get_request(q, WRITE, __GFP_RECLAIM);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	blk_rq_set_block_pc(rq);
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 24d6e9715..2c9791233 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -160,38 +160,30 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
 	return t10_pi_verify(iter, t10_pi_ip_fn, 3);
 }
 
-struct blk_integrity t10_pi_type1_crc = {
+struct blk_integrity_profile t10_pi_type1_crc = {
 	.name			= "T10-DIF-TYPE1-CRC",
 	.generate_fn		= t10_pi_type1_generate_crc,
 	.verify_fn		= t10_pi_type1_verify_crc,
-	.tuple_size		= sizeof(struct t10_pi_tuple),
-	.tag_size		= 0,
 };
 EXPORT_SYMBOL(t10_pi_type1_crc);
 
-struct blk_integrity t10_pi_type1_ip = {
+struct blk_integrity_profile t10_pi_type1_ip = {
 	.name			= "T10-DIF-TYPE1-IP",
 	.generate_fn		= t10_pi_type1_generate_ip,
 	.verify_fn		= t10_pi_type1_verify_ip,
-	.tuple_size		= sizeof(struct t10_pi_tuple),
-	.tag_size		= 0,
 };
 EXPORT_SYMBOL(t10_pi_type1_ip);
 
-struct blk_integrity t10_pi_type3_crc = {
+struct blk_integrity_profile t10_pi_type3_crc = {
 	.name			= "T10-DIF-TYPE3-CRC",
 	.generate_fn		= t10_pi_type3_generate_crc,
 	.verify_fn		= t10_pi_type3_verify_crc,
-	.tuple_size		= sizeof(struct t10_pi_tuple),
-	.tag_size		= 0,
 };
 EXPORT_SYMBOL(t10_pi_type3_crc);
 
-struct blk_integrity t10_pi_type3_ip = {
+struct blk_integrity_profile t10_pi_type3_ip = {
 	.name			= "T10-DIF-TYPE3-IP",
 	.generate_fn		= t10_pi_type3_generate_ip,
 	.verify_fn		= t10_pi_type3_verify_ip,
-	.tuple_size		= sizeof(struct t10_pi_tuple),
-	.tag_size		= 0,
 };
 EXPORT_SYMBOL(t10_pi_type3_ip);