25 files changed, 627 insertions, 1492 deletions
diff --git a/block/Makefile b/block/Makefile
index 086be5007..1ed86d58f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
 			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
-			uuid.o genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
 			partitions/
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
diff --git a/block/bfq.h b/block/bfq.h
index 0ea164d41..96ffbf773 100644
--- a/block/bfq.h
+++ b/block/bfq.h
@@ -1,5 +1,5 @@
 /*
- * BFQ-v7r8 for 4.0.0: data structures and common functions prototypes.
+ * BFQ-v7r8 for 4.1.0: data structures and common functions prototypes.
  *
  * Based on ideas and code from CFQ:
  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 39ce74d10..719b7152a 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 /**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 */
 	if (error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio_nodec(bio, error);
+		bio_endio(bio, error);
 
 		return;
 	}
diff --git a/block/bio.c b/block/bio.c
index 4441522ca..d6e5ba339 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,8 +270,8 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	atomic_set(&bio->bi_remaining, 1);
-	atomic_set(&bio->bi_cnt, 1);
+	atomic_set(&bio->__bi_remaining, 1);
+	atomic_set(&bio->__bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
 
@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
 	__bio_free(bio);
 
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
-	atomic_set(&bio->bi_remaining, 1);
+	bio->bi_flags = flags | (1 << BIO_UPTODATE);
+	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
 	bio_put(bio);
 }
 
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
 /**
  * bio_chain - chain bio completions
  * @bio: the target bio
@@ -320,7 +331,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
 
 	bio->bi_private = parent;
 	bio->bi_end_io	= bio_chain_endio;
-	atomic_inc(&parent->bi_remaining);
+	bio_inc_remaining(parent);
 }
 EXPORT_SYMBOL(bio_chain);
 
@@ -524,13 +535,17 @@ EXPORT_SYMBOL(zero_fill_bio);
  **/
 void bio_put(struct bio *bio)
 {
-	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
-	/*
-	 * last put frees it
-	 */
-	if (atomic_dec_and_test(&bio->bi_cnt))
+	if (!bio_flagged(bio, BIO_REFFED))
 		bio_free(bio);
+	else {
+		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+
+		/*
+		 * last put frees it
+		 */
+		if (atomic_dec_and_test(&bio->__bi_cnt))
+			bio_free(bio);
+	}
 }
 EXPORT_SYMBOL(bio_put);
 
@@ -1741,6 +1756,25 @@ void bio_flush_dcache_pages(struct bio *bi)
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif
 
+static inline bool bio_remaining_done(struct bio *bio)
+{
+	/*
+	 * If we're not chaining, then ->__bi_remaining is always 1 and
+	 * we always end io on the first invocation.
+	 */
+	if (!bio_flagged(bio, BIO_CHAIN))
+		return true;
+
+	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+
+	if (atomic_dec_and_test(&bio->__bi_remaining)) {
+		clear_bit(BIO_CHAIN, &bio->bi_flags);
+		return true;
+	}
+
+	return false;
+}
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
@@ -1758,15 +1792,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
 void bio_endio(struct bio *bio, int error)
 {
 	while (bio) {
-		BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 
-		if (!atomic_dec_and_test(&bio->bi_remaining))
-			return;
+		if (unlikely(!bio_remaining_done(bio)))
+			break;
 
 		/*
 		 * Need to have a real endio function for chained bios,
@@ -1790,21 +1822,6 @@ void bio_endio(struct bio *bio, int error)
 EXPORT_SYMBOL(bio_endio);
 
 /**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio:	bio
- * @error:	error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
-	atomic_inc(&bio->bi_remaining);
-	bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
-/**
  * bio_split - split a bio
  * @bio:	bio to split
  * @sectors:	number of sectors to split from the front of @bio
@@ -1980,6 +1997,29 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_
 EXPORT_SYMBOL(bioset_create_nobvec);
 
 #ifdef CONFIG_BLK_CGROUP
+
+/**
+ * bio_associate_blkcg - associate a bio with the specified blkcg
+ * @bio: target bio
+ * @blkcg_css: css of the blkcg to associate
+ *
+ * Associate @bio with the blkcg specified by @blkcg_css.  Block layer will
+ * treat @bio as if it were issued by a task which belongs to the blkcg.
+ *
+ * This function takes an extra reference of @blkcg_css which will be put
+ * when @bio is released.  The caller must own @bio and is responsible for
+ * synchronizing calls to this function.
+ */
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+{
+	if (unlikely(bio->bi_css))
+		return -EBUSY;
+	css_get(blkcg_css);
+	bio->bi_css = blkcg_css;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkcg);
+
 /**
  * bio_associate_current - associate a bio with %current
  * @bio: target bio
@@ -1996,28 +2036,20 @@ EXPORT_SYMBOL(bioset_create_nobvec);
 int bio_associate_current(struct bio *bio)
 {
 	struct io_context *ioc;
-	struct cgroup_subsys_state *css;
 
-	if (bio->bi_ioc)
+	if (bio->bi_css)
 		return -EBUSY;
 
 	ioc = current->io_context;
 	if (!ioc)
 		return -ENOENT;
 
-	/* acquire active ref on @ioc and associate */
 	get_io_context_active(ioc);
 	bio->bi_ioc = ioc;
-
-	/* associate blkcg if exists */
-	rcu_read_lock();
-	css = task_css(current, blkio_cgrp_id);
-	if (css && css_tryget_online(css))
-		bio->bi_css = css;
-	rcu_read_unlock();
-
+	bio->bi_css = task_get_css(current, blkio_cgrp_id);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(bio_associate_current);
 
 /**
  * bio_disassociate_task - undo bio_associate_current()
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6817e2896..d6283b3f5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -9,29 +9,45 @@
  *
  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  * 	              Nauman Rafique <nauman@google.com>
+ *
+ * For policy-specific per-blkcg data:
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *                    Arianna Avanzini <avanzini.arianna@gmail.com>
  */
 #include <linux/ioprio.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
 #include "blk.h"
 
 #define MAX_KEY_LEN 100
 
+/*
+ * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
+ * blkcg_pol_register_mutex nests outside of it and synchronizes entire
+ * policy [un]register operations including cgroup file additions /
+ * removals.  Putting cgroup file registration outside blkcg_pol_mutex
+ * allows grabbing it from cgroup callbacks.
+ */
+static DEFINE_MUTEX(blkcg_pol_register_mutex);
 static DEFINE_MUTEX(blkcg_pol_mutex);
 
-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
-			    .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
+struct blkcg blkcg_root;
 EXPORT_SYMBOL_GPL(blkcg_root);
 
+struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
+static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
+
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
 {
@@ -179,6 +195,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 				    struct blkcg_gq *new_blkg)
 {
 	struct blkcg_gq *blkg;
+	struct bdi_writeback_congested *wb_congested;
 	int i, ret;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
@@ -190,22 +207,30 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
+	wb_congested = wb_congested_get_create(&q->backing_dev_info,
+					       blkcg->css.id, GFP_ATOMIC);
+	if (!wb_congested) {
+		ret = -ENOMEM;
+		goto err_put_css;
+	}
+
 	/* allocate */
 	if (!new_blkg) {
 		new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
-			goto err_put_css;
+			goto err_put_congested;
 		}
 	}
 	blkg = new_blkg;
+	blkg->wb_congested = wb_congested;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -EINVAL;
-			goto err_put_css;
+			goto err_put_congested;
 		}
 		blkg_get(blkg->parent);
 	}
@@ -235,18 +260,15 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	blkg->online = true;
 	spin_unlock(&blkcg->lock);
 
-	if (!ret) {
-		if (blkcg == &blkcg_root) {
-			q->root_blkg = blkg;
-			q->root_rl.blkg = blkg;
-		}
+	if (!ret)
 		return blkg;
-	}
 
 	/* @blkg failed fully initialized, use the usual release path */
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
+err_put_congested:
+	wb_congested_put(wb_congested);
 err_put_css:
 	css_put(&blkcg->css);
 err_free_blkg:
@@ -340,15 +362,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 		rcu_assign_pointer(blkcg->blkg_hint, NULL);
 
 	/*
-	 * If root blkg is destroyed.  Just clear the pointer since root_rl
-	 * does not take reference on root blkg.
-	 */
-	if (blkcg == &blkcg_root) {
-		blkg->q->root_blkg = NULL;
-		blkg->q->root_rl.blkg = NULL;
-	}
-
-	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
@@ -402,6 +415,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
 	if (blkg->parent)
 		blkg_put(blkg->parent);
 
+	wb_congested_put(blkg->wb_congested);
+
 	blkg_free(blkg);
 }
 EXPORT_SYMBOL_GPL(__blkg_release_rcu);
@@ -448,20 +463,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	struct blkcg_gq *blkg;
 	int i;
 
-	/*
-	 * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex
-	 * which ends up putting cgroup's internal cgroup_tree_mutex under
-	 * it; however, cgroup_tree_mutex is nested above cgroup file
-	 * active protection and grabbing blkcg_pol_mutex from a cgroup
-	 * file operation creates a possible circular dependency.  cgroup
-	 * internal locking is planned to go through further simplification
-	 * and this issue should go away soon.  For now, let's trylock
-	 * blkcg_pol_mutex and restart the write on failure.
-	 *
-	 * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com
-	 */
-	if (!mutex_trylock(&blkcg_pol_mutex))
-		return restart_syscall();
+	mutex_lock(&blkcg_pol_mutex);
 	spin_lock_irq(&blkcg->lock);
 
 	/*
@@ -813,20 +815,35 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 	}
 
 	spin_unlock_irq(&blkcg->lock);
+
+	wb_blkcg_offline(blkcg);
 }
 
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 
-	if (blkcg != &blkcg_root)
+	mutex_lock(&blkcg_pol_mutex);
+	list_del(&blkcg->all_blkcgs_node);
+	mutex_unlock(&blkcg_pol_mutex);
+
+	if (blkcg != &blkcg_root) {
+		int i;
+
+		for (i = 0; i < BLKCG_MAX_POLS; i++)
+			kfree(blkcg->pd[i]);
 		kfree(blkcg);
+	}
 }
 
 static struct cgroup_subsys_state *
 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct blkcg *blkcg;
+	struct cgroup_subsys_state *ret;
+	int i;
+
+	mutex_lock(&blkcg_pol_mutex);
 
 	if (!parent_css) {
 		blkcg = &blkcg_root;
@@ -834,17 +851,54 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	}
 
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-	if (!blkcg)
-		return ERR_PTR(-ENOMEM);
+	if (!blkcg) {
+		ret = ERR_PTR(-ENOMEM);
+		goto free_blkcg;
+	}
+
+	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+		struct blkcg_policy_data *cpd;
+
+		/*
+		 * If the policy hasn't been attached yet, wait for it
+		 * to be attached before doing anything else. Otherwise,
+		 * check if the policy requires any specific per-cgroup
+		 * data: if it does, allocate and initialize it.
+		 */
+		if (!pol || !pol->cpd_size)
+			continue;
+
+		BUG_ON(blkcg->pd[i]);
+		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+		if (!cpd) {
+			ret = ERR_PTR(-ENOMEM);
+			goto free_pd_blkcg;
+		}
+		blkcg->pd[i] = cpd;
+		cpd->plid = i;
+		pol->cpd_init_fn(blkcg);
+	}
 
-	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
-	blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
 done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
+#ifdef CONFIG_CGROUP_WRITEBACK
+	INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
+	list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
 
+	mutex_unlock(&blkcg_pol_mutex);
 	return &blkcg->css;
+
+free_pd_blkcg:
+	for (i--; i >= 0; i--)
+		kfree(blkcg->pd[i]);
+free_blkcg:
+	kfree(blkcg);
+	mutex_unlock(&blkcg_pol_mutex);
+	return ret;
 }
 
 /**
@@ -859,9 +913,45 @@ done:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
-	might_sleep();
+	struct blkcg_gq *new_blkg, *blkg;
+	bool preloaded;
+	int ret;
+
+	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+	if (!new_blkg)
+		return -ENOMEM;
+
+	preloaded = !radix_tree_preload(GFP_KERNEL);
+
+	/*
+	 * Make sure the root blkg exists and count the existing blkgs.  As
+	 * @q is bypassing at this point, blkg_lookup_create() can't be
+	 * used.  Open code insertion.
+	 */
+	rcu_read_lock();
+	spin_lock_irq(q->queue_lock);
+	blkg = blkg_create(&blkcg_root, q, new_blkg);
+	spin_unlock_irq(q->queue_lock);
+	rcu_read_unlock();
+
+	if (preloaded)
+		radix_tree_preload_end();
 
-	return blk_throtl_init(q);
+	if (IS_ERR(blkg)) {
+		kfree(new_blkg);
+		return PTR_ERR(blkg);
+	}
+
+	q->root_blkg = blkg;
+	q->root_rl.blkg = blkg;
+
+	ret = blk_throtl_init(q);
+	if (ret) {
+		spin_lock_irq(q->queue_lock);
+		blkg_destroy_all(q);
+		spin_unlock_irq(q->queue_lock);
+	}
+	return ret;
 }
 
 /**
@@ -962,52 +1052,21 @@ int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
 	LIST_HEAD(pds);
-	struct blkcg_gq *blkg, *new_blkg;
-	struct blkg_policy_data *pd, *n;
+	struct blkcg_gq *blkg;
+	struct blkg_policy_data *pd, *nd;
 	int cnt = 0, ret;
-	bool preloaded;
 
 	if (blkcg_policy_enabled(q, pol))
 		return 0;
 
-	/* preallocations for root blkg */
-	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-	if (!new_blkg)
-		return -ENOMEM;
-
+	/* count and allocate policy_data for all existing blkgs */
 	blk_queue_bypass_start(q);
-
-	preloaded = !radix_tree_preload(GFP_KERNEL);
-
-	/*
-	 * Make sure the root blkg exists and count the existing blkgs.  As
-	 * @q is bypassing at this point, blkg_lookup_create() can't be
-	 * used.  Open code it.
-	 */
 	spin_lock_irq(q->queue_lock);
-
-	rcu_read_lock();
-	blkg = __blkg_lookup(&blkcg_root, q, false);
-	if (blkg)
-		blkg_free(new_blkg);
-	else
-		blkg = blkg_create(&blkcg_root, q, new_blkg);
-	rcu_read_unlock();
-
-	if (preloaded)
-		radix_tree_preload_end();
-
-	if (IS_ERR(blkg)) {
-		ret = PTR_ERR(blkg);
-		goto out_unlock;
-	}
-
 	list_for_each_entry(blkg, &q->blkg_list, q_node)
 		cnt++;
-
 	spin_unlock_irq(q->queue_lock);
 
-	/* allocate policy_data for all existing blkgs */
+	/* allocate per-blkg policy data for all existing blkgs */
 	while (cnt--) {
 		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 		if (!pd) {
@@ -1018,7 +1077,7 @@ int blkcg_activate_policy(struct request_queue *q,
 	}
 
 	/*
-	 * Install the allocated pds.  With @q bypassing, no new blkg
+	 * Install the allocated pds and cpds. With @q bypassing, no new blkg
 	 * should have been created while the queue lock was dropped.
 	 */
 	spin_lock_irq(q->queue_lock);
@@ -1049,7 +1108,7 @@ out_unlock:
 	spin_unlock_irq(q->queue_lock);
 out_free:
 	blk_queue_bypass_end(q);
-	list_for_each_entry_safe(pd, n, &pds, alloc_node)
+	list_for_each_entry_safe(pd, nd, &pds, alloc_node)
 		kfree(pd);
 	return ret;
 }
@@ -1076,10 +1135,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 	__clear_bit(pol->plid, q->blkcg_pols);
 
-	/* if no policy is left, no need for blkgs - shoot them down */
-	if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
-		blkg_destroy_all(q);
-
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
 		/* grab blkcg lock too while removing @pd from @blkg */
 		spin_lock(&blkg->blkcg->lock);
@@ -1109,11 +1164,13 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
  */
 int blkcg_policy_register(struct blkcg_policy *pol)
 {
+	struct blkcg *blkcg;
 	int i, ret;
 
 	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
 		return -EINVAL;
 
+	mutex_lock(&blkcg_pol_register_mutex);
 	mutex_lock(&blkcg_pol_mutex);
 
 	/* find an empty slot */
@@ -1122,19 +1179,49 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 		if (!blkcg_policy[i])
 			break;
 	if (i >= BLKCG_MAX_POLS)
-		goto out_unlock;
+		goto err_unlock;
 
-	/* register and update blkgs */
+	/* register @pol */
 	pol->plid = i;
-	blkcg_policy[i] = pol;
+	blkcg_policy[pol->plid] = pol;
+
+	/* allocate and install cpd's */
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			struct blkcg_policy_data *cpd;
+
+			cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+			if (!cpd) {
+				mutex_unlock(&blkcg_pol_mutex);
+				goto err_free_cpds;
+			}
+
+			blkcg->pd[pol->plid] = cpd;
+			cpd->plid = pol->plid;
+			pol->cpd_init_fn(blkcg);
+		}
+	}
+
+	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
 	if (pol->cftypes)
 		WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
 						  pol->cftypes));
-	ret = 0;
-out_unlock:
+	mutex_unlock(&blkcg_pol_register_mutex);
+	return 0;
+
+err_free_cpds:
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			kfree(blkcg->pd[pol->plid]);
+			blkcg->pd[pol->plid] = NULL;
+		}
+	}
+	blkcg_policy[pol->plid] = NULL;
+err_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
+	mutex_unlock(&blkcg_pol_register_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_register);
@@ -1147,7 +1234,9 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
  */
 void blkcg_policy_unregister(struct blkcg_policy *pol)
 {
-	mutex_lock(&blkcg_pol_mutex);
+	struct blkcg *blkcg;
+
+	mutex_lock(&blkcg_pol_register_mutex);
 
 	if (WARN_ON(blkcg_policy[pol->plid] != pol))
 		goto out_unlock;
@@ -1156,9 +1245,19 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 	if (pol->cftypes)
 		cgroup_rm_cftypes(pol->cftypes);
 
-	/* unregister and update blkgs */
+	/* remove cpds and unregister */
+	mutex_lock(&blkcg_pol_mutex);
+
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			kfree(blkcg->pd[pol->plid]);
+			blkcg->pd[pol->plid] = NULL;
+		}
+	}
 	blkcg_policy[pol->plid] = NULL;
-out_unlock:
+
 	mutex_unlock(&blkcg_pol_mutex);
+out_unlock:
+	mutex_unlock(&blkcg_pol_register_mutex);
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
deleted file mode 100644
index c567865b5..000000000
--- a/block/blk-cgroup.h
+++ /dev/null
@@ -1,603 +0,0 @@
-#ifndef _BLK_CGROUP_H
-#define _BLK_CGROUP_H
-/*
- * Common Block IO controller cgroup interface
- *
- * Based on ideas and code from CFQ, CFS and BFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
- * 	              Nauman Rafique <nauman@google.com>
- */
-
-#include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
-#include <linux/seq_file.h>
-#include <linux/radix-tree.h>
-#include <linux/blkdev.h>
-#include <linux/atomic.h>
-
-/* Max limits for throttle policy */
-#define THROTL_IOPS_MAX		UINT_MAX
-
-/* CFQ specific, out here for blkcg->cfq_weight */
-#define CFQ_WEIGHT_MIN		10
-#define CFQ_WEIGHT_MAX		1000
-#define CFQ_WEIGHT_DEFAULT	500
-
-#ifdef CONFIG_BLK_CGROUP
-
-enum blkg_rwstat_type {
-	BLKG_RWSTAT_READ,
-	BLKG_RWSTAT_WRITE,
-	BLKG_RWSTAT_SYNC,
-	BLKG_RWSTAT_ASYNC,
-
-	BLKG_RWSTAT_NR,
-	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
-};
-
-struct blkcg_gq;
-
-struct blkcg {
-	struct cgroup_subsys_state	css;
-	spinlock_t			lock;
-
-	struct radix_tree_root		blkg_tree;
-	struct blkcg_gq			*blkg_hint;
-	struct hlist_head		blkg_list;
-
-	/* TODO: per-policy storage in blkcg */
-	unsigned int			cfq_weight;	/* belongs to cfq */
-	unsigned int			cfq_leaf_weight;
-};
-
-struct blkg_stat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt;
-};
-
-struct blkg_rwstat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt[BLKG_RWSTAT_NR];
-};
-
-/*
- * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
- * request_queue (q).  This is used by blkcg policies which need to track
- * information per blkcg - q pair.
- *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
- */
-struct blkg_policy_data {
-	/* the blkg and policy id this per-policy data belongs to */
-	struct blkcg_gq			*blkg;
-	int				plid;
-
-	/* used during policy activation */
-	struct list_head		alloc_node;
-};
-
-/* association between a blk cgroup and a request queue */
-struct blkcg_gq {
-	/* Pointer to the associated request_queue */
-	struct request_queue		*q;
-	struct list_head		q_node;
-	struct hlist_node		blkcg_node;
-	struct blkcg			*blkcg;
-
-	/* all non-root blkcg_gq's are guaranteed to have access to parent */
-	struct blkcg_gq			*parent;
-
-	/* request allocation list for this blkcg-q pair */
-	struct request_list		rl;
-
-	/* reference count */
-	atomic_t			refcnt;
-
-	/* is this blkg online? protected by both blkcg and q locks */
-	bool				online;
-
-	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
-
-	struct rcu_head			rcu_head;
-};
-
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
-
-struct blkcg_policy {
-	int				plid;
-	/* policy specific private data size */
-	size_t				pd_size;
-	/* cgroup files for the policy */
-	struct cftype			*cftypes;
-
-	/* operations */
-	blkcg_pol_init_pd_fn		*pd_init_fn;
-	blkcg_pol_online_pd_fn		*pd_online_fn;
-	blkcg_pol_offline_pd_fn		*pd_offline_fn;
-	blkcg_pol_exit_pd_fn		*pd_exit_fn;
-	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
-};
-
-extern struct blkcg blkcg_root;
-
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q);
-int blkcg_init_queue(struct request_queue *q);
-void blkcg_drain_queue(struct request_queue *q);
-void blkcg_exit_queue(struct request_queue *q);
-
-/* Blkio controller policy registration */
-int blkcg_policy_register(struct blkcg_policy *pol);
-void blkcg_policy_unregister(struct blkcg_policy *pol);
-int blkcg_activate_policy(struct request_queue *q,
-			  const struct blkcg_policy *pol);
-void blkcg_deactivate_policy(struct request_queue *q,
-			     const struct blkcg_policy *pol);
-
-void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
-		       u64 (*prfill)(struct seq_file *,
-				     struct blkg_policy_data *, int),
-		       const struct blkcg_policy *pol, int data,
-		       bool show_total);
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-			 const struct blkg_rwstat *rwstat);
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-		       int off);
-
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-					     int off);
-
-struct blkg_conf_ctx {
-	struct gendisk			*disk;
-	struct blkcg_gq			*blkg;
-	u64				v;
-};
-
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   const char *input, struct blkg_conf_ctx *ctx);
-void blkg_conf_finish(struct blkg_conf_ctx *ctx);
-
-
-static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
-{
-	return css ? container_of(css, struct blkcg, css) : NULL;
-}
-
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
-}
-
-static inline struct blkcg *bio_blkcg(struct bio *bio)
-{
-	if (bio && bio->bi_css)
-		return css_to_blkcg(bio->bi_css);
-	return task_blkcg(current);
-}
-
-/**
- * blkcg_parent - get the parent of a blkcg
- * @blkcg: blkcg of interest
- *
- * Return the parent blkcg of @blkcg.  Can be called anytime.
- */
-static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
-{
-	return css_to_blkcg(blkcg->css.parent);
-}
-
-/**
- * blkg_to_pdata - get policy private data
- * @blkg: blkg of interest
- * @pol: policy of interest
- *
- * Return pointer to private data associated with the @blkg-@pol pair.
- */
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-						  struct blkcg_policy *pol)
-{
-	return blkg ? blkg->pd[pol->plid] : NULL;
-}
-
-/**
- * pdata_to_blkg - get blkg associated with policy private data
- * @pd: policy private data of interest
- *
- * @pd is policy private data.  Determine the blkg it's associated with.
- */
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
-{
-	return pd ? pd->blkg : NULL;
-}
-
-/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
-	char *p;
-
-	p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-	if (!p) {
-		strncpy(buf, "<unavailable>", buflen);
-		return -ENAMETOOLONG;
-	}
-
-	memmove(buf, p, buf + buflen - p);
-	return 0;
-}
-
-/**
- * blkg_get - get a blkg reference
- * @blkg: blkg to get
- *
- * The caller should be holding an existing reference.
- */
-static inline void blkg_get(struct blkcg_gq *blkg)
-{
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	atomic_inc(&blkg->refcnt);
-}
-
-void __blkg_release_rcu(struct rcu_head *rcu);
-
-/**
- * blkg_put - put a blkg reference
- * @blkg: blkg to put
- */
-static inline void blkg_put(struct blkcg_gq *blkg)
-{
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	if (atomic_dec_and_test(&blkg->refcnt))
-		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
-}
-
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-			       bool update_hint);
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
- * read locked.  If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs.  The caller may
- * update @pos_css by calling css_rightmost_descendant() to skip subtree.
- * @p_blkg is included in the iteration and the first node to be visited.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
-	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
-		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
-					      (p_blkg)->q, false)))
-
-/**
- * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Similar to blkg_for_each_descendant_pre() but performs post-order
- * traversal instead.  Synchronization rules are the same.  @p_blkg is
- * included in the iteration and the last node to be visited.
- */
-#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
-	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
-		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
-					      (p_blkg)->q, false)))
-
-/**
- * blk_get_rl - get request_list to use
- * @q: request_queue of interest
- * @bio: bio which will be attached to the allocated request (may be %NULL)
- *
- * The caller wants to allocate a request from @q to use for @bio.  Find
- * the request_list to use and obtain a reference on it.  Should be called
- * under queue_lock.  This function is guaranteed to return non-%NULL
- * request_list.
- */
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-					      struct bio *bio)
-{
-	struct blkcg *blkcg;
-	struct blkcg_gq *blkg;
-
-	rcu_read_lock();
-
-	blkcg = bio_blkcg(bio);
-
-	/* bypass blkg lookup and use @q->root_rl directly for root */
-	if (blkcg == &blkcg_root)
-		goto root_rl;
-
-	/*
-	 * Try to use blkg->rl.  blkg lookup may fail under memory pressure
-	 * or if either the blkcg or queue is going away.  Fall back to
-	 * root_rl in such cases.
-	 */
-	blkg = blkg_lookup_create(blkcg, q);
-	if (unlikely(IS_ERR(blkg)))
-		goto root_rl;
-
-	blkg_get(blkg);
-	rcu_read_unlock();
-	return &blkg->rl;
-root_rl:
-	rcu_read_unlock();
-	return &q->root_rl;
-}
-
-/**
- * blk_put_rl - put request_list
- * @rl: request_list to put
- *
- * Put the reference acquired by blk_get_rl().  Should be called under
- * queue_lock.
- */
-static inline void blk_put_rl(struct request_list *rl)
-{
-	/* root_rl may not have blkg set */
-	if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
-		blkg_put(rl->blkg);
-}
-
-/**
- * blk_rq_set_rl - associate a request with a request_list
- * @rq: request of interest
- * @rl: target request_list
- *
- * Associate @rq with @rl so that accounting and freeing can know the
- * request_list @rq came from.
- */
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
-{
-	rq->rl = rl;
-}
-
-/**
- * blk_rq_rl - return the request_list a request came from
- * @rq: request of interest
- *
- * Return the request_list @rq is allocated from.
- */
-static inline struct request_list *blk_rq_rl(struct request *rq)
-{
-	return rq->rl;
-}
-
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
-					 struct request_queue *q);
-/**
- * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
- *
- * Should be used under queue_lock.
- */
-#define blk_queue_for_each_rl(rl, q)	\
-	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
-
-static inline void blkg_stat_init(struct blkg_stat *stat)
-{
-	u64_stats_init(&stat->syncp);
-}
-
-/**
- * blkg_stat_add - add a value to a blkg_stat
- * @stat: target blkg_stat
- * @val: value to add
- *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
- */
-static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
-{
-	u64_stats_update_begin(&stat->syncp);
-	stat->cnt += val;
-	u64_stats_update_end(&stat->syncp);
-}
-
-/**
- * blkg_stat_read - read the current value of a blkg_stat
- * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
- */
-static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
-{
-	unsigned int start;
-	uint64_t v;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&stat->syncp);
-		v = stat->cnt;
-	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-	return v;
-}
-
-/**
- * blkg_stat_reset - reset a blkg_stat
- * @stat: blkg_stat to reset
- */
-static inline void blkg_stat_reset(struct blkg_stat *stat)
-{
-	stat->cnt = 0;
-}
-
-/**
- * blkg_stat_merge - merge a blkg_stat into another
- * @to: the destination blkg_stat
- * @from: the source
- *
- * Add @from's count to @to.
- */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
-{
-	blkg_stat_add(to, blkg_stat_read(from));
-}
-
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
-{
-	u64_stats_init(&rwstat->syncp);
-}
-
-/**
- * blkg_rwstat_add - add a value to a blkg_rwstat
- * @rwstat: target blkg_rwstat
- * @rw: mask of REQ_{WRITE|SYNC}
- * @val: value to add
- *
- * Add @val to @rwstat.  The counters are chosen according to @rw.  The
- * caller is responsible for synchronizing calls to this function.
- */
-static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
-				   int rw, uint64_t val)
-{
-	u64_stats_update_begin(&rwstat->syncp);
-
-	if (rw & REQ_WRITE)
-		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
-	else
-		rwstat->cnt[BLKG_RWSTAT_READ] += val;
-	if (rw & REQ_SYNC)
-		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
-	else
-		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
-
-	u64_stats_update_end(&rwstat->syncp);
-}
-
-/**
- * blkg_rwstat_read - read the current values of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
- */
-static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
-{
-	unsigned int start;
-	struct blkg_rwstat tmp;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-		tmp = *rwstat;
-	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
-
-	return tmp;
-}
-
-/**
- * blkg_rwstat_total - read the total count of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Return the total count of @rwstat regardless of the IO direction.  This
- * function can be called without synchronization and takes care of u64
- * atomicity.
- */
-static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
-{
-	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
-
-	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
-}
-
-/**
- * blkg_rwstat_reset - reset a blkg_rwstat
- * @rwstat: blkg_rwstat to reset
- */
-static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
-{
-	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
-}
-
-/**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
- * @to: the destination blkg_rwstat
- * @from: the source
- *
- * Add @from's counts to @to.
- */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
-				     struct blkg_rwstat *from)
-{
-	struct blkg_rwstat v = blkg_rwstat_read(from);
-	int i;
-
-	u64_stats_update_begin(&to->syncp);
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		to->cnt[i] += v.cnt[i];
-	u64_stats_update_end(&to->syncp);
-}
-
-#else	/* CONFIG_BLK_CGROUP */
-
-struct cgroup;
-struct blkcg;
-
-struct blkg_policy_data {
-};
-
-struct blkcg_gq {
-};
-
-struct blkcg_policy {
-};
-
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
-static inline void blkcg_drain_queue(struct request_queue *q) { }
-static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
-static inline int blkcg_activate_policy(struct request_queue *q,
-					const struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_deactivate_policy(struct request_queue *q,
-					   const struct blkcg_policy *pol) { }
-
-static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
-
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-						  struct blkcg_policy *pol) { return NULL; }
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
-static inline void blkg_get(struct blkcg_gq *blkg) { }
-static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-					      struct bio *bio) { return &q->root_rl; }
-static inline void blk_put_rl(struct request_list *rl) { }
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
-static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
-
-#define blk_queue_for_each_rl(rl, q)	\
-	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
-
-#endif	/* CONFIG_BLK_CGROUP */
-#endif	/* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 4c6e9ced9..627ed0c59 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -32,12 +32,12 @@
 #include <linux/delay.h>
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 #include "blk-mq.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -48,8 +48,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
 
 DEFINE_IDA(blk_queue_ida);
 
-int trap_non_toi_io;
-
 /*
  * For the allocated request tables
  */
@@ -65,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep;
  */
 static struct workqueue_struct *kblockd_workqueue;
 
+static void blk_clear_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+	clear_wb_congested(rl->blkg->wb_congested, sync);
+#else
+	/*
+	 * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
+	 * flip its congestion state for events on other blkcgs.
+	 */
+	if (rl == &rl->q->root_rl)
+		clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
+static void blk_set_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+	set_wb_congested(rl->blkg->wb_congested, sync);
+#else
+	/* see blk_clear_congested() */
+	if (rl == &rl->q->root_rl)
+		set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
@@ -287,6 +310,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
 	q->request_fn(q);
 	q->request_fn_active--;
 }
+EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
 
 /**
  * __blk_run_queue - run a single device queue
@@ -623,8 +647,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-	q->backing_dev_info.state = 0;
-	q->backing_dev_info.capabilities = 0;
+	q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
 	q->backing_dev_info.name = "block";
 	q->node = node_id;
 
@@ -847,13 +870,8 @@ static void __freed_request(struct request_list *rl, int sync)
 {
 	struct request_queue *q = rl->q;
 
-	/*
-	 * bdi isn't aware of blkcg yet.  As all async IOs end up root
-	 * blkcg anyway, just use root blkcg state.
-	 */
-	if (rl == &q->root_rl &&
-	    rl->count[sync] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, sync);
+	if (rl->count[sync] < queue_congestion_off_threshold(q))
+		blk_clear_congested(rl, sync);
 
 	if (rl->count[sync] + 1 <= q->nr_requests) {
 		if (waitqueue_active(&rl->wait[sync]))
@@ -886,25 +904,25 @@ static void freed_request(struct request_list *rl, unsigned int flags)
 int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
 {
 	struct request_list *rl;
+	int on_thresh, off_thresh;
 
 	spin_lock_irq(q->queue_lock);
 	q->nr_requests = nr;
 	blk_queue_congestion_threshold(q);
+	on_thresh = queue_congestion_on_threshold(q);
+	off_thresh = queue_congestion_off_threshold(q);
 
-	/* congestion isn't cgroup aware and follows root blkcg for now */
-	rl = &q->root_rl;
-
-	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_SYNC);
-	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_SYNC);
+	blk_queue_for_each_rl(rl, q) {
+		if (rl->count[BLK_RW_SYNC] >= on_thresh)
+			blk_set_congested(rl, BLK_RW_SYNC);
+		else if (rl->count[BLK_RW_SYNC] < off_thresh)
+			blk_clear_congested(rl, BLK_RW_SYNC);
 
-	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_ASYNC);
-	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_ASYNC);
+		if (rl->count[BLK_RW_ASYNC] >= on_thresh)
+			blk_set_congested(rl, BLK_RW_ASYNC);
+		else if (rl->count[BLK_RW_ASYNC] < off_thresh)
+			blk_clear_congested(rl, BLK_RW_ASYNC);
 
-	blk_queue_for_each_rl(rl, q) {
 		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
 			blk_set_rl_full(rl, BLK_RW_SYNC);
 		} else {
@@ -1014,12 +1032,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
 				}
 			}
 		}
-		/*
-		 * bdi isn't aware of blkcg yet.  As all async IOs end up
-		 * root blkcg anyway, just use root blkcg state.
-		 */
-		if (rl == &q->root_rl)
-			blk_set_queue_congested(q, is_sync);
+		blk_set_congested(rl, is_sync);
 	}
 
 	/*
@@ -1527,7 +1540,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
  * Caller must ensure !blk_queue_nomerges(q) beforehand.
  */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count)
+			    unsigned int *request_count,
+			    struct request **same_queue_rq)
 {
 	struct blk_plug *plug;
 	struct request *rq;
@@ -1547,8 +1561,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	list_for_each_entry_reverse(rq, plug_list, queuelist) {
 		int el_ret;
 
-		if (rq->q == q)
+		if (rq->q == q) {
 			(*request_count)++;
+			/*
+			 * Only blk-mq multiple hardware queues case checks the
+			 * rq in the same queue, there should be only one such
+			 * rq in a queue
+			 **/
+			if (same_queue_rq)
+				*same_queue_rq = rq;
+		}
 
 		if (rq->q != q || !blk_rq_merge_ok(rq, bio))
 			continue;
@@ -1613,7 +1635,7 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * any locks.
 	 */
 	if (!blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
 
 	spin_lock_irq(q->queue_lock);
@@ -1720,8 +1742,6 @@ static void handle_bad_sector(struct bio *bio)
 			bio->bi_rw,
 			(unsigned long long)bio_end_sector(bio),
 			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
-
-	set_bit(BIO_EOF, &bio->bi_flags);
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -1968,9 +1988,6 @@ void submit_bio(int rw, struct bio *bio)
 {
 	bio->bi_rw |= rw;
 
-	if (unlikely(trap_non_toi_io))
-		BUG_ON(!(bio->bi_flags & BIO_TOI));
-
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
@@ -3039,21 +3056,20 @@ void blk_start_plug(struct blk_plug *plug)
 {
 	struct task_struct *tsk = current;
 
+	/*
+	 * If this is a nested plug, don't actually assign it.
+	 */
+	if (tsk->plug)
+		return;
+
 	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
-
 	/*
-	 * If this is a nested plug, don't actually assign it. It will be
-	 * flushed on its own.
+	 * Store ordering should not be needed here, since a potential
+	 * preempt will imply a full memory barrier
 	 */
-	if (!tsk->plug) {
-		/*
-		 * Store ordering should not be needed here, since a potential
-		 * preempt will imply a full memory barrier
-		 */
-		tsk->plug = plug;
-	}
+	tsk->plug = plug;
 }
 EXPORT_SYMBOL(blk_start_plug);
 
@@ -3200,10 +3216,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 void blk_finish_plug(struct blk_plug *plug)
 {
+	if (plug != current->plug)
+		return;
 	blk_flush_plug_list(plug, false);
 
-	if (plug == current->plug)
-		current->plug = NULL;
+	current->plug = NULL;
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
@@ -3353,7 +3370,7 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
-			sizeof(((struct request *)0)->cmd_flags));
+			FIELD_SIZEOF(struct request, cmd_flags));
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9924725fa..3fec8a29d 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-	bool is_pm_resume;
 
 	WARN_ON(irqs_disabled());
 	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
@@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 		return;
 	}
 
-	/*
-	 * need to check this before __blk_run_queue(), because rq can
-	 * be freed before that returns.
-	 */
-	is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
-
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_dying(q))) {
@@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
-	/* the queue is stopped so it won't be run */
-	if (is_pm_resume)
-		__blk_run_queue_uncond(q);
 	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 79ffb4855..f548b64be 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/scatterlist.h>
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fd3fee81c..30a0d9f89 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -589,7 +589,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 
-	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
+	/* Only check gaps if the bio carries data */
+	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
 		struct bio_vec *bprev;
 
 		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 5f13f4d0b..1e28ddb65 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -24,7 +24,7 @@ static int get_first_sibling(unsigned int cpu)
 {
 	unsigned int ret;
 
-	ret = cpumask_first(topology_thread_cpumask(cpu));
+	ret = cpumask_first(topology_sibling_cpumask(cpu));
 	if (ret < nr_cpu_ids)
 		return ret;
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index be3290cc0..9b6e28830 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
+static void bt_tags_for_each(struct blk_mq_tags *tags,
+		struct blk_mq_bitmap_tags *bt, unsigned int off,
+		busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+	struct request *rq;
+	int bit, i;
+
+	if (!tags->rqs)
+		return;
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		for (bit = find_first_bit(&bm->word, bm->depth);
+		     bit < bm->depth;
+		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+			rq = blk_mq_tag_to_rq(tags, off + bit);
+			fn(rq, data, reserved);
+		}
+
+		off += (1 << bt->bits_per_word);
+	}
+}
+
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv)
+{
+	if (tags->nr_reserved_tags)
+		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+			false);
+}
+EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
+
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv)
 {
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;
 
+	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
+		kfree(tags);
+		return NULL;
+	}
+
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 90767b370..75893a342 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,6 +44,7 @@ struct blk_mq_tags {
 	struct list_head page_list;
 
 	int alloc_policy;
+	cpumask_var_t cpumask;
 };
 
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2dc1fd6c5..7d842db59 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
 			return -EBUSY;
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!q->mq_freeze_depth || blk_queue_dying(q));
+				!atomic_read(&q->mq_freeze_depth) ||
+				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
 		if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
-	bool freeze;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	freeze = !q->mq_freeze_depth++;
-	spin_unlock_irq(q->queue_lock);
-
-	if (freeze) {
+	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
+	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->mq_usage_counter);
 		blk_mq_run_hw_queues(q, false);
 	}
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	bool wake;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	wake = !--q->mq_freeze_depth;
-	WARN_ON_ONCE(q->mq_freeze_depth < 0);
-	spin_unlock_irq(q->queue_lock);
-	if (wake) {
+	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
+	WARN_ON_ONCE(freeze_depth < 0);
+	if (!freeze_depth) {
 		percpu_ref_reinit(&q->mq_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
@@ -1237,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	return rq;
 }
 
+static int blk_mq_direct_issue_request(struct request *rq)
+{
+	int ret;
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
+			rq->mq_ctx->cpu);
+	struct blk_mq_queue_data bd = {
+		.rq = rq,
+		.list = NULL,
+		.last = 1
+	};
+
+	/*
+	 * For OK queue, we are done. For error, kill it. Any other
+	 * error (busy), just add it to our list as we previously
+	 * would have done
+	 */
+	ret = q->mq_ops->queue_rq(hctx, &bd);
+	if (ret == BLK_MQ_RQ_QUEUE_OK)
+		return 0;
+	else {
+		__blk_mq_requeue_request(rq);
+
+		if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+			rq->errors = -EIO;
+			blk_mq_end_request(rq, rq->errors);
+			return 0;
+		}
+		return -1;
+	}
+}
+
 /*
  * Multiple hardware queue variant. This will not use per-process plugs,
  * but will attempt to bypass the hctx queueing if we can go straight to
@@ -1248,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 	struct blk_map_ctx data;
 	struct request *rq;
+	unsigned int request_count = 0;
+	struct blk_plug *plug;
+	struct request *same_queue_rq = NULL;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1256,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
+	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+		return;
+
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
 		return;
@@ -1266,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto run_queue;
 	}
 
+	plug = current->plug;
 	/*
 	 * If the driver supports defer issued based on 'last', then
 	 * queue it up like normal since we can potentially save some
 	 * CPU this way.
 	 */
-	if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
-		struct blk_mq_queue_data bd = {
-			.rq = rq,
-			.list = NULL,
-			.last = 1
-		};
-		int ret;
+	if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
+	    !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+		struct request *old_rq = NULL;
 
 		blk_mq_bio_to_request(rq, bio);
 
 		/*
-		 * For OK queue, we are done. For error, kill it. Any other
-		 * error (busy), just add it to our list as we previously
-		 * would have done
+		 * we do limited pluging. If bio can be merged, do merge.
+		 * Otherwise the existing request in the plug list will be
+		 * issued. So the plug list will have one request at most
 		 */
-		ret = q->mq_ops->queue_rq(data.hctx, &bd);
-		if (ret == BLK_MQ_RQ_QUEUE_OK)
-			goto done;
-		else {
-			__blk_mq_requeue_request(rq);
-
-			if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
-				rq->errors = -EIO;
-				blk_mq_end_request(rq, rq->errors);
-				goto done;
+		if (plug) {
+			/*
+			 * The plug list might get flushed before this. If that
+			 * happens, same_queue_rq is invalid and plug list is empty
+			 **/
+			if (same_queue_rq && !list_empty(&plug->mq_list)) {
+				old_rq = same_queue_rq;
+				list_del_init(&old_rq->queuelist);
 			}
-		}
+			list_add_tail(&rq->queuelist, &plug->mq_list);
+		} else /* is_sync */
+			old_rq = rq;
+		blk_mq_put_ctx(data.ctx);
+		if (!old_rq)
+			return;
+		if (!blk_mq_direct_issue_request(old_rq))
+			return;
+		blk_mq_insert_request(old_rq, false, true, true);
+		return;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1310,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 run_queue:
 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
 	}
-done:
 	blk_mq_put_ctx(data.ctx);
 }
 
@@ -1322,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio->bi_rw);
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
-	unsigned int use_plug, request_count = 0;
+	struct blk_plug *plug;
+	unsigned int request_count = 0;
 	struct blk_map_ctx data;
 	struct request *rq;
 
-	/*
-	 * If we have multiple hardware queues, just go directly to
-	 * one of those for sync IO.
-	 */
-	use_plug = !is_flush_fua && !is_sync;
-
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1339,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
-	if (use_plug && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
 
 	rq = blk_mq_map_request(q, bio, &data);
@@ -1358,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	 * utilize that to temporarily store requests until the task is
 	 * either done or scheduled away.
 	 */
-	if (use_plug) {
-		struct blk_plug *plug = current->plug;
-
-		if (plug) {
-			blk_mq_bio_to_request(rq, bio);
-			if (list_empty(&plug->mq_list))
-				trace_block_plug(q);
-			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
-				blk_flush_plug_list(plug, false);
-				trace_block_plug(q);
-			}
-			list_add_tail(&rq->queuelist, &plug->mq_list);
-			blk_mq_put_ctx(data.ctx);
-			return;
+	plug = current->plug;
+	if (plug) {
+		blk_mq_bio_to_request(rq, bio);
+		if (list_empty(&plug->mq_list))
+			trace_block_plug(q);
+		else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+			blk_flush_plug_list(plug, false);
+			trace_block_plug(q);
 		}
+		list_add_tail(&rq->queuelist, &plug->mq_list);
+		blk_mq_put_ctx(data.ctx);
+		return;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1508,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			i++;
 		}
 	}
-
 	return tags;
 
 fail:
@@ -1792,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
+		cpumask_set_cpu(i, hctx->tags->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
@@ -2056,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q)
 /* Basically redo blk_mq_init_queue with queue frozen */
 static void blk_mq_queue_reinit(struct request_queue *q)
 {
-	WARN_ON_ONCE(!q->mq_freeze_depth);
+	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
 
 	blk_mq_sysfs_unregister(q);
 
@@ -2173,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 	return 0;
 }
 
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+	return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -2234,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	int i;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (set->tags[i])
+		if (set->tags[i]) {
 			blk_mq_free_rq_map(set, set->tags[i], i);
+			free_cpumask_var(set->tags[i]->cpumask);
+		}
 	}
 
 	kfree(set->tags);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 12600bfff..e0057d035 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -241,8 +241,8 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
  * Description:
  *    Enables a low level driver to set a hard upper limit,
  *    max_hw_sectors, on the size of requests.  max_hw_sectors is set by
- *    the device driver based upon the combined capabilities of I/O
- *    controller and storage device.
+ *    the device driver based upon the capabilities of the I/O
+ *    controller.
  *
  *    max_sectors is a soft limit imposed by the block layer for
  *    filesystem type requests.  This value can be overridden on a
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 2b8fd302f..6264b382d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -6,11 +6,12 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 #include "blk-mq.h"
 
 struct queue_sysfs_entry {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5b9c6d5c3..b23193518 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -9,7 +9,7 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
 #include "blk.h"
 
 /* Max dispatch from a group in 1 round */
diff --git a/block/blk.h b/block/blk.h
index 43b036185..026d95941 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count);
+			    unsigned int *request_count,
+			    struct request **same_queue_rq);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
@@ -193,8 +194,6 @@ int blk_try_merge(struct request *rq, struct bio *bio);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
-void __blk_run_queue_uncond(struct request_queue *q);
-
 int blk_dev_init(void);
 
 
diff --git a/block/bounce.c b/block/bounce.c
index ed9dd8067..b17311227 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
@@ -128,9 +129,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 	struct bio_vec *bvec, *org_vec;
 	int i;
 
-	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
 	/*
 	 * free up bounce indirect pages used
 	 */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5da8e6e9a..c62bb2e65 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,8 +14,8 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include <linux/blk-cgroup.h>
 #include "blk.h"
-#include "blk-cgroup.h"
 
 /*
  * tunables
@@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool;
 #define sample_valid(samples)	((samples) > 80)
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
 
+/* blkio-related constants */
+#define CFQ_WEIGHT_MIN          10
+#define CFQ_WEIGHT_MAX          1000
+#define CFQ_WEIGHT_DEFAULT      500
+
 struct cfq_ttime {
 	unsigned long last_end_request;
 
@@ -212,6 +217,15 @@ struct cfqg_stats {
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 };
 
+/* Per-cgroup data */
+struct cfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned int weight;
+	unsigned int leaf_weight;
+};
+
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
 	/* must be the first member */
@@ -446,16 +460,6 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
-static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
-{
-	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
-}
-
-static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
-{
-	return pd_to_blkg(&cfqg->pd);
-}
-
 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 
 /* cfqg stats flags */
@@ -600,6 +604,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 
+static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
+}
+
+static struct cfq_group_data
+*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
+{
+	return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+}
+
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+	return pd_to_blkg(&cfqg->pd);
+}
+
 static struct blkcg_policy blkcg_policy_cfq;
 
 static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
@@ -607,6 +627,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 }
 
+static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
+}
+
 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 {
 	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
@@ -1544,13 +1569,28 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
 #endif
 }
 
+static void cfq_cpd_init(const struct blkcg *blkcg)
+{
+	struct cfq_group_data *cgd =
+		cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+
+	if (blkcg == &blkcg_root) {
+		cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
+		cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
+	} else {
+		cgd->weight = CFQ_WEIGHT_DEFAULT;
+		cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
+	}
+}
+
 static void cfq_pd_init(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
 
 	cfq_init_cfqg_base(cfqg);
-	cfqg->weight = blkg->blkcg->cfq_weight;
-	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+	cfqg->weight = cgd->weight;
+	cfqg->leaf_weight = cgd->leaf_weight;
 	cfqg_stats_init(&cfqg->stats);
 	cfqg_stats_init(&cfqg->dead_stats);
 }
@@ -1673,13 +1713,27 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
 
 static int cfq_print_weight(struct seq_file *sf, void *v)
 {
-	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+	unsigned int val = 0;
+
+	if (cgd)
+		val = cgd->weight;
+
+	seq_printf(sf, "%u\n", val);
 	return 0;
 }
 
 static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 {
-	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+	unsigned int val = 0;
+
+	if (cgd)
+		val = cgd->leaf_weight;
+
+	seq_printf(sf, "%u\n", val);
 	return 0;
 }
 
@@ -1690,6 +1744,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
+	struct cfq_group_data *cfqgd;
 	int ret;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
@@ -1698,17 +1753,22 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 
 	ret = -EINVAL;
 	cfqg = blkg_to_cfqg(ctx.blkg);
+	cfqgd = blkcg_to_cfqgd(blkcg);
+	if (!cfqg || !cfqgd)
+		goto err;
+
 	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
 		if (!is_leaf_weight) {
 			cfqg->dev_weight = ctx.v;
-			cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+			cfqg->new_weight = ctx.v ?: cfqgd->weight;
 		} else {
 			cfqg->dev_leaf_weight = ctx.v;
-			cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+			cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
 		}
 		ret = 0;
 	}
 
+err:
 	blkg_conf_finish(&ctx);
 	return ret ?: nbytes;
 }
@@ -1730,16 +1790,23 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
+	struct cfq_group_data *cfqgd;
+	int ret = 0;
 
 	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
 		return -EINVAL;
 
 	spin_lock_irq(&blkcg->lock);
+	cfqgd = blkcg_to_cfqgd(blkcg);
+	if (!cfqgd) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	if (!is_leaf_weight)
-		blkcg->cfq_weight = val;
+		cfqgd->weight = val;
 	else
-		blkcg->cfq_leaf_weight = val;
+		cfqgd->leaf_weight = val;
 
 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
 		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1749,15 +1816,16 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 
 		if (!is_leaf_weight) {
 			if (!cfqg->dev_weight)
-				cfqg->new_weight = blkcg->cfq_weight;
+				cfqg->new_weight = cfqgd->weight;
 		} else {
 			if (!cfqg->dev_leaf_weight)
-				cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+				cfqg->new_leaf_weight = cfqgd->leaf_weight;
 		}
 	}
 
+out:
 	spin_unlock_irq(&blkcg->lock);
-	return 0;
+	return ret;
 }
 
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -4477,6 +4545,18 @@ out_free:
 	return ret;
 }
 
+static void cfq_registered_queue(struct request_queue *q)
+{
+	struct elevator_queue *e = q->elevator;
+	struct cfq_data *cfqd = e->elevator_data;
+
+	/*
+	 * Default to IOPS mode with no idling for SSDs
+	 */
+	if (blk_queue_nonrot(q))
+		cfqd->cfq_slice_idle = 0;
+}
+
 /*
  * sysfs parts below -->
  */
@@ -4592,6 +4672,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
+		.elevator_registered_fn =	cfq_registered_queue,
 	},
 	.icq_size	=	sizeof(struct cfq_io_cq),
 	.icq_align	=	__alignof__(struct cfq_io_cq),
@@ -4603,8 +4684,10 @@ static struct elevator_type iosched_cfq = {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
 	.pd_size		= sizeof(struct cfq_group),
+	.cpd_size		= sizeof(struct cfq_group_data),
 	.cftypes		= cfq_blkcg_files,
 
+	.cpd_init_fn		= cfq_cpd_init,
 	.pd_init_fn		= cfq_pd_init,
 	.pd_offline_fn		= cfq_pd_offline,
 	.pd_reset_stats_fn	= cfq_pd_reset_stats,
diff --git a/block/elevator.c b/block/elevator.c
index 8985038f3..84d63943f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,11 +35,11 @@
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -806,6 +806,8 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
+		if (e->type->ops.elevator_registered_fn)
+			e->type->ops.elevator_registered_fn(q);
 	}
 	return error;
 }
diff --git a/block/genhd.c b/block/genhd.c
index d2b7ebfb5..59a1395ee 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -8,6 +8,7 @@
 #include <linux/kdev_t.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
@@ -17,8 +18,6 @@
 #include <linux/kobj_map.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
-#include <linux/ctype.h>
-#include <linux/fs_uuid.h>
 #include <linux/log2.h>
 #include <linux/pm_runtime.h>
 
@@ -1385,85 +1384,6 @@ int invalidate_partition(struct gendisk *disk, int partno)
 
 EXPORT_SYMBOL(invalidate_partition);
 
-dev_t blk_lookup_fs_info(struct fs_info *seek)
-{
-	dev_t devt = MKDEV(0, 0);
-	struct class_dev_iter iter;
-	struct device *dev;
-	int best_score = 0;
-
-	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
-	while (best_score < 3 && (dev = class_dev_iter_next(&iter))) {
-		struct gendisk *disk = dev_to_disk(dev);
-		struct disk_part_iter piter;
-		struct hd_struct *part;
-
-		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
-
-		while (best_score < 3 && (part = disk_part_iter_next(&piter))) {
-			int score = part_matches_fs_info(part, seek);
-			if (score > best_score) {
-				devt = part_devt(part);
-				best_score = score;
-			}
-		}
-		disk_part_iter_exit(&piter);
-	}
-	class_dev_iter_exit(&iter);
-	return devt;
-}
-
-/* Caller uses NULL, key to start. For each match found, we return a bdev on
- * which we have done blkdev_get, and we do the blkdev_put on block devices
- * that are passed to us. When no more matches are found, we return NULL.
- */
-struct block_device *next_bdev_of_type(struct block_device *last,
-	const char *key)
-{
-	dev_t devt = MKDEV(0, 0);
-	struct class_dev_iter iter;
-	struct device *dev;
-	struct block_device *next = NULL, *bdev;
-	int got_last = 0;
-
-	if (!key)
-		goto out;
-
-	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
-	while (!devt && (dev = class_dev_iter_next(&iter))) {
-		struct gendisk *disk = dev_to_disk(dev);
-		struct disk_part_iter piter;
-		struct hd_struct *part;
-
-		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
-
-		while ((part = disk_part_iter_next(&piter))) {
-			bdev = bdget(part_devt(part));
-			if (last && !got_last) {
-				if (last == bdev)
-					got_last = 1;
-				continue;
-			}
-
-			if (blkdev_get(bdev, FMODE_READ, 0))
-				continue;
-
-			if (bdev_matches_key(bdev, key)) {
-				next = bdev;
-				break;
-			}
-
-			blkdev_put(bdev, FMODE_READ);
-		}
-		disk_part_iter_exit(&piter);
-	}
-	class_dev_iter_exit(&iter);
-out:
-	if (last)
-		blkdev_put(last, FMODE_READ);
-	return next;
-}
-
 /*
  * Disk events - monitor disk events like media change and eject request.
  */
diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befde2..8061eba42 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -150,21 +150,48 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	}
 }
 
-static int blkdev_reread_part(struct block_device *bdev)
+/*
+ * This is an exported API for the block driver, and will not
+ * acquire bd_mutex. This API should be used in case that
+ * caller has held bd_mutex already.
+ */
+int __blkdev_reread_part(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int res;
 
 	if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (!mutex_trylock(&bdev->bd_mutex))
-		return -EBUSY;
-	res = rescan_partitions(disk, bdev);
+
+	lockdep_assert_held(&bdev->bd_mutex);
+
+	return rescan_partitions(disk, bdev);
+}
+EXPORT_SYMBOL(__blkdev_reread_part);
+
+/*
+ * This is an exported API for the block driver, and will
+ * try to acquire bd_mutex. If bd_mutex has been held already
+ * in current context, please call __blkdev_reread_part().
+ *
+ * Make sure the held locks in current context aren't required
+ * in open()/close() handler and I/O path for avoiding ABBA deadlock:
+ * - bd_mutex is held before calling block driver's open/close
+ *   handler
+ * - reading partition table may submit I/O to the block device
+ */
+int blkdev_reread_part(struct block_device *bdev)
+{
+	int res;
+
+	mutex_lock(&bdev->bd_mutex);
+	res = __blkdev_reread_part(bdev);
 	mutex_unlock(&bdev->bd_mutex);
+
 	return res;
 }
+EXPORT_SYMBOL(blkdev_reread_part);
 
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 55b6f15da..dda653ce7 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -326,8 +326,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 			goto out_put_request;
 	}
 
-	ret = -EFAULT;
-	if (blk_fill_sghdr_rq(q, rq, hdr, mode))
+	ret = blk_fill_sghdr_rq(q, rq, hdr, mode);
+	if (ret < 0)
 		goto out_free_cdb;
 
 	ret = 0;
diff --git a/block/uuid.c b/block/uuid.c
deleted file mode 100644
index 722d53b63..000000000
--- a/block/uuid.c
+++ /dev/null
@@ -1,509 +0,0 @@
-#include <linux/blkdev.h>
-#include <linux/ctype.h>
-#include <linux/fs_uuid.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-
-static int debug_enabled;
-
-#define PRINTK(fmt, args...) do {					\
-	if (debug_enabled)						\
-		printk(KERN_DEBUG fmt, ## args);			\
-	} while(0)
-
-#define PRINT_HEX_DUMP(v1, v2, v3, v4, v5, v6, v7, v8)			\
-	do {								\
-		if (debug_enabled)					\
-			print_hex_dump(v1, v2, v3, v4, v5, v6, v7, v8);	\
-	} while(0)
-
-/*
- * Simple UUID translation
- */
-
-struct uuid_info {
-	const char *key;
-	const char *name;
-	long bkoff;
-	unsigned sboff;
-	unsigned sig_len;
-	const char *magic;
-	int uuid_offset;
-	int last_mount_offset;
-	int last_mount_size;
-};
-
-/*
- * Based on libuuid's blkid_magic array. Note that I don't
- * have uuid offsets for all of these yet - mssing ones are 0x0.
- * Further information welcome.
- *
- * Rearranged by page of fs signature for optimisation.
- */
-static struct uuid_info uuid_list[] = {
- { NULL, "oracleasm", 0, 32, 8, "ORCLDISK", 0x0, 0, 0 },
- { "ntfs", "ntfs", 0, 3, 8, "NTFS    ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x52, 5, "MSWIN", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x52, 8, "FAT32   ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x36, 5, "MSDOS", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x36, 8, "FAT16   ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x36, 8, "FAT12   ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0, 1, "\353", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0, 1, "\351", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x1fe, 2, "\125\252", 0x0, 0, 0 },
- { "xfs", "xfs", 0, 0, 4, "XFSB", 0x20, 0, 0 },
- { "romfs", "romfs", 0, 0, 8, "-rom1fs-", 0x0, 0, 0 },
- { "bfs", "bfs", 0, 0, 4, "\316\372\173\033", 0, 0, 0 },
- { "cramfs", "cramfs", 0, 0, 4, "E=\315\050", 0x0, 0, 0 },
- { "qnx4", "qnx4", 0, 4, 6, "QNX4FS", 0, 0, 0 },
- { NULL, "crypt_LUKS", 0, 0, 6, "LUKS\xba\xbe", 0x0, 0, 0 },
- { "squashfs", "squashfs", 0, 0, 4, "sqsh", 0, 0, 0 },
- { "squashfs", "squashfs", 0, 0, 4, "hsqs", 0, 0, 0 },
- { "ocfs", "ocfs", 0, 8, 9, "OracleCFS", 0x0, 0, 0 },
- { "lvm2pv", "lvm2pv", 0, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
- { "sysv", "sysv", 0, 0x3f8, 4, "\020~\030\375", 0, 0, 0 },
- { "ext", "ext", 1, 0x38, 2, "\123\357", 0x468, 0x42c, 4 },
- { "minix", "minix", 1, 0x10, 2, "\177\023", 0, 0, 0 },
- { "minix", "minix", 1, 0x10, 2, "\217\023", 0, 0, 0 },
- { "minix", "minix", 1, 0x10, 2, "\150\044", 0, 0, 0 },
- { "minix", "minix", 1, 0x10, 2, "\170\044", 0, 0, 0 },
- { "lvm2pv", "lvm2pv", 1, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
- { "vxfs", "vxfs", 1, 0, 4, "\365\374\001\245", 0, 0, 0 },
- { "hfsplus", "hfsplus", 1, 0, 2, "BD", 0x0, 0, 0 },
- { "hfsplus", "hfsplus", 1, 0, 2, "H+", 0x0, 0, 0 },
- { "hfsplus", "hfsplus", 1, 0, 2, "HX", 0x0, 0, 0 },
- { "hfs", "hfs", 1, 0, 2, "BD", 0x0, 0, 0 },
- { "ocfs2", "ocfs2", 1, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "lvm2pv", "lvm2pv", 0, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
- { "lvm2pv", "lvm2pv", 1, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
- { "ocfs2", "ocfs2", 2, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "swap", "swap", 0, 0xff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0xff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "ocfs2", "ocfs2", 4, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "ocfs2", "ocfs2", 8, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "hpfs", "hpfs", 8, 0, 4, "I\350\225\371", 0, 0, 0 },
- { "reiserfs", "reiserfs", 8, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
- { "reiserfs", "reiserfs", 8, 20, 8, "ReIsErFs", 0x10054, 0, 0 },
- { "zfs", "zfs", 8, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
- { "zfs", "zfs", 8, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
- { "ufs", "ufs", 8, 0x55c, 4, "T\031\001\000", 0, 0, 0 },
- { "swap", "swap", 0, 0x1ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0x1ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x1ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x1ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x1ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr2Fs", 0x10054, 0, 0 },
- { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr3Fs", 0x10054, 0, 0 },
- { "reiserfs", "reiserfs", 64, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
- { "reiser4", "reiser4", 64, 0, 7, "ReIsEr4", 0x100544, 0, 0 },
- { "gfs2", "gfs2", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
- { "gfs", "gfs", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
- { "btrfs", "btrfs", 64, 0x40, 8, "_BHRfS_M", 0x0, 0, 0 },
- { "swap", "swap", 0, 0x3ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0x3ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x3ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x3ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x3ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "udf", "udf", 32, 1, 5, "BEA01", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "BOOT2", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "CD001", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "CDW02", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "NSR02", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "NSR03", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "TEA01", 0x0, 0, 0 },
- { "iso9660", "iso9660", 32, 1, 5, "CD001", 0x0, 0, 0 },
- { "iso9660", "iso9660", 32, 9, 5, "CDROM", 0x0, 0, 0 },
- { "jfs", "jfs", 32, 0, 4, "JFS1", 0x88, 0, 0 },
- { "swap", "swap", 0, 0x7ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0x7ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x7ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x7ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x7ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0xfff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0xfff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xfff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xfff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xfff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "zfs", "zfs", 264, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
- { "zfs", "zfs", 264, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
- { NULL, NULL, 0, 0, 0, NULL, 0x0, 0, 0 }
-};
-
-static int null_uuid(const char *uuid)
-{
-	int i;
-
-	for (i = 0; i < 16 && !uuid[i]; i++);
-
-	return (i == 16);
-}
-
-
-static void uuid_end_bio(struct bio *bio, int err)
-{
-	struct page *page = bio->bi_io_vec[0].bv_page;
-
-	if(!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		SetPageError(page);
-
-	unlock_page(page);
-	bio_put(bio);
-}
-
-
-/**
- * submit - submit BIO request
- * @dev: The block device we're using.
- * @page_num: The page we're reading.
- *
- * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
- * textbook - allocate and initialize the bio. If we're writing, make sure
- * the page is marked as dirty. Then submit it and carry on."
- **/
-static struct page *read_bdev_page(struct block_device *dev, int page_num)
-{
-	struct bio *bio = NULL;
-	struct page *page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-
-	if (!page) {
-		printk(KERN_ERR "Failed to allocate a page for reading data "
-				"in UUID checks.");
-		return NULL;
-	}
-
-	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_bdev = dev;
-	bio->bi_iter.bi_sector = page_num << 3;
-	bio->bi_end_io = uuid_end_bio;
-	bio->bi_flags |= (1 << BIO_TOI);
-
-	PRINTK("Submitting bio on device %lx, page %d using bio %p and page %p.\n",
-			(unsigned long) dev->bd_dev, page_num, bio, page);
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_DEBUG "ERROR: adding page to bio at %d\n",
-				page_num);
-		bio_put(bio);
-		__free_page(page);
-		printk(KERN_DEBUG "read_bdev_page freed page %p (in error "
-				"path).\n", page);
-		return NULL;
-	}
-
-	lock_page(page);
-	submit_bio(READ | REQ_SYNC, bio);
-
-	wait_on_page_locked(page);
-	if (PageError(page)) {
-		__free_page(page);
-		page = NULL;
-	}
-	return page;
-}
-
-int bdev_matches_key(struct block_device *bdev, const char *key)
-{
-	unsigned char *data = NULL;
-	struct page *data_page = NULL;
-
-	int dev_offset, pg_num, pg_off, i;
-	int last_pg_num = -1;
-	int result = 0;
-	char buf[50];
-
-	if (null_uuid(key)) {
-		PRINTK("Refusing to find a NULL key.\n");
-		return 0;
-	}
-
-	if (!bdev->bd_disk) {
-		bdevname(bdev, buf);
-		PRINTK("bdev %s has no bd_disk.\n", buf);
-		return 0;
-	}
-
-	if (!bdev->bd_disk->queue) {
-		bdevname(bdev, buf);
-		PRINTK("bdev %s has no queue.\n", buf);
-		return 0;
-	}
-
-	for (i = 0; uuid_list[i].name; i++) {
-		struct uuid_info *dat = &uuid_list[i];
-
-		if (!dat->key || strcmp(dat->key, key))
-			continue;
-
-		dev_offset = (dat->bkoff << 10) + dat->sboff;
-		pg_num = dev_offset >> 12;
-		pg_off = dev_offset & 0xfff;
-
-		if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
-			continue;
-
-		if (pg_num != last_pg_num) {
-			if (data_page) {
-				kunmap(data_page);
-				__free_page(data_page);
-			}
-			data_page = read_bdev_page(bdev, pg_num);
-			if (!data_page)
-				continue;
-			data = kmap(data_page);
-		}
-
-		last_pg_num = pg_num;
-
-		if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
-			continue;
-
-		result = 1;
-		break;
-	}
-
-	if (data_page) {
-		kunmap(data_page);
-		__free_page(data_page);
-	}
-
-	return result;
-}
-
-/* 
- * part_matches_fs_info - Does the given partition match the details given?
- *
- * Returns a score saying how good the match is.
- * 0 = no UUID match.
- * 1 = UUID but last mount time differs.
- * 2 = UUID, last mount time but not dev_t
- * 3 = perfect match
- *
- * This lets us cope elegantly with probing resulting in dev_ts changing
- * from boot to boot, and with the case where a user copies a partition
- * (UUID is non unique), and we need to check the last mount time of the
- * correct partition.
- */
-int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek)
-{
-	struct block_device *bdev;
-	struct fs_info *got;
-	int result = 0;
-	char buf[50];
-
-	if (null_uuid((char *) &seek->uuid)) {
-		PRINTK("Refusing to find a NULL uuid.\n");
-		return 0;
-	}
-
-	bdev = bdget(part_devt(part));
-
-	PRINTK("part_matches fs info considering %x.\n", part_devt(part));
-
-	if (blkdev_get(bdev, FMODE_READ, 0)) {
-		PRINTK("blkdev_get failed.\n");
-		return 0;
-	}
-
-	if (!bdev->bd_disk) {
-		bdevname(bdev, buf);
-		PRINTK("bdev %s has no bd_disk.\n", buf);
-		goto out;
-	}
-
-	if (!bdev->bd_disk->queue) {
-		bdevname(bdev, buf);
-		PRINTK("bdev %s has no queue.\n", buf);
-		goto out;
-	}
-
-	got = fs_info_from_block_dev(bdev);
-
-	if (got && !memcmp(got->uuid, seek->uuid, 16)) {
-		PRINTK(" Have matching UUID.\n");
-		PRINTK(" Got: LMS %d, LM %p.\n", got->last_mount_size, got->last_mount);
-		PRINTK(" Seek: LMS %d, LM %p.\n", seek->last_mount_size, seek->last_mount);
-		result = 1;
-
-		if (got->last_mount_size == seek->last_mount_size &&
-		    got->last_mount && seek->last_mount &&
-		    !memcmp(got->last_mount, seek->last_mount,
-			    got->last_mount_size)) {
-			result = 2;
-
-			PRINTK(" Matching last mount time.\n");
-
-			if (part_devt(part) == seek->dev_t) {
-				result = 3;
-				PRINTK(" Matching dev_t.\n");
-			} else
-				PRINTK("Dev_ts differ (%x vs %x).\n", part_devt(part), seek->dev_t);
-		}
-	}
-
-	PRINTK(" Score for %x is %d.\n", part_devt(part), result);
-	free_fs_info(got);
-out:
-	blkdev_put(bdev, FMODE_READ);
-	return result;
-}
-
-void free_fs_info(struct fs_info *fs_info)
-{
-	if (!fs_info || IS_ERR(fs_info))
-		return;
-
-	if (fs_info->last_mount)
-		kfree(fs_info->last_mount);
-
-	kfree(fs_info);
-}
-
-struct fs_info *fs_info_from_block_dev(struct block_device *bdev)
-{
-	unsigned char *data = NULL;
-	struct page *data_page = NULL;
-
-	int dev_offset, pg_num, pg_off;
-	int uuid_pg_num, uuid_pg_off, i;
-	unsigned char *uuid_data = NULL;
-	struct page *uuid_data_page = NULL;
-
-	int last_pg_num = -1, last_uuid_pg_num = 0;
-	char buf[50];
-	struct fs_info *fs_info = NULL;
-
-	bdevname(bdev, buf);
-
-	PRINTK("uuid_from_block_dev looking for partition type of %s.\n", buf);
-
-	for (i = 0; uuid_list[i].name; i++) {
-		struct uuid_info *dat = &uuid_list[i];
-		dev_offset = (dat->bkoff << 10) + dat->sboff;
-		pg_num = dev_offset >> 12;
-		pg_off = dev_offset & 0xfff;
-		uuid_pg_num = dat->uuid_offset >> 12;
-		uuid_pg_off = dat->uuid_offset & 0xfff;
-
-		if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
-			continue;
-
-		/* Ignore partition types with no UUID offset */
-		if (!dat->uuid_offset)
-			continue;
-
-		if (pg_num != last_pg_num) {
-			if (data_page) {
-				kunmap(data_page);
-				__free_page(data_page);
-			}
-			data_page = read_bdev_page(bdev, pg_num);
-			if (!data_page)
-				continue;
-			data = kmap(data_page);
-		}
-
-		last_pg_num = pg_num;
-
-		if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
-			continue;
-
-		PRINTK("This partition looks like %s.\n", dat->name);
-
-		fs_info = kzalloc(sizeof(struct fs_info), GFP_KERNEL);
-
-		if (!fs_info) {
-			PRINTK("Failed to allocate fs_info struct.");
-			fs_info = ERR_PTR(-ENOMEM);
-			break;
-		}
-
-		/* UUID can't be off the end of the disk */
-		if ((uuid_pg_num > bdev->bd_part->nr_sects >> 3) ||
-				!dat->uuid_offset)
-			goto no_uuid;
-
-		if (!uuid_data || uuid_pg_num != last_uuid_pg_num) {
-			/* No need to reread the page from above */
-			if (uuid_pg_num == pg_num && uuid_data)
-				memcpy(uuid_data, data, PAGE_SIZE);
-			else {
-				if (uuid_data_page) {
-					kunmap(uuid_data_page);
-					__free_page(uuid_data_page);
-				}
-				uuid_data_page = read_bdev_page(bdev, uuid_pg_num);
-				if (!uuid_data_page)
-					continue;
-				uuid_data = kmap(uuid_data_page);
-			}
-		}
-
-		last_uuid_pg_num = uuid_pg_num;
-		memcpy(&fs_info->uuid, &uuid_data[uuid_pg_off], 16);
-		fs_info->dev_t = bdev->bd_dev;
-
-no_uuid:
-		PRINT_HEX_DUMP(KERN_EMERG, "fs_info_from_block_dev "
-				"returning uuid ", DUMP_PREFIX_NONE, 16, 1,
-				fs_info->uuid, 16, 0);
-
-		if (dat->last_mount_size) {
-			int pg = dat->last_mount_offset >> 12, sz;
-			int off = dat->last_mount_offset & 0xfff;
-			struct page *last_mount = read_bdev_page(bdev, pg);
-			unsigned char *last_mount_data;
-			char *ptr;
-
-			if (!last_mount) {
-				fs_info = ERR_PTR(-ENOMEM);
-				break;
-			}
-			last_mount_data = kmap(last_mount);
-			sz = dat->last_mount_size;
-			ptr = kmalloc(sz, GFP_KERNEL);
-
-			if (!ptr) {
-				printk(KERN_EMERG "fs_info_from_block_dev "
-					"failed to get memory for last mount "
-					"timestamp.");
-				free_fs_info(fs_info);
-				fs_info = ERR_PTR(-ENOMEM);
-			} else {
-				fs_info->last_mount = ptr;
-				fs_info->last_mount_size = sz;
-				memcpy(ptr, &last_mount_data[off], sz);
-			}
-
-			kunmap(last_mount);
-			__free_page(last_mount);
-		}
-		break;
-	}
-
-	if (data_page) {
-		kunmap(data_page);
-		__free_page(data_page);
-	}
-
-	if (uuid_data_page) {
-		kunmap(uuid_data_page);
-		__free_page(uuid_data_page);
-	}
-
-	return fs_info;
-}
-
-static int __init uuid_debug_setup(char *str)
-{
-	int value;
-
-	if (sscanf(str, "=%d", &value))
-		debug_enabled = value;
-
-	return 1;
-}
-
-__setup("uuid_debug", uuid_debug_setup);