22 files changed, 830 insertions, 936 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a..0363cd731 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -88,6 +88,19 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_DEV_DAX
+	bool "Block device DAX support"
+	depends on FS_DAX
+	depends on BROKEN
+	help
+	  When DAX support is available (CONFIG_FS_DAX) raw block
+	  devices can also support direct userspace access to the
+	  storage capacity via MMAP(2) similar to a file on a
+	  DAX-enabled filesystem.  However, the DAX I/O-path disables
+	  some standard I/O-statistics, and the MMAP(2) path has some
+	  operational differences due to bypassing the page
+	  cache.  If in doubt, say N.
+
 config BLK_DEV_THROTTLING
 	bool "Block layer bio throttling support"
 	depends on BLK_CGROUP=y
diff --git a/block/Makefile b/block/Makefile
index 086be5007..4a3668393 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,10 +5,10 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+			blk-lib.o blk-mq.o blk-mq-tag.o \
 			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
-			uuid.o genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
-			partitions/
+			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+			badblocks.o partitions/
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
diff --git a/block/badblocks.c b/block/badblocks.c
new file mode 100644
index 000000000..7be53cb1c
--- /dev/null
+++ b/block/badblocks.c
@@ -0,0 +1,585 @@
+/*
+ * Bad block management
+ *
+ * - Heavily based on MD badblocks code from Neil Brown
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/badblocks.h>
+#include <linux/seqlock.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+/**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb:		the badblocks structure that holds all badblock information
+ * @s:		sector (start) at which to check for badblocks
+ * @sectors:	number of sectors to check for badblocks
+ * @first_bad:	pointer to store location of the first badblock
+ * @bad_sectors: pointer to store number of badblocks after @first_bad
+ *
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide.  This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ *  A 'shift' can be set so that larger blocks are tracked and
+ *  consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so badblocks_check
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad.  So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ *
+ * Return:
+ *  0: there are no known bad blocks in the range
+ *  1: there are known bad block which are all acknowledged
+ * -1: there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+			sector_t *first_bad, int *bad_sectors)
+{
+	int hi;
+	int lo;
+	u64 *p = bb->page;
+	int rv;
+	sector_t target = s + sectors;
+	unsigned seq;
+
+	if (bb->shift > 0) {
+		/* round the start down, and the end up */
+		s >>= bb->shift;
+		target += (1<<bb->shift) - 1;
+		target >>= bb->shift;
+		sectors = target - s;
+	}
+	/* 'target' is now the first block after the bad range */
+
+retry:
+	seq = read_seqbegin(&bb->lock);
+	lo = 0;
+	rv = 0;
+	hi = bb->count;
+
+	/* Binary search between lo and hi for 'target'
+	 * i.e. for the last range that starts before 'target'
+	 */
+	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+	 * are known not to be the last range before target.
+	 * VARIANT: hi-lo is the number of possible
+	 * ranges, and decreases until it reaches 1
+	 */
+	while (hi - lo > 1) {
+		int mid = (lo + hi) / 2;
+		sector_t a = BB_OFFSET(p[mid]);
+
+		if (a < target)
+			/* This could still be the one, earlier ranges
+			 * could not.
+			 */
+			lo = mid;
+		else
+			/* This and later ranges are definitely out. */
+			hi = mid;
+	}
+	/* 'lo' might be the last that started before target, but 'hi' isn't */
+	if (hi > lo) {
+		/* need to check all range that end after 's' to see if
+		 * any are unacknowledged.
+		 */
+		while (lo >= 0 &&
+		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+			if (BB_OFFSET(p[lo]) < target) {
+				/* starts before the end, and finishes after
+				 * the start, so they must overlap
+				 */
+				if (rv != -1 && BB_ACK(p[lo]))
+					rv = 1;
+				else
+					rv = -1;
+				*first_bad = BB_OFFSET(p[lo]);
+				*bad_sectors = BB_LEN(p[lo]);
+			}
+			lo--;
+		}
+	}
+
+	if (read_seqretry(&bb->lock, seq))
+		goto retry;
+
+	return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_check);
+
+/**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb:		the badblocks structure that holds all badblock information
+ * @s:		first sector to mark as bad
+ * @sectors:	number of sectors to mark as bad
+ * @acknowledged: weather to mark the bad sectors as acknowledged
+ *
+ * This might extend the table, or might contract it if two adjacent ranges
+ * can be merged. We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ *
+ * Return:
+ *  0: success
+ *  1: failed to set badblocks (out of space)
+ */
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+			int acknowledged)
+{
+	u64 *p;
+	int lo, hi;
+	int rv = 0;
+	unsigned long flags;
+
+	if (bb->shift < 0)
+		/* badblocks are disabled */
+		return 0;
+
+	if (bb->shift) {
+		/* round the start down, and the end up */
+		sector_t next = s + sectors;
+
+		s >>= bb->shift;
+		next += (1<<bb->shift) - 1;
+		next >>= bb->shift;
+		sectors = next - s;
+	}
+
+	write_seqlock_irqsave(&bb->lock, flags);
+
+	p = bb->page;
+	lo = 0;
+	hi = bb->count;
+	/* Find the last range that starts at-or-before 's' */
+	while (hi - lo > 1) {
+		int mid = (lo + hi) / 2;
+		sector_t a = BB_OFFSET(p[mid]);
+
+		if (a <= s)
+			lo = mid;
+		else
+			hi = mid;
+	}
+	if (hi > lo && BB_OFFSET(p[lo]) > s)
+		hi = lo;
+
+	if (hi > lo) {
+		/* we found a range that might merge with the start
+		 * of our new range
+		 */
+		sector_t a = BB_OFFSET(p[lo]);
+		sector_t e = a + BB_LEN(p[lo]);
+		int ack = BB_ACK(p[lo]);
+
+		if (e >= s) {
+			/* Yes, we can merge with a previous range */
+			if (s == a && s + sectors >= e)
+				/* new range covers old */
+				ack = acknowledged;
+			else
+				ack = ack && acknowledged;
+
+			if (e < s + sectors)
+				e = s + sectors;
+			if (e - a <= BB_MAX_LEN) {
+				p[lo] = BB_MAKE(a, e-a, ack);
+				s = e;
+			} else {
+				/* does not all fit in one range,
+				 * make p[lo] maximal
+				 */
+				if (BB_LEN(p[lo]) != BB_MAX_LEN)
+					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+				s = a + BB_MAX_LEN;
+			}
+			sectors = e - s;
+		}
+	}
+	if (sectors && hi < bb->count) {
+		/* 'hi' points to the first range that starts after 's'.
+		 * Maybe we can merge with the start of that range
+		 */
+		sector_t a = BB_OFFSET(p[hi]);
+		sector_t e = a + BB_LEN(p[hi]);
+		int ack = BB_ACK(p[hi]);
+
+		if (a <= s + sectors) {
+			/* merging is possible */
+			if (e <= s + sectors) {
+				/* full overlap */
+				e = s + sectors;
+				ack = acknowledged;
+			} else
+				ack = ack && acknowledged;
+
+			a = s;
+			if (e - a <= BB_MAX_LEN) {
+				p[hi] = BB_MAKE(a, e-a, ack);
+				s = e;
+			} else {
+				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+				s = a + BB_MAX_LEN;
+			}
+			sectors = e - s;
+			lo = hi;
+			hi++;
+		}
+	}
+	if (sectors == 0 && hi < bb->count) {
+		/* we might be able to combine lo and hi */
+		/* Note: 's' is at the end of 'lo' */
+		sector_t a = BB_OFFSET(p[hi]);
+		int lolen = BB_LEN(p[lo]);
+		int hilen = BB_LEN(p[hi]);
+		int newlen = lolen + hilen - (s - a);
+
+		if (s >= a && newlen < BB_MAX_LEN) {
+			/* yes, we can combine them */
+			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+
+			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+			memmove(p + hi, p + hi + 1,
+				(bb->count - hi - 1) * 8);
+			bb->count--;
+		}
+	}
+	while (sectors) {
+		/* didn't merge (it all).
+		 * Need to add a range just before 'hi'
+		 */
+		if (bb->count >= MAX_BADBLOCKS) {
+			/* No room for more */
+			rv = 1;
+			break;
+		} else {
+			int this_sectors = sectors;
+
+			memmove(p + hi + 1, p + hi,
+				(bb->count - hi) * 8);
+			bb->count++;
+
+			if (this_sectors > BB_MAX_LEN)
+				this_sectors = BB_MAX_LEN;
+			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+			sectors -= this_sectors;
+			s += this_sectors;
+		}
+	}
+
+	bb->changed = 1;
+	if (!acknowledged)
+		bb->unacked_exist = 1;
+	write_sequnlock_irqrestore(&bb->lock, flags);
+
+	return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_set);
+
+/**
+ * badblocks_clear() - Remove a range of bad blocks to the table.
+ * @bb:		the badblocks structure that holds all badblock information
+ * @s:		first sector to mark as bad
+ * @sectors:	number of sectors to mark as bad
+ *
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail.  So if the table becomes full, we just
+ * drop the remove request.
+ *
+ * Return:
+ *  0: success
+ *  1: failed to clear badblocks
+ */
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+	u64 *p;
+	int lo, hi;
+	sector_t target = s + sectors;
+	int rv = 0;
+
+	if (bb->shift > 0) {
+		/* When clearing we round the start up and the end down.
+		 * This should not matter as the shift should align with
+		 * the block size and no rounding should ever be needed.
+		 * However it is better the think a block is bad when it
+		 * isn't than to think a block is not bad when it is.
+		 */
+		s += (1<<bb->shift) - 1;
+		s >>= bb->shift;
+		target >>= bb->shift;
+		sectors = target - s;
+	}
+
+	write_seqlock_irq(&bb->lock);
+
+	p = bb->page;
+	lo = 0;
+	hi = bb->count;
+	/* Find the last range that starts before 'target' */
+	while (hi - lo > 1) {
+		int mid = (lo + hi) / 2;
+		sector_t a = BB_OFFSET(p[mid]);
+
+		if (a < target)
+			lo = mid;
+		else
+			hi = mid;
+	}
+	if (hi > lo) {
+		/* p[lo] is the last range that could overlap the
+		 * current range.  Earlier ranges could also overlap,
+		 * but only this one can overlap the end of the range.
+		 */
+		if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+			/* Partial overlap, leave the tail of this range */
+			int ack = BB_ACK(p[lo]);
+			sector_t a = BB_OFFSET(p[lo]);
+			sector_t end = a + BB_LEN(p[lo]);
+
+			if (a < s) {
+				/* we need to split this range */
+				if (bb->count >= MAX_BADBLOCKS) {
+					rv = -ENOSPC;
+					goto out;
+				}
+				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+				bb->count++;
+				p[lo] = BB_MAKE(a, s-a, ack);
+				lo++;
+			}
+			p[lo] = BB_MAKE(target, end - target, ack);
+			/* there is no longer an overlap */
+			hi = lo;
+			lo--;
+		}
+		while (lo >= 0 &&
+		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+			/* This range does overlap */
+			if (BB_OFFSET(p[lo]) < s) {
+				/* Keep the early parts of this range. */
+				int ack = BB_ACK(p[lo]);
+				sector_t start = BB_OFFSET(p[lo]);
+
+				p[lo] = BB_MAKE(start, s - start, ack);
+				/* now low doesn't overlap, so.. */
+				break;
+			}
+			lo--;
+		}
+		/* 'lo' is strictly before, 'hi' is strictly after,
+		 * anything between needs to be discarded
+		 */
+		if (hi - lo > 1) {
+			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+			bb->count -= (hi - lo - 1);
+		}
+	}
+
+	bb->changed = 1;
+out:
+	write_sequnlock_irq(&bb->lock);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_clear);
+
+/**
+ * ack_all_badblocks() - Acknowledge all bad blocks in a list.
+ * @bb:		the badblocks structure that holds all badblock information
+ *
+ * This only succeeds if ->changed is clear.  It is used by
+ * in-kernel metadata updates
+ */
+void ack_all_badblocks(struct badblocks *bb)
+{
+	if (bb->page == NULL || bb->changed)
+		/* no point even trying */
+		return;
+	write_seqlock_irq(&bb->lock);
+
+	if (bb->changed == 0 && bb->unacked_exist) {
+		u64 *p = bb->page;
+		int i;
+
+		for (i = 0; i < bb->count ; i++) {
+			if (!BB_ACK(p[i])) {
+				sector_t start = BB_OFFSET(p[i]);
+				int len = BB_LEN(p[i]);
+
+				p[i] = BB_MAKE(start, len, 1);
+			}
+		}
+		bb->unacked_exist = 0;
+	}
+	write_sequnlock_irq(&bb->lock);
+}
+EXPORT_SYMBOL_GPL(ack_all_badblocks);
+
+/**
+ * badblocks_show() - sysfs access to bad-blocks list
+ * @bb:		the badblocks structure that holds all badblock information
+ * @page:	buffer received from sysfs
+ * @unack:	weather to show unacknowledged badblocks
+ *
+ * Return:
+ *  Length of returned data
+ */
+ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
+{
+	size_t len;
+	int i;
+	u64 *p = bb->page;
+	unsigned seq;
+
+	if (bb->shift < 0)
+		return 0;
+
+retry:
+	seq = read_seqbegin(&bb->lock);
+
+	len = 0;
+	i = 0;
+
+	while (len < PAGE_SIZE && i < bb->count) {
+		sector_t s = BB_OFFSET(p[i]);
+		unsigned int length = BB_LEN(p[i]);
+		int ack = BB_ACK(p[i]);
+
+		i++;
+
+		if (unack && ack)
+			continue;
+
+		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
+				(unsigned long long)s << bb->shift,
+				length << bb->shift);
+	}
+	if (unack && len == 0)
+		bb->unacked_exist = 0;
+
+	if (read_seqretry(&bb->lock, seq))
+		goto retry;
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_show);
+
+/**
+ * badblocks_store() - sysfs access to bad-blocks list
+ * @bb:		the badblocks structure that holds all badblock information
+ * @page:	buffer received from sysfs
+ * @len:	length of data received from sysfs
+ * @unack:	weather to show unacknowledged badblocks
+ *
+ * Return:
+ *  Length of the buffer processed or -ve error.
+ */
+ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
+			int unack)
+{
+	unsigned long long sector;
+	int length;
+	char newline;
+
+	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
+	case 3:
+		if (newline != '\n')
+			return -EINVAL;
+	case 2:
+		if (length <= 0)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (badblocks_set(bb, sector, length, !unack))
+		return -ENOSPC;
+	else
+		return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_store);
+
+static int __badblocks_init(struct device *dev, struct badblocks *bb,
+		int enable)
+{
+	bb->dev = dev;
+	bb->count = 0;
+	if (enable)
+		bb->shift = 0;
+	else
+		bb->shift = -1;
+	if (dev)
+		bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
+	else
+		bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!bb->page) {
+		bb->shift = -1;
+		return -ENOMEM;
+	}
+	seqlock_init(&bb->lock);
+
+	return 0;
+}
+
+/**
+ * badblocks_init() - initialize the badblocks structure
+ * @bb:		the badblocks structure that holds all badblock information
+ * @enable:	weather to enable badblocks accounting
+ *
+ * Return:
+ *  0: success
+ *  -ve errno: on error
+ */
+int badblocks_init(struct badblocks *bb, int enable)
+{
+	return __badblocks_init(NULL, bb, enable);
+}
+EXPORT_SYMBOL_GPL(badblocks_init);
+
+int devm_init_badblocks(struct device *dev, struct badblocks *bb)
+{
+	if (!bb)
+		return -EINVAL;
+	return __badblocks_init(dev, bb, 1);
+}
+EXPORT_SYMBOL_GPL(devm_init_badblocks);
+
+/**
+ * badblocks_exit() - free the badblocks structure
+ * @bb:		the badblocks structure that holds all badblock information
+ */
+void badblocks_exit(struct badblocks *bb)
+{
+	if (!bb)
+		return;
+	if (bb->dev)
+		devm_kfree(bb->dev, bb->page);
+	else
+		kfree(bb->page);
+	bb->page = NULL;
+}
+EXPORT_SYMBOL_GPL(badblocks_exit);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index f6325d573..711e4d8de 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	}
 
 	if (unlikely(!bip))
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	memset(bip, 0, sizeof(*bip));
 
@@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	return bip;
 err:
 	mempool_free(bip, bs->bio_integrity_pool);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
@@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio)
 
 	/* Allocate bio integrity payload and integrity vectors */
 	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
-	if (unlikely(bip == NULL)) {
+	if (IS_ERR(bip)) {
 		printk(KERN_ERR "could not allocate data integrity bioset\n");
 		kfree(buf);
-		return -EIO;
+		return PTR_ERR(bip);
 	}
 
 	bip->bip_flags |= BIP_BLOCK_INTEGRITY;
@@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 	BUG_ON(bip_src == NULL);
 
 	bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
-
-	if (bip == NULL)
-		return -EIO;
+	if (IS_ERR(bip))
+		return PTR_ERR(bip);
 
 	memcpy(bip->bip_vec, bip_src->bip_vec,
 	       bip_src->bip_vcnt * sizeof(struct bio_vec));
diff --git a/block/bio.c b/block/bio.c
index d4d144363..cf7591551 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -874,7 +874,7 @@ int submit_bio_wait(int rw, struct bio *bio)
 	bio->bi_private = &ret;
 	bio->bi_end_io = submit_bio_wait_endio;
 	submit_bio(rw, bio);
-	wait_for_completion(&ret.event);
+	wait_for_completion_io(&ret.event);
 
 	return ret.error;
 }
@@ -1128,7 +1128,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	int i, ret;
 	int nr_pages = 0;
 	unsigned int len = iter->count;
-	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
+	unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0;
 
 	for (i = 0; i < iter->nr_segs; i++) {
 		unsigned long uaddr;
@@ -1307,7 +1307,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 			goto out_unmap;
 		}
 
-		offset = uaddr & ~PAGE_MASK;
+		offset = offset_in_page(uaddr);
 		for (j = cur_page; j < page_limit; j++) {
 			unsigned int bytes = PAGE_SIZE - offset;
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5a37188b5..66e6f1aae 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 {
 	struct gendisk *disk;
 	struct blkcg_gq *blkg;
+	struct module *owner;
 	unsigned int major, minor;
 	int key_len, part, ret;
 	char *body;
@@ -804,7 +805,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	if (!disk)
 		return -ENODEV;
 	if (part) {
+		owner = disk->fops->owner;
 		put_disk(disk);
+		module_put(owner);
 		return -ENODEV;
 	}
 
@@ -820,7 +823,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		ret = PTR_ERR(blkg);
 		rcu_read_unlock();
 		spin_unlock_irq(disk->queue->queue_lock);
+		owner = disk->fops->owner;
 		put_disk(disk);
+		module_put(owner);
 		/*
 		 * If queue was bypassing, we should retry.  Do so after a
 		 * short msleep().  It isn't strictly necessary but queue
@@ -851,9 +856,13 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
 {
+	struct module *owner;
+
 	spin_unlock_irq(ctx->disk->queue->queue_lock);
 	rcu_read_unlock();
+	owner = ctx->disk->fops->owner;
 	put_disk(ctx->disk);
+	module_put(owner);
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
diff --git a/block/blk-core.c b/block/blk-core.c
index e8e229c70..b83d29755 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -48,12 +48,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
 
 DEFINE_IDA(blk_queue_ida);
 
-int trap_non_toi_io;
-
 /*
  * For the allocated request tables
  */
-struct kmem_cache *request_cachep = NULL;
+struct kmem_cache *request_cachep;
 
 /*
  * For queue allocation
@@ -648,7 +646,7 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
-int blk_queue_enter(struct request_queue *q, gfp_t gfp)
+int blk_queue_enter(struct request_queue *q, bool nowait)
 {
 	while (true) {
 		int ret;
@@ -656,7 +654,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
 		if (percpu_ref_tryget_live(&q->q_usage_counter))
 			return 0;
 
-		if (!gfpflags_allow_blocking(gfp))
+		if (nowait)
 			return -EBUSY;
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
@@ -682,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
 	wake_up_all(&q->mq_freeze_wq);
 }
 
+static void blk_rq_timed_out_timer(unsigned long data)
+{
+	struct request_queue *q = (struct request_queue *)data;
+
+	kblockd_schedule_work(&q->timeout_work);
+}
+
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
@@ -843,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
 		goto fail;
 
+	INIT_WORK(&q->timeout_work, blk_timeout_work);
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unprep_rq_fn		= NULL;
@@ -1294,7 +1300,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	if (q->mq_ops)
-		return blk_mq_alloc_request(q, rw, gfp_mask, false);
+		return blk_mq_alloc_request(q, rw,
+			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
+				0 : BLK_MQ_REQ_NOWAIT);
 	else
 		return blk_old_get_request(q, rw, gfp_mask);
 }
@@ -2062,8 +2070,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-		if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
-
+		if (likely(blk_queue_enter(q, false) == 0)) {
 			ret = q->make_request_fn(q, bio);
 
 			blk_queue_exit(q);
@@ -2097,9 +2104,6 @@ blk_qc_t submit_bio(int rw, struct bio *bio)
 {
 	bio->bi_rw |= rw;
 
-	if (unlikely(trap_non_toi_io))
-		BUG_ON(!bio_flagged(bio, BIO_TOI));
-
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
@@ -2451,14 +2455,16 @@ struct request *blk_peek_request(struct request_queue *q)
 
 			rq = NULL;
 			break;
-		} else if (ret == BLKPREP_KILL) {
+		} else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
+			int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO;
+
 			rq->cmd_flags |= REQ_QUIET;
 			/*
 			 * Mark this request as started so we don't trigger
 			 * any debug logic in the end I/O path.
 			 */
 			blk_start_request(rq);
-			__blk_end_request_all(rq, -EIO);
+			__blk_end_request_all(rq, err);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
@@ -3539,7 +3545,7 @@ int __init blk_dev_init(void)
 	request_cachep = kmem_cache_create("blkdev_requests",
 			sizeof(struct request), 0, SLAB_PANIC, NULL);
 
-	blk_requestq_cachep = kmem_cache_create("blkdev_queue",
+	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
 	return 0;
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
deleted file mode 100644
index 0736729d6..000000000
--- a/block/blk-iopoll.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Functions related to interrupt-poll handling in the block layer. This
- * is similar to NAPI for network devices.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/blk-iopoll.h>
-#include <linux/delay.h>
-
-#include "blk.h"
-
-static unsigned int blk_iopoll_budget __read_mostly = 256;
-
-static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
-
-/**
- * blk_iopoll_sched - Schedule a run of the iopoll handler
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     Add this blk_iopoll structure to the pending poll list and trigger the
- *     raise of the blk iopoll softirq. The driver must already have gotten a
- *     successful return from blk_iopoll_sched_prep() before calling this.
- **/
-void blk_iopoll_sched(struct blk_iopoll *iop)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
-	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_iopoll_sched);
-
-/**
- * __blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     See blk_iopoll_complete(). This function must be called with interrupts
- *     disabled.
- **/
-void __blk_iopoll_complete(struct blk_iopoll *iop)
-{
-	list_del(&iop->list);
-	smp_mb__before_atomic();
-	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(__blk_iopoll_complete);
-
-/**
- * blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     If a driver consumes less than the assigned budget in its run of the
- *     iopoll handler, it'll end the polled mode by calling this function. The
- *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
- *     is called.
- **/
-void blk_iopoll_complete(struct blk_iopoll *iop)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__blk_iopoll_complete(iop);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_iopoll_complete);
-
-static void blk_iopoll_softirq(struct softirq_action *h)
-{
-	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
-	int rearm = 0, budget = blk_iopoll_budget;
-	unsigned long start_time = jiffies;
-
-	local_irq_disable();
-
-	while (!list_empty(list)) {
-		struct blk_iopoll *iop;
-		int work, weight;
-
-		/*
-		 * If softirq window is exhausted then punt.
-		 */
-		if (budget <= 0 || time_after(jiffies, start_time)) {
-			rearm = 1;
-			break;
-		}
-
-		local_irq_enable();
-
-		/* Even though interrupts have been re-enabled, this
-		 * access is safe because interrupts can only add new
-		 * entries to the tail of this list, and only ->poll()
-		 * calls can remove this head entry from the list.
-		 */
-		iop = list_entry(list->next, struct blk_iopoll, list);
-
-		weight = iop->weight;
-		work = 0;
-		if (test_bit(IOPOLL_F_SCHED, &iop->state))
-			work = iop->poll(iop, weight);
-
-		budget -= work;
-
-		local_irq_disable();
-
-		/*
-		 * Drivers must not modify the iopoll state, if they
-		 * consume their assigned weight (or more, some drivers can't
-		 * easily just stop processing, they have to complete an
-		 * entire mask of commands).In such cases this code
-		 * still "owns" the iopoll instance and therefore can
-		 * move the instance around on the list at-will.
-		 */
-		if (work >= weight) {
-			if (blk_iopoll_disable_pending(iop))
-				__blk_iopoll_complete(iop);
-			else
-				list_move_tail(&iop->list, list);
-		}
-	}
-
-	if (rearm)
-		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-
-	local_irq_enable();
-}
-
-/**
- * blk_iopoll_disable - Disable iopoll on this @iop
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     Disable io polling and wait for any pending callbacks to have completed.
- **/
-void blk_iopoll_disable(struct blk_iopoll *iop)
-{
-	set_bit(IOPOLL_F_DISABLE, &iop->state);
-	while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
-		msleep(1);
-	clear_bit(IOPOLL_F_DISABLE, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_disable);
-
-/**
- * blk_iopoll_enable - Enable iopoll on this @iop
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     Enable iopoll on this @iop. Note that the handler run will not be
- *     scheduled, it will only mark it as active.
- **/
-void blk_iopoll_enable(struct blk_iopoll *iop)
-{
-	BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
-	smp_mb__before_atomic();
-	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_enable);
-
-/**
- * blk_iopoll_init - Initialize this @iop
- * @iop:      The parent iopoll structure
- * @weight:   The default weight (or command completion budget)
- * @poll_fn:  The handler to invoke
- *
- * Description:
- *     Initialize this blk_iopoll structure. Before being actively used, the
- *     driver must call blk_iopoll_enable().
- **/
-void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
-{
-	memset(iop, 0, sizeof(*iop));
-	INIT_LIST_HEAD(&iop->list);
-	iop->weight = weight;
-	iop->poll = poll_fn;
-	set_bit(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_init);
-
-static int blk_iopoll_cpu_notify(struct notifier_block *self,
-				 unsigned long action, void *hcpu)
-{
-	/*
-	 * If a CPU goes away, splice its entries to the current CPU
-	 * and trigger a run of the softirq
-	 */
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		int cpu = (unsigned long) hcpu;
-
-		local_irq_disable();
-		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
-				 this_cpu_ptr(&blk_cpu_iopoll));
-		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-		local_irq_enable();
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block blk_iopoll_cpu_notifier = {
-	.notifier_call	= blk_iopoll_cpu_notify,
-};
-
-static __init int blk_iopoll_setup(void)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
-
-	open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
-	register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
-	return 0;
-}
-subsys_initcall(blk_iopoll_setup);
diff --git a/block/blk-map.c b/block/blk-map.c
index f565e11f4..a54f0543b 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio)
 	return ret;
 }
 
+static int __blk_rq_map_user_iov(struct request *rq,
+		struct rq_map_data *map_data, struct iov_iter *iter,
+		gfp_t gfp_mask, bool copy)
+{
+	struct request_queue *q = rq->q;
+	struct bio *bio, *orig_bio;
+	int ret;
+
+	if (copy)
+		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
+	else
+		bio = bio_map_user_iov(q, iter, gfp_mask);
+
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	if (map_data && map_data->null_mapped)
+		bio_set_flag(bio, BIO_NULL_MAPPED);
+
+	iov_iter_advance(iter, bio->bi_iter.bi_size);
+	if (map_data)
+		map_data->offset += bio->bi_iter.bi_size;
+
+	orig_bio = bio;
+	blk_queue_bounce(q, &bio);
+
+	/*
+	 * We link the bounce buffer in and could have to traverse it
+	 * later so we have to get a ref to prevent it from being freed
+	 */
+	bio_get(bio);
+
+	ret = blk_rq_append_bio(q, rq, bio);
+	if (ret) {
+		bio_endio(bio);
+		__blk_rq_unmap_user(orig_bio);
+		bio_put(bio);
+		return ret;
+	}
+
+	return 0;
+}
+
 /**
  * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:		request queue where request should be inserted
@@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 			struct rq_map_data *map_data,
 			const struct iov_iter *iter, gfp_t gfp_mask)
 {
-	struct bio *bio;
-	int unaligned = 0;
-	struct iov_iter i;
 	struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
+	bool copy = (q->dma_pad_mask & iter->count) || map_data;
+	struct bio *bio = NULL;
+	struct iov_iter i;
+	int ret;
 
 	if (!iter || !iter->count)
 		return -EINVAL;
@@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		 */
 		if ((uaddr & queue_dma_alignment(q)) ||
 		    iovec_gap_to_prv(q, &prv, &iov))
-			unaligned = 1;
+			copy = true;
 
 		prv.iov_base = iov.iov_base;
 		prv.iov_len = iov.iov_len;
 	}
 
-	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
-		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
-	else
-		bio = bio_map_user_iov(q, iter, gfp_mask);
-
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	if (map_data && map_data->null_mapped)
-		bio_set_flag(bio, BIO_NULL_MAPPED);
-
-	if (bio->bi_iter.bi_size != iter->count) {
-		/*
-		 * Grab an extra reference to this bio, as bio_unmap_user()
-		 * expects to be able to drop it twice as it happens on the
-		 * normal IO completion path
-		 */
-		bio_get(bio);
-		bio_endio(bio);
-		__blk_rq_unmap_user(bio);
-		return -EINVAL;
-	}
+	i = *iter;
+	do {
+		ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
+		if (ret)
+			goto unmap_rq;
+		if (!bio)
+			bio = rq->bio;
+	} while (iov_iter_count(&i));
 
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
 		rq->cmd_flags |= REQ_COPY_USER;
-
-	blk_queue_bounce(q, &bio);
-	bio_get(bio);
-	blk_rq_bio_prep(q, rq, bio);
 	return 0;
+
+unmap_rq:
+	__blk_rq_unmap_user(bio);
+	rq->bio = NULL;
+	return -EINVAL;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b966db8f3..261353166 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -7,6 +7,8 @@
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
 
+#include <trace/events/block.h>
+
 #include "blk.h"
 
 static struct bio *blk_bio_discard_split(struct request_queue *q,
@@ -187,6 +189,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 		split->bi_rw |= REQ_NOMERGE;
 
 		bio_chain(split, *bio);
+		trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
 		generic_make_request(*bio);
 		*bio = split;
 	}
@@ -301,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 				   struct bio *nxt)
 {
 	struct bio_vec end_bv = { NULL }, nxt_bv;
-	struct bvec_iter iter;
 
 	if (!blk_queue_cluster(q))
 		return 0;
@@ -313,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 	if (!bio_has_data(bio))
 		return 1;
 
-	bio_for_each_segment(end_bv, bio, iter)
-		if (end_bv.bv_len == iter.bi_size)
-			break;
-
-	nxt_bv = bio_iovec(nxt);
+	bio_get_last_bvec(bio, &end_bv);
+	bio_get_first_bvec(nxt, &nxt_bv);
 
 	if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
 		return 0;
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 8764c241e..d0634bcf3 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -113,7 +113,7 @@ int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
 
 	for_each_possible_cpu(i) {
 		if (index == mq_map[i])
-			return cpu_to_node(i);
+			return local_memory_node(cpu_to_node(i));
 	}
 
 	return NUMA_NO_NODE;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index a07ca3488..abdbb4740 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 	if (tag != -1)
 		return tag;
 
-	if (!gfpflags_allow_blocking(data->gfp))
+	if (data->flags & BLK_MQ_REQ_NOWAIT)
 		return -1;
 
 	bs = bt_wait_ptr(bt, hctx);
@@ -303,7 +303,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = data->q->mq_ops->map_queue(data->q,
 				data->ctx->cpu);
-		if (data->reserved) {
+		if (data->flags & BLK_MQ_REQ_RESERVED) {
 			bt = &data->hctx->tags->breserved_tags;
 		} else {
 			last_tag = &data->ctx->last_tag;
@@ -349,10 +349,9 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
 
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
-	if (!data->reserved)
-		return __blk_mq_get_tag(data);
-
-	return __blk_mq_get_reserved_tag(data);
+	if (data->flags & BLK_MQ_REQ_RESERVED)
+		return __blk_mq_get_reserved_tag(data);
+	return __blk_mq_get_tag(data);
 }
 
 static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6d6f8feb4..56c0a726b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -229,8 +229,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 	return NULL;
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
-		bool reserved)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+		unsigned int flags)
 {
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
@@ -238,24 +238,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 	struct blk_mq_alloc_data alloc_data;
 	int ret;
 
-	ret = blk_queue_enter(q, gfp);
+	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
 	if (ret)
 		return ERR_PTR(ret);
 
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
-			reserved, ctx, hctx);
+	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 
 	rq = __blk_mq_alloc_request(&alloc_data, rw);
-	if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
+	if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
 		__blk_mq_run_hw_queue(hctx);
 		blk_mq_put_ctx(ctx);
 
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
-				hctx);
+		blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 		rq =  __blk_mq_alloc_request(&alloc_data, rw);
 		ctx = alloc_data.ctx;
 	}
@@ -601,12 +599,12 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		 * If a request wasn't started before the queue was
 		 * marked dying, kill it here or it'll go unnoticed.
 		 */
-		if (unlikely(blk_queue_dying(rq->q)))
-			blk_mq_complete_request(rq, -EIO);
+		if (unlikely(blk_queue_dying(rq->q))) {
+			rq->errors = -EIO;
+			blk_mq_end_request(rq, rq->errors);
+		}
 		return;
 	}
-	if (rq->cmd_flags & REQ_NO_TIMEOUT)
-		return;
 
 	if (time_after_eq(jiffies, rq->deadline)) {
 		if (!blk_mark_rq_complete(rq))
@@ -617,15 +615,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
-static void blk_mq_rq_timer(unsigned long priv)
+static void blk_mq_timeout_work(struct work_struct *work)
 {
-	struct request_queue *q = (struct request_queue *)priv;
+	struct request_queue *q =
+		container_of(work, struct request_queue, timeout_work);
 	struct blk_mq_timeout_data data = {
 		.next		= 0,
 		.next_set	= 0,
 	};
 	int i;
 
+	if (blk_queue_enter(q, true))
+		return;
+
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 
 	if (data.next_set) {
@@ -640,6 +642,7 @@ static void blk_mq_rq_timer(unsigned long priv)
 				blk_mq_tag_idle(hctx);
 		}
 	}
+	blk_queue_exit(q);
 }
 
 /*
@@ -1175,8 +1178,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 		rw |= REQ_SYNC;
 
 	trace_block_getrq(q, bio, rw);
-	blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
-			hctx);
+	blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
 	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (unlikely(!rq)) {
 		__blk_mq_run_hw_queue(hctx);
@@ -1185,8 +1187,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q,
-				__GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
+		blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
 		rq = __blk_mq_alloc_request(&alloc_data, rw);
 		ctx = alloc_data.ctx;
 		hctx = alloc_data.hctx;
@@ -1794,7 +1795,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		 * not, we remain on the home node of the device
 		 */
 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
-			hctx->numa_node = cpu_to_node(i);
+			hctx->numa_node = local_memory_node(cpu_to_node(i));
 	}
 }
 
@@ -1854,6 +1855,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 		hctx->tags = set->tags[i];
 		WARN_ON(!hctx->tags);
 
+		cpumask_copy(hctx->tags->cpumask, hctx->cpumask);
 		/*
 		 * Set the map size to the number of mapped software queues.
 		 * This is more accurate and more efficient than looping
@@ -1867,14 +1869,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 		hctx->next_cpu = cpumask_first(hctx->cpumask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
-
-	queue_for_each_ctx(q, ctx, i) {
-		if (!cpumask_test_cpu(i, online_mask))
-			continue;
-
-		hctx = q->mq_ops->map_queue(q, i);
-		cpumask_set_cpu(i, hctx->tags->cpumask);
-	}
 }
 
 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
@@ -2019,7 +2013,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		hctxs[i]->queue_num = i;
 	}
 
-	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
 	q->nr_queues = nr_cpu_ids;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 713820b47..eaede8e45 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -96,8 +96,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 struct blk_mq_alloc_data {
 	/* input parameter */
 	struct request_queue *q;
-	gfp_t gfp;
-	bool reserved;
+	unsigned int flags;
 
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
@@ -105,13 +104,11 @@ struct blk_mq_alloc_data {
 };
 
 static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
-		struct request_queue *q, gfp_t gfp, bool reserved,
-		struct blk_mq_ctx *ctx,
-		struct blk_mq_hw_ctx *hctx)
+		struct request_queue *q, unsigned int flags,
+		struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
 {
 	data->q = q;
-	data->gfp = gfp;
-	data->reserved = reserved;
+	data->flags = flags;
 	data->ctx = ctx;
 	data->hctx = hctx;
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e140cc487..dd9376305 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -147,10 +147,9 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
 
 static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
 {
-	unsigned long long val;
 
-	val = q->limits.max_hw_discard_sectors << 9;
-	return sprintf(page, "%llu\n", val);
+	return sprintf(page, "%llu\n",
+		(unsigned long long)q->limits.max_hw_discard_sectors << 9);
 }
 
 static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index aa40aa933..a30441a20 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
 	}
 }
 
-void blk_rq_timed_out_timer(unsigned long data)
+void blk_timeout_work(struct work_struct *work)
 {
-	struct request_queue *q = (struct request_queue *) data;
+	struct request_queue *q =
+		container_of(work, struct request_queue, timeout_work);
 	unsigned long flags, next = 0;
 	struct request *rq, *tmp;
 	int next_set = 0;
 
+	if (blk_queue_enter(q, true))
+		return;
 	spin_lock_irqsave(q->queue_lock, flags);
 
 	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data)
 		mod_timer(&q->timeout, round_jiffies_up(next));
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
+	blk_queue_exit(q);
 }
 
 /**
@@ -186,15 +190,13 @@ unsigned long blk_rq_timeout(unsigned long timeout)
  * Notes:
  *    Each request has its own timer, and as it is added to the queue, we
  *    set up the timer. When the request completes, we cancel the timer.
+ *    Queue lock must be held for the non-mq case, mq case doesn't care.
  */
 void blk_add_timer(struct request *req)
 {
 	struct request_queue *q = req->q;
 	unsigned long expiry;
 
-	if (req->cmd_flags & REQ_NO_TIMEOUT)
-		return;
-
 	/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
 	if (!q->mq_ops && !q->rq_timed_out_fn)
 		return;
@@ -209,6 +211,11 @@ void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	req->deadline = jiffies + req->timeout;
+
+	/*
+	 * Only the non-mq case needs to add the request to a protected list.
+	 * For the mq case we simply scan the tag map.
+	 */
 	if (!q->mq_ops)
 		list_add_tail(&req->timeout_list, &req->q->timeout_list);
 
diff --git a/block/blk.h b/block/blk.h
index c43926d3d..70e4aee9c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -93,7 +93,7 @@ static inline void blk_flush_integrity(void)
 }
 #endif
 
-void blk_rq_timed_out_timer(unsigned long data);
+void blk_timeout_work(struct work_struct *work);
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 void blk_delete_timer(struct request *);
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index a753df2b3..d0dd7882d 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -39,7 +39,6 @@ struct deadline_data {
 	 */
 	struct request *next_rq[2];
 	unsigned int batching;		/* number of sequential requests made */
-	sector_t last_sector;		/* head position */
 	unsigned int starved;		/* times reads have starved writes */
 
 	/*
@@ -210,8 +209,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
 	dd->next_rq[WRITE] = NULL;
 	dd->next_rq[data_dir] = deadline_latter_request(rq);
 
-	dd->last_sector = rq_end_sector(rq);
-
 	/*
 	 * take it off the sort and fifo list, move
 	 * to dispatch queue
diff --git a/block/genhd.c b/block/genhd.c
index c06731166..9f42526b4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,10 +18,9 @@
 #include <linux/kobj_map.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
-#include <linux/ctype.h>
-#include <linux/fs_uuid.h>
 #include <linux/log2.h>
 #include <linux/pm_runtime.h>
+#include <linux/badblocks.h>
 
 #include "blk.h"
 
@@ -666,7 +665,6 @@ void del_gendisk(struct gendisk *disk)
 
 	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
-	disk->driverfs_dev = NULL;
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -674,6 +672,31 @@ void del_gendisk(struct gendisk *disk)
 }
 EXPORT_SYMBOL(del_gendisk);
 
+/* sysfs access to bad-blocks list. */
+static ssize_t disk_badblocks_show(struct device *dev,
+					struct device_attribute *attr,
+					char *page)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	if (!disk->bb)
+		return sprintf(page, "\n");
+
+	return badblocks_show(disk->bb, page, 0);
+}
+
+static ssize_t disk_badblocks_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *page, size_t len)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	if (!disk->bb)
+		return -ENXIO;
+
+	return badblocks_store(disk->bb, page, len, 0);
+}
+
 /**
  * get_gendisk - get partitioning information for a given device
  * @devt: device to get partitioning information for
@@ -992,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
+		disk_badblocks_store);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -1013,6 +1038,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
+	&dev_attr_badblocks.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -1391,85 +1417,6 @@ int invalidate_partition(struct gendisk *disk, int partno)
 
 EXPORT_SYMBOL(invalidate_partition);
 
-dev_t blk_lookup_fs_info(struct fs_info *seek)
-{
-	dev_t devt = MKDEV(0, 0);
-	struct class_dev_iter iter;
-	struct device *dev;
-	int best_score = 0;
-
-	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
-	while (best_score < 3 && (dev = class_dev_iter_next(&iter))) {
-		struct gendisk *disk = dev_to_disk(dev);
-		struct disk_part_iter piter;
-		struct hd_struct *part;
-
-		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
-
-		while (best_score < 3 && (part = disk_part_iter_next(&piter))) {
-			int score = part_matches_fs_info(part, seek);
-			if (score > best_score) {
-				devt = part_devt(part);
-				best_score = score;
-			}
-		}
-		disk_part_iter_exit(&piter);
-	}
-	class_dev_iter_exit(&iter);
-	return devt;
-}
-
-/* Caller uses NULL, key to start. For each match found, we return a bdev on
- * which we have done blkdev_get, and we do the blkdev_put on block devices
- * that are passed to us. When no more matches are found, we return NULL.
- */
-struct block_device *next_bdev_of_type(struct block_device *last,
-	const char *key)
-{
-	dev_t devt = MKDEV(0, 0);
-	struct class_dev_iter iter;
-	struct device *dev;
-	struct block_device *next = NULL, *bdev;
-	int got_last = 0;
-
-	if (!key)
-		goto out;
-
-	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
-	while (!devt && (dev = class_dev_iter_next(&iter))) {
-		struct gendisk *disk = dev_to_disk(dev);
-		struct disk_part_iter piter;
-		struct hd_struct *part;
-
-		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
-
-		while ((part = disk_part_iter_next(&piter))) {
-			bdev = bdget(part_devt(part));
-			if (last && !got_last) {
-				if (last == bdev)
-					got_last = 1;
-				continue;
-			}
-
-			if (blkdev_get(bdev, FMODE_READ, 0))
-				continue;
-
-			if (bdev_matches_key(bdev, key)) {
-				next = bdev;
-				break;
-			}
-
-			blkdev_put(bdev, FMODE_READ);
-		}
-		disk_part_iter_exit(&piter);
-	}
-	class_dev_iter_exit(&iter);
-out:
-	if (last)
-		blkdev_put(last, FMODE_READ);
-	return next;
-}
-
 /*
  * Disk events - monitor disk events like media change and eject request.
  */
@@ -1502,7 +1449,7 @@ static DEFINE_MUTEX(disk_events_mutex);
 static LIST_HEAD(disk_events);
 
 /* disable in-kernel polling by default */
-static unsigned long disk_events_dfl_poll_msecs	= 0;
+static unsigned long disk_events_dfl_poll_msecs;
 
 static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
 {
diff --git a/block/ioctl.c b/block/ioctl.c
index 0918aed2d..d8996bbd7 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -4,6 +4,7 @@
 #include <linux/gfp.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
+#include <linux/badblocks.h>
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/blktrace_api.h>
@@ -406,6 +407,35 @@ static inline int is_unrecognized_ioctl(int ret)
 		ret == -ENOIOCTLCMD;
 }
 
+#ifdef CONFIG_FS_DAX
+bool blkdev_dax_capable(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+
+	if (!disk->fops->direct_access)
+		return false;
+
+	/*
+	 * If the partition is not aligned on a page boundary, we can't
+	 * do dax I/O to it.
+	 */
+	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+		return false;
+
+	/*
+	 * If the device has known bad blocks, force all I/O through the
+	 * driver / page cache.
+	 *
+	 * TODO: support finer grained dax error handling
+	 */
+	if (disk->bb && disk->bb->count)
+		return false;
+
+	return true;
+}
+#endif
+
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
 		unsigned cmd, unsigned long arg)
 {
@@ -568,6 +598,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
 		return blk_trace_ioctl(bdev, cmd, argp);
+	case BLKDAXGET:
+		return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
+		break;
 	case IOC_PR_REGISTER:
 		return blkdev_pr_register(bdev, argp);
 	case IOC_PR_RESERVE:
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 746935a59..fefd01b49 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -16,6 +16,7 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <linux/dax.h>
 #include <linux/blktrace_api.h>
 
 #include "partitions/check.h"
@@ -550,13 +551,24 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
 	return 0;
 }
 
-unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+			NULL);
+}
+
+unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+{
 	struct page *page;
 
-	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-				 NULL);
+	/* don't populate page cache for dax capable devices */
+	if (IS_DAX(bdev->bd_inode))
+		page = read_dax_sector(bdev, n);
+	else
+		page = read_pagecache_sector(bdev, n);
+
 	if (!IS_ERR(page)) {
 		if (PageError(page))
 			goto fail;
diff --git a/block/uuid.c b/block/uuid.c
deleted file mode 100644
index 4610d7b8f..000000000
--- a/block/uuid.c
+++ /dev/null
@@ -1,509 +0,0 @@
-#include <linux/blkdev.h>
-#include <linux/ctype.h>
-#include <linux/fs_uuid.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-
-static int debug_enabled;
-
-#define PRINTK(fmt, args...) do {					\
-	if (debug_enabled)						\
-		printk(KERN_DEBUG fmt, ## args);			\
-	} while(0)
-
-#define PRINT_HEX_DUMP(v1, v2, v3, v4, v5, v6, v7, v8)			\
-	do {								\
-		if (debug_enabled)					\
-			print_hex_dump(v1, v2, v3, v4, v5, v6, v7, v8);	\
-	} while(0)
-
-/*
- * Simple UUID translation
- */
-
-struct uuid_info {
-	const char *key;
-	const char *name;
-	long bkoff;
-	unsigned sboff;
-	unsigned sig_len;
-	const char *magic;
-	int uuid_offset;
-	int last_mount_offset;
-	int last_mount_size;
-};
-
-/*
- * Based on libuuid's blkid_magic array. Note that I don't
- * have uuid offsets for all of these yet - mssing ones are 0x0.
- * Further information welcome.
- *
- * Rearranged by page of fs signature for optimisation.
- */
-static struct uuid_info uuid_list[] = {
- { NULL, "oracleasm", 0, 32, 8, "ORCLDISK", 0x0, 0, 0 },
- { "ntfs", "ntfs", 0, 3, 8, "NTFS    ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x52, 5, "MSWIN", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x52, 8, "FAT32   ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x36, 5, "MSDOS", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x36, 8, "FAT16   ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x36, 8, "FAT12   ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0, 1, "\353", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0, 1, "\351", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x1fe, 2, "\125\252", 0x0, 0, 0 },
- { "xfs", "xfs", 0, 0, 4, "XFSB", 0x20, 0, 0 },
- { "romfs", "romfs", 0, 0, 8, "-rom1fs-", 0x0, 0, 0 },
- { "bfs", "bfs", 0, 0, 4, "\316\372\173\033", 0, 0, 0 },
- { "cramfs", "cramfs", 0, 0, 4, "E=\315\050", 0x0, 0, 0 },
- { "qnx4", "qnx4", 0, 4, 6, "QNX4FS", 0, 0, 0 },
- { NULL, "crypt_LUKS", 0, 0, 6, "LUKS\xba\xbe", 0x0, 0, 0 },
- { "squashfs", "squashfs", 0, 0, 4, "sqsh", 0, 0, 0 },
- { "squashfs", "squashfs", 0, 0, 4, "hsqs", 0, 0, 0 },
- { "ocfs", "ocfs", 0, 8, 9, "OracleCFS", 0x0, 0, 0 },
- { "lvm2pv", "lvm2pv", 0, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
- { "sysv", "sysv", 0, 0x3f8, 4, "\020~\030\375", 0, 0, 0 },
- { "ext", "ext", 1, 0x38, 2, "\123\357", 0x468, 0x42c, 4 },
- { "minix", "minix", 1, 0x10, 2, "\177\023", 0, 0, 0 },
- { "minix", "minix", 1, 0x10, 2, "\217\023", 0, 0, 0 },
- { "minix", "minix", 1, 0x10, 2, "\150\044", 0, 0, 0 },
- { "minix", "minix", 1, 0x10, 2, "\170\044", 0, 0, 0 },
- { "lvm2pv", "lvm2pv", 1, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
- { "vxfs", "vxfs", 1, 0, 4, "\365\374\001\245", 0, 0, 0 },
- { "hfsplus", "hfsplus", 1, 0, 2, "BD", 0x0, 0, 0 },
- { "hfsplus", "hfsplus", 1, 0, 2, "H+", 0x0, 0, 0 },
- { "hfsplus", "hfsplus", 1, 0, 2, "HX", 0x0, 0, 0 },
- { "hfs", "hfs", 1, 0, 2, "BD", 0x0, 0, 0 },
- { "ocfs2", "ocfs2", 1, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "lvm2pv", "lvm2pv", 0, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
- { "lvm2pv", "lvm2pv", 1, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
- { "ocfs2", "ocfs2", 2, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "swap", "swap", 0, 0xff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0xff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "ocfs2", "ocfs2", 4, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "ocfs2", "ocfs2", 8, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "hpfs", "hpfs", 8, 0, 4, "I\350\225\371", 0, 0, 0 },
- { "reiserfs", "reiserfs", 8, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
- { "reiserfs", "reiserfs", 8, 20, 8, "ReIsErFs", 0x10054, 0, 0 },
- { "zfs", "zfs", 8, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
- { "zfs", "zfs", 8, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
- { "ufs", "ufs", 8, 0x55c, 4, "T\031\001\000", 0, 0, 0 },
- { "swap", "swap", 0, 0x1ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0x1ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x1ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x1ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x1ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr2Fs", 0x10054, 0, 0 },
- { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr3Fs", 0x10054, 0, 0 },
- { "reiserfs", "reiserfs", 64, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
- { "reiser4", "reiser4", 64, 0, 7, "ReIsEr4", 0x100544, 0, 0 },
- { "gfs2", "gfs2", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
- { "gfs", "gfs", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
- { "btrfs", "btrfs", 64, 0x40, 8, "_BHRfS_M", 0x0, 0, 0 },
- { "swap", "swap", 0, 0x3ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0x3ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x3ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x3ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x3ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "udf", "udf", 32, 1, 5, "BEA01", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "BOOT2", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "CD001", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "CDW02", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "NSR02", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "NSR03", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "TEA01", 0x0, 0, 0 },
- { "iso9660", "iso9660", 32, 1, 5, "CD001", 0x0, 0, 0 },
- { "iso9660", "iso9660", 32, 9, 5, "CDROM", 0x0, 0, 0 },
- { "jfs", "jfs", 32, 0, 4, "JFS1", 0x88, 0, 0 },
- { "swap", "swap", 0, 0x7ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0x7ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x7ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x7ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x7ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0xfff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0xfff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xfff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xfff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xfff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "zfs", "zfs", 264, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
- { "zfs", "zfs", 264, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
- { NULL, NULL, 0, 0, 0, NULL, 0x0, 0, 0 }
-};
-
-static int null_uuid(const char *uuid)
-{
-	int i;
-
-	for (i = 0; i < 16 && !uuid[i]; i++);
-
-	return (i == 16);
-}
-
-
-static void uuid_end_bio(struct bio *bio)
-{
-	struct page *page = bio->bi_io_vec[0].bv_page;
-
-        if (bio->bi_error)
-            SetPageError(page);
-
-	unlock_page(page);
-	bio_put(bio);
-}
-
-
-/**
- * submit - submit BIO request
- * @dev: The block device we're using.
- * @page_num: The page we're reading.
- *
- * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
- * textbook - allocate and initialize the bio. If we're writing, make sure
- * the page is marked as dirty. Then submit it and carry on."
- **/
-static struct page *read_bdev_page(struct block_device *dev, int page_num)
-{
-	struct bio *bio = NULL;
-	struct page *page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-
-	if (!page) {
-		printk(KERN_ERR "Failed to allocate a page for reading data "
-				"in UUID checks.");
-		return NULL;
-	}
-
-	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_bdev = dev;
-	bio->bi_iter.bi_sector = page_num << 3;
-	bio->bi_end_io = uuid_end_bio;
-	bio->bi_flags |= (1 << BIO_TOI);
-
-	PRINTK("Submitting bio on device %lx, page %d using bio %p and page %p.\n",
-			(unsigned long) dev->bd_dev, page_num, bio, page);
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_DEBUG "ERROR: adding page to bio at %d\n",
-				page_num);
-		bio_put(bio);
-		__free_page(page);
-		printk(KERN_DEBUG "read_bdev_page freed page %p (in error "
-				"path).\n", page);
-		return NULL;
-	}
-
-	lock_page(page);
-	submit_bio(READ | REQ_SYNC, bio);
-
-	wait_on_page_locked(page);
-	if (PageError(page)) {
-		__free_page(page);
-		page = NULL;
-	}
-	return page;
-}
-
-int bdev_matches_key(struct block_device *bdev, const char *key)
-{
-	unsigned char *data = NULL;
-	struct page *data_page = NULL;
-
-	int dev_offset, pg_num, pg_off, i;
-	int last_pg_num = -1;
-	int result = 0;
-	char buf[50];
-
-	if (null_uuid(key)) {
-		PRINTK("Refusing to find a NULL key.\n");
-		return 0;
-	}
-
-	if (!bdev->bd_disk) {
-		bdevname(bdev, buf);
-		PRINTK("bdev %s has no bd_disk.\n", buf);
-		return 0;
-	}
-
-	if (!bdev->bd_disk->queue) {
-		bdevname(bdev, buf);
-		PRINTK("bdev %s has no queue.\n", buf);
-		return 0;
-	}
-
-	for (i = 0; uuid_list[i].name; i++) {
-		struct uuid_info *dat = &uuid_list[i];
-
-		if (!dat->key || strcmp(dat->key, key))
-			continue;
-
-		dev_offset = (dat->bkoff << 10) + dat->sboff;
-		pg_num = dev_offset >> 12;
-		pg_off = dev_offset & 0xfff;
-
-		if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
-			continue;
-
-		if (pg_num != last_pg_num) {
-			if (data_page) {
-				kunmap(data_page);
-				__free_page(data_page);
-			}
-			data_page = read_bdev_page(bdev, pg_num);
-			if (!data_page)
-				continue;
-			data = kmap(data_page);
-		}
-
-		last_pg_num = pg_num;
-
-		if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
-			continue;
-
-		result = 1;
-		break;
-	}
-
-	if (data_page) {
-		kunmap(data_page);
-		__free_page(data_page);
-	}
-
-	return result;
-}
-
-/* 
- * part_matches_fs_info - Does the given partition match the details given?
- *
- * Returns a score saying how good the match is.
- * 0 = no UUID match.
- * 1 = UUID but last mount time differs.
- * 2 = UUID, last mount time but not dev_t
- * 3 = perfect match
- *
- * This lets us cope elegantly with probing resulting in dev_ts changing
- * from boot to boot, and with the case where a user copies a partition
- * (UUID is non unique), and we need to check the last mount time of the
- * correct partition.
- */
-int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek)
-{
-	struct block_device *bdev;
-	struct fs_info *got;
-	int result = 0;
-	char buf[50];
-
-	if (null_uuid((char *) &seek->uuid)) {
-		PRINTK("Refusing to find a NULL uuid.\n");
-		return 0;
-	}
-
-	bdev = bdget(part_devt(part));
-
-	PRINTK("part_matches fs info considering %x.\n", part_devt(part));
-
-	if (blkdev_get(bdev, FMODE_READ, 0)) {
-		PRINTK("blkdev_get failed.\n");
-		return 0;
-	}
-
-	if (!bdev->bd_disk) {
-		bdevname(bdev, buf);
-		PRINTK("bdev %s has no bd_disk.\n", buf);
-		goto out;
-	}
-
-	if (!bdev->bd_disk->queue) {
-		bdevname(bdev, buf);
-		PRINTK("bdev %s has no queue.\n", buf);
-		goto out;
-	}
-
-	got = fs_info_from_block_dev(bdev);
-
-	if (got && !memcmp(got->uuid, seek->uuid, 16)) {
-		PRINTK(" Have matching UUID.\n");
-		PRINTK(" Got: LMS %d, LM %p.\n", got->last_mount_size, got->last_mount);
-		PRINTK(" Seek: LMS %d, LM %p.\n", seek->last_mount_size, seek->last_mount);
-		result = 1;
-
-		if (got->last_mount_size == seek->last_mount_size &&
-		    got->last_mount && seek->last_mount &&
-		    !memcmp(got->last_mount, seek->last_mount,
-			    got->last_mount_size)) {
-			result = 2;
-
-			PRINTK(" Matching last mount time.\n");
-
-			if (part_devt(part) == seek->dev_t) {
-				result = 3;
-				PRINTK(" Matching dev_t.\n");
-			} else
-				PRINTK("Dev_ts differ (%x vs %x).\n", part_devt(part), seek->dev_t);
-		}
-	}
-
-	PRINTK(" Score for %x is %d.\n", part_devt(part), result);
-	free_fs_info(got);
-out:
-	blkdev_put(bdev, FMODE_READ);
-	return result;
-}
-
-void free_fs_info(struct fs_info *fs_info)
-{
-	if (!fs_info || IS_ERR(fs_info))
-		return;
-
-	if (fs_info->last_mount)
-		kfree(fs_info->last_mount);
-
-	kfree(fs_info);
-}
-
-struct fs_info *fs_info_from_block_dev(struct block_device *bdev)
-{
-	unsigned char *data = NULL;
-	struct page *data_page = NULL;
-
-	int dev_offset, pg_num, pg_off;
-	int uuid_pg_num, uuid_pg_off, i;
-	unsigned char *uuid_data = NULL;
-	struct page *uuid_data_page = NULL;
-
-	int last_pg_num = -1, last_uuid_pg_num = 0;
-	char buf[50];
-	struct fs_info *fs_info = NULL;
-
-	bdevname(bdev, buf);
-
-	PRINTK("uuid_from_block_dev looking for partition type of %s.\n", buf);
-
-	for (i = 0; uuid_list[i].name; i++) {
-		struct uuid_info *dat = &uuid_list[i];
-		dev_offset = (dat->bkoff << 10) + dat->sboff;
-		pg_num = dev_offset >> 12;
-		pg_off = dev_offset & 0xfff;
-		uuid_pg_num = dat->uuid_offset >> 12;
-		uuid_pg_off = dat->uuid_offset & 0xfff;
-
-		if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
-			continue;
-
-		/* Ignore partition types with no UUID offset */
-		if (!dat->uuid_offset)
-			continue;
-
-		if (pg_num != last_pg_num) {
-			if (data_page) {
-				kunmap(data_page);
-				__free_page(data_page);
-			}
-			data_page = read_bdev_page(bdev, pg_num);
-			if (!data_page)
-				continue;
-			data = kmap(data_page);
-		}
-
-		last_pg_num = pg_num;
-
-		if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
-			continue;
-
-		PRINTK("This partition looks like %s.\n", dat->name);
-
-		fs_info = kzalloc(sizeof(struct fs_info), GFP_KERNEL);
-
-		if (!fs_info) {
-			PRINTK("Failed to allocate fs_info struct.");
-			fs_info = ERR_PTR(-ENOMEM);
-			break;
-		}
-
-		/* UUID can't be off the end of the disk */
-		if ((uuid_pg_num > bdev->bd_part->nr_sects >> 3) ||
-				!dat->uuid_offset)
-			goto no_uuid;
-
-		if (!uuid_data || uuid_pg_num != last_uuid_pg_num) {
-			/* No need to reread the page from above */
-			if (uuid_pg_num == pg_num && uuid_data)
-				memcpy(uuid_data, data, PAGE_SIZE);
-			else {
-				if (uuid_data_page) {
-					kunmap(uuid_data_page);
-					__free_page(uuid_data_page);
-				}
-				uuid_data_page = read_bdev_page(bdev, uuid_pg_num);
-				if (!uuid_data_page)
-					continue;
-				uuid_data = kmap(uuid_data_page);
-			}
-		}
-
-		last_uuid_pg_num = uuid_pg_num;
-		memcpy(&fs_info->uuid, &uuid_data[uuid_pg_off], 16);
-		fs_info->dev_t = bdev->bd_dev;
-
-no_uuid:
-		PRINT_HEX_DUMP(KERN_EMERG, "fs_info_from_block_dev "
-				"returning uuid ", DUMP_PREFIX_NONE, 16, 1,
-				fs_info->uuid, 16, 0);
-
-		if (dat->last_mount_size) {
-			int pg = dat->last_mount_offset >> 12, sz;
-			int off = dat->last_mount_offset & 0xfff;
-			struct page *last_mount = read_bdev_page(bdev, pg);
-			unsigned char *last_mount_data;
-			char *ptr;
-
-			if (!last_mount) {
-				fs_info = ERR_PTR(-ENOMEM);
-				break;
-			}
-			last_mount_data = kmap(last_mount);
-			sz = dat->last_mount_size;
-			ptr = kmalloc(sz, GFP_KERNEL);
-
-			if (!ptr) {
-				printk(KERN_EMERG "fs_info_from_block_dev "
-					"failed to get memory for last mount "
-					"timestamp.");
-				free_fs_info(fs_info);
-				fs_info = ERR_PTR(-ENOMEM);
-			} else {
-				fs_info->last_mount = ptr;
-				fs_info->last_mount_size = sz;
-				memcpy(ptr, &last_mount_data[off], sz);
-			}
-
-			kunmap(last_mount);
-			__free_page(last_mount);
-		}
-		break;
-	}
-
-	if (data_page) {
-		kunmap(data_page);
-		__free_page(data_page);
-	}
-
-	if (uuid_data_page) {
-		kunmap(uuid_data_page);
-		__free_page(uuid_data_page);
-	}
-
-	return fs_info;
-}
-
-static int __init uuid_debug_setup(char *str)
-{
-	int value;
-
-	if (sscanf(str, "=%d", &value))
-		debug_enabled = value;
-
-	return 1;
-}
-
-__setup("uuid_debug", uuid_debug_setup);