27 files changed, 2270 insertions, 1193 deletions
diff --git a/lib/Kconfig b/lib/Kconfig
index 3cca12225..c585e4c40 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -362,6 +362,9 @@ config INTERVAL_TREE
 
 	  for more information.
 
+config RADIX_TREE_MULTIORDER
+	bool
+
 config ASSOCIATIVE_ARRAY
 	bool
 	help
@@ -523,6 +526,13 @@ config SG_SPLIT
 	 a scatterlist. This should be selected by a driver or an API which
 	 whishes to split a scatterlist amongst multiple DMA channels.
 
+config SG_POOL
+	def_bool n
+	help
+	 Provides a helper to allocate chained scatterlists. This should be
+	 selected by a driver or an API which whishes to allocate chained
+	 scatterlist.
+
 #
 # sg chaining option
 #
@@ -540,4 +550,7 @@ config STACKDEPOT
 	bool
 	select STACKTRACE
 
+config WBT
+	bool
+
 endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b4e287cde..42d3d798c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -257,6 +257,7 @@ config PAGE_OWNER
 
 config DEBUG_FS
 	bool "Debug Filesystem"
+	select SRCU
 	help
 	  debugfs is a virtual file system that kernel developers use to put
 	  debugging files into.  Enable this option to be able to read and
@@ -1289,6 +1290,39 @@ config TORTURE_TEST
 	tristate
 	default n
 
+config RCU_PERF_TEST
+	tristate "performance tests for RCU"
+	depends on DEBUG_KERNEL
+	select TORTURE_TEST
+	select SRCU
+	select TASKS_RCU
+	default n
+	help
+	  This option provides a kernel module that runs performance
+	  tests on the RCU infrastructure.  The kernel module may be built
+	  after the fact on the running kernel to be tested, if desired.
+
+	  Say Y here if you want RCU performance tests to be built into
+	  the kernel.
+	  Say M if you want the RCU performance tests to build as a module.
+	  Say N if you are unsure.
+
+config RCU_PERF_TEST_RUNNABLE
+	bool "performance tests for RCU runnable by default"
+	depends on RCU_PERF_TEST = y
+	default n
+	help
+	  This option provides a way to build the RCU performance tests
+	  directly into the kernel without them starting up at boot time.
+	  You can use /sys/module to manually override this setting.
+	  This /proc file is available only when the RCU performance
+	  tests have been built into the kernel.
+
+	  Say Y here if you want the RCU performance tests to start during
+	  boot (you probably don't).
+	  Say N here if you want the RCU performance tests to start only
+	  after being manually enabled via /sys/module.
+
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL && !SCHED_BFS
@@ -1808,6 +1842,9 @@ config TEST_BITMAP
 
 	  If unsure, say N.
 
+config TEST_UUID
+	tristate "Test functions located in the uuid module at runtime"
+
 config TEST_RHASHTABLE
 	tristate "Perform selftest on resizable hash table"
 	default n
@@ -1816,6 +1853,17 @@ config TEST_RHASHTABLE
 
 	  If unsure, say N.
 
+config TEST_HASH
+	tristate "Perform selftest on hash functions"
+	default n
+	help
+	  Enable this option to test the kernel's integer (<linux/hash,h>)
+	  and string (<linux/stringhash.h>) hash functions on boot
+	  (or module load).
+
+	  This is intended to help people writing architecture-specific
+	  optimized versions.  If unsure, say N.
+
 endmenu # runtime tests
 
 config PROVIDE_OHCI1394_DMA_INIT
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index c635a107a..533f91263 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -22,7 +22,7 @@ config KGDB_SERIAL_CONSOLE
 	tristate "KGDB: use kgdb over the serial console"
 	select CONSOLE_POLL
 	select MAGIC_SYSRQ
-	depends on TTY
+	depends on TTY && HW_CONSOLE
 	default y
 	help
 	  Share a serial console with kgdb. Sysrq-g must be used
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd436..5b5506e3b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -23,9 +23,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
 	 sha1.o md5.o irq_regs.o argv_split.o \
-	 proportions.o flex_proportions.o ratelimit.o show_mem.o \
+	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o
+	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
@@ -48,6 +48,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
+obj-$(CONFIG_TEST_HASH) += test_hash.o
 obj-$(CONFIG_TEST_KASAN) += test_kasan.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_LKM) += test_module.o
@@ -57,6 +58,7 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o
 obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
 obj-$(CONFIG_TEST_PRINTF) += test_printf.o
 obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
+obj-$(CONFIG_TEST_UUID) += test_uuid.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
@@ -178,8 +180,10 @@ obj-$(CONFIG_GENERIC_STRNLEN_USER) += strnlen_user.o
 obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
 
 obj-$(CONFIG_SG_SPLIT) += sg_split.o
+obj-$(CONFIG_SG_POOL) += sg_pool.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 obj-$(CONFIG_IRQ_POLL) += irq_poll.o
+obj-$(CONFIG_WBT) += wbt.o
 
 obj-$(CONFIG_STACKDEPOT) += stackdepot.o
 KASAN_SANITIZE_stackdepot.o := n
diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c
index 554522934..0bd8a611e 100644
--- a/lib/asn1_decoder.c
+++ b/lib/asn1_decoder.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <linux/module.h>
 #include <linux/asn1_decoder.h>
 #include <linux/asn1_ber_bytecode.h>
 
@@ -506,3 +507,5 @@ error:
 	return -EBADMSG;
 }
 EXPORT_SYMBOL_GPL(asn1_ber_decoder);
+
+MODULE_LICENSE("GPL");
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 519b5a10f..a8e12601e 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
  * Try to repair the damage, so we have a better chance to get useful
  * debug output.
  */
-static int
-debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
+static bool
+debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state),
 		   void * addr, enum debug_obj_state state)
 {
-	int fixed = 0;
-
-	if (fixup)
-		fixed = fixup(addr, state);
-	debug_objects_fixups += fixed;
-	return fixed;
+	if (fixup && fixup(addr, state)) {
+		debug_objects_fixups++;
+		return true;
+	}
+	return false;
 }
 
 static void debug_object_is_on_stack(void *addr, int onstack)
@@ -416,7 +415,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 			state = obj->state;
 			raw_spin_unlock_irqrestore(&db->lock, flags);
 			ret = debug_object_fixup(descr->fixup_activate, addr, state);
-			return ret ? -EINVAL : 0;
+			return ret ? 0 : -EINVAL;
 
 		case ODEBUG_STATE_DESTROYED:
 			debug_print_object(obj, "activate");
@@ -432,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 	/*
-	 * This happens when a static object is activated. We
-	 * let the type specific code decide whether this is
-	 * true or not.
+	 * We are here when a static object is activated. We
+	 * let the type specific code confirm whether this is
+	 * true or not. if true, we just make sure that the
+	 * static object is tracked in the object tracker. If
+	 * not, this must be a bug, so we try to fix it up.
 	 */
-	if (debug_object_fixup(descr->fixup_activate, addr,
-			   ODEBUG_STATE_NOTAVAILABLE)) {
+	if (descr->is_static_object && descr->is_static_object(addr)) {
+		/* track this static object */
+		debug_object_init(addr, descr);
+		debug_object_activate(addr, descr);
+	} else {
 		debug_print_object(&o, "activate");
-		return -EINVAL;
+		ret = debug_object_fixup(descr->fixup_activate, addr,
+					ODEBUG_STATE_NOTAVAILABLE);
+		return ret ? 0 : -EINVAL;
 	}
 	return 0;
 }
@@ -603,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
 
 		raw_spin_unlock_irqrestore(&db->lock, flags);
 		/*
-		 * Maybe the object is static.  Let the type specific
-		 * code decide what to do.
+		 * Maybe the object is static, and we let the type specific
+		 * code confirm. Track this static object if true, else invoke
+		 * fixup.
 		 */
-		if (debug_object_fixup(descr->fixup_assert_init, addr,
-				       ODEBUG_STATE_NOTAVAILABLE))
+		if (descr->is_static_object && descr->is_static_object(addr)) {
+			/* Track this static object */
+			debug_object_init(addr, descr);
+		} else {
 			debug_print_object(&o, "assert_init");
+			debug_object_fixup(descr->fixup_assert_init, addr,
+					   ODEBUG_STATE_NOTAVAILABLE);
+		}
 		return;
 	}
 
@@ -793,11 +805,18 @@ struct self_test {
 
 static __initdata struct debug_obj_descr descr_type_test;
 
+static bool __init is_static_object(void *addr)
+{
+	struct self_test *obj = addr;
+
+	return obj->static_init;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
  */
-static int __init fixup_init(void *addr, enum debug_obj_state state)
+static bool __init fixup_init(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -805,37 +824,31 @@ static int __init fixup_init(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_init(obj, &descr_type_test);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
-static int __init fixup_activate(void *addr, enum debug_obj_state state)
+static bool __init fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (obj->static_init == 1) {
-			debug_object_init(obj, &descr_type_test);
-			debug_object_activate(obj, &descr_type_test);
-			return 0;
-		}
-		return 1;
-
+		return true;
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_activate(obj, &descr_type_test);
-		return 1;
+		return true;
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -843,7 +856,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
  * fixup_destroy is called when:
  * - an active object is destroyed
  */
-static int __init fixup_destroy(void *addr, enum debug_obj_state state)
+static bool __init fixup_destroy(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -851,9 +864,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_destroy(obj, &descr_type_test);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -861,7 +874,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
  * fixup_free is called when:
  * - an active object is freed
  */
-static int __init fixup_free(void *addr, enum debug_obj_state state)
+static bool __init fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -869,9 +882,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_free(obj, &descr_type_test);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -917,6 +930,7 @@ out:
 
 static __initdata struct debug_obj_descr descr_type_test = {
 	.name			= "selftest",
+	.is_static_object	= is_static_object,
 	.fixup_init		= fixup_init,
 	.fixup_activate		= fixup_activate,
 	.fixup_destroy		= fixup_destroy,
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f129d..135ee6407 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
+	if (b == 1)
+		return r & -r;
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == 1)
+			return r & -r;
+		if (a == b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
-	return b;
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b == r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a == r)
+			return r;
+		if (a == b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index ca5316e00..0cd522753 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -99,40 +99,44 @@
 }
 
 #define iterate_and_advance(i, n, v, I, B, K) {			\
-	size_t skip = i->iov_offset;				\
-	if (unlikely(i->type & ITER_BVEC)) {			\
-		const struct bio_vec *bvec;			\
-		struct bio_vec v;				\
-		iterate_bvec(i, n, v, bvec, skip, (B))		\
-		if (skip == bvec->bv_len) {			\
-			bvec++;					\
-			skip = 0;				\
-		}						\
-		i->nr_segs -= bvec - i->bvec;			\
-		i->bvec = bvec;					\
-	} else if (unlikely(i->type & ITER_KVEC)) {		\
-		const struct kvec *kvec;			\
-		struct kvec v;					\
-		iterate_kvec(i, n, v, kvec, skip, (K))		\
-		if (skip == kvec->iov_len) {			\
-			kvec++;					\
-			skip = 0;				\
-		}						\
-		i->nr_segs -= kvec - i->kvec;			\
-		i->kvec = kvec;					\
-	} else {						\
-		const struct iovec *iov;			\
-		struct iovec v;					\
-		iterate_iovec(i, n, v, iov, skip, (I))		\
-		if (skip == iov->iov_len) {			\
-			iov++;					\
-			skip = 0;				\
+	if (unlikely(i->count < n))				\
+		n = i->count;					\
+	if (i->count) {						\
+		size_t skip = i->iov_offset;			\
+		if (unlikely(i->type & ITER_BVEC)) {		\
+			const struct bio_vec *bvec;		\
+			struct bio_vec v;			\
+			iterate_bvec(i, n, v, bvec, skip, (B))	\
+			if (skip == bvec->bv_len) {		\
+				bvec++;				\
+				skip = 0;			\
+			}					\
+			i->nr_segs -= bvec - i->bvec;		\
+			i->bvec = bvec;				\
+		} else if (unlikely(i->type & ITER_KVEC)) {	\
+			const struct kvec *kvec;		\
+			struct kvec v;				\
+			iterate_kvec(i, n, v, kvec, skip, (K))	\
+			if (skip == kvec->iov_len) {		\
+				kvec++;				\
+				skip = 0;			\
+			}					\
+			i->nr_segs -= kvec - i->kvec;		\
+			i->kvec = kvec;				\
+		} else {					\
+			const struct iovec *iov;		\
+			struct iovec v;				\
+			iterate_iovec(i, n, v, iov, skip, (I))	\
+			if (skip == iov->iov_len) {		\
+				iov++;				\
+				skip = 0;			\
+			}					\
+			i->nr_segs -= iov - i->iov;		\
+			i->iov = iov;				\
 		}						\
-		i->nr_segs -= iov - i->iov;			\
-		i->iov = iov;					\
+		i->count -= n;					\
+		i->iov_offset = skip;				\
 	}							\
-	i->count -= n;						\
-	i->iov_offset = skip;					\
 }
 
 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
@@ -386,12 +390,6 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
 size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	const char *from = addr;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
 	iterate_and_advance(i, bytes, v,
 		__copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
 			       v.iov_len),
@@ -407,12 +405,6 @@ EXPORT_SYMBOL(copy_to_iter);
 size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
 	iterate_and_advance(i, bytes, v,
 		__copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
 				 v.iov_len),
@@ -428,12 +420,6 @@ EXPORT_SYMBOL(copy_from_iter);
 size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
 	iterate_and_advance(i, bytes, v,
 		__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
 					 v.iov_base, v.iov_len),
@@ -474,12 +460,6 @@ EXPORT_SYMBOL(copy_page_from_iter);
 
 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 {
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
 	iterate_and_advance(i, bytes, v,
 		__clear_user(v.iov_base, v.iov_len),
 		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
@@ -685,12 +665,6 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 	char *to = addr;
 	__wsum sum, next;
 	size_t off = 0;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
 	sum = *csum;
 	iterate_and_advance(i, bytes, v, ({
 		int err = 0;
@@ -729,12 +703,6 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
 	const char *from = addr;
 	__wsum sum, next;
 	size_t off = 0;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
 	sum = *csum;
 	iterate_and_advance(i, bytes, v, ({
 		int err = 0;
diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c
index eb15e7dc7..747606f9e 100644
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
@@ -20,6 +20,8 @@
 
 #include <linux/bitops.h>
 #include <linux/count_zeros.h>
+#include <linux/byteorder/generic.h>
+#include <linux/string.h>
 #include "mpi-internal.h"
 
 #define MAX_EXTERN_MPI_BITS 16384
@@ -163,7 +165,13 @@ int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
 		    int *sign)
 {
 	uint8_t *p;
-	mpi_limb_t alimb;
+#if BYTES_PER_MPI_LIMB == 4
+	__be32 alimb;
+#elif BYTES_PER_MPI_LIMB == 8
+	__be64 alimb;
+#else
+#error please implement for this limb size.
+#endif
 	unsigned int n = mpi_get_size(a);
 	int i, lzeros;
 
@@ -183,38 +191,19 @@ int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
 	p = buf;
 	*nbytes = n - lzeros;
 
-	for (i = a->nlimbs - 1; i >= 0; i--) {
-		alimb = a->d[i];
+	for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB,
+			lzeros %= BYTES_PER_MPI_LIMB;
+		i >= 0; i--) {
 #if BYTES_PER_MPI_LIMB == 4
-		*p++ = alimb >> 24;
-		*p++ = alimb >> 16;
-		*p++ = alimb >> 8;
-		*p++ = alimb;
+		alimb = cpu_to_be32(a->d[i]);
 #elif BYTES_PER_MPI_LIMB == 8
-		*p++ = alimb >> 56;
-		*p++ = alimb >> 48;
-		*p++ = alimb >> 40;
-		*p++ = alimb >> 32;
-		*p++ = alimb >> 24;
-		*p++ = alimb >> 16;
-		*p++ = alimb >> 8;
-		*p++ = alimb;
+		alimb = cpu_to_be64(a->d[i]);
 #else
 #error please implement for this limb size.
 #endif
-
-		if (lzeros > 0) {
-			if (lzeros >= sizeof(alimb)) {
-				p -= sizeof(alimb);
-			} else {
-				mpi_limb_t *limb1 = (void *)p - sizeof(alimb);
-				mpi_limb_t *limb2 = (void *)p - sizeof(alimb)
-							+ lzeros;
-				*limb1 = *limb2;
-				p -= lzeros;
-			}
-			lzeros -= sizeof(alimb);
-		}
+		memcpy(p, (u8 *)&alimb + lzeros, BYTES_PER_MPI_LIMB - lzeros);
+		p += BYTES_PER_MPI_LIMB - lzeros;
+		lzeros = 0;
 	}
 	return 0;
 }
@@ -359,7 +348,13 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes,
 		     int *sign)
 {
 	u8 *p, *p2;
-	mpi_limb_t alimb, alimb2;
+#if BYTES_PER_MPI_LIMB == 4
+	__be32 alimb;
+#elif BYTES_PER_MPI_LIMB == 8
+	__be64 alimb;
+#else
+#error please implement for this limb size.
+#endif
 	unsigned int n = mpi_get_size(a);
 	int i, x, y = 0, lzeros, buf_len;
 
@@ -380,42 +375,22 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes,
 	buf_len = sgl->length;
 	p2 = sg_virt(sgl);
 
-	for (i = a->nlimbs - 1; i >= 0; i--) {
-		alimb = a->d[i];
-		p = (u8 *)&alimb2;
+	for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB,
+			lzeros %= BYTES_PER_MPI_LIMB;
+		i >= 0; i--) {
 #if BYTES_PER_MPI_LIMB == 4
-		*p++ = alimb >> 24;
-		*p++ = alimb >> 16;
-		*p++ = alimb >> 8;
-		*p++ = alimb;
+		alimb = cpu_to_be32(a->d[i]);
 #elif BYTES_PER_MPI_LIMB == 8
-		*p++ = alimb >> 56;
-		*p++ = alimb >> 48;
-		*p++ = alimb >> 40;
-		*p++ = alimb >> 32;
-		*p++ = alimb >> 24;
-		*p++ = alimb >> 16;
-		*p++ = alimb >> 8;
-		*p++ = alimb;
+		alimb = cpu_to_be64(a->d[i]);
 #else
 #error please implement for this limb size.
 #endif
-		if (lzeros > 0) {
-			if (lzeros >= sizeof(alimb)) {
-				p -= sizeof(alimb);
-				continue;
-			} else {
-				mpi_limb_t *limb1 = (void *)p - sizeof(alimb);
-				mpi_limb_t *limb2 = (void *)p - sizeof(alimb)
-							+ lzeros;
-				*limb1 = *limb2;
-				p -= lzeros;
-				y = lzeros;
-			}
-			lzeros -= sizeof(alimb);
+		if (lzeros) {
+			y = lzeros;
+			lzeros = 0;
 		}
 
-		p = p - (sizeof(alimb) - y);
+		p = (u8 *)&alimb + y;
 
 		for (x = 0; x < sizeof(alimb) - y; x++) {
 			if (!buf_len) {
@@ -443,15 +418,15 @@ EXPORT_SYMBOL_GPL(mpi_write_to_sgl);
  * a new MPI and reads the content of the sgl to the MPI.
  *
  * @sgl:	scatterlist to read from
- * @len:	number of bytes to read
+ * @nbytes:	number of bytes to read
  *
  * Return:	Pointer to a new MPI or NULL on error
  */
-MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
+MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes)
 {
 	struct scatterlist *sg;
 	int x, i, j, z, lzeros, ents;
-	unsigned int nbits, nlimbs, nbytes;
+	unsigned int nbits, nlimbs;
 	mpi_limb_t a;
 	MPI val = NULL;
 
@@ -472,16 +447,12 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
 			break;
 
 		ents--;
+		nbytes -= lzeros;
 		lzeros = 0;
 	}
 
 	sgl = sg;
-
-	if (!ents)
-		nbytes = 0;
-	else
-		nbytes = len - lzeros;
-
+	nbytes -= lzeros;
 	nbits = nbytes * 8;
 	if (nbits > MAX_EXTERN_MPI_BITS) {
 		pr_info("MPI: mpi too large (%u bits)\n", nbits);
@@ -489,9 +460,8 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
 	}
 
 	if (nbytes > 0)
-		nbits -= count_leading_zeros(*(u8 *)(sg_virt(sgl) + lzeros));
-	else
-		nbits = 0;
+		nbits -= count_leading_zeros(*(u8 *)(sg_virt(sgl) + lzeros)) -
+			(BITS_PER_LONG - 8);
 
 	nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB);
 	val = mpi_alloc(nlimbs);
@@ -507,19 +477,14 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
 
 	j = nlimbs - 1;
 	a = 0;
-	z = 0;
-	x = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
-	x %= BYTES_PER_MPI_LIMB;
+	z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
+	z %= BYTES_PER_MPI_LIMB;
 
 	for_each_sg(sgl, sg, ents, i) {
 		const u8 *buffer = sg_virt(sg) + lzeros;
 		int len = sg->length - lzeros;
-		int buf_shift = x;
-
-		if  (sg_is_last(sg) && (len % BYTES_PER_MPI_LIMB))
-			len += BYTES_PER_MPI_LIMB - (len % BYTES_PER_MPI_LIMB);
 
-		for (; x < len + buf_shift; x++) {
+		for (x = 0; x < len; x++) {
 			a <<= 8;
 			a |= *buffer++;
 			if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) {
@@ -528,7 +493,6 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
 			}
 		}
 		z += x;
-		x = 0;
 		lzeros = 0;
 	}
 	return val;
diff --git a/lib/nlattr.c b/lib/nlattr.c
index f5907d232..fce1e9afc 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -355,6 +355,30 @@ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
 EXPORT_SYMBOL(__nla_reserve);
 
 /**
+ * __nla_reserve_64bit - reserve room for attribute on the skb and align it
+ * @skb: socket buffer to reserve room on
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @padattr: attribute type for the padding
+ *
+ * Adds a netlink attribute header to a socket buffer and reserves
+ * room for the payload but does not copy it. It also ensure that this
+ * attribute will have a 64-bit aligned nla_data() area.
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the attribute header and payload.
+ */
+struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
+				   int attrlen, int padattr)
+{
+	if (nla_need_padding_for_64bit(skb))
+		nla_align_64bit(skb, padattr);
+
+	return __nla_reserve(skb, attrtype, attrlen);
+}
+EXPORT_SYMBOL(__nla_reserve_64bit);
+
+/**
  * __nla_reserve_nohdr - reserve room for attribute without header
  * @skb: socket buffer to reserve room on
  * @attrlen: length of attribute payload
@@ -397,6 +421,36 @@ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
 EXPORT_SYMBOL(nla_reserve);
 
 /**
+ * nla_reserve_64bit - reserve room for attribute on the skb and align it
+ * @skb: socket buffer to reserve room on
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @padattr: attribute type for the padding
+ *
+ * Adds a netlink attribute header to a socket buffer and reserves
+ * room for the payload but does not copy it. It also ensure that this
+ * attribute will have a 64-bit aligned nla_data() area.
+ *
+ * Returns NULL if the tailroom of the skb is insufficient to store
+ * the attribute header and payload.
+ */
+struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen,
+				 int padattr)
+{
+	size_t len;
+
+	if (nla_need_padding_for_64bit(skb))
+		len = nla_total_size_64bit(attrlen);
+	else
+		len = nla_total_size(attrlen);
+	if (unlikely(skb_tailroom(skb) < len))
+		return NULL;
+
+	return __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
+}
+EXPORT_SYMBOL(nla_reserve_64bit);
+
+/**
  * nla_reserve_nohdr - reserve room for attribute without header
  * @skb: socket buffer to reserve room on
  * @attrlen: length of attribute payload
@@ -436,6 +490,27 @@ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
 EXPORT_SYMBOL(__nla_put);
 
 /**
+ * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ * @padattr: attribute type for the padding
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the attribute header and payload.
+ */
+void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
+		     const void *data, int padattr)
+{
+	struct nlattr *nla;
+
+	nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
+	memcpy(nla_data(nla), data, attrlen);
+}
+EXPORT_SYMBOL(__nla_put_64bit);
+
+/**
  * __nla_put_nohdr - Add a netlink attribute without header
  * @skb: socket buffer to add attribute to
  * @attrlen: length of attribute payload
@@ -474,6 +549,34 @@ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
 EXPORT_SYMBOL(nla_put);
 
 /**
+ * nla_put_64bit - Add a netlink attribute to a socket buffer and align it
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ * @padattr: attribute type for the padding
+ *
+ * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
+ * the attribute header and payload.
+ */
+int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
+		  const void *data, int padattr)
+{
+	size_t len;
+
+	if (nla_need_padding_for_64bit(skb))
+		len = nla_total_size_64bit(attrlen);
+	else
+		len = nla_total_size(attrlen);
+	if (unlikely(skb_tailroom(skb) < len))
+		return -EMSGSIZE;
+
+	__nla_put_64bit(skb, attrtype, attrlen, data, padattr);
+	return 0;
+}
+EXPORT_SYMBOL(nla_put_64bit);
+
+/**
  * nla_put_nohdr - Add a netlink attribute without header
  * @skb: socket buffer to add attribute to
  * @attrlen: length of attribute payload
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 6019c53c6..26caf51cc 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,33 +16,14 @@
 #include <linux/delay.h>
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
-#include <linux/seq_buf.h>
 
 #ifdef arch_trigger_all_cpu_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE		4096
-
-struct nmi_seq_buf {
-	unsigned char		buffer[NMI_BUF_SIZE];
-	struct seq_buf		seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
 
 /* "in progress" flag of arch_trigger_all_cpu_backtrace */
 static unsigned long backtrace_flag;
 
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
-	const char *buf = s->buffer + start;
-
-	printk("%.*s", (end - start) + 1, buf);
-}
-
 /*
  * When raise() is called it will be is passed a pointer to the
  * backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
 void nmi_trigger_all_cpu_backtrace(bool include_self,
 				   void (*raise)(cpumask_t *mask))
 {
-	struct nmi_seq_buf *s;
-	int i, cpu, this_cpu = get_cpu();
+	int i, this_cpu = get_cpu();
 
 	if (test_and_set_bit(0, &backtrace_flag)) {
 		/*
@@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	if (!include_self)
 		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
 
-	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-
-	/*
-	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
-	 * CPUs will write to.
-	 */
-	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-		s = &per_cpu(nmi_print_seq, cpu);
-		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-	}
-
 	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
 		pr_info("Sending NMI to %s CPUs:\n",
 			(include_self ? "all" : "other"));
@@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	}
 
 	/*
-	 * Now that all the NMIs have triggered, we can dump out their
-	 * back traces safely to the console.
+	 * Force flush any remote buffers that might be stuck in IRQ context
+	 * and therefore could not run their irq_work.
 	 */
-	for_each_cpu(cpu, &printtrace_mask) {
-		int len, last_i = 0;
+	printk_nmi_flush();
 
-		s = &per_cpu(nmi_print_seq, cpu);
-		len = seq_buf_used(&s->seq);
-		if (!len)
-			continue;
-
-		/* Print line by line. */
-		for (i = 0; i < len; i++) {
-			if (s->buffer[i] == '\n') {
-				print_seq_line(s, last_i, i);
-				last_i = i + 1;
-			}
-		}
-		/* Check if there was a partial line. */
-		if (last_i < len) {
-			print_seq_line(s, last_i, len - 1);
-			pr_cont("\n");
-		}
-	}
-
-	clear_bit(0, &backtrace_flag);
-	smp_mb__after_atomic();
+	clear_bit_unlock(0, &backtrace_flag);
 	put_cpu();
 }
 
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	unsigned int len = seq_buf_used(&s->seq);
-
-	seq_buf_vprintf(&s->seq, fmt, args);
-	return seq_buf_used(&s->seq) - len;
-}
-
 bool nmi_cpu_backtrace(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-		/* Replace printk to write into the NMI seq */
-		this_cpu_write(printk_func, nmi_vprintk);
 		pr_warn("NMI backtrace for cpu %d\n", cpu);
 		if (regs)
 			show_regs(regs);
 		else
 			dump_stack();
-		this_cpu_write(printk_func, printk_func_save);
-
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return true;
 	}
diff --git a/lib/nodemask.c b/lib/nodemask.c
new file mode 100644
index 000000000..e42a5bf44
--- /dev/null
+++ b/lib/nodemask.c
@@ -0,0 +1,30 @@
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+int __next_node_in(int node, const nodemask_t *srcp)
+{
+	int ret = __next_node(node, srcp);
+
+	if (ret == MAX_NUMNODES)
+		ret = __first_node(srcp);
+	return ret;
+}
+EXPORT_SYMBOL(__next_node_in);
+
+#ifdef CONFIG_NUMA
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+	int w, bit = NUMA_NO_NODE;
+
+	w = nodes_weight(*maskp);
+	if (w)
+		bit = bitmap_ord_to_pos(maskp->bits,
+			get_random_int() % w, MAX_NUMNODES);
+	return bit;
+}
+#endif
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index f051d69f0..72d36113c 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -19,7 +19,7 @@ static DEFINE_SPINLOCK(percpu_counters_lock);
 
 static struct debug_obj_descr percpu_counter_debug_descr;
 
-static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
+static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct percpu_counter *fbc = addr;
 
@@ -27,9 +27,9 @@ static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		percpu_counter_destroy(fbc);
 		debug_object_free(fbc, &percpu_counter_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
diff --git a/lib/proportions.c b/lib/proportions.c
deleted file mode 100644
index efa54f259..000000000
--- a/lib/proportions.c
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Floating proportions
- *
- *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
- *
- * Description:
- *
- * The floating proportion is a time derivative with an exponentially decaying
- * history:
- *
- *   p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
- *
- * Where j is an element from {prop_local}, x_{j} is j's number of events,
- * and i the time period over which the differential is taken. So d/dt_{-i} is
- * the differential over the i-th last period.
- *
- * The decaying history gives smooth transitions. The time differential carries
- * the notion of speed.
- *
- * The denominator is 2^(1+i) because we want the series to be normalised, ie.
- *
- *   \Sum_{i=0} 1/2^(1+i) = 1
- *
- * Further more, if we measure time (t) in the same events as x; so that:
- *
- *   t = \Sum_{j} x_{j}
- *
- * we get that:
- *
- *   \Sum_{j} p_{j} = 1
- *
- * Writing this in an iterative fashion we get (dropping the 'd's):
- *
- *   if (++x_{j}, ++t > period)
- *     t /= 2;
- *     for_each (j)
- *       x_{j} /= 2;
- *
- * so that:
- *
- *   p_{j} = x_{j} / t;
- *
- * We optimize away the '/= 2' for the global time delta by noting that:
- *
- *   if (++t > period) t /= 2:
- *
- * Can be approximated by:
- *
- *   period/2 + (++t % period/2)
- *
- * [ Furthermore, when we choose period to be 2^n it can be written in terms of
- *   binary operations and wraparound artefacts disappear. ]
- *
- * Also note that this yields a natural counter of the elapsed periods:
- *
- *   c = t / (period/2)
- *
- * [ Its monotonic increasing property can be applied to mitigate the wrap-
- *   around issue. ]
- *
- * This allows us to do away with the loop over all prop_locals on each period
- * expiration. By remembering the period count under which it was last accessed
- * as c_{j}, we can obtain the number of 'missed' cycles from:
- *
- *   c - c_{j}
- *
- * We can then lazily catch up to the global period count every time we are
- * going to use x_{j}, by doing:
- *
- *   x_{j} /= 2^(c - c_{j}), c_{j} = c
- */
-
-#include <linux/proportions.h>
-#include <linux/rcupdate.h>
-
-int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp)
-{
-	int err;
-
-	if (shift > PROP_MAX_SHIFT)
-		shift = PROP_MAX_SHIFT;
-
-	pd->index = 0;
-	pd->pg[0].shift = shift;
-	mutex_init(&pd->mutex);
-	err = percpu_counter_init(&pd->pg[0].events, 0, gfp);
-	if (err)
-		goto out;
-
-	err = percpu_counter_init(&pd->pg[1].events, 0, gfp);
-	if (err)
-		percpu_counter_destroy(&pd->pg[0].events);
-
-out:
-	return err;
-}
-
-/*
- * We have two copies, and flip between them to make it seem like an atomic
- * update. The update is not really atomic wrt the events counter, but
- * it is internally consistent with the bit layout depending on shift.
- *
- * We copy the events count, move the bits around and flip the index.
- */
-void prop_change_shift(struct prop_descriptor *pd, int shift)
-{
-	int index;
-	int offset;
-	u64 events;
-	unsigned long flags;
-
-	if (shift > PROP_MAX_SHIFT)
-		shift = PROP_MAX_SHIFT;
-
-	mutex_lock(&pd->mutex);
-
-	index = pd->index ^ 1;
-	offset = pd->pg[pd->index].shift - shift;
-	if (!offset)
-		goto out;
-
-	pd->pg[index].shift = shift;
-
-	local_irq_save(flags);
-	events = percpu_counter_sum(&pd->pg[pd->index].events);
-	if (offset < 0)
-		events <<= -offset;
-	else
-		events >>= offset;
-	percpu_counter_set(&pd->pg[index].events, events);
-
-	/*
-	 * ensure the new pg is fully written before the switch
-	 */
-	smp_wmb();
-	pd->index = index;
-	local_irq_restore(flags);
-
-	synchronize_rcu();
-
-out:
-	mutex_unlock(&pd->mutex);
-}
-
-/*
- * wrap the access to the data in an rcu_read_lock() section;
- * this is used to track the active references.
- */
-static struct prop_global *prop_get_global(struct prop_descriptor *pd)
-__acquires(RCU)
-{
-	int index;
-
-	rcu_read_lock();
-	index = pd->index;
-	/*
-	 * match the wmb from vcd_flip()
-	 */
-	smp_rmb();
-	return &pd->pg[index];
-}
-
-static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
-__releases(RCU)
-{
-	rcu_read_unlock();
-}
-
-static void
-prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
-{
-	int offset = *pl_shift - new_shift;
-
-	if (!offset)
-		return;
-
-	if (offset < 0)
-		*pl_period <<= -offset;
-	else
-		*pl_period >>= offset;
-
-	*pl_shift = new_shift;
-}
-
-/*
- * PERCPU
- */
-
-#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
-
-int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp)
-{
-	raw_spin_lock_init(&pl->lock);
-	pl->shift = 0;
-	pl->period = 0;
-	return percpu_counter_init(&pl->events, 0, gfp);
-}
-
-void prop_local_destroy_percpu(struct prop_local_percpu *pl)
-{
-	percpu_counter_destroy(&pl->events);
-}
-
-/*
- * Catch up with missed period expirations.
- *
- *   until (c_{j} == c)
- *     x_{j} -= x_{j}/2;
- *     c_{j}++;
- */
-static
-void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
-{
-	unsigned long period = 1UL << (pg->shift - 1);
-	unsigned long period_mask = ~(period - 1);
-	unsigned long global_period;
-	unsigned long flags;
-
-	global_period = percpu_counter_read(&pg->events);
-	global_period &= period_mask;
-
-	/*
-	 * Fast path - check if the local and global period count still match
-	 * outside of the lock.
-	 */
-	if (pl->period == global_period)
-		return;
-
-	raw_spin_lock_irqsave(&pl->lock, flags);
-	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
-
-	/*
-	 * For each missed period, we half the local counter.
-	 * basically:
-	 *   pl->events >> (global_period - pl->period);
-	 */
-	period = (global_period - pl->period) >> (pg->shift - 1);
-	if (period < BITS_PER_LONG) {
-		s64 val = percpu_counter_read(&pl->events);
-
-		if (val < (nr_cpu_ids * PROP_BATCH))
-			val = percpu_counter_sum(&pl->events);
-
-		__percpu_counter_add(&pl->events, -val + (val >> period),
-					PROP_BATCH);
-	} else
-		percpu_counter_set(&pl->events, 0);
-
-	pl->period = global_period;
-	raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/*
- *   ++x_{j}, ++t
- */
-void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
-{
-	struct prop_global *pg = prop_get_global(pd);
-
-	prop_norm_percpu(pg, pl);
-	__percpu_counter_add(&pl->events, 1, PROP_BATCH);
-	percpu_counter_add(&pg->events, 1);
-	prop_put_global(pd, pg);
-}
-
-/*
- * identical to __prop_inc_percpu, except that it limits this pl's fraction to
- * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded.
- */
-void __prop_inc_percpu_max(struct prop_descriptor *pd,
-			   struct prop_local_percpu *pl, long frac)
-{
-	struct prop_global *pg = prop_get_global(pd);
-
-	prop_norm_percpu(pg, pl);
-
-	if (unlikely(frac != PROP_FRAC_BASE)) {
-		unsigned long period_2 = 1UL << (pg->shift - 1);
-		unsigned long counter_mask = period_2 - 1;
-		unsigned long global_count;
-		long numerator, denominator;
-
-		numerator = percpu_counter_read_positive(&pl->events);
-		global_count = percpu_counter_read(&pg->events);
-		denominator = period_2 + (global_count & counter_mask);
-
-		if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT))
-			goto out_put;
-	}
-
-	percpu_counter_add(&pl->events, 1);
-	percpu_counter_add(&pg->events, 1);
-
-out_put:
-	prop_put_global(pd, pg);
-}
-
-/*
- * Obtain a fraction of this proportion
- *
- *   p_{j} = x_{j} / (period/2 + t % period/2)
- */
-void prop_fraction_percpu(struct prop_descriptor *pd,
-		struct prop_local_percpu *pl,
-		long *numerator, long *denominator)
-{
-	struct prop_global *pg = prop_get_global(pd);
-	unsigned long period_2 = 1UL << (pg->shift - 1);
-	unsigned long counter_mask = period_2 - 1;
-	unsigned long global_count;
-
-	prop_norm_percpu(pg, pl);
-	*numerator = percpu_counter_read_positive(&pl->events);
-
-	global_count = percpu_counter_read(&pg->events);
-	*denominator = period_2 + (global_count & counter_mask);
-
-	prop_put_global(pd, pg);
-}
-
-/*
- * SINGLE
- */
-
-int prop_local_init_single(struct prop_local_single *pl)
-{
-	raw_spin_lock_init(&pl->lock);
-	pl->shift = 0;
-	pl->period = 0;
-	pl->events = 0;
-	return 0;
-}
-
-void prop_local_destroy_single(struct prop_local_single *pl)
-{
-}
-
-/*
- * Catch up with missed period expirations.
- */
-static
-void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl)
-{
-	unsigned long period = 1UL << (pg->shift - 1);
-	unsigned long period_mask = ~(period - 1);
-	unsigned long global_period;
-	unsigned long flags;
-
-	global_period = percpu_counter_read(&pg->events);
-	global_period &= period_mask;
-
-	/*
-	 * Fast path - check if the local and global period count still match
-	 * outside of the lock.
-	 */
-	if (pl->period == global_period)
-		return;
-
-	raw_spin_lock_irqsave(&pl->lock, flags);
-	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
-	/*
-	 * For each missed period, we half the local counter.
-	 */
-	period = (global_period - pl->period) >> (pg->shift - 1);
-	if (likely(period < BITS_PER_LONG))
-		pl->events >>= period;
-	else
-		pl->events = 0;
-	pl->period = global_period;
-	raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/*
- *   ++x_{j}, ++t
- */
-void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
-{
-	struct prop_global *pg = prop_get_global(pd);
-
-	prop_norm_single(pg, pl);
-	pl->events++;
-	percpu_counter_add(&pg->events, 1);
-	prop_put_global(pd, pg);
-}
-
-/*
- * Obtain a fraction of this proportion
- *
- *   p_{j} = x_{j} / (period/2 + t % period/2)
- */
-void prop_fraction_single(struct prop_descriptor *pd,
-	       	struct prop_local_single *pl,
-		long *numerator, long *denominator)
-{
-	struct prop_global *pg = prop_get_global(pd);
-	unsigned long period_2 = 1UL << (pg->shift - 1);
-	unsigned long counter_mask = period_2 - 1;
-	unsigned long global_count;
-
-	prop_norm_single(pg, pl);
-	*numerator = pl->events;
-
-	global_count = percpu_counter_read(&pg->events);
-	*denominator = period_2 + (global_count & counter_mask);
-
-	prop_put_global(pd, pg);
-}
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 1624c4117..bc7852f95 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -4,6 +4,8 @@
  * Copyright (C) 2005 SGI, Christoph Lameter
  * Copyright (C) 2006 Nick Piggin
  * Copyright (C) 2012 Konstantin Khlebnikov
+ * Copyright (C) 2016 Intel, Matthew Wilcox
+ * Copyright (C) 2016 Intel, Ross Zwisler
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -37,12 +39,6 @@
 
 
 /*
- * The height_to_maxindex array needs to be one deeper than the maximum
- * path as height 0 holds only 1 entry.
- */
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
-
-/*
  * Radix tree node cache.
  */
 static struct kmem_cache *radix_tree_node_cachep;
@@ -64,20 +60,58 @@ static struct kmem_cache *radix_tree_node_cachep;
  * Per-cpu pool of preloaded nodes
  */
 struct radix_tree_preload {
-	int nr;
+	unsigned nr;
 	/* nodes->private_data points to next preallocated node */
 	struct radix_tree_node *nodes;
 };
 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
-static inline void *ptr_to_indirect(void *ptr)
+static inline void *node_to_entry(void *ptr)
+{
+	return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
+}
+
+#define RADIX_TREE_RETRY	node_to_entry(NULL)
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+/* Sibling slots point directly to another slot in the same node */
+static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
+{
+	void **ptr = node;
+	return (parent->slots <= ptr) &&
+			(ptr < parent->slots + RADIX_TREE_MAP_SIZE);
+}
+#else
+static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
+{
+	return false;
+}
+#endif
+
+static inline unsigned long get_slot_offset(struct radix_tree_node *parent,
+						 void **slot)
 {
-	return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
+	return slot - parent->slots;
 }
 
-static inline void *indirect_to_ptr(void *ptr)
+static unsigned int radix_tree_descend(struct radix_tree_node *parent,
+			struct radix_tree_node **nodep, unsigned long index)
 {
-	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
+	unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
+	void **entry = rcu_dereference_raw(parent->slots[offset]);
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	if (radix_tree_is_internal_node(entry)) {
+		unsigned long siboff = get_slot_offset(parent, entry);
+		if (siboff < RADIX_TREE_MAP_SIZE) {
+			offset = siboff;
+			entry = rcu_dereference_raw(parent->slots[offset]);
+		}
+	}
+#endif
+
+	*nodep = (void *)entry;
+	return offset;
 }
 
 static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
@@ -108,7 +142,7 @@ static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
 	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
 }
 
-static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
 {
 	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
 }
@@ -120,7 +154,12 @@ static inline void root_tag_clear_all(struct radix_tree_root *root)
 
 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
 {
-	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+	return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline unsigned root_tags_get(struct radix_tree_root *root)
+{
+	return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT;
 }
 
 /*
@@ -129,7 +168,7 @@ static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
  */
 static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
 {
-	int idx;
+	unsigned idx;
 	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
 		if (node->tags[tag][idx])
 			return 1;
@@ -173,38 +212,45 @@ radix_tree_find_next_bit(const unsigned long *addr,
 	return size;
 }
 
-#if 0
-static void dump_node(void *slot, int height, int offset)
+#ifndef __KERNEL__
+static void dump_node(struct radix_tree_node *node, unsigned long index)
 {
-	struct radix_tree_node *node;
-	int i;
+	unsigned long i;
 
-	if (!slot)
-		return;
+	pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
+		node, node->offset,
+		node->tags[0][0], node->tags[1][0], node->tags[2][0],
+		node->shift, node->count, node->parent);
 
-	if (height == 0) {
-		pr_debug("radix entry %p offset %d\n", slot, offset);
-		return;
+	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+		unsigned long first = index | (i << node->shift);
+		unsigned long last = first | ((1UL << node->shift) - 1);
+		void *entry = node->slots[i];
+		if (!entry)
+			continue;
+		if (is_sibling_entry(node, entry)) {
+			pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n",
+					entry, i,
+					*(void **)entry_to_node(entry),
+					first, last);
+		} else if (!radix_tree_is_internal_node(entry)) {
+			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
+					entry, i, first, last);
+		} else {
+			dump_node(entry_to_node(entry), first);
+		}
 	}
-
-	node = indirect_to_ptr(slot);
-	pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n",
-		slot, offset, node->tags[0][0], node->tags[1][0],
-		node->tags[2][0], node->path, node->count, node->parent);
-
-	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
-		dump_node(node->slots[i], height - 1, i);
 }
 
 /* For debug */
 static void radix_tree_dump(struct radix_tree_root *root)
 {
-	pr_debug("radix root: %p height %d rnode %p tags %x\n",
-			root, root->height, root->rnode,
+	pr_debug("radix root: %p rnode %p tags %x\n",
+			root, root->rnode,
 			root->gfp_mask >> __GFP_BITS_SHIFT);
-	if (!radix_tree_is_indirect_ptr(root->rnode))
+	if (!radix_tree_is_internal_node(root->rnode))
 		return;
-	dump_node(root->rnode, root->height, 0);
+	dump_node(entry_to_node(root->rnode), 0);
 }
 #endif
 
@@ -219,19 +265,20 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 	gfp_t gfp_mask = root_gfp_mask(root);
 
 	/*
-	 * Preload code isn't irq safe and it doesn't make sence to use
-	 * preloading in the interrupt anyway as all the allocations have to
-	 * be atomic. So just do normal allocation when in interrupt.
+	 * Preload code isn't irq safe and it doesn't make sense to use
+	 * preloading during an interrupt anyway as all the allocations have
+	 * to be atomic. So just do normal allocation when in interrupt.
 	 */
 	if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
 		struct radix_tree_preload *rtp;
 
 		/*
 		 * Even if the caller has preloaded, try to allocate from the
-		 * cache first for the new node to get accounted.
+		 * cache first for the new node to get accounted to the memory
+		 * cgroup.
 		 */
 		ret = kmem_cache_alloc(radix_tree_node_cachep,
-				       gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN);
+				       gfp_mask | __GFP_NOWARN);
 		if (ret)
 			goto out;
 
@@ -254,10 +301,9 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 		kmemleak_update_trace(ret);
 		goto out;
 	}
-	ret = kmem_cache_alloc(radix_tree_node_cachep,
-			       gfp_mask | __GFP_ACCOUNT);
+	ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
 out:
-	BUG_ON(radix_tree_is_indirect_ptr(ret));
+	BUG_ON(radix_tree_is_internal_node(ret));
 	return ret;
 }
 
@@ -302,6 +348,12 @@ static int __radix_tree_preload(gfp_t gfp_mask)
 	struct radix_tree_node *node;
 	int ret = -ENOMEM;
 
+	/*
+	 * Nodes preloaded by one cgroup can be be used by another cgroup, so
+	 * they should never be accounted to any particular memory cgroup.
+	 */
+	gfp_mask &= ~__GFP_ACCOUNT;
+
 	preempt_disable();
 	rtp = this_cpu_ptr(&radix_tree_preloads);
 	while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
@@ -357,38 +409,58 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
 EXPORT_SYMBOL(radix_tree_maybe_preload);
 
 /*
- *	Return the maximum key which can be store into a
- *	radix tree with height HEIGHT.
+ * The maximum index which can be stored in a radix tree
  */
-static inline unsigned long radix_tree_maxindex(unsigned int height)
+static inline unsigned long shift_maxindex(unsigned int shift)
+{
+	return (RADIX_TREE_MAP_SIZE << shift) - 1;
+}
+
+static inline unsigned long node_maxindex(struct radix_tree_node *node)
+{
+	return shift_maxindex(node->shift);
+}
+
+static unsigned radix_tree_load_root(struct radix_tree_root *root,
+		struct radix_tree_node **nodep, unsigned long *maxindex)
 {
-	return height_to_maxindex[height];
+	struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
+
+	*nodep = node;
+
+	if (likely(radix_tree_is_internal_node(node))) {
+		node = entry_to_node(node);
+		*maxindex = node_maxindex(node);
+		return node->shift + RADIX_TREE_MAP_SHIFT;
+	}
+
+	*maxindex = 0;
+	return 0;
 }
 
 /*
  *	Extend a radix tree so it can store key @index.
  */
 static int radix_tree_extend(struct radix_tree_root *root,
-				unsigned long index, unsigned order)
+				unsigned long index, unsigned int shift)
 {
-	struct radix_tree_node *node;
 	struct radix_tree_node *slot;
-	unsigned int height;
+	unsigned int maxshift;
 	int tag;
 
-	/* Figure out what the height should be.  */
-	height = root->height + 1;
-	while (index > radix_tree_maxindex(height))
-		height++;
+	/* Figure out what the shift should be.  */
+	maxshift = shift;
+	while (index > shift_maxindex(maxshift))
+		maxshift += RADIX_TREE_MAP_SHIFT;
 
-	if ((root->rnode == NULL) && (order == 0)) {
-		root->height = height;
+	slot = root->rnode;
+	if (!slot)
 		goto out;
-	}
 
 	do {
-		unsigned int newheight;
-		if (!(node = radix_tree_node_alloc(root)))
+		struct radix_tree_node *node = radix_tree_node_alloc(root);
+
+		if (!node)
 			return -ENOMEM;
 
 		/* Propagate the aggregated tag info into the new root */
@@ -397,25 +469,20 @@ static int radix_tree_extend(struct radix_tree_root *root,
 				tag_set(node, tag, 0);
 		}
 
-		/* Increase the height.  */
-		newheight = root->height+1;
-		BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
-		node->path = newheight;
+		BUG_ON(shift > BITS_PER_LONG);
+		node->shift = shift;
+		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		slot = root->rnode;
-		if (radix_tree_is_indirect_ptr(slot) && newheight > 1) {
-			slot = indirect_to_ptr(slot);
-			slot->parent = node;
-			slot = ptr_to_indirect(slot);
-		}
+		if (radix_tree_is_internal_node(slot))
+			entry_to_node(slot)->parent = node;
 		node->slots[0] = slot;
-		node = ptr_to_indirect(node);
-		rcu_assign_pointer(root->rnode, node);
-		root->height = newheight;
-	} while (height > root->height);
+		slot = node_to_entry(node);
+		rcu_assign_pointer(root->rnode, slot);
+		shift += RADIX_TREE_MAP_SHIFT;
+	} while (shift <= maxshift);
 out:
-	return 0;
+	return maxshift + RADIX_TREE_MAP_SHIFT;
 }
 
 /**
@@ -439,71 +506,70 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			unsigned order, struct radix_tree_node **nodep,
 			void ***slotp)
 {
-	struct radix_tree_node *node = NULL, *slot;
-	unsigned int height, shift, offset;
-	int error;
+	struct radix_tree_node *node = NULL, *child;
+	void **slot = (void **)&root->rnode;
+	unsigned long maxindex;
+	unsigned int shift, offset = 0;
+	unsigned long max = index | ((1UL << order) - 1);
 
-	BUG_ON((0 < order) && (order < RADIX_TREE_MAP_SHIFT));
+	shift = radix_tree_load_root(root, &child, &maxindex);
 
 	/* Make sure the tree is high enough.  */
-	if (index > radix_tree_maxindex(root->height)) {
-		error = radix_tree_extend(root, index, order);
-		if (error)
+	if (max > maxindex) {
+		int error = radix_tree_extend(root, max, shift);
+		if (error < 0)
 			return error;
+		shift = error;
+		child = root->rnode;
+		if (order == shift)
+			shift += RADIX_TREE_MAP_SHIFT;
 	}
 
-	slot = root->rnode;
-
-	height = root->height;
-	shift = height * RADIX_TREE_MAP_SHIFT;
-
-	offset = 0;			/* uninitialised var warning */
 	while (shift > order) {
-		if (slot == NULL) {
+		shift -= RADIX_TREE_MAP_SHIFT;
+		if (child == NULL) {
 			/* Have to add a child node.  */
-			if (!(slot = radix_tree_node_alloc(root)))
+			child = radix_tree_node_alloc(root);
+			if (!child)
 				return -ENOMEM;
-			slot->path = height;
-			slot->parent = node;
-			if (node) {
-				rcu_assign_pointer(node->slots[offset],
-							ptr_to_indirect(slot));
+			child->shift = shift;
+			child->offset = offset;
+			child->parent = node;
+			rcu_assign_pointer(*slot, node_to_entry(child));
+			if (node)
 				node->count++;
-				slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
-			} else
-				rcu_assign_pointer(root->rnode,
-							ptr_to_indirect(slot));
-		} else if (!radix_tree_is_indirect_ptr(slot))
+		} else if (!radix_tree_is_internal_node(child))
 			break;
 
 		/* Go a level down */
-		height--;
-		shift -= RADIX_TREE_MAP_SHIFT;
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		node = indirect_to_ptr(slot);
-		slot = node->slots[offset];
+		node = entry_to_node(child);
+		offset = radix_tree_descend(node, &child, index);
+		slot = &node->slots[offset];
 	}
 
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
 	/* Insert pointers to the canonical entry */
-	if ((shift - order) > 0) {
-		int i, n = 1 << (shift - order);
+	if (order > shift) {
+		unsigned i, n = 1 << (order - shift);
 		offset = offset & ~(n - 1);
-		slot = ptr_to_indirect(&node->slots[offset]);
+		slot = &node->slots[offset];
+		child = node_to_entry(slot);
 		for (i = 0; i < n; i++) {
-			if (node->slots[offset + i])
+			if (slot[i])
 				return -EEXIST;
 		}
 
 		for (i = 1; i < n; i++) {
-			rcu_assign_pointer(node->slots[offset + i], slot);
+			rcu_assign_pointer(slot[i], child);
 			node->count++;
 		}
 	}
+#endif
 
 	if (nodep)
 		*nodep = node;
 	if (slotp)
-		*slotp = node ? node->slots + offset : (void **)&root->rnode;
+		*slotp = slot;
 	return 0;
 }
 
@@ -523,7 +589,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 	void **slot;
 	int error;
 
-	BUG_ON(radix_tree_is_indirect_ptr(item));
+	BUG_ON(radix_tree_is_internal_node(item));
 
 	error = __radix_tree_create(root, index, order, &node, &slot);
 	if (error)
@@ -533,12 +599,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 	rcu_assign_pointer(*slot, item);
 
 	if (node) {
+		unsigned offset = get_slot_offset(node, slot);
 		node->count++;
-		BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK));
-		BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK));
+		BUG_ON(tag_get(node, 0, offset));
+		BUG_ON(tag_get(node, 1, offset));
+		BUG_ON(tag_get(node, 2, offset));
 	} else {
-		BUG_ON(root_tag_get(root, 0));
-		BUG_ON(root_tag_get(root, 1));
+		BUG_ON(root_tags_get(root));
 	}
 
 	return 0;
@@ -563,44 +630,25 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
 			  struct radix_tree_node **nodep, void ***slotp)
 {
 	struct radix_tree_node *node, *parent;
-	unsigned int height, shift;
+	unsigned long maxindex;
 	void **slot;
 
-	node = rcu_dereference_raw(root->rnode);
-	if (node == NULL)
+ restart:
+	parent = NULL;
+	slot = (void **)&root->rnode;
+	radix_tree_load_root(root, &node, &maxindex);
+	if (index > maxindex)
 		return NULL;
 
-	if (!radix_tree_is_indirect_ptr(node)) {
-		if (index > 0)
-			return NULL;
+	while (radix_tree_is_internal_node(node)) {
+		unsigned offset;
 
-		if (nodep)
-			*nodep = NULL;
-		if (slotp)
-			*slotp = (void **)&root->rnode;
-		return node;
+		if (node == RADIX_TREE_RETRY)
+			goto restart;
+		parent = entry_to_node(node);
+		offset = radix_tree_descend(parent, &node, index);
+		slot = parent->slots + offset;
 	}
-	node = indirect_to_ptr(node);
-
-	height = node->path & RADIX_TREE_HEIGHT_MASK;
-	if (index > radix_tree_maxindex(height))
-		return NULL;
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
-	do {
-		parent = node;
-		slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK);
-		node = rcu_dereference_raw(*slot);
-		if (node == NULL)
-			return NULL;
-		if (!radix_tree_is_indirect_ptr(node))
-			break;
-		node = indirect_to_ptr(node);
-
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	} while (height > 0);
 
 	if (nodep)
 		*nodep = parent;
@@ -654,59 +702,72 @@ EXPORT_SYMBOL(radix_tree_lookup);
  *	radix_tree_tag_set - set a tag on a radix tree node
  *	@root:		radix tree root
  *	@index:		index key
- *	@tag: 		tag index
+ *	@tag:		tag index
  *
  *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
  *	corresponding to @index in the radix tree.  From
  *	the root all the way down to the leaf node.
  *
- *	Returns the address of the tagged item.   Setting a tag on a not-present
+ *	Returns the address of the tagged item.  Setting a tag on a not-present
  *	item is a bug.
  */
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
 {
-	unsigned int height, shift;
-	struct radix_tree_node *slot;
+	struct radix_tree_node *node, *parent;
+	unsigned long maxindex;
 
-	height = root->height;
-	BUG_ON(index > radix_tree_maxindex(height));
+	radix_tree_load_root(root, &node, &maxindex);
+	BUG_ON(index > maxindex);
 
-	slot = indirect_to_ptr(root->rnode);
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	while (radix_tree_is_internal_node(node)) {
+		unsigned offset;
 
-	while (height > 0) {
-		int offset;
+		parent = entry_to_node(node);
+		offset = radix_tree_descend(parent, &node, index);
+		BUG_ON(!node);
 
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		if (!tag_get(slot, tag, offset))
-			tag_set(slot, tag, offset);
-		slot = slot->slots[offset];
-		BUG_ON(slot == NULL);
-		if (!radix_tree_is_indirect_ptr(slot))
-			break;
-		slot = indirect_to_ptr(slot);
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
+		if (!tag_get(parent, tag, offset))
+			tag_set(parent, tag, offset);
 	}
 
 	/* set the root's tag bit */
-	if (slot && !root_tag_get(root, tag))
+	if (!root_tag_get(root, tag))
 		root_tag_set(root, tag);
 
-	return slot;
+	return node;
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
 
+static void node_tag_clear(struct radix_tree_root *root,
+				struct radix_tree_node *node,
+				unsigned int tag, unsigned int offset)
+{
+	while (node) {
+		if (!tag_get(node, tag, offset))
+			return;
+		tag_clear(node, tag, offset);
+		if (any_tag_set(node, tag))
+			return;
+
+		offset = node->offset;
+		node = node->parent;
+	}
+
+	/* clear the root's tag bit */
+	if (root_tag_get(root, tag))
+		root_tag_clear(root, tag);
+}
+
 /**
  *	radix_tree_tag_clear - clear a tag on a radix tree node
  *	@root:		radix tree root
  *	@index:		index key
- *	@tag: 		tag index
+ *	@tag:		tag index
  *
  *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
- *	corresponding to @index in the radix tree.  If
- *	this causes the leaf node to have no tags set then clear the tag in the
+ *	corresponding to @index in the radix tree.  If this causes
+ *	the leaf node to have no tags set then clear the tag in the
  *	next-to-leaf node, etc.
  *
  *	Returns the address of the tagged item on success, else NULL.  ie:
@@ -715,52 +776,25 @@ EXPORT_SYMBOL(radix_tree_tag_set);
 void *radix_tree_tag_clear(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
 {
-	struct radix_tree_node *node = NULL;
-	struct radix_tree_node *slot = NULL;
-	unsigned int height, shift;
+	struct radix_tree_node *node, *parent;
+	unsigned long maxindex;
 	int uninitialized_var(offset);
 
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		goto out;
-
-	shift = height * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
+	radix_tree_load_root(root, &node, &maxindex);
+	if (index > maxindex)
+		return NULL;
 
-	while (shift) {
-		if (slot == NULL)
-			goto out;
-		if (!radix_tree_is_indirect_ptr(slot))
-			break;
-		slot = indirect_to_ptr(slot);
+	parent = NULL;
 
-		shift -= RADIX_TREE_MAP_SHIFT;
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		node = slot;
-		slot = slot->slots[offset];
+	while (radix_tree_is_internal_node(node)) {
+		parent = entry_to_node(node);
+		offset = radix_tree_descend(parent, &node, index);
 	}
 
-	if (slot == NULL)
-		goto out;
+	if (node)
+		node_tag_clear(root, parent, tag, offset);
 
-	while (node) {
-		if (!tag_get(node, tag, offset))
-			goto out;
-		tag_clear(node, tag, offset);
-		if (any_tag_set(node, tag))
-			goto out;
-
-		index >>= RADIX_TREE_MAP_SHIFT;
-		offset = index & RADIX_TREE_MAP_MASK;
-		node = node->parent;
-	}
-
-	/* clear the root's tag bit */
-	if (root_tag_get(root, tag))
-		root_tag_clear(root, tag);
-
-out:
-	return slot;
+	return node;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
 
@@ -768,7 +802,7 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
  * radix_tree_tag_get - get a tag on a radix tree node
  * @root:		radix tree root
  * @index:		index key
- * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
+ * @tag:		tag index (< RADIX_TREE_MAX_TAGS)
  *
  * Return values:
  *
@@ -782,48 +816,44 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
 int radix_tree_tag_get(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
 {
-	unsigned int height, shift;
-	struct radix_tree_node *node;
+	struct radix_tree_node *node, *parent;
+	unsigned long maxindex;
 
-	/* check the root's tag bit */
 	if (!root_tag_get(root, tag))
 		return 0;
 
-	node = rcu_dereference_raw(root->rnode);
-	if (node == NULL)
+	radix_tree_load_root(root, &node, &maxindex);
+	if (index > maxindex)
 		return 0;
-
-	if (!radix_tree_is_indirect_ptr(node))
-		return (index == 0);
-	node = indirect_to_ptr(node);
-
-	height = node->path & RADIX_TREE_HEIGHT_MASK;
-	if (index > radix_tree_maxindex(height))
+	if (node == NULL)
 		return 0;
 
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	while (radix_tree_is_internal_node(node)) {
+		unsigned offset;
 
-	for ( ; ; ) {
-		int offset;
+		parent = entry_to_node(node);
+		offset = radix_tree_descend(parent, &node, index);
 
-		if (node == NULL)
+		if (!node)
 			return 0;
-		node = indirect_to_ptr(node);
-
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		if (!tag_get(node, tag, offset))
+		if (!tag_get(parent, tag, offset))
 			return 0;
-		if (height == 1)
-			return 1;
-		node = rcu_dereference_raw(node->slots[offset]);
-		if (!radix_tree_is_indirect_ptr(node))
-			return 1;
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
+		if (node == RADIX_TREE_RETRY)
+			break;
 	}
+
+	return 1;
 }
 EXPORT_SYMBOL(radix_tree_tag_get);
 
+static inline void __set_iter_shift(struct radix_tree_iter *iter,
+					unsigned int shift)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	iter->shift = shift;
+#endif
+}
+
 /**
  * radix_tree_next_chunk - find next chunk of slots for iteration
  *
@@ -835,9 +865,9 @@ EXPORT_SYMBOL(radix_tree_tag_get);
 void **radix_tree_next_chunk(struct radix_tree_root *root,
 			     struct radix_tree_iter *iter, unsigned flags)
 {
-	unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
-	struct radix_tree_node *rnode, *node;
-	unsigned long index, offset, height;
+	unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
+	struct radix_tree_node *node, *child;
+	unsigned long index, offset, maxindex;
 
 	if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
 		return NULL;
@@ -855,33 +885,28 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 	if (!index && iter->index)
 		return NULL;
 
-	rnode = rcu_dereference_raw(root->rnode);
-	if (radix_tree_is_indirect_ptr(rnode)) {
-		rnode = indirect_to_ptr(rnode);
-	} else if (rnode && !index) {
+ restart:
+	radix_tree_load_root(root, &child, &maxindex);
+	if (index > maxindex)
+		return NULL;
+	if (!child)
+		return NULL;
+
+	if (!radix_tree_is_internal_node(child)) {
 		/* Single-slot tree */
-		iter->index = 0;
-		iter->next_index = 1;
+		iter->index = index;
+		iter->next_index = maxindex + 1;
 		iter->tags = 1;
+		__set_iter_shift(iter, 0);
 		return (void **)&root->rnode;
-	} else
-		return NULL;
-
-restart:
-	height = rnode->path & RADIX_TREE_HEIGHT_MASK;
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	offset = index >> shift;
+	}
 
-	/* Index outside of the tree */
-	if (offset >= RADIX_TREE_MAP_SIZE)
-		return NULL;
+	do {
+		node = entry_to_node(child);
+		offset = radix_tree_descend(node, &child, index);
 
-	node = rnode;
-	while (1) {
-		struct radix_tree_node *slot;
 		if ((flags & RADIX_TREE_ITER_TAGGED) ?
-				!test_bit(offset, node->tags[tag]) :
-				!node->slots[offset]) {
+				!tag_get(node, tag, offset) : !child) {
 			/* Hole detected */
 			if (flags & RADIX_TREE_ITER_CONTIG)
 				return NULL;
@@ -893,35 +918,30 @@ restart:
 						offset + 1);
 			else
 				while (++offset	< RADIX_TREE_MAP_SIZE) {
-					if (node->slots[offset])
+					void *slot = node->slots[offset];
+					if (is_sibling_entry(node, slot))
+						continue;
+					if (slot)
 						break;
 				}
-			index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
-			index += offset << shift;
+			index &= ~node_maxindex(node);
+			index += offset << node->shift;
 			/* Overflow after ~0UL */
 			if (!index)
 				return NULL;
 			if (offset == RADIX_TREE_MAP_SIZE)
 				goto restart;
+			child = rcu_dereference_raw(node->slots[offset]);
 		}
 
-		/* This is leaf-node */
-		if (!shift)
-			break;
-
-		slot = rcu_dereference_raw(node->slots[offset]);
-		if (slot == NULL)
+		if ((child == NULL) || (child == RADIX_TREE_RETRY))
 			goto restart;
-		if (!radix_tree_is_indirect_ptr(slot))
-			break;
-		node = indirect_to_ptr(slot);
-		shift -= RADIX_TREE_MAP_SHIFT;
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-	}
+	} while (radix_tree_is_internal_node(child));
 
 	/* Update the iterator state */
-	iter->index = index;
-	iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1;
+	iter->index = (index &~ node_maxindex(node)) | (offset << node->shift);
+	iter->next_index = (index | node_maxindex(node)) + 1;
+	__set_iter_shift(iter, node->shift);
 
 	/* Construct iter->tags bit-mask from node->tags[tag] array */
 	if (flags & RADIX_TREE_ITER_TAGGED) {
@@ -967,7 +987,7 @@ EXPORT_SYMBOL(radix_tree_next_chunk);
  * set is outside the range we are scanning. This reults in dangling tags and
  * can lead to problems with later tag operations (e.g. livelocks on lookups).
  *
- * The function returns number of leaves where the tag was set and sets
+ * The function returns the number of leaves where the tag was set and sets
  * *first_indexp to the first unscanned index.
  * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
  * be prepared to handle that.
@@ -977,14 +997,13 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		unsigned long nr_to_tag,
 		unsigned int iftag, unsigned int settag)
 {
-	unsigned int height = root->height;
-	struct radix_tree_node *node = NULL;
-	struct radix_tree_node *slot;
-	unsigned int shift;
+	struct radix_tree_node *parent, *node, *child;
+	unsigned long maxindex;
 	unsigned long tagged = 0;
 	unsigned long index = *first_indexp;
 
-	last_index = min(last_index, radix_tree_maxindex(height));
+	radix_tree_load_root(root, &child, &maxindex);
+	last_index = min(last_index, maxindex);
 	if (index > last_index)
 		return 0;
 	if (!nr_to_tag)
@@ -993,80 +1012,62 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		*first_indexp = last_index + 1;
 		return 0;
 	}
-	if (height == 0) {
+	if (!radix_tree_is_internal_node(child)) {
 		*first_indexp = last_index + 1;
 		root_tag_set(root, settag);
 		return 1;
 	}
 
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = indirect_to_ptr(root->rnode);
+	node = entry_to_node(child);
 
 	for (;;) {
-		unsigned long upindex;
-		int offset;
-
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		if (!slot->slots[offset])
+		unsigned offset = radix_tree_descend(node, &child, index);
+		if (!child)
 			goto next;
-		if (!tag_get(slot, iftag, offset))
+		if (!tag_get(node, iftag, offset))
 			goto next;
-		if (shift) {
-			node = slot;
-			slot = slot->slots[offset];
-			if (radix_tree_is_indirect_ptr(slot)) {
-				slot = indirect_to_ptr(slot);
-				shift -= RADIX_TREE_MAP_SHIFT;
-				continue;
-			} else {
-				slot = node;
-				node = node->parent;
-			}
+		/* Sibling slots never have tags set on them */
+		if (radix_tree_is_internal_node(child)) {
+			node = entry_to_node(child);
+			continue;
 		}
 
 		/* tag the leaf */
-		tagged += 1 << shift;
-		tag_set(slot, settag, offset);
+		tagged++;
+		tag_set(node, settag, offset);
 
 		/* walk back up the path tagging interior nodes */
-		upindex = index;
-		while (node) {
-			upindex >>= RADIX_TREE_MAP_SHIFT;
-			offset = upindex & RADIX_TREE_MAP_MASK;
-
+		parent = node;
+		for (;;) {
+			offset = parent->offset;
+			parent = parent->parent;
+			if (!parent)
+				break;
 			/* stop if we find a node with the tag already set */
-			if (tag_get(node, settag, offset))
+			if (tag_get(parent, settag, offset))
 				break;
-			tag_set(node, settag, offset);
-			node = node->parent;
+			tag_set(parent, settag, offset);
 		}
-
-		/*
-		 * Small optimization: now clear that node pointer.
-		 * Since all of this slot's ancestors now have the tag set
-		 * from setting it above, we have no further need to walk
-		 * back up the tree setting tags, until we update slot to
-		 * point to another radix_tree_node.
-		 */
-		node = NULL;
-
-next:
-		/* Go to next item at level determined by 'shift' */
-		index = ((index >> shift) + 1) << shift;
+ next:
+		/* Go to next entry in node */
+		index = ((index >> node->shift) + 1) << node->shift;
 		/* Overflow can happen when last_index is ~0UL... */
 		if (index > last_index || !index)
 			break;
-		if (tagged >= nr_to_tag)
-			break;
-		while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
+		offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
+		while (offset == 0) {
 			/*
 			 * We've fully scanned this node. Go up. Because
 			 * last_index is guaranteed to be in the tree, what
 			 * we do below cannot wander astray.
 			 */
-			slot = slot->parent;
-			shift += RADIX_TREE_MAP_SHIFT;
+			node = node->parent;
+			offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
 		}
+		if (is_sibling_entry(node, node->slots[offset]))
+			goto next;
+		if (tagged >= nr_to_tag)
+			break;
 	}
 	/*
 	 * We need not to tag the root tag if there is no tag which is set with
@@ -1095,9 +1096,10 @@ EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
  *
  *	Like radix_tree_lookup, radix_tree_gang_lookup may be called under
  *	rcu_read_lock. In this case, rather than the returned results being
- *	an atomic snapshot of the tree at a single point in time, the semantics
- *	of an RCU protected gang lookup are as though multiple radix_tree_lookups
- *	have been issued in individual locks, and results stored in 'results'.
+ *	an atomic snapshot of the tree at a single point in time, the
+ *	semantics of an RCU protected gang lookup are as though multiple
+ *	radix_tree_lookups have been issued in individual locks, and results
+ *	stored in 'results'.
  */
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
@@ -1114,7 +1116,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 		results[ret] = rcu_dereference_raw(*slot);
 		if (!results[ret])
 			continue;
-		if (radix_tree_is_indirect_ptr(results[ret])) {
+		if (radix_tree_is_internal_node(results[ret])) {
 			slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
@@ -1197,7 +1199,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		results[ret] = rcu_dereference_raw(*slot);
 		if (!results[ret])
 			continue;
-		if (radix_tree_is_indirect_ptr(results[ret])) {
+		if (radix_tree_is_internal_node(results[ret])) {
 			slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
@@ -1247,58 +1249,48 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
 #if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
 #include <linux/sched.h> /* for cond_resched() */
 
+struct locate_info {
+	unsigned long found_index;
+	bool stop;
+};
+
 /*
  * This linear search is at present only useful to shmem_unuse_inode().
  */
 static unsigned long __locate(struct radix_tree_node *slot, void *item,
-			      unsigned long index, unsigned long *found_index)
+			      unsigned long index, struct locate_info *info)
 {
-	unsigned int shift, height;
 	unsigned long i;
 
-	height = slot->path & RADIX_TREE_HEIGHT_MASK;
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
-	for ( ; height > 1; height--) {
-		i = (index >> shift) & RADIX_TREE_MAP_MASK;
-		for (;;) {
-			if (slot->slots[i] != NULL)
-				break;
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-			i++;
-			if (i == RADIX_TREE_MAP_SIZE)
+	do {
+		unsigned int shift = slot->shift;
+
+		for (i = (index >> shift) & RADIX_TREE_MAP_MASK;
+		     i < RADIX_TREE_MAP_SIZE;
+		     i++, index += (1UL << shift)) {
+			struct radix_tree_node *node =
+					rcu_dereference_raw(slot->slots[i]);
+			if (node == RADIX_TREE_RETRY)
 				goto out;
-		}
-
-		slot = rcu_dereference_raw(slot->slots[i]);
-		if (slot == NULL)
-			goto out;
-		if (!radix_tree_is_indirect_ptr(slot)) {
-			if (slot == item) {
-				*found_index = index + i;
-				index = 0;
-			} else {
-				index += shift;
+			if (!radix_tree_is_internal_node(node)) {
+				if (node == item) {
+					info->found_index = index;
+					info->stop = true;
+					goto out;
+				}
+				continue;
 			}
-			goto out;
+			node = entry_to_node(node);
+			if (is_sibling_entry(slot, node))
+				continue;
+			slot = node;
+			break;
 		}
-		slot = indirect_to_ptr(slot);
-		shift -= RADIX_TREE_MAP_SHIFT;
-	}
+	} while (i < RADIX_TREE_MAP_SIZE);
 
-	/* Bottom level: check items */
-	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
-		if (slot->slots[i] == item) {
-			*found_index = index + i;
-			index = 0;
-			goto out;
-		}
-	}
-	index += RADIX_TREE_MAP_SIZE;
 out:
+	if ((index == 0) && (i == RADIX_TREE_MAP_SIZE))
+		info->stop = true;
 	return index;
 }
 
@@ -1316,32 +1308,35 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 	struct radix_tree_node *node;
 	unsigned long max_index;
 	unsigned long cur_index = 0;
-	unsigned long found_index = -1;
+	struct locate_info info = {
+		.found_index = -1,
+		.stop = false,
+	};
 
 	do {
 		rcu_read_lock();
 		node = rcu_dereference_raw(root->rnode);
-		if (!radix_tree_is_indirect_ptr(node)) {
+		if (!radix_tree_is_internal_node(node)) {
 			rcu_read_unlock();
 			if (node == item)
-				found_index = 0;
+				info.found_index = 0;
 			break;
 		}
 
-		node = indirect_to_ptr(node);
-		max_index = radix_tree_maxindex(node->path &
-						RADIX_TREE_HEIGHT_MASK);
+		node = entry_to_node(node);
+
+		max_index = node_maxindex(node);
 		if (cur_index > max_index) {
 			rcu_read_unlock();
 			break;
 		}
 
-		cur_index = __locate(node, item, cur_index, &found_index);
+		cur_index = __locate(node, item, cur_index, &info);
 		rcu_read_unlock();
 		cond_resched();
-	} while (cur_index != 0 && cur_index <= max_index);
+	} while (!info.stop && cur_index <= max_index);
 
-	return found_index;
+	return info.found_index;
 }
 #else
 unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
@@ -1351,47 +1346,45 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 #endif /* CONFIG_SHMEM && CONFIG_SWAP */
 
 /**
- *	radix_tree_shrink    -    shrink height of a radix tree to minimal
+ *	radix_tree_shrink    -    shrink radix tree to minimum height
  *	@root		radix tree root
  */
-static inline void radix_tree_shrink(struct radix_tree_root *root)
+static inline bool radix_tree_shrink(struct radix_tree_root *root)
 {
-	/* try to shrink tree height */
-	while (root->height > 0) {
-		struct radix_tree_node *to_free = root->rnode;
-		struct radix_tree_node *slot;
+	bool shrunk = false;
 
-		BUG_ON(!radix_tree_is_indirect_ptr(to_free));
-		to_free = indirect_to_ptr(to_free);
+	for (;;) {
+		struct radix_tree_node *node = root->rnode;
+		struct radix_tree_node *child;
+
+		if (!radix_tree_is_internal_node(node))
+			break;
+		node = entry_to_node(node);
 
 		/*
 		 * The candidate node has more than one child, or its child
-		 * is not at the leftmost slot, or it is a multiorder entry,
-		 * we cannot shrink.
+		 * is not at the leftmost slot, or the child is a multiorder
+		 * entry, we cannot shrink.
 		 */
-		if (to_free->count != 1)
+		if (node->count != 1)
+			break;
+		child = node->slots[0];
+		if (!child)
 			break;
-		slot = to_free->slots[0];
-		if (!slot)
+		if (!radix_tree_is_internal_node(child) && node->shift)
 			break;
 
+		if (radix_tree_is_internal_node(child))
+			entry_to_node(child)->parent = NULL;
+
 		/*
 		 * We don't need rcu_assign_pointer(), since we are simply
 		 * moving the node from one part of the tree to another: if it
 		 * was safe to dereference the old pointer to it
-		 * (to_free->slots[0]), it will be safe to dereference the new
+		 * (node->slots[0]), it will be safe to dereference the new
 		 * one (root->rnode) as far as dependent read barriers go.
 		 */
-		if (root->height > 1) {
-			if (!radix_tree_is_indirect_ptr(slot))
-				break;
-
-			slot = indirect_to_ptr(slot);
-			slot->parent = NULL;
-			slot = ptr_to_indirect(slot);
-		}
-		root->rnode = slot;
-		root->height--;
+		root->rnode = child;
 
 		/*
 		 * We have a dilemma here. The node's slot[0] must not be
@@ -1403,7 +1396,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
 		 * their slot to become empty sooner or later.
 		 *
 		 * For example, lockless pagecache will look up a slot, deref
-		 * the page pointer, and if the page is 0 refcount it means it
+		 * the page pointer, and if the page has 0 refcount it means it
 		 * was concurrently deleted from pagecache so try the deref
 		 * again. Fortunately there is already a requirement for logic
 		 * to retry the entire slot lookup -- the indirect pointer
@@ -1411,12 +1404,14 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
 		 * also results in a stale slot). So tag the slot as indirect
 		 * to force callers to retry.
 		 */
-		if (root->height == 0)
-			*((unsigned long *)&to_free->slots[0]) |=
-						RADIX_TREE_INDIRECT_PTR;
+		if (!radix_tree_is_internal_node(child))
+			node->slots[0] = RADIX_TREE_RETRY;
 
-		radix_tree_node_free(to_free);
+		radix_tree_node_free(node);
+		shrunk = true;
 	}
+
+	return shrunk;
 }
 
 /**
@@ -1439,24 +1434,17 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 		struct radix_tree_node *parent;
 
 		if (node->count) {
-			if (node == indirect_to_ptr(root->rnode)) {
-				radix_tree_shrink(root);
-				if (root->height == 0)
-					deleted = true;
-			}
+			if (node == entry_to_node(root->rnode))
+				deleted |= radix_tree_shrink(root);
 			return deleted;
 		}
 
 		parent = node->parent;
 		if (parent) {
-			unsigned int offset;
-
-			offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
-			parent->slots[offset] = NULL;
+			parent->slots[node->offset] = NULL;
 			parent->count--;
 		} else {
 			root_tag_clear_all(root);
-			root->height = 0;
 			root->rnode = NULL;
 		}
 
@@ -1469,6 +1457,20 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 	return deleted;
 }
 
+static inline void delete_sibling_entries(struct radix_tree_node *node,
+					void *ptr, unsigned offset)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	int i;
+	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+		if (node->slots[offset + i] != ptr)
+			break;
+		node->slots[offset + i] = NULL;
+		node->count--;
+	}
+#endif
+}
+
 /**
  *	radix_tree_delete_item    -    delete an item from a radix tree
  *	@root:		radix tree root
@@ -1484,7 +1486,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 			     unsigned long index, void *item)
 {
 	struct radix_tree_node *node;
-	unsigned int offset, i;
+	unsigned int offset;
 	void **slot;
 	void *entry;
 	int tag;
@@ -1502,24 +1504,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 		return entry;
 	}
 
-	offset = index & RADIX_TREE_MAP_MASK;
+	offset = get_slot_offset(node, slot);
 
-	/*
-	 * Clear all tags associated with the item to be deleted.
-	 * This way of doing it would be inefficient, but seldom is any set.
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (tag_get(node, tag, offset))
-			radix_tree_tag_clear(root, index, tag);
-	}
+	/* Clear all tags associated with the item to be deleted.  */
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		node_tag_clear(root, node, tag, offset);
 
-	/* Delete any sibling slots pointing to this slot */
-	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
-		if (node->slots[offset + i] != ptr_to_indirect(slot))
-			break;
-		node->slots[offset + i] = NULL;
-		node->count--;
-	}
+	delete_sibling_entries(node, node_to_entry(slot), offset);
 	node->slots[offset] = NULL;
 	node->count--;
 
@@ -1544,6 +1535,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
+struct radix_tree_node *radix_tree_replace_clear_tags(
+			struct radix_tree_root *root,
+			unsigned long index, void *entry)
+{
+	struct radix_tree_node *node;
+	void **slot;
+
+	__radix_tree_lookup(root, index, &node, &slot);
+
+	if (node) {
+		unsigned int tag, offset = get_slot_offset(node, slot);
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			node_tag_clear(root, node, tag, offset);
+	} else {
+		/* Clear root node tags */
+		root->gfp_mask &= __GFP_BITS_MASK;
+	}
+
+	radix_tree_replace_slot(slot, entry);
+	return node;
+}
+
 /**
  *	radix_tree_tagged - test whether any items in the tree are tagged
  *	@root:		radix tree root
@@ -1564,45 +1577,24 @@ radix_tree_node_ctor(void *arg)
 	INIT_LIST_HEAD(&node->private_list);
 }
 
-static __init unsigned long __maxindex(unsigned int height)
-{
-	unsigned int width = height * RADIX_TREE_MAP_SHIFT;
-	int shift = RADIX_TREE_INDEX_BITS - width;
-
-	if (shift < 0)
-		return ~0UL;
-	if (shift >= BITS_PER_LONG)
-		return 0UL;
-	return ~0UL >> shift;
-}
-
-static __init void radix_tree_init_maxindex(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
-		height_to_maxindex[i] = __maxindex(i);
-}
-
 static int radix_tree_callback(struct notifier_block *nfb,
-                            unsigned long action,
-                            void *hcpu)
+				unsigned long action, void *hcpu)
 {
-       int cpu = (long)hcpu;
-       struct radix_tree_preload *rtp;
-       struct radix_tree_node *node;
-
-       /* Free per-cpu pool of perloaded nodes */
-       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-               rtp = &per_cpu(radix_tree_preloads, cpu);
-               while (rtp->nr) {
+	int cpu = (long)hcpu;
+	struct radix_tree_preload *rtp;
+	struct radix_tree_node *node;
+
+	/* Free per-cpu pool of preloaded nodes */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		rtp = &per_cpu(radix_tree_preloads, cpu);
+		while (rtp->nr) {
 			node = rtp->nodes;
 			rtp->nodes = node->private_data;
 			kmem_cache_free(radix_tree_node_cachep, node);
 			rtp->nr--;
-               }
-       }
-       return NOTIFY_OK;
+		}
+	}
+	return NOTIFY_OK;
 }
 
 void __init radix_tree_init(void)
@@ -1611,6 +1603,5 @@ void __init radix_tree_init(void)
 			sizeof(struct radix_tree_node), 0,
 			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
 			radix_tree_node_ctor);
-	radix_tree_init_maxindex();
 	hotcpu_notifier(radix_tree_callback, 0);
 }
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index cc808707d..5d845ffd7 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -487,6 +487,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * rhashtable_walk_init - Initialise an iterator
  * @ht:		Table to walk over
  * @iter:	Hash table Iterator
+ * @gfp:	GFP flags for allocations
  *
  * This function prepares a hash table walk.
  *
@@ -504,14 +505,15 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * You must call rhashtable_walk_exit if this function returns
  * successfully.
  */
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
+			 gfp_t gfp)
 {
 	iter->ht = ht;
 	iter->p = NULL;
 	iter->slot = 0;
 	iter->skip = 0;
 
-	iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL);
+	iter->walker = kmalloc(sizeof(*iter->walker), gfp);
 	if (!iter->walker)
 		return -ENOMEM;
 
diff --git a/lib/sg_pool.c b/lib/sg_pool.c
new file mode 100644
index 000000000..6dd30615a
--- /dev/null
+++ b/lib/sg_pool.c
@@ -0,0 +1,172 @@
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/mempool.h>
+#include <linux/slab.h>
+
+#define SG_MEMPOOL_NR		ARRAY_SIZE(sg_pools)
+#define SG_MEMPOOL_SIZE		2
+
+struct sg_pool {
+	size_t		size;
+	char		*name;
+	struct kmem_cache	*slab;
+	mempool_t	*pool;
+};
+
+#define SP(x) { .size = x, "sgpool-" __stringify(x) }
+#if (SG_CHUNK_SIZE < 32)
+#error SG_CHUNK_SIZE is too small (must be 32 or greater)
+#endif
+static struct sg_pool sg_pools[] = {
+	SP(8),
+	SP(16),
+#if (SG_CHUNK_SIZE > 32)
+	SP(32),
+#if (SG_CHUNK_SIZE > 64)
+	SP(64),
+#if (SG_CHUNK_SIZE > 128)
+	SP(128),
+#if (SG_CHUNK_SIZE > 256)
+#error SG_CHUNK_SIZE is too large (256 MAX)
+#endif
+#endif
+#endif
+#endif
+	SP(SG_CHUNK_SIZE)
+};
+#undef SP
+
+static inline unsigned int sg_pool_index(unsigned short nents)
+{
+	unsigned int index;
+
+	BUG_ON(nents > SG_CHUNK_SIZE);
+
+	if (nents <= 8)
+		index = 0;
+	else
+		index = get_count_order(nents) - 3;
+
+	return index;
+}
+
+static void sg_pool_free(struct scatterlist *sgl, unsigned int nents)
+{
+	struct sg_pool *sgp;
+
+	sgp = sg_pools + sg_pool_index(nents);
+	mempool_free(sgl, sgp->pool);
+}
+
+static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask)
+{
+	struct sg_pool *sgp;
+
+	sgp = sg_pools + sg_pool_index(nents);
+	return mempool_alloc(sgp->pool, gfp_mask);
+}
+
+/**
+ * sg_free_table_chained - Free a previously mapped sg table
+ * @table:	The sg table header to use
+ * @first_chunk: was first_chunk not NULL in sg_alloc_table_chained?
+ *
+ *  Description:
+ *    Free an sg table previously allocated and setup with
+ *    sg_alloc_table_chained().
+ *
+ **/
+void sg_free_table_chained(struct sg_table *table, bool first_chunk)
+{
+	if (first_chunk && table->orig_nents <= SG_CHUNK_SIZE)
+		return;
+	__sg_free_table(table, SG_CHUNK_SIZE, first_chunk, sg_pool_free);
+}
+EXPORT_SYMBOL_GPL(sg_free_table_chained);
+
+/**
+ * sg_alloc_table_chained - Allocate and chain SGLs in an sg table
+ * @table:	The sg table header to use
+ * @nents:	Number of entries in sg list
+ * @first_chunk: first SGL
+ *
+ *  Description:
+ *    Allocate and chain SGLs in an sg table. If @nents@ is larger than
+ *    SG_CHUNK_SIZE a chained sg table will be setup.
+ *
+ **/
+int sg_alloc_table_chained(struct sg_table *table, int nents,
+		struct scatterlist *first_chunk)
+{
+	int ret;
+
+	BUG_ON(!nents);
+
+	if (first_chunk) {
+		if (nents <= SG_CHUNK_SIZE) {
+			table->nents = table->orig_nents = nents;
+			sg_init_table(table->sgl, nents);
+			return 0;
+		}
+	}
+
+	ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE,
+			       first_chunk, GFP_ATOMIC, sg_pool_alloc);
+	if (unlikely(ret))
+		sg_free_table_chained(table, (bool)first_chunk);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sg_alloc_table_chained);
+
+static __init int sg_pool_init(void)
+{
+	int i;
+
+	for (i = 0; i < SG_MEMPOOL_NR; i++) {
+		struct sg_pool *sgp = sg_pools + i;
+		int size = sgp->size * sizeof(struct scatterlist);
+
+		sgp->slab = kmem_cache_create(sgp->name, size, 0,
+				SLAB_HWCACHE_ALIGN, NULL);
+		if (!sgp->slab) {
+			printk(KERN_ERR "SG_POOL: can't init sg slab %s\n",
+					sgp->name);
+			goto cleanup_sdb;
+		}
+
+		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
+						     sgp->slab);
+		if (!sgp->pool) {
+			printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n",
+					sgp->name);
+			goto cleanup_sdb;
+		}
+	}
+
+	return 0;
+
+cleanup_sdb:
+	for (i = 0; i < SG_MEMPOOL_NR; i++) {
+		struct sg_pool *sgp = sg_pools + i;
+		if (sgp->pool)
+			mempool_destroy(sgp->pool);
+		if (sgp->slab)
+			kmem_cache_destroy(sgp->slab);
+	}
+
+	return -ENOMEM;
+}
+
+static __exit void sg_pool_exit(void)
+{
+	int i;
+
+	for (i = 0; i < SG_MEMPOOL_NR; i++) {
+		struct sg_pool *sgp = sg_pools + i;
+		mempool_destroy(sgp->pool);
+		kmem_cache_destroy(sgp->slab);
+	}
+}
+
+module_init(sg_pool_init);
+module_exit(sg_pool_exit);
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 5c88204b6..ecaac2c05 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -10,6 +10,10 @@
 #include <linux/export.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/string_helpers.h>
 
@@ -534,3 +538,91 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 	return p - dst;
 }
 EXPORT_SYMBOL(string_escape_mem);
+
+/*
+ * Return an allocated string that has been escaped of special characters
+ * and double quotes, making it safe to log in quotes.
+ */
+char *kstrdup_quotable(const char *src, gfp_t gfp)
+{
+	size_t slen, dlen;
+	char *dst;
+	const int flags = ESCAPE_HEX;
+	const char esc[] = "\f\n\r\t\v\a\e\\\"";
+
+	if (!src)
+		return NULL;
+	slen = strlen(src);
+
+	dlen = string_escape_mem(src, slen, NULL, 0, flags, esc);
+	dst = kmalloc(dlen + 1, gfp);
+	if (!dst)
+		return NULL;
+
+	WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen);
+	dst[dlen] = '\0';
+
+	return dst;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable);
+
+/*
+ * Returns allocated NULL-terminated string containing process
+ * command line, with inter-argument NULLs replaced with spaces,
+ * and other special characters escaped.
+ */
+char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
+{
+	char *buffer, *quoted;
+	int i, res;
+
+	buffer = kmalloc(PAGE_SIZE, GFP_TEMPORARY);
+	if (!buffer)
+		return NULL;
+
+	res = get_cmdline(task, buffer, PAGE_SIZE - 1);
+	buffer[res] = '\0';
+
+	/* Collapse trailing NULLs, leave res pointing to last non-NULL. */
+	while (--res >= 0 && buffer[res] == '\0')
+		;
+
+	/* Replace inter-argument NULLs. */
+	for (i = 0; i <= res; i++)
+		if (buffer[i] == '\0')
+			buffer[i] = ' ';
+
+	/* Make sure result is printable. */
+	quoted = kstrdup_quotable(buffer, gfp);
+	kfree(buffer);
+	return quoted;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline);
+
+/*
+ * Returns allocated NULL-terminated string containing pathname,
+ * with special characters escaped, able to be safely logged. If
+ * there is an error, the leading character will be "<".
+ */
+char *kstrdup_quotable_file(struct file *file, gfp_t gfp)
+{
+	char *temp, *pathname;
+
+	if (!file)
+		return kstrdup("<unknown>", gfp);
+
+	/* We add 11 spaces for ' (deleted)' to be appended */
+	temp = kmalloc(PATH_MAX + 11, GFP_TEMPORARY);
+	if (!temp)
+		return kstrdup("<no_memory>", gfp);
+
+	pathname = file_path(file, temp, PATH_MAX + 11);
+	if (IS_ERR(pathname))
+		pathname = kstrdup("<too_long>", gfp);
+	else
+		pathname = kstrdup_quotable(pathname, gfp);
+
+	kfree(temp);
+	return pathname;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable_file);
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 338403241..33f655ef4 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,5 +1,6 @@
 #include <linux/compiler.h>
 #include <linux/export.h>
+#include <linux/kasan-checks.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -109,6 +110,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 		unsigned long max = max_addr - src_addr;
 		long retval;
 
+		kasan_check_write(dst, count);
 		user_access_begin();
 		retval = do_strncpy_from_user(dst, src, count, max);
 		user_access_end();
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 8f22fbedc..93f45011a 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5621,7 +5621,10 @@ static struct bpf_prog *generate_filter(int which, int *err)
 		fp->type = BPF_PROG_TYPE_SOCKET_FILTER;
 		memcpy(fp->insnsi, fptr, fp->len * sizeof(struct bpf_insn));
 
-		bpf_prog_select_runtime(fp);
+		/* We cannot error here as we don't need type compatibility
+		 * checks.
+		 */
+		fp = bpf_prog_select_runtime(fp, err);
 		break;
 	}
 
diff --git a/lib/test_hash.c b/lib/test_hash.c
new file mode 100644
index 000000000..c9549c8b4
--- /dev/null
+++ b/lib/test_hash.c
@@ -0,0 +1,250 @@
+/*
+ * Test cases for <linux/hash.h> and <linux/stringhash.h>
+ * This just verifies that various ways of computing a hash
+ * produce the same thing and, for cases where a k-bit hash
+ * value is requested, is of the requested size.
+ *
+ * We fill a buffer with a 255-byte null-terminated string,
+ * and use both full_name_hash() and hashlen_string() to hash the
+ * substrings from i to j, where 0 <= i < j < 256.
+ *
+ * The returned values are used to check that __hash_32() and
+ * __hash_32_generic() compute the same thing.  Likewise hash_32()
+ * and hash_64().
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt "\n"
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/stringhash.h>
+#include <linux/printk.h>
+
+/* 32-bit XORSHIFT generator.  Seed must not be zero. */
+static u32 __init __attribute_const__
+xorshift(u32 seed)
+{
+	seed ^= seed << 13;
+	seed ^= seed >> 17;
+	seed ^= seed << 5;
+	return seed;
+}
+
+/* Given a non-zero x, returns a non-zero byte. */
+static u8 __init __attribute_const__
+mod255(u32 x)
+{
+	x = (x & 0xffff) + (x >> 16);	/* 1 <= x <= 0x1fffe */
+	x = (x & 0xff) + (x >> 8);	/* 1 <= x <= 0x2fd */
+	x = (x & 0xff) + (x >> 8);	/* 1 <= x <= 0x100 */
+	x = (x & 0xff) + (x >> 8);	/* 1 <= x <= 0xff */
+	return x;
+}
+
+/* Fill the buffer with non-zero bytes. */
+static void __init
+fill_buf(char *buf, size_t len, u32 seed)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		seed = xorshift(seed);
+		buf[i] = mod255(seed);
+	}
+}
+
+/*
+ * Test the various integer hash functions.  h64 (or its low-order bits)
+ * is the integer to hash.  hash_or accumulates the OR of the hash values,
+ * which are later checked to see that they cover all the requested bits.
+ *
+ * Because these functions (as opposed to the string hashes) are all
+ * inline, the code being tested is actually in the module, and you can
+ * recompile and re-test the module without rebooting.
+ */
+static bool __init
+test_int_hash(unsigned long long h64, u32 hash_or[2][33])
+{
+	int k;
+	u32 h0 = (u32)h64, h1, h2;
+
+	/* Test __hash32 */
+	hash_or[0][0] |= h1 = __hash_32(h0);
+#ifdef HAVE_ARCH__HASH_32
+	hash_or[1][0] |= h2 = __hash_32_generic(h0);
+#if HAVE_ARCH__HASH_32 == 1
+	if (h1 != h2) {
+		pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x",
+			h0, h1, h2);
+		return false;
+	}
+#endif
+#endif
+
+	/* Test k = 1..32 bits */
+	for (k = 1; k <= 32; k++) {
+		u32 const m = ((u32)2 << (k-1)) - 1;	/* Low k bits set */
+
+		/* Test hash_32 */
+		hash_or[0][k] |= h1 = hash_32(h0, k);
+		if (h1 > m) {
+			pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m);
+			return false;
+		}
+#ifdef HAVE_ARCH_HASH_32
+		h2 = hash_32_generic(h0, k);
+#if HAVE_ARCH_HASH_32 == 1
+		if (h1 != h2) {
+			pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() "
+				" = %#x", h0, k, h1, h2);
+			return false;
+		}
+#else
+		if (h2 > m) {
+			pr_err("hash_32_generic(%#x, %d) = %#x > %#x",
+				h0, k, h1, m);
+			return false;
+		}
+#endif
+#endif
+		/* Test hash_64 */
+		hash_or[1][k] |= h1 = hash_64(h64, k);
+		if (h1 > m) {
+			pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, h1, m);
+			return false;
+		}
+#ifdef HAVE_ARCH_HASH_64
+		h2 = hash_64_generic(h64, k);
+#if HAVE_ARCH_HASH_64 == 1
+		if (h1 != h2) {
+			pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() "
+				"= %#x", h64, k, h1, h2);
+			return false;
+		}
+#else
+		if (h2 > m) {
+			pr_err("hash_64_generic(%#llx, %d) = %#x > %#x",
+				h64, k, h1, m);
+			return false;
+		}
+#endif
+#endif
+	}
+
+	(void)h2;	/* Suppress unused variable warning */
+	return true;
+}
+
+#define SIZE 256	/* Run time is cubic in SIZE */
+
+static int __init
+test_hash_init(void)
+{
+	char buf[SIZE+1];
+	u32 string_or = 0, hash_or[2][33] = { 0 };
+	unsigned tests = 0;
+	unsigned long long h64 = 0;
+	int i, j;
+
+	fill_buf(buf, SIZE, 1);
+
+	/* Test every possible non-empty substring in the buffer. */
+	for (j = SIZE; j > 0; --j) {
+		buf[j] = '\0';
+
+		for (i = 0; i <= j; i++) {
+			u64 hashlen = hashlen_string(buf+i);
+			u32 h0 = full_name_hash(buf+i, j-i);
+
+			/* Check that hashlen_string gets the length right */
+			if (hashlen_len(hashlen) != j-i) {
+				pr_err("hashlen_string(%d..%d) returned length"
+					" %u, expected %d",
+					i, j, hashlen_len(hashlen), j-i);
+				return -EINVAL;
+			}
+			/* Check that the hashes match */
+			if (hashlen_hash(hashlen) != h0) {
+				pr_err("hashlen_string(%d..%d) = %08x != "
+					"full_name_hash() = %08x",
+					i, j, hashlen_hash(hashlen), h0);
+				return -EINVAL;
+			}
+
+			string_or |= h0;
+			h64 = h64 << 32 | h0;	/* For use with hash_64 */
+			if (!test_int_hash(h64, hash_or))
+				return -EINVAL;
+			tests++;
+		} /* i */
+	} /* j */
+
+	/* The OR of all the hash values should cover all the bits */
+	if (~string_or) {
+		pr_err("OR of all string hash results = %#x != %#x",
+			string_or, -1u);
+		return -EINVAL;
+	}
+	if (~hash_or[0][0]) {
+		pr_err("OR of all __hash_32 results = %#x != %#x",
+			hash_or[0][0], -1u);
+		return -EINVAL;
+	}
+#ifdef HAVE_ARCH__HASH_32
+#if HAVE_ARCH__HASH_32 != 1	/* Test is pointless if results match */
+	if (~hash_or[1][0]) {
+		pr_err("OR of all __hash_32_generic results = %#x != %#x",
+			hash_or[1][0], -1u);
+		return -EINVAL;
+	}
+#endif
+#endif
+
+	/* Likewise for all the i-bit hash values */
+	for (i = 1; i <= 32; i++) {
+		u32 const m = ((u32)2 << (i-1)) - 1;	/* Low i bits set */
+
+		if (hash_or[0][i] != m) {
+			pr_err("OR of all hash_32(%d) results = %#x "
+				"(%#x expected)", i, hash_or[0][i], m);
+			return -EINVAL;
+		}
+		if (hash_or[1][i] != m) {
+			pr_err("OR of all hash_64(%d) results = %#x "
+				"(%#x expected)", i, hash_or[1][i], m);
+			return -EINVAL;
+		}
+	}
+
+	/* Issue notices about skipped tests. */
+#ifndef HAVE_ARCH__HASH_32
+	pr_info("__hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH__HASH_32 != 1
+	pr_info("__hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_32
+	pr_info("hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_32 != 1
+	pr_info("hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_64
+	pr_info("hash_64() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_64 != 1
+	pr_info("hash_64() is arch-specific; not compared to generic.");
+#endif
+
+	pr_notice("%u tests passed.", tests);
+
+	return 0;
+}
+
+static void __exit test_hash_exit(void)
+{
+}
+
+module_init(test_hash_init);	/* Does everything */
+module_exit(test_hash_exit);	/* Does nothing */
+
+MODULE_LICENSE("GPL");
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 82169fbf2..5e51872b3 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -12,9 +12,12 @@
 #define pr_fmt(fmt) "kasan test: %s " fmt, __func__
 
 #include <linux/kernel.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 
 static noinline void __init kmalloc_oob_right(void)
@@ -344,6 +347,70 @@ static noinline void __init kasan_stack_oob(void)
 	*(volatile char *)p;
 }
 
+static noinline void __init ksize_unpoisons_memory(void)
+{
+	char *ptr;
+	size_t size = 123, real_size = size;
+
+	pr_info("ksize() unpoisons the whole allocated chunk\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+	real_size = ksize(ptr);
+	/* This access doesn't trigger an error. */
+	ptr[size] = 'x';
+	/* This one does. */
+	ptr[real_size] = 'y';
+	kfree(ptr);
+}
+
+static noinline void __init copy_user_test(void)
+{
+	char *kmem;
+	char __user *usermem;
+	size_t size = 10;
+	int unused;
+
+	kmem = kmalloc(size, GFP_KERNEL);
+	if (!kmem)
+		return;
+
+	usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
+			    PROT_READ | PROT_WRITE | PROT_EXEC,
+			    MAP_ANONYMOUS | MAP_PRIVATE, 0);
+	if (IS_ERR(usermem)) {
+		pr_err("Failed to allocate user memory\n");
+		kfree(kmem);
+		return;
+	}
+
+	pr_info("out-of-bounds in copy_from_user()\n");
+	unused = copy_from_user(kmem, usermem, size + 1);
+
+	pr_info("out-of-bounds in copy_to_user()\n");
+	unused = copy_to_user(usermem, kmem, size + 1);
+
+	pr_info("out-of-bounds in __copy_from_user()\n");
+	unused = __copy_from_user(kmem, usermem, size + 1);
+
+	pr_info("out-of-bounds in __copy_to_user()\n");
+	unused = __copy_to_user(usermem, kmem, size + 1);
+
+	pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
+	unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
+
+	pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
+	unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
+
+	pr_info("out-of-bounds in strncpy_from_user()\n");
+	unused = strncpy_from_user(kmem, usermem, size + 1);
+
+	vm_munmap((unsigned long)usermem, PAGE_SIZE);
+	kfree(kmem);
+}
+
 static int __init kmalloc_tests_init(void)
 {
 	kmalloc_oob_right();
@@ -367,6 +434,8 @@ static int __init kmalloc_tests_init(void)
 	kmem_cache_oob();
 	kasan_stack_oob();
 	kasan_global_oob();
+	ksize_unpoisons_memory();
+	copy_user_test();
 	return -EAGAIN;
 }
 
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 270bf7289..297fdb5e7 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -143,7 +143,7 @@ static void test_bucket_stats(struct rhashtable *ht)
 	struct rhashtable_iter hti;
 	struct rhash_head *pos;
 
-	err = rhashtable_walk_init(ht, &hti);
+	err = rhashtable_walk_init(ht, &hti, GFP_KERNEL);
 	if (err) {
 		pr_warn("Test failed: allocation error");
 		return;
diff --git a/lib/test_uuid.c b/lib/test_uuid.c
new file mode 100644
index 000000000..547d3127a
--- /dev/null
+++ b/lib/test_uuid.c
@@ -0,0 +1,133 @@
+/*
+ * Test cases for lib/uuid.c module.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/uuid.h>
+
+struct test_uuid_data {
+	const char *uuid;
+	uuid_le le;
+	uuid_be be;
+};
+
+static const struct test_uuid_data test_uuid_test_data[] = {
+	{
+		.uuid = "c33f4995-3701-450e-9fbf-206a2e98e576",
+		.le = UUID_LE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
+		.be = UUID_BE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
+	},
+	{
+		.uuid = "64b4371c-77c1-48f9-8221-29f054fc023b",
+		.le = UUID_LE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
+		.be = UUID_BE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
+	},
+	{
+		.uuid = "0cb4ddff-a545-4401-9d06-688af53e7f84",
+		.le = UUID_LE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
+		.be = UUID_BE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
+	},
+};
+
+static const char * const test_uuid_wrong_data[] = {
+	"c33f4995-3701-450e-9fbf206a2e98e576 ",	/* no hyphen(s) */
+	"64b4371c-77c1-48f9-8221-29f054XX023b",	/* invalid character(s) */
+	"0cb4ddff-a545-4401-9d06-688af53e",	/* not enough data */
+};
+
+static unsigned total_tests __initdata;
+static unsigned failed_tests __initdata;
+
+static void __init test_uuid_failed(const char *prefix, bool wrong, bool be,
+				    const char *data, const char *actual)
+{
+	pr_err("%s test #%u %s %s data: '%s'\n",
+	       prefix,
+	       total_tests,
+	       wrong ? "passed on wrong" : "failed on",
+	       be ? "BE" : "LE",
+	       data);
+	if (actual && *actual)
+		pr_err("%s test #%u actual data: '%s'\n",
+		       prefix,
+		       total_tests,
+		       actual);
+	failed_tests++;
+}
+
+static void __init test_uuid_test(const struct test_uuid_data *data)
+{
+	uuid_le le;
+	uuid_be be;
+	char buf[48];
+
+	/* LE */
+	total_tests++;
+	if (uuid_le_to_bin(data->uuid, &le))
+		test_uuid_failed("conversion", false, false, data->uuid, NULL);
+
+	total_tests++;
+	if (uuid_le_cmp(data->le, le)) {
+		sprintf(buf, "%pUl", &le);
+		test_uuid_failed("cmp", false, false, data->uuid, buf);
+	}
+
+	/* BE */
+	total_tests++;
+	if (uuid_be_to_bin(data->uuid, &be))
+		test_uuid_failed("conversion", false, true, data->uuid, NULL);
+
+	total_tests++;
+	if (uuid_be_cmp(data->be, be)) {
+		sprintf(buf, "%pUb", &be);
+		test_uuid_failed("cmp", false, true, data->uuid, buf);
+	}
+}
+
+static void __init test_uuid_wrong(const char *data)
+{
+	uuid_le le;
+	uuid_be be;
+
+	/* LE */
+	total_tests++;
+	if (!uuid_le_to_bin(data, &le))
+		test_uuid_failed("negative", true, false, data, NULL);
+
+	/* BE */
+	total_tests++;
+	if (!uuid_be_to_bin(data, &be))
+		test_uuid_failed("negative", true, true, data, NULL);
+}
+
+static int __init test_uuid_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(test_uuid_test_data); i++)
+		test_uuid_test(&test_uuid_test_data[i]);
+
+	for (i = 0; i < ARRAY_SIZE(test_uuid_wrong_data); i++)
+		test_uuid_wrong(test_uuid_wrong_data[i]);
+
+	if (failed_tests == 0)
+		pr_info("all %u tests passed\n", total_tests);
+	else
+		pr_err("failed %u out of %u tests\n", failed_tests, total_tests);
+
+	return failed_tests ? -EINVAL : 0;
+}
+module_init(test_uuid_init);
+
+static void __exit test_uuid_exit(void)
+{
+	/* do nothing */
+}
+module_exit(test_uuid_exit);
+
+MODULE_AUTHOR("Andy Shevchenko <andriy.shevchenko@linux.intel.com>");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/uuid.c b/lib/uuid.c
index 398821e4d..37687af77 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -1,7 +1,7 @@
 /*
  * Unified UUID/GUID definition
  *
- * Copyright (C) 2009, Intel Corp.
+ * Copyright (C) 2009, 2016 Intel Corp.
  *	Huang Ying <ying.huang@intel.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -12,17 +12,40 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/uuid.h>
 #include <linux/random.h>
 
+const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_le_index);
+const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_be_index);
+
+/***************************************************************
+ * Random UUID interface
+ *
+ * Used here for a Boot ID, but can be useful for other kernel
+ * drivers.
+ ***************************************************************/
+
+/*
+ * Generate random UUID
+ */
+void generate_random_uuid(unsigned char uuid[16])
+{
+	get_random_bytes(uuid, 16);
+	/* Set UUID version to 4 --- truly random generation */
+	uuid[6] = (uuid[6] & 0x0F) | 0x40;
+	/* Set the UUID variant to DCE */
+	uuid[8] = (uuid[8] & 0x3F) | 0x80;
+}
+EXPORT_SYMBOL(generate_random_uuid);
+
 static void __uuid_gen_common(__u8 b[16])
 {
 	prandom_bytes(b, 16);
@@ -45,3 +68,61 @@ void uuid_be_gen(uuid_be *bu)
 	bu->b[6] = (bu->b[6] & 0x0F) | 0x40;
 }
 EXPORT_SYMBOL_GPL(uuid_be_gen);
+
+/**
+  * uuid_is_valid - checks if UUID string valid
+  * @uuid:	UUID string to check
+  *
+  * Description:
+  * It checks if the UUID string is following the format:
+  *	xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+  * where x is a hex digit.
+  *
+  * Return: true if input is valid UUID string.
+  */
+bool uuid_is_valid(const char *uuid)
+{
+	unsigned int i;
+
+	for (i = 0; i < UUID_STRING_LEN; i++) {
+		if (i == 8 || i == 13 || i == 18 || i == 23) {
+			if (uuid[i] != '-')
+				return false;
+		} else if (!isxdigit(uuid[i])) {
+			return false;
+		}
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(uuid_is_valid);
+
+static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16])
+{
+	static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34};
+	unsigned int i;
+
+	if (!uuid_is_valid(uuid))
+		return -EINVAL;
+
+	for (i = 0; i < 16; i++) {
+		int hi = hex_to_bin(uuid[si[i] + 0]);
+		int lo = hex_to_bin(uuid[si[i] + 1]);
+
+		b[ei[i]] = (hi << 4) | lo;
+	}
+
+	return 0;
+}
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u)
+{
+	return __uuid_to_bin(uuid, u->b, uuid_le_index);
+}
+EXPORT_SYMBOL(uuid_le_to_bin);
+
+int uuid_be_to_bin(const char *uuid, uuid_be *u)
+{
+	return __uuid_to_bin(uuid, u->b, uuid_be_index);
+}
+EXPORT_SYMBOL(uuid_be_to_bin);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index ccb664b54..0967771d8 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -30,6 +30,7 @@
 #include <linux/ioport.h>
 #include <linux/dcache.h>
 #include <linux/cred.h>
+#include <linux/uuid.h>
 #include <net/addrconf.h>
 #ifdef CONFIG_BLOCK
 #include <linux/blkdev.h>
@@ -1304,19 +1305,17 @@ static noinline_for_stack
 char *uuid_string(char *buf, char *end, const u8 *addr,
 		  struct printf_spec spec, const char *fmt)
 {
-	char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
+	char uuid[UUID_STRING_LEN + 1];
 	char *p = uuid;
 	int i;
-	static const u8 be[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-	static const u8 le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
-	const u8 *index = be;
+	const u8 *index = uuid_be_index;
 	bool uc = false;
 
 	switch (*(++fmt)) {
 	case 'L':
 		uc = true;		/* fall-through */
 	case 'l':
-		index = le;
+		index = uuid_le_index;
 		break;
 	case 'B':
 		uc = true;
@@ -1324,7 +1323,10 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
 	}
 
 	for (i = 0; i < 16; i++) {
-		p = hex_byte_pack(p, addr[index[i]]);
+		if (uc)
+			p = hex_byte_pack_upper(p, addr[index[i]]);
+		else
+			p = hex_byte_pack(p, addr[index[i]]);
 		switch (i) {
 		case 3:
 		case 5:
@@ -1337,13 +1339,6 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
 
 	*p = 0;
 
-	if (uc) {
-		p = uuid;
-		do {
-			*p = toupper(*p);
-		} while (*(++p));
-	}
-
 	return string(buf, end, uuid, spec);
 }
 
diff --git a/lib/wbt.c b/lib/wbt.c
new file mode 100644
index 000000000..cc5a24270
--- /dev/null
+++ b/lib/wbt.c
@@ -0,0 +1,569 @@
+/*
+ * buffered writeback throttling. losely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ *   scaling step and scale down queue depth by a factor of 2x. The monitoring
+ *   window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ *   look like, retain status quo.
+ * - If latencies look good, decrement scaling step.
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ * Things that (may) need changing:
+ *
+ *	- Different scaling of background/normal/high priority writeback.
+ *	  We may have to violate guarantees for max.
+ *	- We can have mismatches between the stat window and our window.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/wbt.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/wbt.h>
+
+enum {
+	/*
+	 * Might need to be higher
+	 */
+	RWB_MAX_DEPTH	= 64,
+
+	/*
+	 * 100msec window
+	 */
+	RWB_WINDOW_NSEC		= 100 * 1000 * 1000ULL,
+
+	/*
+	 * Disregard stats, if we don't meet these minimums
+	 */
+	RWB_MIN_WRITE_SAMPLES	= 3,
+	RWB_MIN_READ_SAMPLES	= 1,
+
+	/*
+	 * If we have this number of consecutive windows with not enough
+	 * information to scale up or down, scale up.
+	 */
+	RWB_UNKNOWN_BUMP	= 5,
+};
+
+static inline bool rwb_enabled(struct rq_wb *rwb)
+{
+	return rwb && rwb->wb_normal != 0;
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+	int cur = atomic_read(v);
+
+	for (;;) {
+		int old;
+
+		if (cur >= below)
+			return false;
+		old = atomic_cmpxchg(v, cur, cur + 1);
+		if (old == cur)
+			break;
+		cur = old;
+	}
+
+	return true;
+}
+
+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
+{
+	if (rwb_enabled(rwb)) {
+		const unsigned long cur = jiffies;
+
+		if (cur != *var)
+			*var = cur;
+	}
+}
+
+void __wbt_done(struct rq_wb *rwb)
+{
+	int inflight, limit;
+
+	inflight = atomic_dec_return(&rwb->inflight);
+
+	/*
+	 * wbt got disabled with IO in flight. Wake up any potential
+	 * waiters, we don't have to do more than that.
+	 */
+	if (unlikely(!rwb_enabled(rwb))) {
+		wake_up_all(&rwb->wait);
+		return;
+	}
+
+	/*
+	 * If the device does write back caching, drop further down
+	 * before we wake people up.
+	 */
+	if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping))
+		limit = 0;
+	else
+		limit = rwb->wb_normal;
+
+	/*
+	 * Don't wake anyone up if we are above the normal limit.
+	 */
+	if (inflight && inflight >= limit)
+		return;
+
+	if (waitqueue_active(&rwb->wait)) {
+		int diff = limit - inflight;
+
+		if (!inflight || diff >= rwb->wb_background / 2)
+			wake_up_nr(&rwb->wait, 1);
+	}
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+	if (!rwb)
+		return;
+
+	if (!wbt_tracked(stat)) {
+		if (rwb->sync_cookie == stat) {
+			rwb->sync_issue = 0;
+			rwb->sync_cookie = NULL;
+		}
+
+		wb_timestamp(rwb, &rwb->last_comp);
+	} else {
+		WARN_ON_ONCE(stat == rwb->sync_cookie);
+		__wbt_done(rwb);
+		wbt_clear_tracked(stat);
+	}
+}
+
+static void calc_wb_limits(struct rq_wb *rwb)
+{
+	unsigned int depth;
+
+	if (!rwb->min_lat_nsec) {
+		rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
+		return;
+	}
+
+	/*
+	 * For QD=1 devices, this is a special case. It's important for those
+	 * to have one request ready when one completes, so force a depth of
+	 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+	 * since the device can't have more than that in flight. If we're
+	 * scaling down, then keep a setting of 1/1/1.
+	 */
+	if (rwb->queue_depth == 1) {
+		if (rwb->scale_step)
+			rwb->wb_max = rwb->wb_normal = 1;
+		else
+			rwb->wb_max = rwb->wb_normal = 2;
+		rwb->wb_background = 1;
+	} else {
+		depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
+
+		/*
+		 * Set our max/normal/bg queue depths based on how far
+		 * we have scaled down (->scale_step).
+		 */
+		rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step));
+		rwb->wb_normal = (rwb->wb_max + 1) / 2;
+		rwb->wb_background = (rwb->wb_max + 3) / 4;
+	}
+}
+
+static bool inline stat_sample_valid(struct blk_rq_stat *stat)
+{
+	/*
+	 * We need at least one read sample, and a minimum of
+	 * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
+	 * that it's writes impacting us, and not just some sole read on
+	 * a device that is in a lower power state.
+	 */
+	return stat[0].nr_samples >= 1 &&
+		stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+}
+
+static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
+{
+	u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+
+	if (!issue || !rwb->sync_cookie)
+		return 0;
+
+	now = ktime_to_ns(ktime_get());
+	return now - issue;
+}
+
+enum {
+	LAT_OK,
+	LAT_UNKNOWN,
+	LAT_EXCEEDED,
+};
+
+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+{
+	u64 thislat;
+
+	/*
+	 * If our stored sync issue exceeds the window size, or it
+	 * exceeds our min target AND we haven't logged any entries,
+	 * flag the latency as exceeded. wbt works off completion latencies,
+	 * but for a flooded device, a single sync IO can take a long time
+	 * to complete after being issued. If this time exceeds our
+	 * monitoring window AND we didn't see any other completions in that
+	 * window, then count that sync IO as a violation of the latency.
+	 */
+	thislat = rwb_sync_issue_lat(rwb);
+	if (thislat > rwb->cur_win_nsec ||
+	    (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
+		trace_wbt_lat(rwb->bdi, thislat);
+		return LAT_EXCEEDED;
+	}
+
+	if (!stat_sample_valid(stat))
+		return LAT_UNKNOWN;
+
+	/*
+	 * If the 'min' latency exceeds our target, step down.
+	 */
+	if (stat[0].min > rwb->min_lat_nsec) {
+		trace_wbt_lat(rwb->bdi, stat[0].min);
+		trace_wbt_stat(rwb->bdi, stat);
+		return LAT_EXCEEDED;
+	}
+
+	if (rwb->scale_step)
+		trace_wbt_stat(rwb->bdi, stat);
+
+	return LAT_OK;
+}
+
+static int latency_exceeded(struct rq_wb *rwb)
+{
+	struct blk_rq_stat stat[2];
+
+	rwb->stat_ops->get(rwb->ops_data, stat);
+	return __latency_exceeded(rwb, stat);
+}
+
+static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
+{
+	trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
+			rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+}
+
+static void scale_up(struct rq_wb *rwb)
+{
+	/*
+	 * If we're at 0, we can't go lower.
+	 */
+	if (!rwb->scale_step)
+		return;
+
+	rwb->scale_step--;
+	rwb->unknown_cnt = 0;
+	rwb->stat_ops->clear(rwb->ops_data);
+	calc_wb_limits(rwb);
+
+	if (waitqueue_active(&rwb->wait))
+		wake_up_all(&rwb->wait);
+
+	rwb_trace_step(rwb, "step up");
+}
+
+static void scale_down(struct rq_wb *rwb)
+{
+	/*
+	 * Stop scaling down when we've hit the limit. This also prevents
+	 * ->scale_step from going to crazy values, if the device can't
+	 * keep up.
+	 */
+	if (rwb->wb_max == 1)
+		return;
+
+	rwb->scale_step++;
+	rwb->unknown_cnt = 0;
+	rwb->stat_ops->clear(rwb->ops_data);
+	calc_wb_limits(rwb);
+	rwb_trace_step(rwb, "step down");
+}
+
+static void rwb_arm_timer(struct rq_wb *rwb)
+{
+	unsigned long expires;
+
+	/*
+	 * We should speed this up, using some variant of a fast integer
+	 * inverse square root calculation. Since we only do this for
+	 * every window expiration, it's not a huge deal, though.
+	 */
+	rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+					int_sqrt((rwb->scale_step + 1) << 8));
+	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
+	mod_timer(&rwb->window_timer, expires);
+}
+
+static void wb_timer_fn(unsigned long data)
+{
+	struct rq_wb *rwb = (struct rq_wb *) data;
+	int status;
+
+	/*
+	 * If we exceeded the latency target, step down. If we did not,
+	 * step one level up. If we don't know enough to say either exceeded
+	 * or ok, then don't do anything.
+	 */
+	status = latency_exceeded(rwb);
+	switch (status) {
+	case LAT_EXCEEDED:
+		scale_down(rwb);
+		break;
+	case LAT_OK:
+		scale_up(rwb);
+		break;
+	case LAT_UNKNOWN:
+		/*
+		 * We had no read samples, start bumping up the write
+		 * depth slowly
+		 */
+		if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP)
+			scale_up(rwb);
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Re-arm timer, if we have IO in flight
+	 */
+	if (rwb->scale_step || atomic_read(&rwb->inflight))
+		rwb_arm_timer(rwb);
+}
+
+void wbt_update_limits(struct rq_wb *rwb)
+{
+	rwb->scale_step = 0;
+	calc_wb_limits(rwb);
+
+	if (waitqueue_active(&rwb->wait))
+		wake_up_all(&rwb->wait);
+}
+
+static bool close_io(struct rq_wb *rwb)
+{
+	const unsigned long now = jiffies;
+
+	return time_before(now, rwb->last_issue + HZ / 10) ||
+		time_before(now, rwb->last_comp + HZ / 10);
+}
+
+#define REQ_HIPRIO	(REQ_SYNC | REQ_META | REQ_PRIO)
+
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+{
+	unsigned int limit;
+
+	/*
+	 * At this point we know it's a buffered write. If REQ_SYNC is
+	 * set, then it's WB_SYNC_ALL writeback, and we'll use the max
+	 * limit for that. If the write is marked as a background write,
+	 * then use the idle limit, or go to normal if we haven't had
+	 * competing IO for a bit.
+	 */
+	if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping))
+		limit = rwb->wb_max;
+	else if ((rw & REQ_BG) || close_io(rwb)) {
+		/*
+		 * If less than 100ms since we completed unrelated IO,
+		 * limit us to half the depth for background writeback.
+		 */
+		limit = rwb->wb_background;
+	} else
+		limit = rwb->wb_normal;
+
+	return limit;
+}
+
+static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
+{
+	/*
+	 * inc it here even if disabled, since we'll dec it at completion.
+	 * this only happens if the task was sleeping in __wbt_wait(),
+	 * and someone turned it off at the same time.
+	 */
+	if (!rwb_enabled(rwb)) {
+		atomic_inc(&rwb->inflight);
+		return true;
+	}
+
+	return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw));
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+{
+	DEFINE_WAIT(wait);
+
+	if (may_queue(rwb, rw))
+		return;
+
+	do {
+		prepare_to_wait_exclusive(&rwb->wait, &wait,
+						TASK_UNINTERRUPTIBLE);
+
+		if (may_queue(rwb, rw))
+			break;
+
+		if (lock)
+			spin_unlock_irq(lock);
+
+		io_schedule();
+
+		if (lock)
+			spin_lock_irq(lock);
+	} while (1);
+
+	finish_wait(&rwb->wait, &wait);
+}
+
+static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
+{
+	/*
+	 * If not a WRITE (or a discard), do nothing
+	 */
+	if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD))
+		return false;
+
+	/*
+	 * Don't throttle WRITE_ODIRECT
+	 */
+	if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
+		return false;
+
+	return true;
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
+{
+	if (!rwb_enabled(rwb))
+		return false;
+
+	if (!wbt_should_throttle(rwb, rw)) {
+		wb_timestamp(rwb, &rwb->last_issue);
+		return false;
+	}
+
+	__wbt_wait(rwb, rw, lock);
+
+	if (!timer_pending(&rwb->window_timer))
+		rwb_arm_timer(rwb);
+
+	return true;
+}
+
+void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+	if (!rwb_enabled(rwb))
+		return;
+
+	wbt_issue_stat_set_time(stat);
+
+	/*
+	 * Track sync issue, in case it takes a long time to complete. Allows
+	 * us to react quicker, if a sync IO takes a long time to complete.
+	 * Note that this is just a hint. 'stat' can go away when the
+	 * request completes, so it's important we never dereference it. We
+	 * only use the address to compare with, which is why we store the
+	 * sync_issue time locally.
+	 */
+	if (!wbt_tracked(stat) && !rwb->sync_issue) {
+		rwb->sync_cookie = stat;
+		rwb->sync_issue = wbt_issue_stat_get_time(stat);
+	}
+}
+
+void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+	if (!rwb_enabled(rwb))
+		return;
+	if (stat == rwb->sync_cookie) {
+		rwb->sync_issue = 0;
+		rwb->sync_cookie = NULL;
+	}
+}
+
+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+	if (rwb) {
+		rwb->queue_depth = depth;
+		wbt_update_limits(rwb);
+	}
+}
+
+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+{
+	if (rwb)
+		rwb->wc = write_cache_on;
+}
+
+void wbt_disable(struct rq_wb *rwb)
+{
+	del_timer_sync(&rwb->window_timer);
+	rwb->win_nsec = rwb->min_lat_nsec = 0;
+	wbt_update_limits(rwb);
+}
+EXPORT_SYMBOL_GPL(wbt_disable);
+
+struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
+		       void *ops_data)
+{
+	struct rq_wb *rwb;
+
+	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+	if (!rwb)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&rwb->inflight, 0);
+	init_waitqueue_head(&rwb->wait);
+	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
+	rwb->wc = 1;
+	rwb->queue_depth = RWB_MAX_DEPTH;
+	rwb->last_comp = rwb->last_issue = jiffies;
+	rwb->bdi = bdi;
+	rwb->win_nsec = RWB_WINDOW_NSEC;
+	rwb->stat_ops = ops,
+	rwb->ops_data = ops_data;
+	wbt_update_limits(rwb);
+	return rwb;
+}
+
+void wbt_exit(struct rq_wb *rwb)
+{
+	if (rwb) {
+		del_timer_sync(&rwb->window_timer);
+		kfree(rwb);
+	}
+}