192 files changed, 7059 insertions, 30427 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 08561f1ac..ebdb00432 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER
        def_bool y
        depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
 
-config ARCH_USE_QUEUE_RWLOCK
+config ARCH_USE_QUEUED_SPINLOCKS
 	bool
 
-config QUEUE_RWLOCK
-	def_bool y if ARCH_USE_QUEUE_RWLOCK
+config QUEUED_SPINLOCKS
+	def_bool y if ARCH_USE_QUEUED_SPINLOCKS
+	depends on SMP
+
+config ARCH_USE_QUEUED_RWLOCKS
+	bool
+
+config QUEUED_RWLOCKS
+	def_bool y if ARCH_USE_QUEUED_RWLOCKS
 	depends on SMP
diff --git a/kernel/Makefile b/kernel/Makefile
index 60c302cfb..43c4c920f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -137,7 +137,7 @@ endif
 
 ifneq ($(wildcard $(obj)/.x509.list),)
 ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(info X.509 certificate list changed)
+$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)")
 $(shell rm $(obj)/.x509.list)
 endif
 endif
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c13e4267..f9e606534 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1904,7 +1904,7 @@ EXPORT_SYMBOL(audit_log_task_info);
 
 /**
  * audit_log_link_denied - report a link restriction denial
- * @operation: specific link opreation
+ * @operation: specific link operation
  * @link: the path that triggered the restriction
  */
 void audit_log_link_denied(const char *operation, struct path *link)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9fb9d1cb8..e85bdfd15 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -599,9 +599,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = match_tree_refs(ctx, rule->tree);
 			break;
 		case AUDIT_LOGINUID:
-			result = 0;
-			if (ctx)
-				result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+			result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
 			break;
 		case AUDIT_LOGINUID_SET:
 			result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
@@ -1023,8 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 	 * for strings that are too long, we should not have created
 	 * any.
 	 */
-	if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) {
-		WARN_ON(1);
+	if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
 		send_sig(SIGKILL, current, 0);
 		return -1;
 	}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8a6616583..cb31229a6 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-
-struct bpf_array {
-	struct bpf_map map;
-	u32 elem_size;
-	char value[0] __aligned(8);
-};
+#include <linux/filter.h>
 
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
 	return 0;
 }
 late_initcall(register_array_map);
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+	/* only bpf_prog file descriptors can be stored in prog_array map */
+	if (attr->value_size != sizeof(u32))
+		return ERR_PTR(-EINVAL);
+	return array_map_alloc(attr);
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* make sure it's empty */
+	for (i = 0; i < array->map.max_entries; i++)
+		BUG_ON(array->prog[i] != NULL);
+	kvfree(array);
+}
+
+static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+/* only called from syscall */
+static int prog_array_map_update_elem(struct bpf_map *map, void *key,
+				      void *value, u64 map_flags)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog, *old_prog;
+	u32 index = *(u32 *)key, ufd;
+
+	if (map_flags != BPF_ANY)
+		return -EINVAL;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	ufd = *(u32 *)value;
+	prog = bpf_prog_get(ufd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	old_prog = xchg(array->prog + index, prog);
+	if (old_prog)
+		bpf_prog_put_rcu(old_prog);
+
+	return 0;
+}
+
+static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *old_prog;
+	u32 index = *(u32 *)key;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	old_prog = xchg(array->prog + index, NULL);
+	if (old_prog) {
+		bpf_prog_put_rcu(old_prog);
+		return 0;
+	} else {
+		return -ENOENT;
+	}
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_prog_array_map_clear(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		prog_array_map_delete_elem(map, &i);
+}
+
+static const struct bpf_map_ops prog_array_ops = {
+	.map_alloc = prog_array_map_alloc,
+	.map_free = prog_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = prog_array_map_lookup_elem,
+	.map_update_elem = prog_array_map_update_elem,
+	.map_delete_elem = prog_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+	.ops = &prog_array_ops,
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+
+static int __init register_prog_array_map(void)
+{
+	bpf_register_map_type(&prog_array_type);
+	return 0;
+}
+late_initcall(register_prog_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 54f0e7fcd..c5bedc82b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -26,9 +26,10 @@
 #include <linux/vmalloc.h>
 #include <linux/random.h>
 #include <linux/moduleloader.h>
-#include <asm/unaligned.h>
 #include <linux/bpf.h>
 
+#include <asm/unaligned.h>
+
 /* Registers */
 #define BPF_R0	regs[BPF_REG_0]
 #define BPF_R1	regs[BPF_REG_1]
@@ -62,6 +63,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
 		ptr = skb_network_header(skb) + k - SKF_NET_OFF;
 	else if (k >= SKF_LL_OFF)
 		ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+
 	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
 		return ptr;
 
@@ -244,6 +246,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
 		/* Call instruction */
 		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+		[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
 		/* Jumps */
 		[BPF_JMP | BPF_JA] = &&JMP_JA,
 		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +289,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
 		[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
 	};
+	u32 tail_call_cnt = 0;
 	void *ptr;
 	int off;
 
@@ -431,6 +435,30 @@ select_insn:
 						       BPF_R4, BPF_R5);
 		CONT;
 
+	JMP_TAIL_CALL: {
+		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+		struct bpf_prog *prog;
+		u64 index = BPF_R3;
+
+		if (unlikely(index >= array->map.max_entries))
+			goto out;
+
+		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+			goto out;
+
+		tail_call_cnt++;
+
+		prog = READ_ONCE(array->prog[index]);
+		if (unlikely(!prog))
+			goto out;
+
+		ARG1 = BPF_R1;
+		insn = prog->insnsi;
+		goto select_insn;
+out:
+		CONT;
+	}
 	/* JMP */
 	JMP_JA:
 		insn += insn->off;
@@ -615,25 +643,63 @@ load_byte:
 		return 0;
 }
 
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+bool bpf_prog_array_compatible(struct bpf_array *array,
+			       const struct bpf_prog *fp)
 {
+	if (!array->owner_prog_type) {
+		/* There's no owner yet where we could check for
+		 * compatibility.
+		 */
+		array->owner_prog_type = fp->type;
+		array->owner_jited = fp->jited;
+
+		return true;
+	}
+
+	return array->owner_prog_type == fp->type &&
+	       array->owner_jited == fp->jited;
+}
+
+static int bpf_check_tail_call(const struct bpf_prog *fp)
+{
+	struct bpf_prog_aux *aux = fp->aux;
+	int i;
+
+	for (i = 0; i < aux->used_map_cnt; i++) {
+		struct bpf_map *map = aux->used_maps[i];
+		struct bpf_array *array;
+
+		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+			continue;
+
+		array = container_of(map, struct bpf_array, map);
+		if (!bpf_prog_array_compatible(array, fp))
+			return -EINVAL;
+	}
+
+	return 0;
 }
 
 /**
- *	bpf_prog_select_runtime - select execution runtime for BPF program
+ *	bpf_prog_select_runtime - select exec runtime for BPF program
  *	@fp: bpf_prog populated with internal BPF program
  *
- * try to JIT internal BPF program, if JIT is not available select interpreter
- * BPF program will be executed via BPF_PROG_RUN() macro
+ * Try to JIT eBPF program, if JIT is not available, use interpreter.
+ * The BPF program will be executed via BPF_PROG_RUN() macro.
  */
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+int bpf_prog_select_runtime(struct bpf_prog *fp)
 {
 	fp->bpf_func = (void *) __bpf_prog_run;
 
-	/* Probe if internal BPF can be JITed */
 	bpf_int_jit_compile(fp);
-	/* Lock whole bpf_prog as read-only */
 	bpf_prog_lock_ro(fp);
+
+	/* The tail call compatibility check can only be done at
+	 * this late stage as we need to determine, if we deal
+	 * with JITed or non JITed program concatenations and not
+	 * all eBPF JITs might immediately support all features.
+	 */
+	return bpf_check_tail_call(fp);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
@@ -663,6 +729,29 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
+const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
+{
+	return NULL;
+}
+
+/* Always built-in helper functions. */
+const struct bpf_func_proto bpf_tail_call_proto = {
+	.func		= NULL,
+	.gpl_only	= false,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
 
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bd7f5988e..1447ec094 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -13,6 +13,9 @@
 #include <linux/rcupdate.h>
 #include <linux/random.h>
 #include <linux/smp.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/uidgid.h>
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -44,11 +47,11 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 }
 
 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
-	.func = bpf_map_lookup_elem,
-	.gpl_only = false,
-	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
-	.arg1_type = ARG_CONST_MAP_PTR,
-	.arg2_type = ARG_PTR_TO_MAP_KEY,
+	.func		= bpf_map_lookup_elem,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
 static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -63,13 +66,13 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 }
 
 const struct bpf_func_proto bpf_map_update_elem_proto = {
-	.func = bpf_map_update_elem,
-	.gpl_only = false,
-	.ret_type = RET_INTEGER,
-	.arg1_type = ARG_CONST_MAP_PTR,
-	.arg2_type = ARG_PTR_TO_MAP_KEY,
-	.arg3_type = ARG_PTR_TO_MAP_VALUE,
-	.arg4_type = ARG_ANYTHING,
+	.func		= bpf_map_update_elem,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_MAP_KEY,
+	.arg3_type	= ARG_PTR_TO_MAP_VALUE,
+	.arg4_type	= ARG_ANYTHING,
 };
 
 static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -83,11 +86,11 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 }
 
 const struct bpf_func_proto bpf_map_delete_elem_proto = {
-	.func = bpf_map_delete_elem,
-	.gpl_only = false,
-	.ret_type = RET_INTEGER,
-	.arg1_type = ARG_CONST_MAP_PTR,
-	.arg2_type = ARG_PTR_TO_MAP_KEY,
+	.func		= bpf_map_delete_elem,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
 static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -111,3 +114,71 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 };
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* NMI safe access to clock monotonic */
+	return ktime_get_mono_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+	.func		= bpf_ktime_get_ns,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+};
+
+static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct task_struct *task = current;
+
+	if (!task)
+		return -EINVAL;
+
+	return (u64) task->tgid << 32 | task->pid;
+}
+
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
+	.func		= bpf_get_current_pid_tgid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct task_struct *task = current;
+	kuid_t uid;
+	kgid_t gid;
+
+	if (!task)
+		return -EINVAL;
+
+	current_uid_gid(&uid, &gid);
+	return (u64) from_kgid(&init_user_ns, gid) << 32 |
+		from_kuid(&init_user_ns, uid);
+}
+
+const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
+	.func		= bpf_get_current_uid_gid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+{
+	struct task_struct *task = current;
+	char *buf = (char *) (long) r1;
+
+	if (!task)
+		return -EINVAL;
+
+	memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+	return 0;
+}
+
+const struct bpf_func_proto bpf_get_current_comm_proto = {
+	.func		= bpf_get_current_comm,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3bae6c591..a1b14d197 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_map *map = filp->private_data;
 
+	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+		/* prog_array stores refcnt-ed bpf_prog pointers
+		 * release them all when user space closes prog_array_fd
+		 */
+		bpf_prog_array_map_clear(map);
+
 	bpf_map_put(map);
 	return 0;
 }
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 			 */
 			BUG_ON(!prog->aux->ops->get_func_proto);
 
+			if (insn->imm == BPF_FUNC_tail_call) {
+				/* mark bpf_tail_call as different opcode
+				 * to avoid conditional branch in
+				 * interpeter for every normal call
+				 * and to prevent accidental JITing by
+				 * JIT compiler that doesn't support
+				 * bpf_tail_call yet
+				 */
+				insn->imm = 0;
+				insn->code |= BPF_X;
+				continue;
+			}
+
 			fn = prog->aux->ops->get_func_proto(insn->imm);
 			/* all functions that have prototype and verifier allowed
 			 * programs to call them, must be real in-kernel functions
@@ -413,6 +432,23 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 	kfree(aux->used_maps);
 }
 
+static void __prog_put_rcu(struct rcu_head *rcu)
+{
+	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+
+	free_used_maps(aux);
+	bpf_prog_free(aux->prog);
+}
+
+/* version of bpf_prog_put() that is called after a grace period */
+void bpf_prog_put_rcu(struct bpf_prog *prog)
+{
+	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		prog->aux->prog = prog;
+		call_rcu(&prog->aux->rcu, __prog_put_rcu);
+	}
+}
+
 void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
@@ -426,7 +462,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_prog *prog = filp->private_data;
 
-	bpf_prog_put(prog);
+	bpf_prog_put_rcu(prog);
 	return 0;
 }
 
@@ -532,7 +568,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	fixup_bpf_calls(prog);
 
 	/* eBPF program is ready to be JITed */
-	bpf_prog_select_runtime(prog);
+	err = bpf_prog_select_runtime(prog);
+	if (err < 0)
+		goto free_used_maps;
 
 	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
 	if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47dcd3aa6..039d866fd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
 			fn->ret_type, func_id);
 		return -EINVAL;
 	}
+
+	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+	    func_id != BPF_FUNC_tail_call)
+		/* prog_array map type needs extra care:
+		 * only allow to pass it into bpf_tail_call() for now.
+		 * bpf_map_delete_elem() can be allowed in the future,
+		 * while bpf_map_update_elem() must only be done via syscall
+		 */
+		return -EINVAL;
+
+	if (func_id == BPF_FUNC_tail_call &&
+	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+		/* don't allow any other map type to be passed into
+		 * bpf_tail_call()
+		 */
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -1675,6 +1692,8 @@ static int do_check(struct verifier_env *env)
 			}
 
 		} else if (class == BPF_STX) {
+			enum bpf_reg_type dst_reg_type;
+
 			if (BPF_MODE(insn->code) == BPF_XADD) {
 				err = check_xadd(env, insn);
 				if (err)
@@ -1683,11 +1702,6 @@ static int do_check(struct verifier_env *env)
 				continue;
 			}
 
-			if (BPF_MODE(insn->code) != BPF_MEM ||
-			    insn->imm != 0) {
-				verbose("BPF_STX uses reserved fields\n");
-				return -EINVAL;
-			}
 			/* check src1 operand */
 			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
 			if (err)
@@ -1697,6 +1711,8 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
+			dst_reg_type = regs[insn->dst_reg].type;
+
 			/* check that memory (dst_reg + off) is writeable */
 			err = check_mem_access(env, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
@@ -1704,6 +1720,15 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
+			if (insn->imm == 0) {
+				insn->imm = dst_reg_type;
+			} else if (dst_reg_type != insn->imm &&
+				   (dst_reg_type == PTR_TO_CTX ||
+				    insn->imm == PTR_TO_CTX)) {
+				verbose("same insn cannot be used with different pointers\n");
+				return -EINVAL;
+			}
+
 		} else if (class == BPF_ST) {
 			if (BPF_MODE(insn->code) != BPF_MEM ||
 			    insn->src_reg != BPF_REG_0) {
@@ -1822,12 +1847,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
-		    (BPF_MODE(insn->code) != BPF_MEM ||
-		     insn->imm != 0)) {
+		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
 			verbose("BPF_LDX uses reserved fields\n");
 			return -EINVAL;
 		}
 
+		if (BPF_CLASS(insn->code) == BPF_STX &&
+		    ((BPF_MODE(insn->code) != BPF_MEM &&
+		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
+			verbose("BPF_STX uses reserved fields\n");
+			return -EINVAL;
+		}
+
 		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
 			struct bpf_map *map;
 			struct fd f;
@@ -1950,12 +1981,17 @@ static int convert_ctx_accesses(struct verifier_env *env)
 	struct bpf_prog *new_prog;
 	u32 cnt;
 	int i;
+	enum bpf_access_type type;
 
 	if (!env->prog->aux->ops->convert_ctx_access)
 		return 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+			type = BPF_READ;
+		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+			type = BPF_WRITE;
+		else
 			continue;
 
 		if (insn->imm != PTR_TO_CTX) {
@@ -1965,7 +2001,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
 		}
 
 		cnt = env->prog->aux->ops->
-			convert_ctx_access(insn->dst_reg, insn->src_reg,
+			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
 					   insn->off, insn_buf);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e8a5491be..f89d9292e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
@@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
 #define cgroup_assert_mutex_or_rcu_locked()				\
 	rcu_lockdep_assert(rcu_read_lock_held() ||			\
 			   lockdep_is_held(&cgroup_mutex),		\
@@ -156,7 +159,7 @@ static bool cgrp_dfl_root_visible;
 static bool cgroup_legacy_files_on_dfl;
 
 /* some controllers are not supported in the default hierarchy */
-static unsigned int cgrp_dfl_root_inhibit_ss_mask;
+static unsigned long cgrp_dfl_root_inhibit_ss_mask;
 
 /* The list of hierarchy roots */
 
@@ -175,18 +178,19 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
  */
 static u64 css_serial_nr_next = 1;
 
-/* This flag indicates whether tasks in the fork and exit paths should
- * check for fork/exit handlers to call. This avoids us having to do
- * extra work in the fork/exit path if none of the subsystems need to
- * be called.
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
  */
-static int need_forkexit_callback __read_mostly;
+static unsigned long have_fork_callback __read_mostly;
+static unsigned long have_exit_callback __read_mostly;
 
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
 static int rebind_subsystems(struct cgroup_root *dst_root,
-			     unsigned int ss_mask);
+			     unsigned long ss_mask);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 		      bool visible);
@@ -261,7 +265,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
  * @cgrp: the cgroup of interest
  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
  *
- * Similar to cgroup_css() but returns the effctive css, which is defined
+ * Similar to cgroup_css() but returns the effective css, which is defined
  * as the matching css of the nearest ancestor including self which has @ss
  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
  * function is guaranteed to return non-NULL css.
@@ -409,6 +413,24 @@ static int notify_on_release(const struct cgroup *cgrp)
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 
+/**
+ * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_maskp: a pointer to the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * mask is set to 1.
+ */
+#define for_each_subsys_which(ss, ssid, ss_maskp)			\
+	if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */	\
+		(ssid) = 0;						\
+	else								\
+		for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT)	\
+			if (((ss) = cgroup_subsys[ssid]) && false)	\
+				break;					\
+			else
+
 /* iterate across the hierarchies */
 #define for_each_root(root)						\
 	list_for_each_entry((root), &cgroup_roots, root_list)
@@ -882,7 +904,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
-		/* hierarhcy ID shoulid already have been released */
+		/* hierarchy ID should already have been released */
 		WARN_ON_ONCE(root->hierarchy_id);
 
 		idr_destroy(&root->cgroup_idr);
@@ -998,7 +1020,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
 
@@ -1068,11 +1090,11 @@ static void cgroup_put(struct cgroup *cgrp)
  * @subtree_control is to be applied to @cgrp.  The returned mask is always
  * a superset of @subtree_control and follows the usual hierarchy rules.
  */
-static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
-						  unsigned int subtree_control)
+static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+						  unsigned long subtree_control)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
-	unsigned int cur_ss_mask = subtree_control;
+	unsigned long cur_ss_mask = subtree_control;
 	struct cgroup_subsys *ss;
 	int ssid;
 
@@ -1082,11 +1104,10 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
 		return cur_ss_mask;
 
 	while (true) {
-		unsigned int new_ss_mask = cur_ss_mask;
+		unsigned long new_ss_mask = cur_ss_mask;
 
-		for_each_subsys(ss, ssid)
-			if (cur_ss_mask & (1 << ssid))
-				new_ss_mask |= ss->depends_on;
+		for_each_subsys_which(ss, ssid, &cur_ss_mask)
+			new_ss_mask |= ss->depends_on;
 
 		/*
 		 * Mask out subsystems which aren't available.  This can
@@ -1200,7 +1221,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
  * @cgrp: target cgroup
  * @subsys_mask: mask of the subsystem ids whose files should be removed
  */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -1215,18 +1236,16 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
 	}
 }
 
-static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root,
+			     unsigned long ss_mask)
 {
 	struct cgroup_subsys *ss;
-	unsigned int tmp_ss_mask;
+	unsigned long tmp_ss_mask;
 	int ssid, i, ret;
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	for_each_subsys(ss, ssid) {
-		if (!(ss_mask & (1 << ssid)))
-			continue;
-
+	for_each_subsys_which(ss, ssid, &ss_mask) {
 		/* if @ss has non-root csses attached to it, can't move */
 		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
 			return -EBUSY;
@@ -1253,7 +1272,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
 		 * Just warn about it and continue.
 		 */
 		if (cgrp_dfl_root_visible) {
-			pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+			pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
 				ret, ss_mask);
 			pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
 		}
@@ -1263,18 +1282,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
 	 * Nothing can fail from this point on.  Remove files for the
 	 * removed subsystems and rebind each subsystem.
 	 */
-	for_each_subsys(ss, ssid)
-		if (ss_mask & (1 << ssid))
-			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
+	for_each_subsys_which(ss, ssid, &ss_mask)
+		cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
 
-	for_each_subsys(ss, ssid) {
+	for_each_subsys_which(ss, ssid, &ss_mask) {
 		struct cgroup_root *src_root;
 		struct cgroup_subsys_state *css;
 		struct css_set *cset;
 
-		if (!(ss_mask & (1 << ssid)))
-			continue;
-
 		src_root = ss->root;
 		css = cgroup_css(&src_root->cgrp, ss);
 
@@ -1338,7 +1353,7 @@ static int cgroup_show_options(struct seq_file *seq,
 }
 
 struct cgroup_sb_opts {
-	unsigned int subsys_mask;
+	unsigned long subsys_mask;
 	unsigned int flags;
 	char *release_agent;
 	bool cpuset_clone_children;
@@ -1351,7 +1366,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data;
 	bool all_ss = false, one_ss = false;
-	unsigned int mask = -1U;
+	unsigned long mask = -1UL;
 	struct cgroup_subsys *ss;
 	int nr_opts = 0;
 	int i;
@@ -1495,7 +1510,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	int ret = 0;
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_sb_opts opts;
-	unsigned int added_mask, removed_mask;
+	unsigned long added_mask, removed_mask;
 
 	if (root == &cgrp_dfl_root) {
 		pr_err("remount is not allowed\n");
@@ -1641,7 +1656,7 @@ static void init_cgroup_root(struct cgroup_root *root,
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
@@ -2050,9 +2065,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	lockdep_assert_held(&css_set_rwsem);
 
 	/*
-	 * We are synchronized through threadgroup_lock() against PF_EXITING
-	 * setting such that we can't race against cgroup_exit() changing the
-	 * css_set to init_css_set and dropping the old one.
+	 * We are synchronized through cgroup_threadgroup_rwsem against
+	 * PF_EXITING setting such that we can't race against cgroup_exit()
+	 * changing the css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
@@ -2109,10 +2124,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * @src_cset and add it to @preloaded_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
- * This function may be called without holding threadgroup_lock even if the
- * target is a process.  Threads may be created and destroyed but as long
- * as cgroup_mutex is not dropped, no new css_set can be put into play and
- * the preloaded css_sets are guaranteed to cover all migrations.
+ * This function may be called without holding cgroup_threadgroup_rwsem
+ * even if the target is a process.  Threads may be created and destroyed
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
  */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
 				   struct cgroup *dst_cgrp,
@@ -2215,7 +2231,7 @@ err:
  * @threadgroup: whether @leader points to the whole process or a single task
  *
  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding threadgroup_lock of @leader.  The
+ * process, the caller must be holding cgroup_threadgroup_rwsem.  The
  * caller is also responsible for invoking cgroup_migrate_add_src() and
  * cgroup_migrate_prepare_dst() on the targets before invoking this
  * function and following up with cgroup_migrate_finish().
@@ -2343,7 +2359,7 @@ out_release_tset:
  * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
- * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
  */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			      struct task_struct *leader, bool threadgroup)
@@ -2374,6 +2390,47 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	return ret;
 }
 
+static int cgroup_procs_write_permission(struct task_struct *task,
+					 struct cgroup *dst_cgrp,
+					 struct kernfs_open_file *of)
+{
+	const struct cred *cred = current_cred();
+	const struct cred *tcred = get_task_cred(task);
+	int ret = 0;
+
+	/*
+	 * even if we're attaching all tasks in the thread group, we only
+	 * need to check permissions on one of them.
+	 */
+	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+	    !uid_eq(cred->euid, tcred->uid) &&
+	    !uid_eq(cred->euid, tcred->suid))
+		ret = -EACCES;
+
+	if (!ret && cgroup_on_dfl(dst_cgrp)) {
+		struct super_block *sb = of->file->f_path.dentry->d_sb;
+		struct cgroup *cgrp;
+		struct inode *inode;
+
+		down_read(&css_set_rwsem);
+		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+		up_read(&css_set_rwsem);
+
+		while (!cgroup_is_descendant(dst_cgrp, cgrp))
+			cgrp = cgroup_parent(cgrp);
+
+		ret = -ENOMEM;
+		inode = kernfs_get_inode(sb, cgrp->procs_kn);
+		if (inode) {
+			ret = inode_permission(inode, MAY_WRITE);
+			iput(inode);
+		}
+	}
+
+	put_cred(tcred);
+	return ret;
+}
+
 /*
  * Find the task_struct of the task to attach by vpid and pass it along to the
  * function to attach either it or all tasks in its threadgroup. Will lock
@@ -2383,7 +2440,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 				    size_t nbytes, loff_t off, bool threadgroup)
 {
 	struct task_struct *tsk;
-	const struct cred *cred = current_cred(), *tcred;
 	struct cgroup *cgrp;
 	pid_t pid;
 	int ret;
@@ -2395,29 +2451,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	if (!cgrp)
 		return -ENODEV;
 
-retry_find_task:
+	percpu_down_write(&cgroup_threadgroup_rwsem);
 	rcu_read_lock();
 	if (pid) {
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
-			rcu_read_unlock();
 			ret = -ESRCH;
-			goto out_unlock_cgroup;
+			goto out_unlock_rcu;
 		}
-		/*
-		 * even if we're attaching all tasks in the thread group, we
-		 * only need to check permissions on one of them.
-		 */
-		tcred = __task_cred(tsk);
-		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-		    !uid_eq(cred->euid, tcred->uid) &&
-		    !uid_eq(cred->euid, tcred->suid)) {
-			rcu_read_unlock();
-			ret = -EACCES;
-			goto out_unlock_cgroup;
-		}
-	} else
+	} else {
 		tsk = current;
+	}
 
 	if (threadgroup)
 		tsk = tsk->group_leader;
@@ -2429,35 +2473,23 @@ retry_find_task:
 	 */
 	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
-		rcu_read_unlock();
-		goto out_unlock_cgroup;
+		goto out_unlock_rcu;
 	}
 
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
-	threadgroup_lock(tsk);
-	if (threadgroup) {
-		if (!thread_group_leader(tsk)) {
-			/*
-			 * a race with de_thread from another thread's exec()
-			 * may strip us of our leadership, if this happens,
-			 * there is no choice but to throw this task away and
-			 * try again; this is
-			 * "double-double-toil-and-trouble-check locking".
-			 */
-			threadgroup_unlock(tsk);
-			put_task_struct(tsk);
-			goto retry_find_task;
-		}
-	}
-
-	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
-
-	threadgroup_unlock(tsk);
+	ret = cgroup_procs_write_permission(tsk, cgrp, of);
+	if (!ret)
+		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 
 	put_task_struct(tsk);
-out_unlock_cgroup:
+	goto out_unlock_threadgroup;
+
+out_unlock_rcu:
+	rcu_read_unlock();
+out_unlock_threadgroup:
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	cgroup_kn_unlock(of->kn);
 	return ret ?: nbytes;
 }
@@ -2540,19 +2572,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
 {
 	struct cgroup_subsys *ss;
 	bool printed = false;
 	int ssid;
 
-	for_each_subsys(ss, ssid) {
-		if (ss_mask & (1 << ssid)) {
-			if (printed)
-				seq_putc(seq, ' ');
-			seq_printf(seq, "%s", ss->name);
-			printed = true;
-		}
+	for_each_subsys_which(ss, ssid, &ss_mask) {
+		if (printed)
+			seq_putc(seq, ' ');
+		seq_printf(seq, "%s", ss->name);
+		printed = true;
 	}
 	if (printed)
 		seq_putc(seq, '\n');
@@ -2604,6 +2634,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
 	lockdep_assert_held(&cgroup_mutex);
 
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
 	/* look up all csses currently attached to @cgrp's subtree */
 	down_read(&css_set_rwsem);
 	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2659,17 +2691,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 				goto out_finish;
 			last_task = task;
 
-			threadgroup_lock(task);
-			/* raced against de_thread() from another thread? */
-			if (!thread_group_leader(task)) {
-				threadgroup_unlock(task);
-				put_task_struct(task);
-				continue;
-			}
-
 			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
 
-			threadgroup_unlock(task);
 			put_task_struct(task);
 
 			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2679,6 +2702,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
 out_finish:
 	cgroup_migrate_finish(&preloaded_csets);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	return ret;
 }
 
@@ -2687,8 +2711,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 					    char *buf, size_t nbytes,
 					    loff_t off)
 {
-	unsigned int enable = 0, disable = 0;
-	unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+	unsigned long enable = 0, disable = 0;
+	unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
 	struct cgroup *cgrp, *child;
 	struct cgroup_subsys *ss;
 	char *tok;
@@ -2700,11 +2724,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 	 */
 	buf = strstrip(buf);
 	while ((tok = strsep(&buf, " "))) {
+		unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
+
 		if (tok[0] == '\0')
 			continue;
-		for_each_subsys(ss, ssid) {
-			if (ss->disabled || strcmp(tok + 1, ss->name) ||
-			    ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+		for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+			if (ss->disabled || strcmp(tok + 1, ss->name))
 				continue;
 
 			if (*tok == '+') {
@@ -2791,10 +2816,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 	 * still around.  In such cases, wait till it's gone using
 	 * offline_waitq.
 	 */
-	for_each_subsys(ss, ssid) {
-		if (!(css_enable & (1 << ssid)))
-			continue;
-
+	for_each_subsys_which(ss, ssid, &css_enable) {
 		cgroup_for_each_live_child(child, cgrp) {
 			DEFINE_WAIT(wait);
 
@@ -3085,7 +3107,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 		return ret;
 	}
 
-	if (cft->seq_show == cgroup_populated_show)
+	if (cft->write == cgroup_procs_write)
+		cgrp->procs_kn = kn;
+	else if (cft->seq_show == cgroup_populated_show)
 		cgrp->populated_kn = kn;
 	return 0;
 }
@@ -4320,7 +4344,7 @@ static struct cftype cgroup_legacy_base_files[] = {
  *
  * On failure, no file is added.
  */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i, ret = 0;
@@ -4929,7 +4953,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 	 * init_css_set is in the subsystem's root cgroup. */
 	init_css_set.subsys[ss->id] = css;
 
-	need_forkexit_callback |= ss->fork || ss->exit;
+	have_fork_callback |= (bool)ss->fork << ss->id;
+	have_exit_callback |= (bool)ss->exit << ss->id;
 
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
@@ -4987,6 +5012,7 @@ int __init cgroup_init(void)
 	unsigned long key;
 	int ssid, err;
 
+	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
@@ -5239,11 +5265,8 @@ void cgroup_post_fork(struct task_struct *child)
 	 * css_set; otherwise, @child might change state between ->fork()
 	 * and addition to css_set.
 	 */
-	if (need_forkexit_callback) {
-		for_each_subsys(ss, i)
-			if (ss->fork)
-				ss->fork(child);
-	}
+	for_each_subsys_which(ss, i, &have_fork_callback)
+		ss->fork(child);
 }
 
 /**
@@ -5287,16 +5310,12 @@ void cgroup_exit(struct task_struct *tsk)
 	cset = task_css_set(tsk);
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 
-	if (need_forkexit_callback) {
-		/* see cgroup_post_fork() for details */
-		for_each_subsys(ss, i) {
-			if (ss->exit) {
-				struct cgroup_subsys_state *old_css = cset->subsys[i];
-				struct cgroup_subsys_state *css = task_css(tsk, i);
+	/* see cgroup_post_fork() for details */
+	for_each_subsys_which(ss, i, &have_exit_callback) {
+		struct cgroup_subsys_state *old_css = cset->subsys[i];
+		struct cgroup_subsys_state *css = task_css(tsk, i);
 
-				ss->exit(css, old_css, tsk);
-			}
-		}
+		ss->exit(css, old_css, tsk);
 	}
 
 	if (put_cset)
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
new file mode 100644
index 000000000..ff756221f
--- /dev/null
+++ b/kernel/configs/xen.config
@@ -0,0 +1,48 @@
+# global stuff - these enable us to allow some
+# of the not so generic stuff below for xen
+CONFIG_PARAVIRT=y
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_WATCHDOG=y
+CONFIG_TARGET_CORE=y
+CONFIG_SCSI=y
+CONFIG_FB=y
+CONFIG_INPUT_MISC=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_TTY=y
+# Technically not required but otherwise produces
+# pretty useless systems starting from allnoconfig
+# You want TCP/IP and ELF binaries right?
+CONFIG_INET=y
+CONFIG_BINFMT_ELF=y
+# generic config
+CONFIG_XEN=y
+CONFIG_XEN_DOM0=y
+# backend drivers
+CONFIG_XEN_BACKEND=y
+CONFIG_XEN_BLKDEV_BACKEND=m
+CONFIG_XEN_NETDEV_BACKEND=m
+CONFIG_HVC_XEN=y
+CONFIG_XEN_WDT=m
+CONFIG_XEN_SCSI_BACKEND=m
+# frontend drivers
+CONFIG_XEN_FBDEV_FRONTEND=m
+CONFIG_HVC_XEN_FRONTEND=y
+CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
+CONFIG_XEN_SCSI_FRONTEND=m
+# others
+CONFIG_XEN_BALLOON=y
+CONFIG_XEN_SCRUB_PAGES=y
+CONFIG_XEN_DEV_EVTCHN=m
+CONFIG_XEN_BLKDEV_FRONTEND=m
+CONFIG_XEN_NETDEV_FRONTEND=m
+CONFIG_XENFS=m
+CONFIG_XEN_COMPAT_XENFS=y
+CONFIG_XEN_SYS_HYPERVISOR=y
+CONFIG_XEN_XENBUS_FRONTEND=y
+CONFIG_XEN_GNTDEV=m
+CONFIG_XEN_GRANT_DEV_ALLOC=m
+CONFIG_SWIOTLB_XEN=y
+CONFIG_XEN_PRIVCMD=m
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 72d59a1a6..0a495ab35 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -30,12 +30,23 @@ EXPORT_SYMBOL_GPL(context_tracking_enabled);
 DEFINE_PER_CPU(struct context_tracking, context_tracking);
 EXPORT_SYMBOL_GPL(context_tracking);
 
-void context_tracking_cpu_set(int cpu)
+static bool context_tracking_recursion_enter(void)
 {
-	if (!per_cpu(context_tracking.active, cpu)) {
-		per_cpu(context_tracking.active, cpu) = true;
-		static_key_slow_inc(&context_tracking_enabled);
-	}
+	int recursion;
+
+	recursion = __this_cpu_inc_return(context_tracking.recursion);
+	if (recursion == 1)
+		return true;
+
+	WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion);
+	__this_cpu_dec(context_tracking.recursion);
+
+	return false;
+}
+
+static void context_tracking_recursion_exit(void)
+{
+	__this_cpu_dec(context_tracking.recursion);
 }
 
 /**
@@ -75,6 +86,9 @@ void context_tracking_enter(enum ctx_state state)
 	WARN_ON_ONCE(!current->mm);
 
 	local_irq_save(flags);
+	if (!context_tracking_recursion_enter())
+		goto out_irq_restore;
+
 	if ( __this_cpu_read(context_tracking.state) != state) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
@@ -105,6 +119,8 @@ void context_tracking_enter(enum ctx_state state)
 		 */
 		__this_cpu_write(context_tracking.state, state);
 	}
+	context_tracking_recursion_exit();
+out_irq_restore:
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_enter);
@@ -139,6 +155,9 @@ void context_tracking_exit(enum ctx_state state)
 		return;
 
 	local_irq_save(flags);
+	if (!context_tracking_recursion_enter())
+		goto out_irq_restore;
+
 	if (__this_cpu_read(context_tracking.state) == state) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
@@ -153,6 +172,8 @@ void context_tracking_exit(enum ctx_state state)
 		}
 		__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
 	}
+	context_tracking_recursion_exit();
+out_irq_restore:
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_exit);
@@ -164,24 +185,26 @@ void context_tracking_user_exit(void)
 }
 NOKPROBE_SYMBOL(context_tracking_user_exit);
 
-/**
- * __context_tracking_task_switch - context switch the syscall callbacks
- * @prev: the task that is being switched out
- * @next: the task that is being switched in
- *
- * The context tracking uses the syscall slow path to implement its user-kernel
- * boundaries probes on syscalls. This way it doesn't impact the syscall fast
- * path on CPUs that don't do context tracking.
- *
- * But we need to clear the flag on the previous task because it may later
- * migrate to some CPU that doesn't do the context tracking. As such the TIF
- * flag may not be desired there.
- */
-void __context_tracking_task_switch(struct task_struct *prev,
-				    struct task_struct *next)
+void __init context_tracking_cpu_set(int cpu)
 {
-	clear_tsk_thread_flag(prev, TIF_NOHZ);
-	set_tsk_thread_flag(next, TIF_NOHZ);
+	static __initdata bool initialized = false;
+
+	if (!per_cpu(context_tracking.active, cpu)) {
+		per_cpu(context_tracking.active, cpu) = true;
+		static_key_slow_inc(&context_tracking_enabled);
+	}
+
+	if (initialized)
+		return;
+
+	/*
+	 * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
+	 * This assumes that init is the only task at this early boot stage.
+	 */
+	set_tsk_thread_flag(&init_task, TIF_NOHZ);
+	WARN_ON_ONCE(!tasklist_empty());
+
+	initialized = true;
 }
 
 #ifdef CONFIG_CONTEXT_TRACKING_FORCE
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 94bbe4695..5644ec558 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,7 @@
 #include <linux/suspend.h>
 #include <linux/lockdep.h>
 #include <linux/tick.h>
+#include <linux/irq.h>
 #include <trace/events/power.h>
 
 #include "smpboot.h"
@@ -392,14 +393,19 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	smpboot_park_threads(cpu);
 
 	/*
-	 * So now all preempt/rcu users must observe !cpu_active().
+	 * Prevent irq alloc/free while the dying cpu reorganizes the
+	 * interrupt affinities.
 	 */
+	irq_lock_sparse();
 
+	/*
+	 * So now all preempt/rcu users must observe !cpu_active().
+	 */
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		smpboot_unpark_threads(cpu);
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+		irq_unlock_sparse();
 		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
@@ -416,6 +422,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
 	per_cpu(cpu_dead_idle, cpu) = false;
 
+	/* Interrupts are moved away from the dying cpu, reenable alloc/free */
+	irq_unlock_sparse();
+
 	hotplug_cpu__broadcast_tick_pull(cpu);
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
@@ -463,6 +472,7 @@ static int smpboot_thread_call(struct notifier_block *nfb,
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 
+	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
 		smpboot_unpark_threads(cpu);
 		break;
@@ -479,7 +489,7 @@ static struct notifier_block smpboot_thread_notifier = {
 	.priority = CPU_PRI_SMPBOOT,
 };
 
-void __cpuinit smpboot_thread_init(void)
+void smpboot_thread_init(void)
 {
 	register_cpu_notifier(&smpboot_thread_notifier);
 }
@@ -519,6 +529,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 
 	/* Arch-specific enabling code. */
 	ret = __cpu_up(cpu, idle);
+
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ee14e3a35..f0acff0f6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1223,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	spin_unlock_irq(&callback_lock);
 
 	/* use trialcs->mems_allowed as a temp variable */
-	update_nodemasks_hier(cs, &cs->mems_allowed);
+	update_nodemasks_hier(cs, &trialcs->mems_allowed);
 done:
 	return retval;
 }
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index d12807d40..ef90b04d7 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -104,7 +104,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk_seruntime(tsk);
+	t3 = tsk->se.sum_exec_runtime;
 
 	d->cpu_count += t1;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0ceb38677..e6feb5114 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,7 +36,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/cgroup.h>
 #include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
 #include <linux/module.h>
@@ -51,9 +51,11 @@
 
 static struct workqueue_struct *perf_wq;
 
+typedef int (*remote_function_f)(void *);
+
 struct remote_function_call {
 	struct task_struct	*p;
-	int			(*func)(void *info);
+	remote_function_f	func;
 	void			*info;
 	int			ret;
 };
@@ -86,7 +88,7 @@ static void remote_function(void *data)
  *	    -EAGAIN - when the process moved away
  */
 static int
-task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+task_function_call(struct task_struct *p, remote_function_f func, void *info)
 {
 	struct remote_function_call data = {
 		.p	= p,
@@ -110,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
  *
  * returns: @func return value or -ENXIO when the cpu is offline
  */
-static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+static int cpu_function_call(int cpu, remote_function_f func, void *info)
 {
 	struct remote_function_call data = {
 		.p	= NULL,
@@ -747,62 +749,31 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 /*
  * function must be called with interrupts disbled
  */
-static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
 	struct perf_cpu_context *cpuctx;
-	enum hrtimer_restart ret = HRTIMER_NORESTART;
 	int rotations = 0;
 
 	WARN_ON(!irqs_disabled());
 
 	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
 	rotations = perf_rotate_context(cpuctx);
 
-	/*
-	 * arm timer if needed
-	 */
-	if (rotations) {
+	raw_spin_lock(&cpuctx->hrtimer_lock);
+	if (rotations)
 		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
-		ret = HRTIMER_RESTART;
-	}
-
-	return ret;
-}
-
-/* CPU is going down */
-void perf_cpu_hrtimer_cancel(int cpu)
-{
-	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
-	unsigned long flags;
-
-	if (WARN_ON(cpu != smp_processor_id()))
-		return;
-
-	local_irq_save(flags);
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-		if (pmu->task_ctx_nr == perf_sw_context)
-			continue;
-
-		hrtimer_cancel(&cpuctx->hrtimer);
-	}
-
-	rcu_read_unlock();
+	else
+		cpuctx->hrtimer_active = 0;
+	raw_spin_unlock(&cpuctx->hrtimer_lock);
 
-	local_irq_restore(flags);
+	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
 }
 
-static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 {
-	struct hrtimer *hr = &cpuctx->hrtimer;
+	struct hrtimer *timer = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
-	int timer;
+	u64 interval;
 
 	/* no multiplexing needed for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
@@ -812,31 +783,36 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 	 * check default is sane, if not set then force to
 	 * default interval (1/tick)
 	 */
-	timer = pmu->hrtimer_interval_ms;
-	if (timer < 1)
-		timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+	interval = pmu->hrtimer_interval_ms;
+	if (interval < 1)
+		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 
-	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 
-	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
-	hr->function = perf_cpu_hrtimer_handler;
+	raw_spin_lock_init(&cpuctx->hrtimer_lock);
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	timer->function = perf_mux_hrtimer_handler;
 }
 
-static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
 {
-	struct hrtimer *hr = &cpuctx->hrtimer;
+	struct hrtimer *timer = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
+	unsigned long flags;
 
 	/* not for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
-		return;
+		return 0;
 
-	if (hrtimer_active(hr))
-		return;
+	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+	if (!cpuctx->hrtimer_active) {
+		cpuctx->hrtimer_active = 1;
+		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+	}
+	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
 
-	if (!hrtimer_callback_running(hr))
-		__hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
-					 0, HRTIMER_MODE_REL_PINNED, 0);
+	return 0;
 }
 
 void perf_pmu_disable(struct pmu *pmu)
@@ -1526,11 +1502,17 @@ static int __init perf_workqueue_init(void)
 
 core_initcall(perf_workqueue_init);
 
+static inline int pmu_filter_match(struct perf_event *event)
+{
+	struct pmu *pmu = event->pmu;
+	return pmu->filter_match ? pmu->filter_match(event) : 1;
+}
+
 static inline int
 event_filter_match(struct perf_event *event)
 {
 	return (event->cpu == -1 || event->cpu == smp_processor_id())
-	    && perf_cgroup_match(event);
+	    && perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
 static void
@@ -1886,8 +1868,6 @@ event_sched_in(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
-	event->tstamp_running += tstamp - event->tstamp_stopped;
-
 	perf_set_shadow_time(event, ctx, tstamp);
 
 	perf_log_itrace_start(event);
@@ -1899,6 +1879,8 @@ event_sched_in(struct perf_event *event,
 		goto out;
 	}
 
+	event->tstamp_running += tstamp - event->tstamp_stopped;
+
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	if (!ctx->nr_active++)
@@ -1935,7 +1917,7 @@ group_sched_in(struct perf_event *group_event,
 
 	if (event_sched_in(group_event, cpuctx, ctx)) {
 		pmu->cancel_txn(pmu);
-		perf_cpu_hrtimer_restart(cpuctx);
+		perf_mux_hrtimer_restart(cpuctx);
 		return -EAGAIN;
 	}
 
@@ -1982,7 +1964,7 @@ group_error:
 
 	pmu->cancel_txn(pmu);
 
-	perf_cpu_hrtimer_restart(cpuctx);
+	perf_mux_hrtimer_restart(cpuctx);
 
 	return -EAGAIN;
 }
@@ -2255,7 +2237,7 @@ static int __perf_event_enable(void *info)
 		 */
 		if (leader != event) {
 			group_sched_out(leader, cpuctx, ctx);
-			perf_cpu_hrtimer_restart(cpuctx);
+			perf_mux_hrtimer_restart(cpuctx);
 		}
 		if (leader->attr.pinned) {
 			update_group_times(leader);
@@ -3976,28 +3958,21 @@ static void perf_event_for_each(struct perf_event *event,
 		perf_event_for_each_child(sibling, func);
 }
 
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
-{
-	struct perf_event_context *ctx = event->ctx;
-	int ret = 0, active;
+struct period_event {
+	struct perf_event *event;
 	u64 value;
+};
 
-	if (!is_sampling_event(event))
-		return -EINVAL;
-
-	if (copy_from_user(&value, arg, sizeof(value)))
-		return -EFAULT;
-
-	if (!value)
-		return -EINVAL;
+static int __perf_event_period(void *info)
+{
+	struct period_event *pe = info;
+	struct perf_event *event = pe->event;
+	struct perf_event_context *ctx = event->ctx;
+	u64 value = pe->value;
+	bool active;
 
-	raw_spin_lock_irq(&ctx->lock);
+	raw_spin_lock(&ctx->lock);
 	if (event->attr.freq) {
-		if (value > sysctl_perf_event_sample_rate) {
-			ret = -EINVAL;
-			goto unlock;
-		}
-
 		event->attr.sample_freq = value;
 	} else {
 		event->attr.sample_period = value;
@@ -4016,11 +3991,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
 		event->pmu->start(event, PERF_EF_RELOAD);
 		perf_pmu_enable(ctx->pmu);
 	}
+	raw_spin_unlock(&ctx->lock);
 
-unlock:
+	return 0;
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+	struct period_event pe = { .event = event, };
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task;
+	u64 value;
+
+	if (!is_sampling_event(event))
+		return -EINVAL;
+
+	if (copy_from_user(&value, arg, sizeof(value)))
+		return -EFAULT;
+
+	if (!value)
+		return -EINVAL;
+
+	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
+		return -EINVAL;
+
+	task = ctx->task;
+	pe.value = value;
+
+	if (!task) {
+		cpu_function_call(event->cpu, __perf_event_period, &pe);
+		return 0;
+	}
+
+retry:
+	if (!task_function_call(task, __perf_event_period, &pe))
+		return 0;
+
+	raw_spin_lock_irq(&ctx->lock);
+	if (ctx->is_active) {
+		raw_spin_unlock_irq(&ctx->lock);
+		task = ctx->task;
+		goto retry;
+	}
+
+	__perf_event_period(&pe);
 	raw_spin_unlock_irq(&ctx->lock);
 
-	return ret;
+	return 0;
 }
 
 static const struct file_operations perf_fops;
@@ -4376,14 +4393,6 @@ static void ring_buffer_wakeup(struct perf_event *event)
 	rcu_read_unlock();
 }
 
-static void rb_free_rcu(struct rcu_head *rcu_head)
-{
-	struct ring_buffer *rb;
-
-	rb = container_of(rcu_head, struct ring_buffer, rcu_head);
-	rb_free(rb);
-}
-
 struct ring_buffer *ring_buffer_get(struct perf_event *event)
 {
 	struct ring_buffer *rb;
@@ -4766,12 +4775,20 @@ static const struct file_operations perf_fops = {
  * to user-space before waking everybody up.
  */
 
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+	/* only the parent has fasync state */
+	if (event->parent)
+		event = event->parent;
+	return &event->fasync;
+}
+
 void perf_event_wakeup(struct perf_event *event)
 {
 	ring_buffer_wakeup(event);
 
 	if (event->pending_kill) {
-		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+		kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
 		event->pending_kill = 0;
 	}
 }
@@ -5381,9 +5398,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 }
 
-static void perf_event_output(struct perf_event *event,
-				struct perf_sample_data *data,
-				struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+			struct perf_sample_data *data,
+			struct pt_regs *regs)
 {
 	struct perf_output_handle handle;
 	struct perf_event_header header;
@@ -5812,7 +5829,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 		 * need to add enough zero bytes after the string to handle
 		 * the 64bit alignment we do later.
 		 */
-		name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+		name = file_path(file, buf, PATH_MAX - sizeof(u64));
 		if (IS_ERR(name)) {
 			name = "//toolong";
 			goto cpy_name;
@@ -5975,6 +5992,39 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
 }
 
 /*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				lost;
+	} lost_samples_event = {
+		.header = {
+			.type = PERF_RECORD_LOST_SAMPLES,
+			.misc = 0,
+			.size = sizeof(lost_samples_event),
+		},
+		.lost		= lost,
+	};
+
+	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+	ret = perf_output_begin(&handle, event,
+				lost_samples_event.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, lost_samples_event);
+	perf_event__output_id_sample(event, &handle, &sample);
+	perf_output_end(&handle);
+}
+
+/*
  * IRQ throttle logging
  */
 
@@ -6117,7 +6167,7 @@ static int __perf_event_overflow(struct perf_event *event,
 	else
 		perf_event_output(event, data, regs);
 
-	if (event->fasync && event->pending_kill) {
+	if (*perf_event_fasync(event) && event->pending_kill) {
 		event->pending_wakeup = 1;
 		irq_work_queue(&event->pending);
 	}
@@ -6864,9 +6914,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
 	} else {
 		period = max_t(u64, 10000, hwc->sample_period);
 	}
-	__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -7167,6 +7216,8 @@ perf_event_mux_interval_ms_show(struct device *dev,
 	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
 }
 
+static DEFINE_MUTEX(mux_interval_mutex);
+
 static ssize_t
 perf_event_mux_interval_ms_store(struct device *dev,
 				 struct device_attribute *attr,
@@ -7186,17 +7237,21 @@ perf_event_mux_interval_ms_store(struct device *dev,
 	if (timer == pmu->hrtimer_interval_ms)
 		return count;
 
+	mutex_lock(&mux_interval_mutex);
 	pmu->hrtimer_interval_ms = timer;
 
 	/* update all cpuctx for this PMU */
-	for_each_possible_cpu(cpu) {
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
 		struct perf_cpu_context *cpuctx;
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 
-		if (hrtimer_active(&cpuctx->hrtimer))
-			hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+		cpu_function_call(cpu,
+			(remote_function_f)perf_mux_hrtimer_restart, cpuctx);
 	}
+	put_online_cpus();
+	mutex_unlock(&mux_interval_mutex);
 
 	return count;
 }
@@ -7301,7 +7356,7 @@ skip_type:
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
 		cpuctx->ctx.pmu = pmu;
 
-		__perf_cpu_hrtimer_init(cpuctx, cpu);
+		__perf_mux_hrtimer_init(cpuctx, cpu);
 
 		cpuctx->unique_pmu = pmu;
 	}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 9f6ce9ba4..2bbad9c12 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,6 +11,7 @@
 struct ring_buffer {
 	atomic_t			refcount;
 	struct rcu_head			rcu_head;
+	struct irq_work			irq_work;
 #ifdef CONFIG_PERF_USE_VMALLOC
 	struct work_struct		work;
 	int				page_order;	/* allocation order  */
@@ -55,6 +56,15 @@ struct ring_buffer {
 };
 
 extern void rb_free(struct ring_buffer *rb);
+
+static inline void rb_free_rcu(struct rcu_head *rcu_head)
+{
+	struct ring_buffer *rb;
+
+	rb = container_of(rcu_head, struct ring_buffer, rcu_head);
+	rb_free(rb);
+}
+
 extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
 extern void perf_event_wakeup(struct perf_event *event);
@@ -72,15 +82,6 @@ static inline bool rb_has_aux(struct ring_buffer *rb)
 void perf_event_aux_event(struct perf_event *event, unsigned long head,
 			  unsigned long size, u64 flags);
 
-extern void
-perf_event_header__init_id(struct perf_event_header *header,
-			   struct perf_sample_data *data,
-			   struct perf_event *event);
-extern void
-perf_event__output_id_sample(struct perf_event *event,
-			     struct perf_output_handle *handle,
-			     struct perf_sample_data *sample);
-
 extern struct page *
 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 725c41608..c8aa3f75b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 	perf_output_get_handle(handle);
 
 	do {
-		tail = ACCESS_ONCE(rb->user_page->data_tail);
+		tail = READ_ONCE_CTRL(rb->user_page->data_tail);
 		offset = head = local_read(&rb->head);
 		if (!rb->overwrite &&
 		    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
@@ -221,6 +221,8 @@ void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
+static void rb_irq_work(struct irq_work *work);
+
 static void
 ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 {
@@ -241,6 +243,16 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 
 	INIT_LIST_HEAD(&rb->event_list);
 	spin_lock_init(&rb->event_lock);
+	init_irq_work(&rb->irq_work, rb_irq_work);
+}
+
+static void ring_buffer_put_async(struct ring_buffer *rb)
+{
+	if (!atomic_dec_and_test(&rb->refcount))
+		return;
+
+	rb->rcu_head.next = (void *)rb;
+	irq_work_queue(&rb->irq_work);
 }
 
 /*
@@ -319,7 +331,7 @@ err_put:
 	rb_free_aux(rb);
 
 err:
-	ring_buffer_put(rb);
+	ring_buffer_put_async(rb);
 	handle->event = NULL;
 
 	return NULL;
@@ -370,7 +382,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 
 	local_set(&rb->aux_nest, 0);
 	rb_free_aux(rb);
-	ring_buffer_put(rb);
+	ring_buffer_put_async(rb);
 }
 
 /*
@@ -547,17 +559,30 @@ static void __rb_free_aux(struct ring_buffer *rb)
 		rb->aux_priv = NULL;
 	}
 
-	for (pg = 0; pg < rb->aux_nr_pages; pg++)
-		rb_free_aux_page(rb, pg);
+	if (rb->aux_nr_pages) {
+		for (pg = 0; pg < rb->aux_nr_pages; pg++)
+			rb_free_aux_page(rb, pg);
 
-	kfree(rb->aux_pages);
-	rb->aux_nr_pages = 0;
+		kfree(rb->aux_pages);
+		rb->aux_nr_pages = 0;
+	}
 }
 
 void rb_free_aux(struct ring_buffer *rb)
 {
 	if (atomic_dec_and_test(&rb->aux_refcount))
+		irq_work_queue(&rb->irq_work);
+}
+
+static void rb_irq_work(struct irq_work *work)
+{
+	struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
+
+	if (!atomic_read(&rb->aux_refcount))
 		__rb_free_aux(rb);
+
+	if (rb->rcu_head.next == (void *)rb)
+		call_rcu(&rb->rcu_head, rb_free_rcu);
 }
 
 #ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/exit.c b/kernel/exit.c
index 490a707c7..031325e9a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -135,7 +135,7 @@ static void __exit_signal(struct task_struct *tsk)
 	sig->inblock += task_io_get_inblock(tsk);
 	sig->oublock += task_io_get_oublock(tsk);
 	task_io_accounting_add(&sig->ioac, &tsk->ioac);
-	sig->sum_sched_runtime += tsk_seruntime(tsk);
+	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	sig->nr_threads--;
 	__unhash_process(tsk, group_dead);
 	write_sequnlock(&sig->stats_lock);
@@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk)
 	mm_update_next_owner(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
-		unmark_oom_victim();
+		exit_oom_victim();
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -711,10 +711,10 @@ void do_exit(long code)
 			current->comm, task_pid_nr(current),
 			preempt_count());
 
-	acct_update_integrals(tsk);
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
 		sync_mm_rss(tsk->mm);
+	acct_update_integrals(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index e37f372d3..d6dfe2c23 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -138,7 +138,7 @@ static struct kmem_cache *task_struct_cachep;
 
 static inline struct task_struct *alloc_task_struct_node(int node)
 {
-	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node);
+	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
 }
 
 static inline void free_task_struct(struct task_struct *tsk)
@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
+
 void __init fork_init(void)
 {
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@ -295,7 +300,7 @@ void __init fork_init(void)
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", sizeof(struct task_struct),
+		kmem_cache_create("task_struct", arch_task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
@@ -456,7 +461,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			struct inode *inode = file_inode(file);
 			struct address_space *mapping = file->f_mapping;
 
-			vma_get_file(tmp);
+			get_file(file);
 			if (tmp->vm_flags & VM_DENYWRITE)
 				atomic_dec(&inode->i_writecount);
 			i_mmap_lock_write(mapping);
@@ -1091,10 +1096,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
 	unsigned long cpu_limit;
 
-	/* Thread group counters. */
-	thread_group_cputime_init(sig);
-
-	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (cpu_limit != RLIM_INFINITY) {
 		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
 		sig->cputimer.running = 1;
@@ -1144,10 +1146,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 	sched_autogroup_fork(sig);
 
-#ifdef CONFIG_CGROUPS
-	init_rwsem(&sig->group_rwsem);
-#endif
-
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
@@ -1241,7 +1239,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					unsigned long stack_size,
 					int __user *child_tidptr,
 					struct pid *pid,
-					int trace)
+					int trace,
+					unsigned long tls)
 {
 	int retval;
 	struct task_struct *p;
@@ -1396,6 +1395,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+
+	p->pagefault_disabled = 0;
+
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
@@ -1447,7 +1449,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	retval = copy_io(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread(clone_flags, stack_start, stack_size, p);
+	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
@@ -1659,7 +1661,7 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
-	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
 	if (!IS_ERR(task)) {
 		init_idle_pids(task->pids);
 		init_idle(task, cpu);
@@ -1674,11 +1676,12 @@ struct task_struct *fork_idle(int cpu)
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
  */
-long do_fork(unsigned long clone_flags,
+long _do_fork(unsigned long clone_flags,
 	      unsigned long stack_start,
 	      unsigned long stack_size,
 	      int __user *parent_tidptr,
-	      int __user *child_tidptr)
+	      int __user *child_tidptr,
+	      unsigned long tls)
 {
 	struct task_struct *p;
 	int trace = 0;
@@ -1703,7 +1706,7 @@ long do_fork(unsigned long clone_flags,
 	}
 
 	p = copy_process(clone_flags, stack_start, stack_size,
-			 child_tidptr, NULL, trace);
+			 child_tidptr, NULL, trace, tls);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
@@ -1744,20 +1747,34 @@ long do_fork(unsigned long clone_flags,
 	return nr;
 }
 
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+/* For compatibility with architectures that call do_fork directly rather than
+ * using the syscall entry points below. */
+long do_fork(unsigned long clone_flags,
+	      unsigned long stack_start,
+	      unsigned long stack_size,
+	      int __user *parent_tidptr,
+	      int __user *child_tidptr)
+{
+	return _do_fork(clone_flags, stack_start, stack_size,
+			parent_tidptr, child_tidptr, 0);
+}
+#endif
+
 /*
  * Create a kernel thread.
  */
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
-	return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-		(unsigned long)arg, NULL, NULL);
+	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+		(unsigned long)arg, NULL, NULL, 0);
 }
 
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-	return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
 #else
 	/* can not support in nommu mode */
 	return -EINVAL;
@@ -1768,8 +1785,8 @@ SYSCALL_DEFINE0(fork)
 #ifdef __ARCH_WANT_SYS_VFORK
 SYSCALL_DEFINE0(vfork)
 {
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-			0, NULL, NULL);
+	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+			0, NULL, NULL, 0);
 }
 #endif
 
@@ -1777,27 +1794,27 @@ SYSCALL_DEFINE0(vfork)
 #ifdef CONFIG_CLONE_BACKWARDS
 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 int __user *, parent_tidptr,
-		 int, tls_val,
+		 unsigned long, tls,
 		 int __user *, child_tidptr)
 #elif defined(CONFIG_CLONE_BACKWARDS2)
 SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
 		 int __user *, parent_tidptr,
 		 int __user *, child_tidptr,
-		 int, tls_val)
+		 unsigned long, tls)
 #elif defined(CONFIG_CLONE_BACKWARDS3)
 SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
 		int, stack_size,
 		int __user *, parent_tidptr,
 		int __user *, child_tidptr,
-		int, tls_val)
+		unsigned long, tls)
 #else
 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 int __user *, parent_tidptr,
 		 int __user *, child_tidptr,
-		 int, tls_val)
+		 unsigned long, tls)
 #endif
 {
-	return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
 }
 #endif
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407f..c4a182f53 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
 
 /*
  * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
  */
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
 {
 	struct task_struct *p = q->task;
 
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
 		return;
 
 	/*
-	 * We set q->lock_ptr = NULL _before_ we wake up the task. If
-	 * a non-futex wake up happens on another CPU then the task
-	 * might exit and p would dereference a non-existing task
-	 * struct. Prevent this by holding a reference on p across the
-	 * wake up.
+	 * Queue the task for later wakeup for after we've released
+	 * the hb->lock. wake_q_add() grabs reference to p.
 	 */
-	get_task_struct(p);
-
+	wake_q_add(wake_q, p);
 	__unqueue_futex(q);
 	/*
 	 * The waiting task can free the futex_q as soon as
@@ -1117,16 +1115,16 @@ static void wake_futex(struct futex_q *q)
 	 */
 	smp_wmb();
 	q->lock_ptr = NULL;
-
-	wake_up_state(p, TASK_NORMAL);
-	put_task_struct(p);
 }
 
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+			 struct futex_hash_bucket *hb)
 {
 	struct task_struct *new_owner;
 	struct futex_pi_state *pi_state = this->pi_state;
 	u32 uninitialized_var(curval), newval;
+	WAKE_Q(wake_q);
+	bool deboost;
 	int ret = 0;
 
 	if (!pi_state)
@@ -1178,7 +1176,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	raw_spin_unlock_irq(&new_owner->pi_lock);
 
 	raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-	rt_mutex_unlock(&pi_state->pi_mutex);
+
+	deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+
+	/*
+	 * First unlock HB so the waiter does not spin on it once he got woken
+	 * up. Second wake up the waiter before the priority is adjusted. If we
+	 * deboost first (and lose our higher priority), then the task might get
+	 * scheduled away before the wake up can take place.
+	 */
+	spin_unlock(&hb->lock);
+	wake_up_q(&wake_q);
+	if (deboost)
+		rt_mutex_adjust_prio(current);
 
 	return 0;
 }
@@ -1217,6 +1227,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
+	WAKE_Q(wake_q);
 
 	if (!bitset)
 		return -EINVAL;
@@ -1244,13 +1255,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 			if (!(this->bitset & bitset))
 				continue;
 
-			wake_futex(this);
+			mark_wake_futex(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
 	}
 
 	spin_unlock(&hb->lock);
+	wake_up_q(&wake_q);
 out_put_key:
 	put_futex_key(&key);
 out:
@@ -1269,6 +1281,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
 	int ret, op_ret;
+	WAKE_Q(wake_q);
 
 retry:
 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1333,7 @@ retry_private:
 				ret = -EINVAL;
 				goto out_unlock;
 			}
-			wake_futex(this);
+			mark_wake_futex(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
@@ -1334,7 +1347,7 @@ retry_private:
 					ret = -EINVAL;
 					goto out_unlock;
 				}
-				wake_futex(this);
+				mark_wake_futex(&wake_q, this);
 				if (++op_ret >= nr_wake2)
 					break;
 			}
@@ -1344,6 +1357,7 @@ retry_private:
 
 out_unlock:
 	double_unlock_hb(hb1, hb2);
+	wake_up_q(&wake_q);
 out_put_keys:
 	put_futex_key(&key2);
 out_put_key1:
@@ -1503,6 +1517,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
+	WAKE_Q(wake_q);
 
 	if (requeue_pi) {
 		/*
@@ -1679,7 +1694,7 @@ retry_private:
 		 * woken by futex_unlock_pi().
 		 */
 		if (++task_count <= nr_wake && !requeue_pi) {
-			wake_futex(this);
+			mark_wake_futex(&wake_q, this);
 			continue;
 		}
 
@@ -1719,6 +1734,7 @@ retry_private:
 out_unlock:
 	free_pi_state(pi_state);
 	double_unlock_hb(hb1, hb2);
+	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
 
 	/*
@@ -2055,7 +2071,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 {
 	/*
 	 * The task state is guaranteed to be set before another task can
-	 * wake it. set_current_state() is implemented using set_mb() and
+	 * wake it. set_current_state() is implemented using smp_store_mb() and
 	 * queue_me() calls spin_unlock() upon completion, both serializing
 	 * access to the hash list and forcing another memory barrier.
 	 */
@@ -2063,11 +2079,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	queue_me(q, hb);
 
 	/* Arm the timer */
-	if (timeout) {
+	if (timeout)
 		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-		if (!hrtimer_active(&timeout->timer))
-			timeout->task = NULL;
-	}
 
 	/*
 	 * If we have been removed from the hash list, then another task
@@ -2412,13 +2425,23 @@ retry:
 	 */
 	match = futex_top_waiter(hb, &key);
 	if (match) {
-		ret = wake_futex_pi(uaddr, uval, match);
+		ret = wake_futex_pi(uaddr, uval, match, hb);
+		/*
+		 * In case of success wake_futex_pi dropped the hash
+		 * bucket lock.
+		 */
+		if (!ret)
+			goto out_putkey;
 		/*
 		 * The atomic access to the futex value generated a
 		 * pagefault, so retry the user-access and the wakeup:
 		 */
 		if (ret == -EFAULT)
 			goto pi_faulted;
+		/*
+		 * wake_futex_pi has detected invalid state. Tell user
+		 * space.
+		 */
 		goto out_unlock;
 	}
 
@@ -2439,6 +2462,7 @@ retry:
 
 out_unlock:
 	spin_unlock(&hb->lock);
+out_putkey:
 	put_futex_key(&key);
 	return ret;
 
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index a744098e4..7080ae1eb 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -92,6 +92,12 @@ void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
 }
 EXPORT_SYMBOL(__gcov_merge_time_profile);
 
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
 /**
  * gcov_enable_events - enable event reporting through gcov_event()
  *
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 826ba9fb5..e25e92fb4 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
 #include <linux/vmalloc.h>
 #include "gcov.h"
 
-#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1
+#define GCOV_COUNTERS			10
+#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
 #define GCOV_COUNTERS			9
 #else
 #define GCOV_COUNTERS			8
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index eb9a4ea39..ae216824e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -719,15 +719,9 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
 }
 
 void
-__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
-		  const char *name)
+__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
+		     int is_chained, const char *name)
 {
-	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
-	if (!desc)
-		return;
-
 	if (!handle) {
 		handle = handle_bad_irq;
 	} else {
@@ -749,13 +743,13 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 			 * right away.
 			 */
 			if (WARN_ON(is_chained))
-				goto out;
+				return;
 			/* Try the parent */
 			irq_data = irq_data->parent_data;
 		}
 #endif
 		if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
-			goto out;
+			return;
 	}
 
 	/* Uninstall? */
@@ -774,12 +768,41 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		irq_settings_set_nothread(desc);
 		irq_startup(desc, true);
 	}
-out:
+}
+
+void
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+		  const char *name)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+
+	if (!desc)
+		return;
+
+	__irq_do_set_handler(desc, handle, is_chained, name);
 	irq_put_desc_busunlock(desc, flags);
 }
 EXPORT_SYMBOL_GPL(__irq_set_handler);
 
 void
+irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
+				 void *data)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+
+	if (!desc)
+		return;
+
+	__irq_do_set_handler(desc, handle, 1, NULL);
+	desc->irq_data.handler_data = data;
+
+	irq_put_desc_busunlock(desc, flags);
+}
+EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
+
+void
 irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 			      irq_flow_handler_t handle, const char *name)
 {
@@ -876,6 +899,34 @@ void irq_cpu_offline(void)
 
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 /**
+ * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
+ * NULL)
+ * @data:	Pointer to interrupt specific data
+ */
+void irq_chip_enable_parent(struct irq_data *data)
+{
+	data = data->parent_data;
+	if (data->chip->irq_enable)
+		data->chip->irq_enable(data);
+	else
+		data->chip->irq_unmask(data);
+}
+
+/**
+ * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if
+ * NULL)
+ * @data:	Pointer to interrupt specific data
+ */
+void irq_chip_disable_parent(struct irq_data *data)
+{
+	data = data->parent_data;
+	if (data->chip->irq_disable)
+		data->chip->irq_disable(data);
+	else
+		data->chip->irq_mask(data);
+}
+
+/**
  * irq_chip_ack_parent - Acknowledge the parent interrupt
  * @data:	Pointer to interrupt specific data
  */
@@ -934,6 +985,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
 }
 
 /**
+ * irq_chip_set_type_parent - Set IRQ type on the parent interrupt
+ * @data:	Pointer to interrupt specific data
+ * @type:	IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
+{
+	data = data->parent_data;
+
+	if (data->chip->irq_set_type)
+		return data->chip->irq_set_type(data, type);
+
+	return -ENOSYS;
+}
+
+/**
  * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
  * @data:	Pointer to interrupt specific data
  *
@@ -946,6 +1014,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
 		if (data->chip && data->chip->irq_retrigger)
 			return data->chip->irq_retrigger(data);
 
+	return 0;
+}
+
+/**
+ * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
+ * @data:	Pointer to interrupt specific data
+ * @dest:	The vcpu affinity information
+ */
+int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
+{
+	data = data->parent_data;
+	if (data->chip->irq_set_vcpu_affinity)
+		return data->chip->irq_set_vcpu_affinity(data, vcpu_info);
+
 	return -ENOSYS;
 }
 
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 2feb6feca..326a67f24 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -42,6 +42,7 @@ struct irq_chip no_irq_chip = {
 	.irq_enable	= noop,
 	.irq_disable	= noop,
 	.irq_ack	= ack_bad,
+	.flags		= IRQCHIP_SKIP_SET_WAKE,
 };
 
 /*
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 61024e8ab..15b370daf 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -360,7 +360,7 @@ static struct lock_class_key irq_nested_lock_class;
 int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
 			 irq_hw_number_t hw_irq)
 {
-	struct irq_data *data = irq_get_irq_data(virq);
+	struct irq_data *data = irq_domain_get_irq_data(d, virq);
 	struct irq_domain_chip_generic *dgc = d->gc;
 	struct irq_chip_generic *gc;
 	struct irq_chip_type *ct;
@@ -405,8 +405,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
 	else
 		data->mask = 1 << idx;
 
-	irq_set_chip_and_handler(virq, chip, ct->handler);
-	irq_set_chip_data(virq, gc);
+	irq_domain_set_info(d, virq, hw_irq, chip, gc, ct->handler, NULL, NULL);
 	irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
 	return 0;
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index df553b0af..61008b843 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -59,8 +59,6 @@ enum {
 #include "debug.h"
 #include "settings.h"
 
-#define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
-
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
@@ -78,12 +76,8 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
 
 #ifdef CONFIG_SPARSE_IRQ
 static inline void irq_mark_irq(unsigned int irq) { }
-extern void irq_lock_sparse(void);
-extern void irq_unlock_sparse(void);
 #else
 extern void irq_mark_irq(unsigned int irq);
-static inline void irq_lock_sparse(void) { }
-static inline void irq_unlock_sparse(void) { }
 #endif
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
@@ -170,27 +164,27 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
  */
 static inline void irqd_set_move_pending(struct irq_data *d)
 {
-	d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+	__irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING;
 }
 
 static inline void irqd_clr_move_pending(struct irq_data *d)
 {
-	d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+	__irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING;
 }
 
 static inline void irqd_clear(struct irq_data *d, unsigned int mask)
 {
-	d->state_use_accessors &= ~mask;
+	__irqd_to_state(d) &= ~mask;
 }
 
 static inline void irqd_set(struct irq_data *d, unsigned int mask)
 {
-	d->state_use_accessors |= mask;
+	__irqd_to_state(d) |= mask;
 }
 
 static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
 {
-	return d->state_use_accessors & mask;
+	return __irqd_to_state(d) & mask;
 }
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
@@ -199,6 +193,11 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
 	__this_cpu_inc(kstat.irqs_sum);
 }
 
+static inline int irq_desc_get_node(struct irq_desc *desc)
+{
+	return irq_data_get_node(&desc->irq_data);
+}
+
 #ifdef CONFIG_PM_SLEEP
 bool irq_pm_check_wakeup(struct irq_desc *desc);
 void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 99793b9b6..4afc45761 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -59,16 +59,10 @@ static void desc_smp_init(struct irq_desc *desc, int node)
 #endif
 }
 
-static inline int desc_node(struct irq_desc *desc)
-{
-	return desc->irq_data.node;
-}
-
 #else
 static inline int
 alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
 static inline void desc_smp_init(struct irq_desc *desc, int node) { }
-static inline int desc_node(struct irq_desc *desc) { return 0; }
 #endif
 
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
@@ -76,6 +70,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 {
 	int cpu;
 
+	desc->irq_data.common = &desc->irq_common_data;
 	desc->irq_data.irq = irq;
 	desc->irq_data.chip = &no_irq_chip;
 	desc->irq_data.chip_data = NULL;
@@ -299,7 +294,7 @@ static void free_desc(unsigned int irq)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	desc_set_defaults(irq, desc, desc_node(desc), NULL);
+	desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
@@ -619,7 +614,7 @@ unsigned int kstat_irqs(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	int cpu;
-	int sum = 0;
+	unsigned int sum = 0;
 
 	if (!desc || !desc->kstat_irqs)
 		return 0;
@@ -639,7 +634,7 @@ unsigned int kstat_irqs(unsigned int irq)
  */
 unsigned int kstat_irqs_usr(unsigned int irq)
 {
-	int sum;
+	unsigned int sum;
 
 	irq_lock_sparse();
 	sum = kstat_irqs(irq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 7fac31105..8c3577fef 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -830,10 +830,12 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
 {
 	struct irq_data *irq_data;
 
-	irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
+	irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL,
+				irq_data_get_node(child));
 	if (irq_data) {
 		child->parent_data = irq_data;
 		irq_data->irq = child->irq;
+		irq_data->common = child->common;
 		irq_data->node = child->node;
 		irq_data->domain = domain;
 	}
@@ -1232,6 +1234,27 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
 	return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
 }
 
+/**
+ * irq_domain_set_info - Set the complete data for a @virq in @domain
+ * @domain:		Interrupt domain to match
+ * @virq:		IRQ number
+ * @hwirq:		The hardware interrupt number
+ * @chip:		The associated interrupt chip
+ * @chip_data:		The associated interrupt chip data
+ * @handler:		The interrupt flow handler
+ * @handler_data:	The interrupt flow handler data
+ * @handler_name:	The interrupt handler name
+ */
+void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+			 irq_hw_number_t hwirq, struct irq_chip *chip,
+			 void *chip_data, irq_flow_handler_t handler,
+			 void *handler_data, const char *handler_name)
+{
+	irq_set_chip_and_handler_name(virq, chip, handler, handler_name);
+	irq_set_chip_data(virq, chip_data);
+	irq_set_handler_data(virq, handler_data);
+}
+
 static void irq_domain_check_hierarchy(struct irq_domain *domain)
 {
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e68932bb3..f9744853b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,37 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
 
+/**
+ *	irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ *	@irq: interrupt number to set affinity
+ *	@vcpu_info: vCPU specific data
+ *
+ *	This function uses the vCPU specific data to set the vCPU
+ *	affinity for an irq. The vCPU specific data is passed from
+ *	outside, such as KVM. One example code path is as below:
+ *	KVM -> IOMMU -> irq_set_vcpu_affinity().
+ */
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+	struct irq_data *data;
+	struct irq_chip *chip;
+	int ret = -ENOSYS;
+
+	if (!desc)
+		return -EINVAL;
+
+	data = irq_desc_get_irq_data(desc);
+	chip = irq_data_get_irq_chip(data);
+	if (chip && chip->irq_set_vcpu_affinity)
+		ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
+	irq_put_desc_unlock(desc, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
+
 static void irq_affinity_notify(struct work_struct *work)
 {
 	struct irq_affinity_notify *notify =
@@ -332,7 +363,7 @@ static int
 setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
 	struct cpumask *set = irq_default_affinity;
-	int node = desc->irq_data.node;
+	int node = irq_desc_get_node(desc);
 
 	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!irq_can_set_affinity(irq))
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index ca3f4aaff..37ddb7bda 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,21 +7,21 @@
 void irq_move_masked_irq(struct irq_data *idata)
 {
 	struct irq_desc *desc = irq_data_to_desc(idata);
-	struct irq_chip *chip = idata->chip;
+	struct irq_chip *chip = desc->irq_data.chip;
 
 	if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
 		return;
 
+	irqd_clr_move_pending(&desc->irq_data);
+
 	/*
 	 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
 	 */
-	if (!irqd_can_balance(&desc->irq_data)) {
+	if (irqd_is_per_cpu(&desc->irq_data)) {
 		WARN_ON(1);
 		return;
 	}
 
-	irqd_clr_move_pending(&desc->irq_data);
-
 	if (unlikely(cpumask_empty(desc->pending_mask)))
 		return;
 
@@ -52,6 +52,13 @@ void irq_move_irq(struct irq_data *idata)
 {
 	bool masked;
 
+	/*
+	 * Get top level irq_data when CONFIG_IRQ_DOMAIN_HIERARCHY is enabled,
+	 * and it should be optimized away when CONFIG_IRQ_DOMAIN_HIERARCHY is
+	 * disabled. So we avoid an "#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY" here.
+	 */
+	idata = irq_desc_get_irq_data(irq_data_to_desc(idata));
+
 	if (likely(!irqd_is_setaffinity_pending(idata)))
 		return;
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 474de5cb3..7bf1f1bbb 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -124,7 +124,7 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
 	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-static struct irq_domain_ops msi_domain_ops = {
+static const struct irq_domain_ops msi_domain_ops = {
 	.alloc		= msi_domain_alloc,
 	.free		= msi_domain_free,
 	.activate	= msi_domain_activate,
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 5204a6d1b..d22786a6d 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -123,6 +123,8 @@ void suspend_device_irqs(void)
 		unsigned long flags;
 		bool sync;
 
+		if (irq_settings_is_nested_thread(desc))
+			continue;
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		sync = suspend_device_irq(desc, irq);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -163,6 +165,8 @@ static void resume_irqs(bool want_early)
 
 		if (!is_early && want_early)
 			continue;
+		if (irq_settings_is_nested_thread(desc))
+			continue;
 
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		resume_irq(desc, irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index df2f4642d..0e97c142c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -241,7 +241,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long) m->private);
 
-	seq_printf(m, "%d\n", desc->irq_data.node);
+	seq_printf(m, "%d\n", irq_desc_get_node(desc));
 	return 0;
 }
 
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 9019f15de..52ebaca1b 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -302,7 +302,7 @@ static int jump_label_add_module(struct module *mod)
 			continue;
 
 		key = iterk;
-		if (__module_address(iter->key) == mod) {
+		if (within_module(iter->key, mod)) {
 			/*
 			 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
 			 */
@@ -339,7 +339,7 @@ static void jump_label_del_module(struct module *mod)
 
 		key = (struct static_key *)(unsigned long)iter->key;
 
-		if (__module_address(iter->key) == mod)
+		if (within_module(iter->key, mod))
 			continue;
 
 		prev = &key->next;
@@ -443,14 +443,16 @@ static void jump_label_update(struct static_key *key, int enable)
 {
 	struct jump_entry *stop = __stop___jump_table;
 	struct jump_entry *entry = jump_label_get_entries(key);
-
 #ifdef CONFIG_MODULES
-	struct module *mod = __module_address((unsigned long)key);
+	struct module *mod;
 
 	__jump_label_mod_update(key, enable);
 
+	preempt_disable();
+	mod = __module_address((unsigned long)key);
 	if (mod)
 		stop = mod->jump_entries + mod->num_jump_entries;
+	preempt_enable();
 #endif
 	/* if there are no users, entry can be NULL */
 	if (entry)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7a36fdcca..a785c1015 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -84,6 +84,17 @@ struct resource crashk_low_res = {
 
 int kexec_should_crash(struct task_struct *p)
 {
+	/*
+	 * If crash_kexec_post_notifiers is enabled, don't run
+	 * crash_kexec() here yet, which must be run after panic
+	 * notifiers in panic().
+	 */
+	if (crash_kexec_post_notifiers)
+		return 0;
+	/*
+	 * There are 4 panic() calls in do_exit() path, each of which
+	 * corresponds to each of these 4 conditions.
+	 */
 	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
 		return 1;
 	return 0;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c4237f12c..fdea0bee7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -97,6 +97,7 @@ bool kthread_should_park(void)
 {
 	return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
 }
+EXPORT_SYMBOL_GPL(kthread_should_park);
 
 /**
  * kthread_freezable_should_stop - should this freezable kthread return now?
@@ -171,6 +172,7 @@ void kthread_parkme(void)
 {
 	__kthread_parkme(to_kthread(current));
 }
+EXPORT_SYMBOL_GPL(kthread_parkme);
 
 static int kthread(void *_create)
 {
@@ -272,7 +274,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct task_struct *task;
 	struct kthread_create_info *create = kmalloc(sizeof(*create),
-						     GFP_KERNEL | ___GFP_TOI_NOTRACK);
+						     GFP_KERNEL);
 
 	if (!create)
 		return ERR_PTR(-ENOMEM);
@@ -411,6 +413,7 @@ void kthread_unpark(struct task_struct *k)
 	if (kthread)
 		__kthread_unpark(k, kthread);
 }
+EXPORT_SYMBOL_GPL(kthread_unpark);
 
 /**
  * kthread_park - park a thread created by kthread_create().
@@ -441,6 +444,7 @@ int kthread_park(struct task_struct *k)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(kthread_park);
 
 /**
  * kthread_stop - stop a thread created by kthread_create().
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 9ec555732..c40ebcca0 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -128,7 +128,7 @@ static bool klp_is_patch_registered(struct klp_patch *patch)
 
 static bool klp_initialized(void)
 {
-	return klp_root_kobj;
+	return !!klp_root_kobj;
 }
 
 struct klp_find_arg {
@@ -242,8 +242,9 @@ static int klp_find_verify_func_addr(struct klp_object *obj,
 	int ret;
 
 #if defined(CONFIG_RANDOMIZE_BASE)
-	/* KASLR is enabled, disregard old_addr from user */
-	func->old_addr = 0;
+	/* If KASLR has been enabled, adjust old_addr accordingly */
+	if (kaslr_enabled() && func->old_addr)
+		func->old_addr += kaslr_offset();
 #endif
 
 	if (!func->old_addr || klp_is_module(obj))
@@ -430,7 +431,7 @@ static void klp_disable_object(struct klp_object *obj)
 {
 	struct klp_func *func;
 
-	for (func = obj->funcs; func->old_name; func++)
+	klp_for_each_func(obj, func)
 		if (func->state == KLP_ENABLED)
 			klp_disable_func(func);
 
@@ -448,7 +449,7 @@ static int klp_enable_object(struct klp_object *obj)
 	if (WARN_ON(!klp_is_object_loaded(obj)))
 		return -EINVAL;
 
-	for (func = obj->funcs; func->old_name; func++) {
+	klp_for_each_func(obj, func) {
 		ret = klp_enable_func(func);
 		if (ret) {
 			klp_disable_object(obj);
@@ -471,7 +472,7 @@ static int __klp_disable_patch(struct klp_patch *patch)
 
 	pr_notice("disabling patch '%s'\n", patch->mod->name);
 
-	for (obj = patch->objs; obj->funcs; obj++) {
+	klp_for_each_object(patch, obj) {
 		if (obj->state == KLP_ENABLED)
 			klp_disable_object(obj);
 	}
@@ -531,7 +532,7 @@ static int __klp_enable_patch(struct klp_patch *patch)
 
 	pr_notice("enabling patch '%s'\n", patch->mod->name);
 
-	for (obj = patch->objs; obj->funcs; obj++) {
+	klp_for_each_object(patch, obj) {
 		if (!klp_is_object_loaded(obj))
 			continue;
 
@@ -659,6 +660,15 @@ static struct kobj_type klp_ktype_patch = {
 	.default_attrs = klp_patch_attrs,
 };
 
+static void klp_kobj_release_object(struct kobject *kobj)
+{
+}
+
+static struct kobj_type klp_ktype_object = {
+	.release = klp_kobj_release_object,
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
 static void klp_kobj_release_func(struct kobject *kobj)
 {
 }
@@ -688,7 +698,7 @@ static void klp_free_object_loaded(struct klp_object *obj)
 
 	obj->mod = NULL;
 
-	for (func = obj->funcs; func->old_name; func++)
+	klp_for_each_func(obj, func)
 		func->old_addr = 0;
 }
 
@@ -703,7 +713,7 @@ static void klp_free_objects_limited(struct klp_patch *patch,
 
 	for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
 		klp_free_funcs_limited(obj, NULL);
-		kobject_put(obj->kobj);
+		kobject_put(&obj->kobj);
 	}
 }
 
@@ -721,7 +731,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 	func->state = KLP_DISABLED;
 
 	return kobject_init_and_add(&func->kobj, &klp_ktype_func,
-				    obj->kobj, "%s", func->old_name);
+				    &obj->kobj, "%s", func->old_name);
 }
 
 /* parts of the initialization that is done only when the object is loaded */
@@ -737,7 +747,7 @@ static int klp_init_object_loaded(struct klp_patch *patch,
 			return ret;
 	}
 
-	for (func = obj->funcs; func->old_name; func++) {
+	klp_for_each_func(obj, func) {
 		ret = klp_find_verify_func_addr(obj, func);
 		if (ret)
 			return ret;
@@ -761,11 +771,12 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
 	klp_find_object_module(obj);
 
 	name = klp_is_module(obj) ? obj->name : "vmlinux";
-	obj->kobj = kobject_create_and_add(name, &patch->kobj);
-	if (!obj->kobj)
-		return -ENOMEM;
+	ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object,
+				   &patch->kobj, "%s", name);
+	if (ret)
+		return ret;
 
-	for (func = obj->funcs; func->old_name; func++) {
+	klp_for_each_func(obj, func) {
 		ret = klp_init_func(obj, func);
 		if (ret)
 			goto free;
@@ -781,7 +792,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
 
 free:
 	klp_free_funcs_limited(obj, func);
-	kobject_put(obj->kobj);
+	kobject_put(&obj->kobj);
 	return ret;
 }
 
@@ -802,7 +813,7 @@ static int klp_init_patch(struct klp_patch *patch)
 	if (ret)
 		goto unlock;
 
-	for (obj = patch->objs; obj->funcs; obj++) {
+	klp_for_each_object(patch, obj) {
 		ret = klp_init_object(patch, obj);
 		if (ret)
 			goto free;
@@ -891,7 +902,7 @@ int klp_register_patch(struct klp_patch *patch)
 }
 EXPORT_SYMBOL_GPL(klp_register_patch);
 
-static void klp_module_notify_coming(struct klp_patch *patch,
+static int klp_module_notify_coming(struct klp_patch *patch,
 				     struct klp_object *obj)
 {
 	struct module *pmod = patch->mod;
@@ -899,22 +910,23 @@ static void klp_module_notify_coming(struct klp_patch *patch,
 	int ret;
 
 	ret = klp_init_object_loaded(patch, obj);
-	if (ret)
-		goto err;
+	if (ret) {
+		pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n",
+			pmod->name, mod->name, ret);
+		return ret;
+	}
 
 	if (patch->state == KLP_DISABLED)
-		return;
+		return 0;
 
 	pr_notice("applying patch '%s' to loading module '%s'\n",
 		  pmod->name, mod->name);
 
 	ret = klp_enable_object(obj);
-	if (!ret)
-		return;
-
-err:
-	pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
-		pmod->name, mod->name, ret);
+	if (ret)
+		pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+			pmod->name, mod->name, ret);
+	return ret;
 }
 
 static void klp_module_notify_going(struct klp_patch *patch,
@@ -938,6 +950,7 @@ disabled:
 static int klp_module_notify(struct notifier_block *nb, unsigned long action,
 			     void *data)
 {
+	int ret;
 	struct module *mod = data;
 	struct klp_patch *patch;
 	struct klp_object *obj;
@@ -957,13 +970,18 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action,
 		mod->klp_alive = false;
 
 	list_for_each_entry(patch, &klp_patches, list) {
-		for (obj = patch->objs; obj->funcs; obj++) {
+		klp_for_each_object(patch, obj) {
 			if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
 				continue;
 
 			if (action == MODULE_STATE_COMING) {
 				obj->mod = mod;
-				klp_module_notify_coming(patch, obj);
+				ret = klp_module_notify_coming(patch, obj);
+				if (ret) {
+					obj->mod = NULL;
+					pr_warn("patch '%s' is in an inconsistent state!\n",
+						patch->mod->name);
+				}
 			} else /* MODULE_STATE_GOING */
 				klp_module_notify_going(patch, obj);
 
@@ -981,7 +999,7 @@ static struct notifier_block klp_module_nb = {
 	.priority = INT_MIN+1, /* called late but before ftrace notifier */
 };
 
-static int klp_init(void)
+static int __init klp_init(void)
 {
 	int ret;
 
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416cc..7dd5c9918 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
@@ -25,5 +26,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
-obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf..951cfcd10 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
 }
 EXPORT_SYMBOL(lg_local_unlock_cpu);
 
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+	BUG_ON(cpu1 == cpu2);
+
+	/* lock in cpu order, just like lg_global_lock */
+	if (cpu2 < cpu1)
+		swap(cpu1, cpu2);
+
+	preempt_disable();
+	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+	arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+	arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+	arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+	preempt_enable();
+}
+
 void lg_global_lock(struct lglock *lg)
 {
 	int i;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index aaeae885d..8acfbf773 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3157,6 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->waittime_stamp = 0;
 	hlock->holdtime_stamp = lockstat_clock();
 #endif
+	hlock->pin_count = 0;
 
 	if (check && !mark_irqflags(curr, hlock))
 		return 0;
@@ -3260,26 +3261,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
 	return 0;
 }
 
-/*
- * Common debugging checks for both nested and non-nested unlock:
- */
-static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
-			unsigned long ip)
-{
-	if (unlikely(!debug_locks))
-		return 0;
-	/*
-	 * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
-	 */
-	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-		return 0;
-
-	if (curr->lockdep_depth <= 0)
-		return print_unlock_imbalance_bug(curr, lock, ip);
-
-	return 1;
-}
-
 static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 {
 	if (hlock->instance == lock)
@@ -3376,31 +3357,35 @@ found_it:
 }
 
 /*
- * Remove the lock to the list of currently held locks in a
- * potentially non-nested (out of order) manner. This is a
- * relatively rare operation, as all the unlock APIs default
- * to nested mode (which uses lock_release()):
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()).
+ *
+ * @nested is an hysterical artifact, needs a tree wide cleanup.
  */
 static int
-lock_release_non_nested(struct task_struct *curr,
-			struct lockdep_map *lock, unsigned long ip)
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
+	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
 	unsigned int depth;
 	int i;
 
-	/*
-	 * Check whether the lock exists in the current stack
-	 * of held locks:
-	 */
+	if (unlikely(!debug_locks))
+		return 0;
+
 	depth = curr->lockdep_depth;
 	/*
 	 * So we're all set to release this lock.. wait what lock? We don't
 	 * own any locks, you've been drinking again?
 	 */
-	if (DEBUG_LOCKS_WARN_ON(!depth))
-		return 0;
+	if (DEBUG_LOCKS_WARN_ON(depth <= 0))
+		 return print_unlock_imbalance_bug(curr, lock, ip);
 
+	/*
+	 * Check whether the lock exists in the current stack
+	 * of held locks:
+	 */
 	prev_hlock = NULL;
 	for (i = depth-1; i >= 0; i--) {
 		hlock = curr->held_locks + i;
@@ -3419,6 +3404,8 @@ found_it:
 	if (hlock->instance == lock)
 		lock_release_holdtime(hlock);
 
+	WARN(hlock->pin_count, "releasing a pinned lock\n");
+
 	if (hlock->references) {
 		hlock->references--;
 		if (hlock->references) {
@@ -3456,91 +3443,66 @@ found_it:
 	 */
 	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
 		return 0;
+
 	return 1;
 }
 
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static int lock_release_nested(struct task_struct *curr,
-			       struct lockdep_map *lock, unsigned long ip)
+static int __lock_is_held(struct lockdep_map *lock)
 {
-	struct held_lock *hlock;
-	unsigned int depth;
-
-	/*
-	 * Pop off the top of the lock stack:
-	 */
-	depth = curr->lockdep_depth - 1;
-	hlock = curr->held_locks + depth;
-
-	/*
-	 * Is the unlock non-nested:
-	 */
-	if (hlock->instance != lock || hlock->references)
-		return lock_release_non_nested(curr, lock, ip);
-	curr->lockdep_depth--;
-
-	/*
-	 * No more locks, but somehow we've got hash left over, who left it?
-	 */
-	if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
-		return 0;
+	struct task_struct *curr = current;
+	int i;
 
-	curr->curr_chain_key = hlock->prev_chain_key;
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
 
-	lock_release_holdtime(hlock);
+		if (match_held_lock(hlock, lock))
+			return 1;
+	}
 
-#ifdef CONFIG_DEBUG_LOCKDEP
-	hlock->prev_chain_key = 0;
-	hlock->class_idx = 0;
-	hlock->acquire_ip = 0;
-	hlock->irq_context = 0;
-#endif
-	return 1;
+	return 0;
 }
 
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static void
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+static void __lock_pin_lock(struct lockdep_map *lock)
 {
 	struct task_struct *curr = current;
+	int i;
 
-	if (!check_unlock(curr, lock, ip))
+	if (unlikely(!debug_locks))
 		return;
 
-	if (nested) {
-		if (!lock_release_nested(curr, lock, ip))
-			return;
-	} else {
-		if (!lock_release_non_nested(curr, lock, ip))
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			hlock->pin_count++;
 			return;
+		}
 	}
 
-	check_chain_key(curr);
+	WARN(1, "pinning an unheld lock\n");
 }
 
-static int __lock_is_held(struct lockdep_map *lock)
+static void __lock_unpin_lock(struct lockdep_map *lock)
 {
 	struct task_struct *curr = current;
 	int i;
 
+	if (unlikely(!debug_locks))
+		return;
+
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock))
-			return 1;
+		if (match_held_lock(hlock, lock)) {
+			if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
+				return;
+
+			hlock->pin_count--;
+			return;
+		}
 	}
 
-	return 0;
+	WARN(1, "unpinning an unheld lock\n");
 }
 
 /*
@@ -3639,7 +3601,8 @@ void lock_release(struct lockdep_map *lock, int nested,
 	check_flags(flags);
 	current->lockdep_recursion = 1;
 	trace_lock_release(lock, ip);
-	__lock_release(lock, nested, ip);
+	if (__lock_release(lock, nested, ip))
+		check_chain_key(current);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
@@ -3665,6 +3628,40 @@ int lock_is_held(struct lockdep_map *lock)
 }
 EXPORT_SYMBOL_GPL(lock_is_held);
 
+void lock_pin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_pin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_unpin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
+
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
 	current->lockdep_reclaim_gfp = gfp_mask;
@@ -4067,8 +4064,7 @@ void __init lockdep_info(void)
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 	if (lockdep_init_error) {
-		printk("WARNING: lockdep init error! lock-%s was acquired"
-			"before lockdep_init\n", lock_init_error);
+		printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
 		printk("Call stack leading to lockdep invocation was:\n");
 		print_stack_trace(&lockdep_init_trace, 0);
 	}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ec8cce259..32244186f 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -122,12 +122,12 @@ static int torture_lock_busted_write_lock(void)
 
 static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_us = 100;
+	const unsigned long longdelay_ms = 100;
 
 	/* We want a long delay occasionally to force massive contention.  */
 	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_us)))
-		mdelay(longdelay_us);
+	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+		mdelay(longdelay_ms);
 #ifdef CONFIG_PREEMPT
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
 		preempt_schedule();  /* Allow test to be preempted. */
@@ -160,14 +160,14 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
 static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 2;
-	const unsigned long longdelay_us = 100;
+	const unsigned long longdelay_ms = 100;
 
 	/* We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
 	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_us)))
-		mdelay(longdelay_us);
+	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+		mdelay(longdelay_ms);
 	if (!(torture_random(trsp) %
 	      (cxt.nrealwriters_stress * 2 * shortdelay_us)))
 		udelay(shortdelay_us);
@@ -309,7 +309,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
 static void torture_rwlock_read_unlock_irq(void)
 __releases(torture_rwlock)
 {
-	write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+	read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
 }
 
 static struct lock_torture_ops rw_lock_irq_ops = {
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 75e114bdf..fd91aaa45 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -17,6 +17,7 @@
 struct mcs_spinlock {
 	struct mcs_spinlock *next;
 	int locked; /* 1 if lock acquired */
+	int count;  /* nesting count, see qspinlock.c */
 };
 
 #ifndef arch_mcs_spin_lock_contended
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f956ede7f..6c5da4839 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,5 +1,5 @@
 /*
- * Queue read/write lock
+ * Queued read/write locks
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,6 +22,26 @@
 #include <linux/hardirq.h>
 #include <asm/qrwlock.h>
 
+/*
+ * This internal data structure is used for optimizing access to some of
+ * the subfields within the atomic_t cnts.
+ */
+struct __qrwlock {
+	union {
+		atomic_t cnts;
+		struct {
+#ifdef __LITTLE_ENDIAN
+			u8 wmode;	/* Writer mode   */
+			u8 rcnts[3];	/* Reader counts */
+#else
+			u8 rcnts[3];	/* Reader counts */
+			u8 wmode;	/* Writer mode   */
+#endif
+		};
+	};
+	arch_spinlock_t	lock;
+};
+
 /**
  * rspin_until_writer_unlock - inc reader count & spin until writer is gone
  * @lock  : Pointer to queue rwlock structure
@@ -107,10 +127,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
 	 * or wait for a previous writer to go away.
 	 */
 	for (;;) {
-		cnts = atomic_read(&lock->cnts);
-		if (!(cnts & _QW_WMASK) &&
-		    (atomic_cmpxchg(&lock->cnts, cnts,
-				    cnts | _QW_WAITING) == cnts))
+		struct __qrwlock *l = (struct __qrwlock *)lock;
+
+		if (!READ_ONCE(l->wmode) &&
+		   (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
 			break;
 
 		cpu_relax_lowlatency();
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 000000000..38c49202d
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,473 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ *          Peter Zijlstra <peterz@infradead.org>
+ */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ *      atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES	8
+#else
+#define MAX_NODES	4
+#endif
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+	u32 tail;
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+	BUG_ON(idx > 3);
+#endif
+	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+	return tail;
+}
+
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+	int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+	return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
+ */
+struct __qspinlock {
+	union {
+		atomic_t val;
+#ifdef __LITTLE_ENDIAN
+		struct {
+			u8	locked;
+			u8	pending;
+		};
+		struct {
+			u16	locked_pending;
+			u16	tail;
+		};
+#else
+		struct {
+			u16	tail;
+			u16	locked_pending;
+		};
+		struct {
+			u8	reserved[2];
+			u8	pending;
+			u8	locked;
+		};
+#endif
+	};
+};
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+	u32 old, new, val = atomic_read(&lock->val);
+
+	for (;;) {
+		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+		old = atomic_cmpxchg(&lock->val, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+	return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+
+
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+					   struct mcs_spinlock *node) { }
+
+#define pv_enabled()		false
+
+#define pv_init_node		__pv_init_node
+#define pv_wait_node		__pv_wait_node
+#define pv_kick_node		__pv_kick_node
+#define pv_wait_head		__pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ *              fast     :    slow                                  :    unlock
+ *                       :                                          :
+ * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ *                       :       | ^--------.------.             /  :
+ *                       :       v           \      \            |  :
+ * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
+ *                       :       | ^--'              |           |  :
+ *                       :       v                   |           |  :
+ * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
+ *   queue               :       | ^--'                          |  :
+ *                       :       v                               |  :
+ * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
+ *   queue               :         ^--'                             :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	struct mcs_spinlock *prev, *next, *node;
+	u32 new, old, tail;
+	int idx;
+
+	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+	if (pv_enabled())
+		goto queue;
+
+	if (virt_queued_spin_lock(lock))
+		return;
+
+	/*
+	 * wait for in-progress pending->locked hand-overs
+	 *
+	 * 0,1,0 -> 0,0,1
+	 */
+	if (val == _Q_PENDING_VAL) {
+		while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+			cpu_relax();
+	}
+
+	/*
+	 * trylock || pending
+	 *
+	 * 0,0,0 -> 0,0,1 ; trylock
+	 * 0,0,1 -> 0,1,1 ; pending
+	 */
+	for (;;) {
+		/*
+		 * If we observe any contention; queue.
+		 */
+		if (val & ~_Q_LOCKED_MASK)
+			goto queue;
+
+		new = _Q_LOCKED_VAL;
+		if (val == new)
+			new |= _Q_PENDING_VAL;
+
+		old = atomic_cmpxchg(&lock->val, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	/*
+	 * we won the trylock
+	 */
+	if (new == _Q_LOCKED_VAL)
+		return;
+
+	/*
+	 * we're pending, wait for the owner to go away.
+	 *
+	 * *,1,1 -> *,1,0
+	 *
+	 * this wait loop must be a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because not all clear_pending_set_locked()
+	 * implementations imply full barriers.
+	 */
+	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
+		cpu_relax();
+
+	/*
+	 * take ownership and clear the pending bit.
+	 *
+	 * *,1,0 -> *,0,1
+	 */
+	clear_pending_set_locked(lock);
+	return;
+
+	/*
+	 * End of pending bit optimistic spinning and beginning of MCS
+	 * queuing.
+	 */
+queue:
+	node = this_cpu_ptr(&mcs_nodes[0]);
+	idx = node->count++;
+	tail = encode_tail(smp_processor_id(), idx);
+
+	node += idx;
+	node->locked = 0;
+	node->next = NULL;
+	pv_init_node(node);
+
+	/*
+	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
+	 * attempt the trylock once more in the hope someone let go while we
+	 * weren't watching.
+	 */
+	if (queued_spin_trylock(lock))
+		goto release;
+
+	/*
+	 * We have already touched the queueing cacheline; don't bother with
+	 * pending stuff.
+	 *
+	 * p,*,* -> n,*,*
+	 */
+	old = xchg_tail(lock, tail);
+
+	/*
+	 * if there was a previous node; link it and wait until reaching the
+	 * head of the waitqueue.
+	 */
+	if (old & _Q_TAIL_MASK) {
+		prev = decode_tail(old);
+		WRITE_ONCE(prev->next, node);
+
+		pv_wait_node(node);
+		arch_mcs_spin_lock_contended(&node->locked);
+	}
+
+	/*
+	 * we're at the head of the waitqueue, wait for the owner & pending to
+	 * go away.
+	 *
+	 * *,x,y -> *,0,0
+	 *
+	 * this wait loop must use a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because the set_locked() function below
+	 * does not imply a full barrier.
+	 *
+	 */
+	pv_wait_head(lock, node);
+	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+		cpu_relax();
+
+	/*
+	 * claim the lock:
+	 *
+	 * n,0,0 -> 0,0,1 : lock, uncontended
+	 * *,0,0 -> *,0,1 : lock, contended
+	 *
+	 * If the queue head is the only one in the queue (lock value == tail),
+	 * clear the tail code and grab the lock. Otherwise, we only need
+	 * to grab the lock.
+	 */
+	for (;;) {
+		if (val != tail) {
+			set_locked(lock);
+			break;
+		}
+		old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+		if (old == val)
+			goto release;	/* No contention */
+
+		val = old;
+	}
+
+	/*
+	 * contended path; wait for next, release.
+	 */
+	while (!(next = READ_ONCE(node->next)))
+		cpu_relax();
+
+	arch_mcs_spin_unlock_contended(&next->locked);
+	pv_kick_node(next);
+
+release:
+	/*
+	 * release the node
+	 */
+	this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef  pv_enabled
+#define pv_enabled()	true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+
+#undef  queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath	__pv_queued_spin_lock_slowpath
+
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 000000000..df19ae4de
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,334 @@
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+#include <linux/debug_locks.h>
+
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ *   pv_kick(cpu)             -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+
+#define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
+
+enum vcpu_state {
+	vcpu_running = 0,
+	vcpu_halted,
+};
+
+struct pv_node {
+	struct mcs_spinlock	mcs;
+	struct mcs_spinlock	__res[3];
+
+	int			cpu;
+	u8			state;
+};
+
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+	struct qspinlock *lock;
+	struct pv_node   *node;
+};
+
+#define PV_HE_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN	(PAGE_SIZE / sizeof(struct pv_hash_entry))
+
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+
+	if (pv_hash_size < PV_HE_MIN)
+		pv_hash_size = PV_HE_MIN;
+
+	/*
+	 * Allocate space from bootmem which should be page-size aligned
+	 * and hence cacheline aligned.
+	 */
+	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+					       sizeof(struct pv_hash_entry),
+					       pv_hash_size, 0, HASH_EARLY,
+					       &pv_lock_hash_bits, NULL,
+					       pv_hash_size, pv_hash_size);
+}
+
+#define for_each_hash_entry(he, offset, hash)						\
+	for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;	\
+	     offset < (1 << pv_lock_hash_bits);						\
+	     offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+	struct pv_hash_entry *he;
+
+	for_each_hash_entry(he, offset, hash) {
+		if (!cmpxchg(&he->lock, NULL, lock)) {
+			WRITE_ONCE(he->node, node);
+			return &he->lock;
+		}
+	}
+	/*
+	 * Hard assume there is a free entry for us.
+	 *
+	 * This is guaranteed by ensuring every blocked lock only ever consumes
+	 * a single entry, and since we only have 4 nesting levels per CPU
+	 * and allocated 4*nr_possible_cpus(), this must be so.
+	 *
+	 * The single entry is guaranteed by having the lock owner unhash
+	 * before it releases.
+	 */
+	BUG();
+}
+
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+	struct pv_hash_entry *he;
+	struct pv_node *node;
+
+	for_each_hash_entry(he, offset, hash) {
+		if (READ_ONCE(he->lock) == lock) {
+			node = READ_ONCE(he->node);
+			WRITE_ONCE(he->lock, NULL);
+			return node;
+		}
+	}
+	/*
+	 * Hard assume we'll find an entry.
+	 *
+	 * This guarantees a limited lookup time and is itself guaranteed by
+	 * having the lock owner do the unhash -- IFF the unlock sees the
+	 * SLOW flag, there MUST be a hash entry.
+	 */
+	BUG();
+}
+
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+
+	BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+
+	pn->cpu = smp_processor_id();
+	pn->state = vcpu_running;
+}
+
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to wake the vcpu again.
+ */
+static void pv_wait_node(struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+	int loop;
+
+	for (;;) {
+		for (loop = SPIN_THRESHOLD; loop; loop--) {
+			if (READ_ONCE(node->locked))
+				return;
+			cpu_relax();
+		}
+
+		/*
+		 * Order pn->state vs pn->locked thusly:
+		 *
+		 * [S] pn->state = vcpu_halted	  [S] next->locked = 1
+		 *     MB			      MB
+		 * [L] pn->locked		[RmW] pn->state = vcpu_running
+		 *
+		 * Matches the xchg() from pv_kick_node().
+		 */
+		smp_store_mb(pn->state, vcpu_halted);
+
+		if (!READ_ONCE(node->locked))
+			pv_wait(&pn->state, vcpu_halted);
+
+		/*
+		 * Reset the vCPU state to avoid unncessary CPU kicking
+		 */
+		WRITE_ONCE(pn->state, vcpu_running);
+
+		/*
+		 * If the locked flag is still not set after wakeup, it is a
+		 * spurious wakeup and the vCPU should wait again. However,
+		 * there is a pretty high overhead for CPU halting and kicking.
+		 * So it is better to spin for a while in the hope that the
+		 * MCS lock will be released soon.
+		 */
+	}
+	/*
+	 * By now our node->locked should be 1 and our caller will not actually
+	 * spin-wait for it. We do however rely on our caller to do a
+	 * load-acquire for us.
+	 */
+}
+
+/*
+ * Called after setting next->locked = 1, used to wake those stuck in
+ * pv_wait_node().
+ */
+static void pv_kick_node(struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+
+	/*
+	 * Note that because node->locked is already set, this actual
+	 * mcs_spinlock entry could be re-used already.
+	 *
+	 * This should be fine however, kicking people for no reason is
+	 * harmless.
+	 *
+	 * See the comment in pv_wait_node().
+	 */
+	if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+		pv_kick(pn->cpu);
+}
+
+/*
+ * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ */
+static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+	struct __qspinlock *l = (void *)lock;
+	struct qspinlock **lp = NULL;
+	int loop;
+
+	for (;;) {
+		for (loop = SPIN_THRESHOLD; loop; loop--) {
+			if (!READ_ONCE(l->locked))
+				return;
+			cpu_relax();
+		}
+
+		WRITE_ONCE(pn->state, vcpu_halted);
+		if (!lp) { /* ONCE */
+			lp = pv_hash(lock, pn);
+			/*
+			 * lp must be set before setting _Q_SLOW_VAL
+			 *
+			 * [S] lp = lock                [RmW] l = l->locked = 0
+			 *     MB                             MB
+			 * [S] l->locked = _Q_SLOW_VAL  [L]   lp
+			 *
+			 * Matches the cmpxchg() in __pv_queued_spin_unlock().
+			 */
+			if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+				/*
+				 * The lock is free and _Q_SLOW_VAL has never
+				 * been set. Therefore we need to unhash before
+				 * getting the lock.
+				 */
+				WRITE_ONCE(*lp, NULL);
+				return;
+			}
+		}
+		pv_wait(&l->locked, _Q_SLOW_VAL);
+
+		/*
+		 * The unlocker should have freed the lock before kicking the
+		 * CPU. So if the lock is still not free, it is a spurious
+		 * wakeup and so the vCPU should wait again after spinning for
+		 * a while.
+		 */
+	}
+
+	/*
+	 * Lock is unlocked now; the caller will acquire it without waiting.
+	 * As with pv_wait_node() we rely on the caller to do a load-acquire
+	 * for us.
+	 */
+}
+
+/*
+ * PV version of the unlock function to be used in stead of
+ * queued_spin_unlock().
+ */
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+	struct pv_node *node;
+	u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+
+	/*
+	 * We must not unlock if SLOW, because in that case we must first
+	 * unhash. Otherwise it would be possible to have multiple @lock
+	 * entries, which would be BAD.
+	 */
+	if (likely(lockval == _Q_LOCKED_VAL))
+		return;
+
+	if (unlikely(lockval != _Q_SLOW_VAL)) {
+		if (debug_locks_silent)
+			return;
+		WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val));
+		return;
+	}
+
+	/*
+	 * Since the above failed to release, this must be the SLOW path.
+	 * Therefore start by looking up the blocked node and unhashing it.
+	 */
+	node = pv_unhash(lock);
+
+	/*
+	 * Now that we have a reference to the (likely) blocked pv_node,
+	 * release the lock.
+	 */
+	smp_store_release(&l->locked, 0);
+
+	/*
+	 * At this point the memory pointed at by lock can be freed/reused,
+	 * however we can still use the pv_node to kick the CPU.
+	 */
+	if (READ_ONCE(node->state) == vcpu_halted)
+		pv_kick(node->cpu);
+}
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() near the top of the file to make sure
+ * that the callee-save thunk and the real unlock function are close
+ * to each other sharing consecutive instruction cachelines.
+ */
+#include <asm/qspinlock_paravirt.h>
+
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b025295f4..5674b0734 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 }
 
 /*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
+ * We can speed up the acquire/release, if there's no debugging state to be
+ * set up.
  */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+#ifndef CONFIG_DEBUG_RT_MUTEXES
 # define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
@@ -300,7 +300,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
  * of task. We do not use the spin_xx_mutex() variants here as we are
  * outside of the debug path.)
  */
-static void rt_mutex_adjust_prio(struct task_struct *task)
+void rt_mutex_adjust_prio(struct task_struct *task)
 {
 	unsigned long flags;
 
@@ -624,7 +624,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 */
 	prerequeue_top_waiter = rt_mutex_top_waiter(lock);
 
-	/* [7] Requeue the waiter in the lock waiter list. */
+	/* [7] Requeue the waiter in the lock waiter tree. */
 	rt_mutex_dequeue(lock, waiter);
 	waiter->prio = task->prio;
 	rt_mutex_enqueue(lock, waiter);
@@ -662,7 +662,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		/*
 		 * The waiter became the new top (highest priority)
 		 * waiter on the lock. Replace the previous top waiter
-		 * in the owner tasks pi waiters list with this waiter
+		 * in the owner tasks pi waiters tree with this waiter
 		 * and adjust the priority of the owner.
 		 */
 		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
@@ -673,7 +673,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		/*
 		 * The waiter was the top waiter on the lock, but is
 		 * no longer the top prority waiter. Replace waiter in
-		 * the owner tasks pi waiters list with the new top
+		 * the owner tasks pi waiters tree with the new top
 		 * (highest priority) waiter and adjust the priority
 		 * of the owner.
 		 * The new top waiter is stored in @waiter so that
@@ -747,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
  *
  * @lock:   The lock to be acquired.
  * @task:   The task which wants to acquire the lock
- * @waiter: The waiter that is queued to the lock's wait list if the
+ * @waiter: The waiter that is queued to the lock's wait tree if the
  *	    callsite called task_blocked_on_lock(), otherwise NULL
  */
 static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
@@ -782,7 +782,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 
 	/*
 	 * If @waiter != NULL, @task has already enqueued the waiter
-	 * into @lock waiter list. If @waiter == NULL then this is a
+	 * into @lock waiter tree. If @waiter == NULL then this is a
 	 * trylock attempt.
 	 */
 	if (waiter) {
@@ -795,7 +795,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 
 		/*
 		 * We can acquire the lock. Remove the waiter from the
-		 * lock waiters list.
+		 * lock waiters tree.
 		 */
 		rt_mutex_dequeue(lock, waiter);
 
@@ -827,7 +827,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 			 * No waiters. Take the lock without the
 			 * pi_lock dance.@task->pi_blocked_on is NULL
 			 * and we have no waiters to enqueue in @task
-			 * pi waiters list.
+			 * pi waiters tree.
 			 */
 			goto takeit;
 		}
@@ -844,7 +844,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	/*
 	 * Finish the lock acquisition. @task is the new owner. If
 	 * other waiters exist we have to insert the highest priority
-	 * waiter into @task->pi_waiters list.
+	 * waiter into @task->pi_waiters tree.
 	 */
 	if (rt_mutex_has_waiters(lock))
 		rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
@@ -955,14 +955,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 }
 
 /*
- * Wake up the next waiter on the lock.
- *
- * Remove the top waiter from the current tasks pi waiter list and
- * wake it up.
+ * Remove the top waiter from the current tasks pi waiter tree and
+ * queue it up.
  *
  * Called with lock->wait_lock held.
  */
-static void wakeup_next_waiter(struct rt_mutex *lock)
+static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
+				    struct rt_mutex *lock)
 {
 	struct rt_mutex_waiter *waiter;
 	unsigned long flags;
@@ -991,12 +990,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
-	/*
-	 * It's safe to dereference waiter as it cannot go away as
-	 * long as we hold lock->wait_lock. The waiter task needs to
-	 * acquire it in order to dequeue the waiter.
-	 */
-	wake_up_process(waiter->task);
+	wake_q_add(wake_q, waiter->task);
 }
 
 /*
@@ -1182,11 +1176,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	set_current_state(state);
 
 	/* Setup the timer, when timeout != NULL */
-	if (unlikely(timeout)) {
+	if (unlikely(timeout))
 		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-		if (!hrtimer_active(&timeout->timer))
-			timeout->task = NULL;
-	}
 
 	ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
 
@@ -1253,10 +1244,11 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 }
 
 /*
- * Slow path to release a rt-mutex:
+ * Slow path to release a rt-mutex.
+ * Return whether the current task needs to undo a potential priority boosting.
  */
-static void __sched
-rt_mutex_slowunlock(struct rt_mutex *lock)
+static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
+					struct wake_q_head *wake_q)
 {
 	raw_spin_lock(&lock->wait_lock);
 
@@ -1298,7 +1290,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
 	while (!rt_mutex_has_waiters(lock)) {
 		/* Drops lock->wait_lock ! */
 		if (unlock_rt_mutex_safe(lock) == true)
-			return;
+			return false;
 		/* Relock the rtmutex and try again */
 		raw_spin_lock(&lock->wait_lock);
 	}
@@ -1306,13 +1298,15 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
 	/*
 	 * The wakeup next waiter path does not suffer from the above
 	 * race. See the comments there.
+	 *
+	 * Queue the next waiter for wakeup once we release the wait_lock.
 	 */
-	wakeup_next_waiter(lock);
+	mark_wakeup_next_waiter(wake_q, lock);
 
 	raw_spin_unlock(&lock->wait_lock);
 
-	/* Undo pi boosting if necessary: */
-	rt_mutex_adjust_prio(current);
+	/* check PI boosting */
+	return true;
 }
 
 /*
@@ -1363,12 +1357,23 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
 
 static inline void
 rt_mutex_fastunlock(struct rt_mutex *lock,
-		    void (*slowfn)(struct rt_mutex *lock))
+		    bool (*slowfn)(struct rt_mutex *lock,
+				   struct wake_q_head *wqh))
 {
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+	WAKE_Q(wake_q);
+
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
 		rt_mutex_deadlock_account_unlock(current);
-	else
-		slowfn(lock);
+
+	} else {
+		bool deboost = slowfn(lock, &wake_q);
+
+		wake_up_q(&wake_q);
+
+		/* Undo pi boosting if necessary: */
+		if (deboost)
+			rt_mutex_adjust_prio(current);
+	}
 }
 
 /**
@@ -1443,10 +1448,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
  *
  * @lock:	the rt_mutex to be locked
  *
+ * This function can only be called in thread context. It's safe to
+ * call it from atomic regions, but not from hard interrupt or soft
+ * interrupt context.
+ *
  * Returns 1 on success and 0 on contention
  */
 int __sched rt_mutex_trylock(struct rt_mutex *lock)
 {
+	if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+		return 0;
+
 	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
@@ -1463,6 +1475,23 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 
 /**
+ * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
+ * @lock: the rt_mutex to be unlocked
+ *
+ * Returns: true/false indicating whether priority adjustment is
+ * required or not.
+ */
+bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
+				   struct wake_q_head *wqh)
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+		rt_mutex_deadlock_account_unlock(current);
+		return false;
+	}
+	return rt_mutex_slowunlock(lock, wqh);
+}
+
+/**
  * rt_mutex_destroy - mark a mutex unusable
  * @lock: the mutex to be destroyed
  *
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 855212501..7844f8f0e 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -131,6 +131,9 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 				      struct hrtimer_sleeper *to,
 				      struct rt_mutex_waiter *waiter);
 extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
+				  struct wake_q_head *wqh);
+extern void rt_mutex_adjust_prio(struct task_struct *task);
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 3417d0172..0f189714e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -409,11 +409,24 @@ done:
 	return taken;
 }
 
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+	return osq_is_locked(&sem->osq);
+}
+
 #else
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
 	return false;
 }
+
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+	return false;
+}
 #endif
 
 /*
@@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
+	/*
+	 * If a spinner is present, it is not necessary to do the wakeup.
+	 * Try to do wakeup only if the trylock succeeds to minimize
+	 * spinlock contention which may introduce too much delay in the
+	 * unlock operation.
+	 *
+	 *    spinning writer		up_write/up_read caller
+	 *    ---------------		-----------------------
+	 * [S]   osq_unlock()		[L]   osq
+	 *	 MB			      RMB
+	 * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+	 *
+	 * Here, it is important to make sure that there won't be a missed
+	 * wakeup while the rwsem is free and the only spinning writer goes
+	 * to sleep without taking the rwsem. Even when the spinning writer
+	 * is just going to break out of the waiting loop, it will still do
+	 * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+	 * rwsem_has_spinner() is true, it will guarantee at least one
+	 * trylock attempt on the rwsem later on.
+	 */
+	if (rwsem_has_spinner(sem)) {
+		/*
+		 * The smp_rmb() here is to make sure that the spinner
+		 * state is consulted before reading the wait_lock.
+		 */
+		smp_rmb();
+		if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+			return sem;
+		goto locked;
+	}
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
 
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
diff --git a/kernel/module.c b/kernel/module.c
index cfc9e843a..b86b7bf1b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,7 +18,7 @@
 */
 #include <linux/export.h>
 #include <linux/moduleloader.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/file.h>
@@ -101,48 +101,201 @@
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
 
-#ifdef CONFIG_MODULE_SIG
-#ifdef CONFIG_MODULE_SIG_FORCE
-static bool sig_enforce = true;
-#else
-static bool sig_enforce = false;
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * Because modules have two address ranges: init and core, we need two
+ * latch_tree_nodes entries. Therefore we need the back-pointer from
+ * mod_tree_node.
+ *
+ * Because init ranges are short lived we mark them unlikely and have placed
+ * them outside the critical cacheline in struct module.
+ *
+ * This is conditional on PERF_EVENTS || TRACING because those can really hit
+ * __module_address() hard by doing a lot of stack unwinding; potentially from
+ * NMI context.
+ */
 
-static int param_set_bool_enable_only(const char *val,
-				      const struct kernel_param *kp)
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
 {
-	int err;
-	bool test;
-	struct kernel_param dummy_kp = *kp;
+	struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+	struct module *mod = mtn->mod;
 
-	dummy_kp.arg = &test;
+	if (unlikely(mtn == &mod->mtn_init))
+		return (unsigned long)mod->module_init;
 
-	err = param_set_bool(val, &dummy_kp);
-	if (err)
-		return err;
+	return (unsigned long)mod->module_core;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+	struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+	struct module *mod = mtn->mod;
+
+	if (unlikely(mtn == &mod->mtn_init))
+		return (unsigned long)mod->init_size;
 
-	/* Don't let them unset it once it's set! */
-	if (!test && sig_enforce)
-		return -EROFS;
+	return (unsigned long)mod->core_size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+	return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+	unsigned long val = (unsigned long)key;
+	unsigned long start, end;
+
+	start = __mod_tree_val(n);
+	if (val < start)
+		return -1;
+
+	end = start + __mod_tree_size(n);
+	if (val >= end)
+		return 1;
 
-	if (test)
-		sig_enforce = true;
 	return 0;
 }
 
-static const struct kernel_param_ops param_ops_bool_enable_only = {
-	.flags = KERNEL_PARAM_OPS_FL_NOARG,
-	.set = param_set_bool_enable_only,
-	.get = param_get_bool,
+static const struct latch_tree_ops mod_tree_ops = {
+	.less = mod_tree_less,
+	.comp = mod_tree_comp,
 };
-#define param_check_bool_enable_only param_check_bool
 
+static struct mod_tree_root {
+	struct latch_tree_root root;
+	unsigned long addr_min;
+	unsigned long addr_max;
+} mod_tree __cacheline_aligned = {
+	.addr_min = -1UL,
+};
+
+#define module_addr_min mod_tree.addr_min
+#define module_addr_max mod_tree.addr_max
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node)
+{
+	latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node)
+{
+	latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+static void mod_tree_insert(struct module *mod)
+{
+	mod->mtn_core.mod = mod;
+	mod->mtn_init.mod = mod;
+
+	__mod_tree_insert(&mod->mtn_core);
+	if (mod->init_size)
+		__mod_tree_insert(&mod->mtn_init);
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+	if (mod->init_size)
+		__mod_tree_remove(&mod->mtn_init);
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+	__mod_tree_remove(&mod->mtn_core);
+	mod_tree_remove_init(mod);
+}
+
+static struct module *mod_find(unsigned long addr)
+{
+	struct latch_tree_node *ltn;
+
+	ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
+	if (!ltn)
+		return NULL;
+
+	return container_of(ltn, struct mod_tree_node, node)->mod;
+}
+
+#else /* MODULES_TREE_LOOKUP */
+
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+
+static void mod_tree_insert(struct module *mod) { }
+static void mod_tree_remove_init(struct module *mod) { }
+static void mod_tree_remove(struct module *mod) { }
+
+static struct module *mod_find(unsigned long addr)
+{
+	struct module *mod;
+
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if (within_module(addr, mod))
+			return mod;
+	}
+
+	return NULL;
+}
+
+#endif /* MODULES_TREE_LOOKUP */
+
+/*
+ * Bounds of module text, for speeding up __module_address.
+ * Protected by module_mutex.
+ */
+static void __mod_update_bounds(void *base, unsigned int size)
+{
+	unsigned long min = (unsigned long)base;
+	unsigned long max = min + size;
+
+	if (min < module_addr_min)
+		module_addr_min = min;
+	if (max > module_addr_max)
+		module_addr_max = max;
+}
+
+static void mod_update_bounds(struct module *mod)
+{
+	__mod_update_bounds(mod->module_core, mod->core_size);
+	if (mod->init_size)
+		__mod_update_bounds(mod->module_init, mod->init_size);
+}
+
+#ifdef CONFIG_KGDB_KDB
+struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+#endif /* CONFIG_KGDB_KDB */
+
+static void module_assert_mutex(void)
+{
+	lockdep_assert_held(&module_mutex);
+}
+
+static void module_assert_mutex_or_preempt(void)
+{
+#ifdef CONFIG_LOCKDEP
+	if (unlikely(!debug_locks))
+		return;
+
+	WARN_ON(!rcu_read_lock_sched_held() &&
+		!lockdep_is_held(&module_mutex));
+#endif
+}
+
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+#ifndef CONFIG_MODULE_SIG_FORCE
 module_param(sig_enforce, bool_enable_only, 0644);
 #endif /* !CONFIG_MODULE_SIG_FORCE */
-#endif /* CONFIG_MODULE_SIG */
 
 /* Block module loading/unloading? */
 int modules_disabled = 0;
@@ -153,10 +306,6 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
-/* Bounds of module allocation, for speeding __module_address.
- * Protected by module_mutex. */
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
 int register_module_notifier(struct notifier_block *nb)
 {
 	return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -318,6 +467,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
 #endif
 	};
 
+	module_assert_mutex_or_preempt();
+
 	if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
 		return true;
 
@@ -451,12 +602,17 @@ const struct kernel_symbol *find_symbol(const char *name,
 }
 EXPORT_SYMBOL_GPL(find_symbol);
 
-/* Search for module by name: must hold module_mutex. */
+/*
+ * Search for module by name: must hold module_mutex (or preempt disabled
+ * for read-only access).
+ */
 static struct module *find_module_all(const char *name, size_t len,
 				      bool even_unformed)
 {
 	struct module *mod;
 
+	module_assert_mutex_or_preempt();
+
 	list_for_each_entry(mod, &modules, list) {
 		if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
 			continue;
@@ -468,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len,
 
 struct module *find_module(const char *name)
 {
+	module_assert_mutex();
 	return find_module_all(name, strlen(name), false);
 }
 EXPORT_SYMBOL_GPL(find_module);
@@ -1169,11 +1326,17 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
 	const unsigned long *crc;
 
-	/* Since this should be found in kernel (which can't be removed),
-	 * no locking is necessary. */
+	/*
+	 * Since this should be found in kernel (which can't be removed), no
+	 * locking is necessary -- use preempt_disable() to placate lockdep.
+	 */
+	preempt_disable();
 	if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
-			 &crc, true, false))
+			 &crc, true, false)) {
+		preempt_enable();
 		BUG();
+	}
+	preempt_enable();
 	return check_version(sechdrs, versindex,
 			     VMLINUX_SYMBOL_STR(module_layout), mod, crc,
 			     NULL);
@@ -1661,6 +1824,10 @@ static void mod_sysfs_fini(struct module *mod)
 	mod_kobject_put(mod);
 }
 
+static void init_param_lock(struct module *mod)
+{
+	mutex_init(&mod->param_lock);
+}
 #else /* !CONFIG_SYSFS */
 
 static int mod_sysfs_setup(struct module *mod,
@@ -1683,6 +1850,9 @@ static void del_usage_links(struct module *mod)
 {
 }
 
+static void init_param_lock(struct module *mod)
+{
+}
 #endif /* CONFIG_SYSFS */
 
 static void mod_sysfs_teardown(struct module *mod)
@@ -1852,10 +2022,11 @@ static void free_module(struct module *mod)
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	/* Remove this module from bug list, this uses list_del_rcu */
 	module_bug_cleanup(mod);
-	/* Wait for RCU synchronizing before releasing mod->list and buglist. */
-	synchronize_rcu();
+	/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
+	synchronize_sched();
 	mutex_unlock(&module_mutex);
 
 	/* This may be NULL, but that's OK */
@@ -2384,22 +2555,6 @@ void * __weak module_alloc(unsigned long size)
 	return vmalloc_exec(size);
 }
 
-static void *module_alloc_update_bounds(unsigned long size)
-{
-	void *ret = module_alloc(size);
-
-	if (ret) {
-		mutex_lock(&module_mutex);
-		/* Update module bounds. */
-		if ((unsigned long)ret < module_addr_min)
-			module_addr_min = (unsigned long)ret;
-		if ((unsigned long)ret + size > module_addr_max)
-			module_addr_max = (unsigned long)ret + size;
-		mutex_unlock(&module_mutex);
-	}
-	return ret;
-}
-
 #ifdef CONFIG_DEBUG_KMEMLEAK
 static void kmemleak_load_module(const struct module *mod,
 				 const struct load_info *info)
@@ -2805,7 +2960,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	void *ptr;
 
 	/* Do the allocs. */
-	ptr = module_alloc_update_bounds(mod->core_size);
+	ptr = module_alloc(mod->core_size);
 	/*
 	 * The pointer to this block is stored in the module structure
 	 * which is inside the block. Just mark it as not being a
@@ -2819,7 +2974,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	mod->module_core = ptr;
 
 	if (mod->init_size) {
-		ptr = module_alloc_update_bounds(mod->init_size);
+		ptr = module_alloc(mod->init_size);
 		/*
 		 * The pointer to this block is stored in the module structure
 		 * which is inside the block. This block doesn't need to be
@@ -3107,7 +3262,7 @@ static noinline int do_init_module(struct module *mod)
 	 *
 	 * http://thread.gmane.org/gmane.linux.kernel/1420814
 	 */
-	if (current->flags & PF_USED_ASYNC)
+	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
 		async_synchronize_full();
 
 	mutex_lock(&module_mutex);
@@ -3119,6 +3274,7 @@ static noinline int do_init_module(struct module *mod)
 	mod->symtab = mod->core_symtab;
 	mod->strtab = mod->core_strtab;
 #endif
+	mod_tree_remove_init(mod);
 	unset_module_init_ro_nx(mod);
 	module_arch_freeing_init(mod);
 	mod->module_init = NULL;
@@ -3127,11 +3283,11 @@ static noinline int do_init_module(struct module *mod)
 	mod->init_text_size = 0;
 	/*
 	 * We want to free module_init, but be aware that kallsyms may be
-	 * walking this with preempt disabled.  In all the failure paths,
-	 * we call synchronize_rcu/synchronize_sched, but we don't want
-	 * to slow down the success path, so use actual RCU here.
+	 * walking this with preempt disabled.  In all the failure paths, we
+	 * call synchronize_sched(), but we don't want to slow down the success
+	 * path, so use actual RCU here.
 	 */
-	call_rcu(&freeinit->rcu, do_free_init);
+	call_rcu_sched(&freeinit->rcu, do_free_init);
 	mutex_unlock(&module_mutex);
 	wake_up_all(&module_wq);
 
@@ -3188,7 +3344,9 @@ again:
 		err = -EEXIST;
 		goto out;
 	}
+	mod_update_bounds(mod);
 	list_add_rcu(&mod->list, &modules);
+	mod_tree_insert(mod);
 	err = 0;
 
 out:
@@ -3237,10 +3395,19 @@ out:
 	return err;
 }
 
-static int unknown_module_param_cb(char *param, char *val, const char *modname)
+static int unknown_module_param_cb(char *param, char *val, const char *modname,
+				   void *arg)
 {
+	struct module *mod = arg;
+	int ret;
+
+	if (strcmp(param, "async_probe") == 0) {
+		mod->async_probe_requested = true;
+		return 0;
+	}
+
 	/* Check for magic 'dyndbg' arg */
-	int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+	ret = ddebug_dyndbg_module_param_cb(param, val, modname);
 	if (ret != 0)
 		pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
 	return 0;
@@ -3295,6 +3462,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	if (err)
 		goto unlink_mod;
 
+	init_param_lock(mod);
+
 	/* Now we've got everything in the final locations, we can
 	 * find optional sections. */
 	err = find_module_sections(mod, info);
@@ -3342,7 +3511,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	/* Module is ready to execute: parsing args may do that. */
 	after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-				  -32768, 32767, unknown_module_param_cb);
+				  -32768, 32767, NULL,
+				  unknown_module_param_cb);
 	if (IS_ERR(after_dashes)) {
 		err = PTR_ERR(after_dashes);
 		goto bug_cleanup;
@@ -3391,9 +3561,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	wake_up_all(&module_wq);
-	/* Wait for RCU synchronizing before releasing mod->list. */
-	synchronize_rcu();
+	/* Wait for RCU-sched synchronizing before releasing mod->list. */
+	synchronize_sched();
 	mutex_unlock(&module_mutex);
  free_module:
 	/* Free lock-classes; relies on the preceding sync_rcu() */
@@ -3517,19 +3688,15 @@ const char *module_address_lookup(unsigned long addr,
 			    char **modname,
 			    char *namebuf)
 {
-	struct module *mod;
 	const char *ret = NULL;
+	struct module *mod;
 
 	preempt_disable();
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod)) {
-			if (modname)
-				*modname = mod->name;
-			ret = get_ksymbol(mod, addr, size, offset);
-			break;
-		}
+	mod = __module_address(addr);
+	if (mod) {
+		if (modname)
+			*modname = mod->name;
+		ret = get_ksymbol(mod, addr, size, offset);
 	}
 	/* Make a copy in here where it's safe */
 	if (ret) {
@@ -3537,6 +3704,7 @@ const char *module_address_lookup(unsigned long addr,
 		ret = namebuf;
 	}
 	preempt_enable();
+
 	return ret;
 }
 
@@ -3660,6 +3828,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 	unsigned int i;
 	int ret;
 
+	module_assert_mutex();
+
 	list_for_each_entry(mod, &modules, list) {
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
@@ -3834,13 +4004,15 @@ struct module *__module_address(unsigned long addr)
 	if (addr < module_addr_min || addr > module_addr_max)
 		return NULL;
 
-	list_for_each_entry_rcu(mod, &modules, list) {
+	module_assert_mutex_or_preempt();
+
+	mod = mod_find(addr);
+	if (mod) {
+		BUG_ON(!within_module(addr, mod));
 		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod))
-			return mod;
+			mod = NULL;
 	}
-	return NULL;
+	return mod;
 }
 EXPORT_SYMBOL_GPL(__module_address);
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 8136ad76e..04e91ff75 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,7 +32,7 @@ static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
-static bool crash_kexec_post_notifiers;
+bool crash_kexec_post_notifiers;
 int panic_on_warn __read_mostly;
 
 int panic_timeout = CONFIG_PANIC_TIMEOUT;
@@ -142,7 +142,8 @@ void panic(const char *fmt, ...)
 	 * Note: since some panic_notifiers can make crashed kernel
 	 * more unstable, it can increase risks of the kdump failure too.
 	 */
-	crash_kexec(NULL);
+	if (crash_kexec_post_notifiers)
+		crash_kexec(NULL);
 
 	bust_spinlocks(0);
 
diff --git a/kernel/params.c b/kernel/params.c
index a22d6a759..b6554aa71 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,15 +25,34 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
 
-/* Protects all parameters, and incidentally kmalloced_param list. */
+#ifdef CONFIG_SYSFS
+/* Protects all built-in parameters, modules use their own param_lock */
 static DEFINE_MUTEX(param_lock);
 
+/* Use the module's mutex, or if built-in use the built-in mutex */
+#ifdef CONFIG_MODULES
+#define KPARAM_MUTEX(mod)	((mod) ? &(mod)->param_lock : &param_lock)
+#else
+#define KPARAM_MUTEX(mod)	(&param_lock)
+#endif
+
+static inline void check_kparam_locked(struct module *mod)
+{
+	BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod)));
+}
+#else
+static inline void check_kparam_locked(struct module *mod)
+{
+}
+#endif /* !CONFIG_SYSFS */
+
 /* This just allows us to keep track of which parameters are kmalloced. */
 struct kmalloced_param {
 	struct list_head list;
 	char val[];
 };
 static LIST_HEAD(kmalloced_params);
+static DEFINE_SPINLOCK(kmalloced_params_lock);
 
 static void *kmalloc_parameter(unsigned int size)
 {
@@ -43,7 +62,10 @@ static void *kmalloc_parameter(unsigned int size)
 	if (!p)
 		return NULL;
 
+	spin_lock(&kmalloced_params_lock);
 	list_add(&p->list, &kmalloced_params);
+	spin_unlock(&kmalloced_params_lock);
+
 	return p->val;
 }
 
@@ -52,6 +74,7 @@ static void maybe_kfree_parameter(void *param)
 {
 	struct kmalloced_param *p;
 
+	spin_lock(&kmalloced_params_lock);
 	list_for_each_entry(p, &kmalloced_params, list) {
 		if (p->val == param) {
 			list_del(&p->list);
@@ -59,6 +82,7 @@ static void maybe_kfree_parameter(void *param)
 			break;
 		}
 	}
+	spin_unlock(&kmalloced_params_lock);
 }
 
 static char dash2underscore(char c)
@@ -100,8 +124,9 @@ static int parse_one(char *param,
 		     unsigned num_params,
 		     s16 min_level,
 		     s16 max_level,
+		     void *arg,
 		     int (*handle_unknown)(char *param, char *val,
-				     const char *doing))
+				     const char *doing, void *arg))
 {
 	unsigned int i;
 	int err;
@@ -118,17 +143,17 @@ static int parse_one(char *param,
 				return -EINVAL;
 			pr_debug("handling %s with %p\n", param,
 				params[i].ops->set);
-			mutex_lock(&param_lock);
+			kernel_param_lock(params[i].mod);
 			param_check_unsafe(&params[i]);
 			err = params[i].ops->set(val, &params[i]);
-			mutex_unlock(&param_lock);
+			kernel_param_unlock(params[i].mod);
 			return err;
 		}
 	}
 
 	if (handle_unknown) {
 		pr_debug("doing %s: %s='%s'\n", doing, param, val);
-		return handle_unknown(param, val, doing);
+		return handle_unknown(param, val, doing, arg);
 	}
 
 	pr_debug("Unknown argument '%s'\n", param);
@@ -194,7 +219,9 @@ char *parse_args(const char *doing,
 		 unsigned num,
 		 s16 min_level,
 		 s16 max_level,
-		 int (*unknown)(char *param, char *val, const char *doing))
+		 void *arg,
+		 int (*unknown)(char *param, char *val,
+				const char *doing, void *arg))
 {
 	char *param, *val;
 
@@ -214,7 +241,7 @@ char *parse_args(const char *doing,
 			return args;
 		irq_was_disabled = irqs_disabled();
 		ret = parse_one(param, val, doing, params, num,
-				min_level, max_level, unknown);
+				min_level, max_level, arg, unknown);
 		if (irq_was_disabled && !irqs_disabled())
 			pr_warn("%s: option '%s' enabled irq's!\n",
 				doing, param);
@@ -251,7 +278,7 @@ char *parse_args(const char *doing,
 		return scnprintf(buffer, PAGE_SIZE, format,		\
 				*((type *)kp->arg));			\
 	}								\
-	struct kernel_param_ops param_ops_##name = {			\
+	const struct kernel_param_ops param_ops_##name = {			\
 		.set = param_set_##name,				\
 		.get = param_get_##name,				\
 	};								\
@@ -303,7 +330,7 @@ static void param_free_charp(void *arg)
 	maybe_kfree_parameter(*((char **)arg));
 }
 
-struct kernel_param_ops param_ops_charp = {
+const struct kernel_param_ops param_ops_charp = {
 	.set = param_set_charp,
 	.get = param_get_charp,
 	.free = param_free_charp,
@@ -328,13 +355,44 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_get_bool);
 
-struct kernel_param_ops param_ops_bool = {
+const struct kernel_param_ops param_ops_bool = {
 	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 	.set = param_set_bool,
 	.get = param_get_bool,
 };
 EXPORT_SYMBOL(param_ops_bool);
 
+int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
+{
+	int err = 0;
+	bool new_value;
+	bool orig_value = *(bool *)kp->arg;
+	struct kernel_param dummy_kp = *kp;
+
+	dummy_kp.arg = &new_value;
+
+	err = param_set_bool(val, &dummy_kp);
+	if (err)
+		return err;
+
+	/* Don't let them unset it once it's set! */
+	if (!new_value && orig_value)
+		return -EROFS;
+
+	if (new_value)
+		err = param_set_bool(val, kp);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(param_set_bool_enable_only);
+
+const struct kernel_param_ops param_ops_bool_enable_only = {
+	.flags = KERNEL_PARAM_OPS_FL_NOARG,
+	.set = param_set_bool_enable_only,
+	.get = param_get_bool,
+};
+EXPORT_SYMBOL_GPL(param_ops_bool_enable_only);
+
 /* This one must be bool. */
 int param_set_invbool(const char *val, const struct kernel_param *kp)
 {
@@ -356,7 +414,7 @@ int param_get_invbool(char *buffer, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_get_invbool);
 
-struct kernel_param_ops param_ops_invbool = {
+const struct kernel_param_ops param_ops_invbool = {
 	.set = param_set_invbool,
 	.get = param_get_invbool,
 };
@@ -364,12 +422,11 @@ EXPORT_SYMBOL(param_ops_invbool);
 
 int param_set_bint(const char *val, const struct kernel_param *kp)
 {
-	struct kernel_param boolkp;
+	/* Match bool exactly, by re-using it. */
+	struct kernel_param boolkp = *kp;
 	bool v;
 	int ret;
 
-	/* Match bool exactly, by re-using it. */
-	boolkp = *kp;
 	boolkp.arg = &v;
 
 	ret = param_set_bool(val, &boolkp);
@@ -379,7 +436,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_set_bint);
 
-struct kernel_param_ops param_ops_bint = {
+const struct kernel_param_ops param_ops_bint = {
 	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 	.set = param_set_bint,
 	.get = param_get_int,
@@ -387,7 +444,8 @@ struct kernel_param_ops param_ops_bint = {
 EXPORT_SYMBOL(param_ops_bint);
 
 /* We break the rule and mangle the string. */
-static int param_array(const char *name,
+static int param_array(struct module *mod,
+		       const char *name,
 		       const char *val,
 		       unsigned int min, unsigned int max,
 		       void *elem, int elemsize,
@@ -418,7 +476,7 @@ static int param_array(const char *name,
 		/* nul-terminate and parse */
 		save = val[len];
 		((char *)val)[len] = '\0';
-		BUG_ON(!mutex_is_locked(&param_lock));
+		check_kparam_locked(mod);
 		ret = set(val, &kp);
 
 		if (ret != 0)
@@ -440,7 +498,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
 	const struct kparam_array *arr = kp->arr;
 	unsigned int temp_num;
 
-	return param_array(kp->name, val, 1, arr->max, arr->elem,
+	return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem,
 			   arr->elemsize, arr->ops->set, kp->level,
 			   arr->num ?: &temp_num);
 }
@@ -449,14 +507,13 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
 {
 	int i, off, ret;
 	const struct kparam_array *arr = kp->arr;
-	struct kernel_param p;
+	struct kernel_param p = *kp;
 
-	p = *kp;
 	for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
 		if (i)
 			buffer[off++] = ',';
 		p.arg = arr->elem + arr->elemsize * i;
-		BUG_ON(!mutex_is_locked(&param_lock));
+		check_kparam_locked(p.mod);
 		ret = arr->ops->get(buffer + off, &p);
 		if (ret < 0)
 			return ret;
@@ -476,7 +533,7 @@ static void param_array_free(void *arg)
 			arr->ops->free(arr->elem + arr->elemsize * i);
 }
 
-struct kernel_param_ops param_array_ops = {
+const struct kernel_param_ops param_array_ops = {
 	.set = param_array_set,
 	.get = param_array_get,
 	.free = param_array_free,
@@ -504,7 +561,7 @@ int param_get_string(char *buffer, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_get_string);
 
-struct kernel_param_ops param_ops_string = {
+const struct kernel_param_ops param_ops_string = {
 	.set = param_set_copystring,
 	.get = param_get_string,
 };
@@ -539,9 +596,9 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 	if (!attribute->param->ops->get)
 		return -EPERM;
 
-	mutex_lock(&param_lock);
+	kernel_param_lock(mk->mod);
 	count = attribute->param->ops->get(buf, attribute->param);
-	mutex_unlock(&param_lock);
+	kernel_param_unlock(mk->mod);
 	if (count > 0) {
 		strcat(buf, "\n");
 		++count;
@@ -551,7 +608,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 
 /* sysfs always hands a nul-terminated string in buf.  We rely on that. */
 static ssize_t param_attr_store(struct module_attribute *mattr,
-				struct module_kobject *km,
+				struct module_kobject *mk,
 				const char *buf, size_t len)
 {
  	int err;
@@ -560,10 +617,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 	if (!attribute->param->ops->set)
 		return -EPERM;
 
-	mutex_lock(&param_lock);
+	kernel_param_lock(mk->mod);
 	param_check_unsafe(attribute->param);
 	err = attribute->param->ops->set(buf, attribute->param);
-	mutex_unlock(&param_lock);
+	kernel_param_unlock(mk->mod);
 	if (!err)
 		return len;
 	return err;
@@ -577,17 +634,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 #endif
 
 #ifdef CONFIG_SYSFS
-void __kernel_param_lock(void)
+void kernel_param_lock(struct module *mod)
 {
-	mutex_lock(&param_lock);
+	mutex_lock(KPARAM_MUTEX(mod));
 }
-EXPORT_SYMBOL(__kernel_param_lock);
 
-void __kernel_param_unlock(void)
+void kernel_param_unlock(struct module *mod)
 {
-	mutex_unlock(&param_lock);
+	mutex_unlock(KPARAM_MUTEX(mod));
 }
-EXPORT_SYMBOL(__kernel_param_unlock);
+
+EXPORT_SYMBOL(kernel_param_lock);
+EXPORT_SYMBOL(kernel_param_unlock);
 
 /*
  * add_sysfs_param - add a parameter to sysfs
@@ -853,6 +911,7 @@ static void __init version_sysfs_builtin(void)
 		mk = locate_module_kobject(vattr->module_name);
 		if (mk) {
 			err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+			WARN_ON_ONCE(err);
 			kobject_uevent(&mk->kobj, KOBJ_ADD);
 			kobject_put(&mk->kobj);
 		}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 89a46f3ff..9e302315e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -91,284 +91,6 @@ config PM_STD_PARTITION
 	  suspended image to. It will simply pick the first available swap 
 	  device.
 
-menuconfig TOI_CORE
-	bool "Enhanced Hibernation (TuxOnIce)"
-	depends on HIBERNATION
-	default y
-	---help---
-	  TuxOnIce is the 'new and improved' suspend support.
-
-	  See the TuxOnIce home page (tuxonice.net)
-	  for FAQs, HOWTOs and other documentation.
-
-	comment "Image Storage (you need at least one allocator)"
-		depends on TOI_CORE
-
-	config TOI_FILE
-		bool "File Allocator"
-		depends on TOI_CORE
-		default y
-		---help---
-		  This option enables support for storing an image in a
-		  simple file. You might want this if your swap is
-		  sometimes full enough that you don't have enough spare
-		  space to store an image.
-
-	config TOI_SWAP
-		bool "Swap Allocator"
-		depends on TOI_CORE && SWAP
-		default y
-		---help---
-		  This option enables support for storing an image in your
-		  swap space.
-
-	comment "General Options"
-		depends on TOI_CORE
-
-	config TOI_PRUNE
-		bool "Image pruning support"
-		depends on TOI_CORE && CRYPTO && BROKEN
-		default y
-		---help---
-		  This option adds support for using cryptoapi hashing
-		  algorithms to identify pages with the same content. We
-		  then write a much smaller pointer to the first copy of
-		  the data instead of a complete (perhaps compressed)
-                  additional copy.
-
-		  You probably want this, so say Y here.
-
-	comment "No image pruning support available without Cryptoapi support."
-		depends on TOI_CORE && !CRYPTO
-
-	config TOI_CRYPTO
-		bool "Compression support"
-		depends on TOI_CORE && CRYPTO
-		default y
-		---help---
-		  This option adds support for using cryptoapi compression
-		  algorithms. Compression is particularly useful as it can
-		  more than double your suspend and resume speed (depending
-		  upon how well your image compresses).
-
-		  You probably want this, so say Y here.
-
-	comment "No compression support available without Cryptoapi support."
-		depends on TOI_CORE && !CRYPTO
-
-	config TOI_USERUI
-		bool "Userspace User Interface support"
-		depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
-		default y
-		---help---
-		  This option enabled support for a userspace based user interface
-		  to TuxOnIce, which allows you to have a nice display while suspending
-		  and resuming, and also enables features such as pressing escape to
-		  cancel a cycle or interactive debugging.
-
-	config TOI_USERUI_DEFAULT_PATH
-		string "Default userui program location"
-		default "/usr/local/sbin/tuxoniceui_text"
-		depends on TOI_USERUI
-		---help---
-		  This entry allows you to specify a default path to the userui binary.
-
-	config TOI_DEFAULT_IMAGE_SIZE_LIMIT
-		int "Default image size limit"
-		range -2 65536 
-		default "-2"
-		depends on TOI_CORE
-		---help---
-		  This entry allows you to specify a default image size limit. It can
-		  be overridden at run-time using /sys/power/tuxonice/image_size_limit.
-
-	config TOI_KEEP_IMAGE
-		bool "Allow Keep Image Mode"
-		depends on TOI_CORE
-		---help---
-		  This option allows you to keep and image and reuse it. It is intended
-		  __ONLY__ for use with systems where all filesystems are mounted read-
-		  only (kiosks, for example). To use it, compile this option in and boot
-		  normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
-		  When you resume, the image will not be removed. You will be unable to turn
-		  off swap partitions (assuming you are using the swap allocator), but future
-		  suspends simply do a power-down. The image can be updated using the
-		  kernel command line parameter suspend_act= to turn off the keep image
-		  bit. Keep image mode is a little less user friendly on purpose - it
-		  should not be used without thought!
-
-	config TOI_INCREMENTAL
-		bool "Incremental Image Support"
-		depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE
-		default n
-		---help---
-		  This option enables the work in progress toward using the dirty page
-		  tracking to record changes to pages. It is hoped that
-		  this will be an initial step toward implementing storing just
-		  the differences between consecutive images, which will
-		  increase the amount of storage needed for the image, but also
-		  increase the speed at which writing an image occurs and
-		  reduce the wear and tear on drives.
-
-		  At the moment, all that is implemented is the first step of keeping
-		  an existing image and then comparing it to the contents in memory
-		  (by setting /sys/power/tuxonice/verify_image to 1 and triggering a
-		  (fake) resume) to see what the page change tracking should find to be
-		  different. If you have verify_image set to 1, TuxOnIce will automatically
-		  invalidate the old image when you next try to hibernate, so there's no
-		  greater chance of disk corruption than normal.
-
-	comment "No incremental image support available without Keep Image support."
-		depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT
-
-	config TOI_REPLACE_SWSUSP
-		bool "Replace swsusp by default"
-		default y
-		depends on TOI_CORE
-		---help---
-		  TuxOnIce can replace swsusp. This option makes that the default state,
-		  requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
-		  to use the vanilla kernel functionality. Note that your initrd/ramfs will
-		  need to do this before trying to resume, too.
-		  With overriding swsusp enabled, echoing disk  to /sys/power/state will
-		  start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
-		  the swap and file allocators are compiled in, the swap allocator will be
-		  used by default.
-
-	config TOI_IGNORE_LATE_INITCALL
-		bool "Wait for initrd/ramfs to run, by default"
-		default n
-		depends on TOI_CORE
-		---help---
-		  When booting, TuxOnIce can check for an image and start to resume prior
-		  to any initrd/ramfs running (via a late initcall).
-
-		  If you don't have an initrd/ramfs, this is what you want to happen -
-		  otherwise you won't be able to safely resume. You should set this option
-		  to 'No'.
-
-		  If, however, you want your initrd/ramfs to run anyway before resuming,
-		  you need to tell TuxOnIce to ignore that earlier opportunity to resume.
-		  This can be done either by using this compile time option, or by
-		  overriding this option with the boot-time parameter toi_initramfs_resume_only=1.
-
-		  Note that if TuxOnIce can't resume at the earlier opportunity, the
-		  value of this option won't matter - the initramfs/initrd (if any) will
-		  run anyway.
-
-	menuconfig TOI_CLUSTER
-		bool "Cluster support"
-		default n
-		depends on TOI_CORE && NET && BROKEN
-		---help---
-		  Support for linking multiple machines in a cluster so that they suspend
-		  and resume together.
-
-	config TOI_DEFAULT_CLUSTER_INTERFACE
-		string "Default cluster interface"
-		depends on TOI_CLUSTER
-		---help---
-		  The default interface on which to communicate with other nodes in
-		  the cluster.
-
-		  If no value is set here, cluster support will be disabled by default.
-
-	config TOI_DEFAULT_CLUSTER_KEY
-		string "Default cluster key"
-		default "Default"
-		depends on TOI_CLUSTER
-		---help---
-		  The default key used by this node. All nodes in the same cluster
-		  have the same key. Multiple clusters may coexist on the same lan
-		  by using different values for this key.
-
-	config TOI_CLUSTER_IMAGE_TIMEOUT
-		int "Timeout when checking for image"
-		default 15
-		depends on TOI_CLUSTER
-		---help---
-		  Timeout (seconds) before continuing to boot when waiting to see
-		  whether other nodes might have an image. Set to -1 to wait
-		  indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
-		  booting sooner than this timeout.
-
-	config TOI_CLUSTER_WAIT_UNTIL_NODES
-		int "Nodes without image before continuing"
-		default 0
-		depends on TOI_CLUSTER
-		---help---
-		  When booting and no image is found, we wait to see if other nodes
-		  have an image before continuing to boot. This value lets us
-		  continue after seeing a certain number of nodes without an image,
-		  instead of continuing to wait for the timeout. Set to 0 to only
-		  use the timeout.
-
-	config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
-		string "Default pre-hibernate script"
-		depends on TOI_CLUSTER
-		---help---
-		  The default script to be called when starting to hibernate.
-
-	config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
-		string "Default post-hibernate script"
-		depends on TOI_CLUSTER
-		---help---
-		  The default script to be called after resuming from hibernation.
-
-	config TOI_DEFAULT_WAIT
-		int "Default waiting time for emergency boot messages"
-		default "25"
-		range -1 32768
-		depends on TOI_CORE
-		help
-		  TuxOnIce can display warnings very early in the process of resuming,
-		  if (for example) it appears that you have booted a kernel that doesn't
-		  match an image on disk. It can then give you the opportunity to either
-		  continue booting that kernel, or reboot the machine. This option can be
-		  used to control how long to wait in such circumstances. -1 means wait
-		  forever. 0 means don't wait at all (do the default action, which will
-		  generally be to continue booting and remove the image). Values of 1 or
-		  more indicate a number of seconds (up to 255) to wait before doing the
-		  default.
-
-	config  TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE
-		int "Default extra pages allowance"
-		default "2000"
-		range 500 32768
-		depends on TOI_CORE
-		help
-		  This value controls the default for the allowance TuxOnIce makes for
-		  drivers to allocate extra memory during the atomic copy. The default
-		  value of 2000 will be okay in most cases. If you are using
-		  DRI, the easiest way to find what value to use is to try to hibernate
-		  and look at how many pages were actually needed in the sysfs entry
-		  /sys/power/tuxonice/debug_info (first number on the last line), adding
-		  a little extra because the value is not always the same.
-
-	config TOI_CHECKSUM
-		bool "Checksum pageset2"
-		default n
-		depends on TOI_CORE
-		select CRYPTO
-		select CRYPTO_ALGAPI
-		select CRYPTO_MD4
-		---help---
-		  Adds support for checksumming pageset2 pages, to ensure you really get an
-		  atomic copy. Since some filesystems (XFS especially) change metadata even
-		  when there's no other activity, we need this to check for pages that have
-		  been changed while we were saving the page cache. If your debugging output
-		  always says no pages were resaved, you may be able to safely disable this
-		  option.
-
-config TOI
-	bool
-	depends on TOI_CORE!=n
-	default y
-
-config TOI_ZRAM_SUPPORT
-	def_bool y
-	depends on TOI && ZRAM!=n
-
 config PM_SLEEP
 	def_bool y
 	depends on SUSPEND || HIBERNATE_CALLBACKS
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index b8d7b68f7..cb880a14c 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,46 +1,13 @@
 
 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
 
-tuxonice_core-y := tuxonice_modules.o
-
-obj-$(CONFIG_TOI)		+= tuxonice_builtin.o
-obj-$(CONFIG_TOI_INCREMENTAL)   += tuxonice_incremental.o \
-    tuxonice_copy_before_write.o
-
-tuxonice_core-$(CONFIG_PM_DEBUG)	+= tuxonice_alloc.o
-
-# Compile these in after allocation debugging, if used.
-
-tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \
-		tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
-		tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
-		tuxonice_power_off.o tuxonice_atomic_copy.o
-
-tuxonice_core-$(CONFIG_TOI_CHECKSUM)	+= tuxonice_checksum.o
-
-tuxonice_core-$(CONFIG_NET)	+= tuxonice_storage.o tuxonice_netlink.o
-
-obj-$(CONFIG_TOI_CORE)		+= tuxonice_core.o
-obj-$(CONFIG_TOI_PRUNE)		+= tuxonice_prune.o
-obj-$(CONFIG_TOI_CRYPTO)	+= tuxonice_compress.o
-
-tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \
-		tuxonice_bio_signature.o
-
-obj-$(CONFIG_TOI_SWAP)		+= tuxonice_bio.o tuxonice_swap.o
-obj-$(CONFIG_TOI_FILE)		+= tuxonice_bio.o tuxonice_file.o
-obj-$(CONFIG_TOI_CLUSTER)	+= tuxonice_cluster.o
-
-obj-$(CONFIG_TOI_USERUI)	+= tuxonice_userui.o
-
 obj-y				+= qos.o
 obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
-obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o \
-				   block_io.o
+obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o
 obj-$(CONFIG_PM_AUTOSLEEP)	+= autosleep.o
 obj-$(CONFIG_PM_WAKELOCKS)	+= wakelock.o
 
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
deleted file mode 100644
index 9a58bc258..000000000
--- a/kernel/power/block_io.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * This file provides functions for block I/O operations on swap/file.
- *
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
- * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/bio.h>
-#include <linux/kernel.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-
-#include "power.h"
-
-/**
- *	submit - submit BIO request.
- *	@rw:	READ or WRITE.
- *	@off	physical offset of page.
- *	@page:	page we're reading or writing.
- *	@bio_chain: list of pending biod (for async reading)
- *
- *	Straight from the textbook - allocate and initialize the bio.
- *	If we're reading, make sure the page is marked as dirty.
- *	Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, struct block_device *bdev, sector_t sector,
-		struct page *page, struct bio **bio_chain)
-{
-	const int bio_rw = rw | REQ_SYNC;
-	struct bio *bio;
-
-	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-	bio->bi_iter.bi_sector = sector;
-	bio->bi_bdev = bdev;
-	bio->bi_end_io = end_swap_bio_read;
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
-			(unsigned long long)sector);
-		bio_put(bio);
-		return -EFAULT;
-	}
-
-	lock_page(page);
-	bio_get(bio);
-
-	if (bio_chain == NULL) {
-		submit_bio(bio_rw, bio);
-		wait_on_page_locked(page);
-		if (rw == READ)
-			bio_set_pages_dirty(bio);
-		bio_put(bio);
-	} else {
-		if (rw == READ)
-			get_page(page);	/* These pages are freed later */
-		bio->bi_private = *bio_chain;
-		*bio_chain = bio;
-		submit_bio(bio_rw, bio);
-	}
-	return 0;
-}
-
-int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
-			virt_to_page(addr), bio_chain);
-}
-
-int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
-			virt_to_page(addr), bio_chain);
-}
-
-int hib_wait_on_bio_chain(struct bio **bio_chain)
-{
-	struct bio *bio;
-	struct bio *next_bio;
-	int ret = 0;
-
-	if (bio_chain == NULL)
-		return 0;
-
-	bio = *bio_chain;
-	if (bio == NULL)
-		return 0;
-	while (bio) {
-		struct page *page;
-
-		next_bio = bio->bi_private;
-		page = bio->bi_io_vec[0].bv_page;
-		wait_on_page_locked(page);
-		if (!PageUptodate(page) || PageError(page))
-			ret = -EIO;
-		put_page(page);
-		bio_put(bio);
-		bio = next_bio;
-	}
-	*bio_chain = NULL;
-	return ret;
-}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b7d3bc724..690f78f21 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -31,7 +31,7 @@
 #include <linux/ktime.h>
 #include <trace/events/power.h>
 
-#include "tuxonice.h"
+#include "power.h"
 
 
 static int nocompress;
@@ -39,7 +39,7 @@ static int noresume;
 static int nohibernate;
 static int resume_wait;
 static unsigned int resume_delay;
-char resume_file[256] = CONFIG_PM_STD_PARTITION;
+static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
 __visible int in_suspend __nosavedata;
@@ -123,7 +123,7 @@ static int hibernation_test(int level) { return 0; }
  * platform_begin - Call platform to start hibernation.
  * @platform_mode: Whether or not to use the platform driver.
  */
-int platform_begin(int platform_mode)
+static int platform_begin(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->begin() : 0;
@@ -133,7 +133,7 @@ int platform_begin(int platform_mode)
  * platform_end - Call platform to finish transition to the working state.
  * @platform_mode: Whether or not to use the platform driver.
  */
-void platform_end(int platform_mode)
+static void platform_end(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->end();
@@ -147,7 +147,7 @@ void platform_end(int platform_mode)
  * if so configured, and return an error code if that fails.
  */
 
-int platform_pre_snapshot(int platform_mode)
+static int platform_pre_snapshot(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->pre_snapshot() : 0;
@@ -162,7 +162,7 @@ int platform_pre_snapshot(int platform_mode)
  *
  * This routine is called on one CPU with interrupts disabled.
  */
-void platform_leave(int platform_mode)
+static void platform_leave(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->leave();
@@ -177,7 +177,7 @@ void platform_leave(int platform_mode)
  *
  * This routine must be called after platform_prepare().
  */
-void platform_finish(int platform_mode)
+static void platform_finish(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->finish();
@@ -193,7 +193,7 @@ void platform_finish(int platform_mode)
  * If the restore fails after this function has been called,
  * platform_restore_cleanup() must be called.
  */
-int platform_pre_restore(int platform_mode)
+static int platform_pre_restore(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->pre_restore() : 0;
@@ -210,7 +210,7 @@ int platform_pre_restore(int platform_mode)
  * function must be called too, regardless of the result of
  * platform_pre_restore().
  */
-void platform_restore_cleanup(int platform_mode)
+static void platform_restore_cleanup(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->restore_cleanup();
@@ -220,7 +220,7 @@ void platform_restore_cleanup(int platform_mode)
  * platform_recover - Recover from a failure to suspend devices.
  * @platform_mode: Whether or not to use the platform driver.
  */
-void platform_recover(int platform_mode)
+static void platform_recover(int platform_mode)
 {
 	if (platform_mode && hibernation_ops && hibernation_ops->recover)
 		hibernation_ops->recover();
@@ -552,7 +552,7 @@ int hibernation_platform_enter(void)
 
 	error = disable_nonboot_cpus();
 	if (error)
-		goto Platform_finish;
+		goto Enable_cpus;
 
 	local_irq_disable();
 	syscore_suspend();
@@ -568,6 +568,8 @@ int hibernation_platform_enter(void)
  Power_up:
 	syscore_resume();
 	local_irq_enable();
+
+ Enable_cpus:
 	enable_nonboot_cpus();
 
  Platform_finish:
@@ -646,9 +648,6 @@ int hibernate(void)
 {
 	int error;
 
-	if (test_action_state(TOI_REPLACE_SWSUSP))
-		return try_tuxonice_hibernate();
-
 	if (!hibernation_available()) {
 		pr_debug("PM: Hibernation not available.\n");
 		return -EPERM;
@@ -738,19 +737,11 @@ int hibernate(void)
  * attempts to recover gracefully and make the kernel return to the normal mode
  * of operation.
  */
-int software_resume(void)
+static int software_resume(void)
 {
 	int error;
 	unsigned int flags;
 
-	resume_attempted = 1;
-
-	/*
-	 * We can't know (until an image header - if any - is loaded), whether
-	 * we did override swsusp. We therefore ensure that both are tried.
-	 */
-	try_tuxonice_resume();
-
 	/*
 	 * If the user said "noresume".. bail out early.
 	 */
@@ -1137,7 +1128,6 @@ static int __init hibernate_setup(char *str)
 static int __init noresume_setup(char *str)
 {
 	noresume = 1;
-	set_toi_state(TOI_NORESUME_SPECIFIED);
 	return 1;
 }
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 86e8157a4..63d395b5d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -272,7 +272,7 @@ static inline void pm_print_times_init(void)
 {
 	pm_print_times_enabled = !!initcall_debug;
 }
-#else /* !CONFIG_PP_SLEEP_DEBUG */
+#else /* !CONFIG_PM_SLEEP_DEBUG */
 static inline void pm_print_times_init(void) {}
 #endif /* CONFIG_PM_SLEEP_DEBUG */
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 095ed9f03..caadb566e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -36,12 +36,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 	return arch_hibernation_header_restore(info) ?
 			"architecture specific data" : NULL;
 }
-#else
-extern char *check_image_kernel(struct swsusp_info *info);
 #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-extern int init_header(struct swsusp_info *info);
 
-extern char resume_file[256];
 /*
  * Keep some memory free so that I/O operations can succeed without paging
  * [Might this be more than 4 MB?]
@@ -81,8 +77,6 @@ static struct kobj_attribute _name##_attr = {	\
 	.store	= _name##_store,		\
 }
 
-extern struct pbe *restore_pblist;
-
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
@@ -169,15 +163,6 @@ extern void swsusp_close(fmode_t);
 extern int swsusp_unmark(void);
 #endif
 
-/* kernel/power/block_io.c */
-extern struct block_device *hib_resume_bdev;
-
-extern int hib_bio_read_page(pgoff_t page_off, void *addr,
-		struct bio **bio_chain);
-extern int hib_bio_write_page(pgoff_t page_off, void *addr,
-		struct bio **bio_chain);
-extern int hib_wait_on_bio_chain(struct bio **bio_chain);
-
 struct timeval;
 /* kernel/power/swsusp.c */
 extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
@@ -275,31 +260,6 @@ static inline void suspend_thaw_processes(void)
 }
 #endif
 
-extern struct page *saveable_page(struct zone *z, unsigned long p);
-#ifdef CONFIG_HIGHMEM
-struct page *saveable_highmem_page(struct zone *z, unsigned long p);
-#else
-static
-inline void *saveable_highmem_page(struct zone *z, unsigned long p)
-{
-	return NULL;
-}
-#endif
-
-#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
-extern struct list_head nosave_regions;
-
-/**
- *	This structure represents a range of page frames the contents of which
- *	should not be saved during the suspend.
- */
-
-struct nosave_region {
-	struct list_head list;
-	unsigned long start_pfn;
-	unsigned long end_pfn;
-};
-
 #ifdef CONFIG_PM_AUTOSLEEP
 
 /* kernel/power/autosleep.c */
@@ -326,10 +286,3 @@ extern int pm_wake_lock(const char *buf);
 extern int pm_wake_unlock(const char *buf);
 
 #endif /* !CONFIG_PM_WAKELOCKS */
-
-#ifdef CONFIG_TOI
-unsigned long toi_get_nonconflicting_page(void);
-#define BM_END_OF_MAP	(~0UL)
-#else
-#define toi_get_nonconflicting_page() (0)
-#endif
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ba9d20ebc..5235dd4e1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,9 +36,6 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
 #include "power.h"
 
 static int swsusp_page_is_free(struct page *);
@@ -101,9 +98,6 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
 
-        if (toi_running)
-            return (void *) toi_get_nonconflicting_page();
-
 	res = (void *)get_zeroed_page(gfp_mask);
 	if (safe_needed)
 		while (res && swsusp_page_is_free(virt_to_page(res))) {
@@ -149,11 +143,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 
 	page = virt_to_page(addr);
 
-        if (toi_running) {
-            toi__free_page(29, page);
-            return;
-        }
-
 	swsusp_unset_page_forbidden(page);
 	if (clear_nosave_free)
 		swsusp_unset_page_free(page);
@@ -313,15 +302,13 @@ struct bm_position {
 	int node_bit;
 };
 
-#define BM_POSITION_SLOTS (NR_CPUS * 2)
-
 struct memory_bitmap {
 	struct list_head zones;
 	struct linked_page *p_list;	/* list of pages used to store zone
 					 * bitmap objects and bitmap block
 					 * objects
 					 */
-	struct bm_position cur[BM_POSITION_SLOTS];    /* most recently used bit position */
+	struct bm_position cur;	/* most recently used bit position */
 };
 
 /* Functions that operate on memory bitmaps */
@@ -486,39 +473,16 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
 		free_image_page(node->data, clear_nosave_free);
 }
 
-void memory_bm_position_reset(struct memory_bitmap *bm)
+static void memory_bm_position_reset(struct memory_bitmap *bm)
 {
-    int index;
-
-    for (index = 0; index < BM_POSITION_SLOTS; index++) {
-	bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+	bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
 				  list);
-	bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+	bm->cur.node = list_entry(bm->cur.zone->leaves.next,
 				  struct rtree_node, list);
-	bm->cur[index].node_pfn = 0;
-	bm->cur[index].node_bit = 0;
-    }
+	bm->cur.node_pfn = 0;
+	bm->cur.node_bit = 0;
 }
 
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-
-/**
- *      memory_bm_clear
- *      @param bm - The bitmap to clear
- *
- *      Only run while single threaded - locking not needed
- */
-void memory_bm_clear(struct memory_bitmap *bm)
-{
-    memory_bm_position_reset(bm);
-
-    while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) {
-        memory_bm_clear_current(bm, 0);
-    }
-
-    memory_bm_position_reset(bm);
-}
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 
 struct mem_extent {
@@ -631,8 +595,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 	}
 
 	bm->p_list = ca.chain;
-
-        memory_bm_position_reset(bm);
+	memory_bm_position_reset(bm);
  Exit:
 	free_mem_extents(&mem_extents);
 	return error;
@@ -668,24 +631,14 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
  *	It walks the radix tree to find the page which contains the bit for
  *	pfn and returns the bit position in **addr and *bit_nr.
  */
-int memory_bm_find_bit(struct memory_bitmap *bm, int index,
-        unsigned long pfn, void **addr, unsigned int *bit_nr)
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+			      void **addr, unsigned int *bit_nr)
 {
 	struct mem_zone_bm_rtree *curr, *zone;
 	struct rtree_node *node;
 	int i, block_nr;
 
-        if (!bm->cur[index].zone) {
-            // Reset
-            bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
-                    list);
-            bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
-                    struct rtree_node, list);
-            bm->cur[index].node_pfn = 0;
-            bm->cur[index].node_bit = 0;
-        }
-
-	zone = bm->cur[index].zone;
+	zone = bm->cur.zone;
 
 	if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
 		goto zone_found;
@@ -709,8 +662,8 @@ zone_found:
 	 * node for our pfn.
 	 */
 
-	node = bm->cur[index].node;
-	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn)
+	node = bm->cur.node;
+	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
 		goto node_found;
 
 	node      = zone->rtree;
@@ -727,9 +680,9 @@ zone_found:
 
 node_found:
 	/* Update last position */
-	bm->cur[index].zone = zone;
-	bm->cur[index].node = node;
-	bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+	bm->cur.zone = zone;
+	bm->cur.node = node;
+	bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
 
 	/* Set return values */
 	*addr = node->data;
@@ -738,66 +691,66 @@ node_found:
 	return 0;
 }
 
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
 	BUG_ON(error);
 	set_bit(bit, addr);
 }
 
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
 	if (!error)
 		set_bit(bit, addr);
 
 	return error;
 }
 
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
 	BUG_ON(error);
 	clear_bit(bit, addr);
 }
 
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index)
+static void memory_bm_clear_current(struct memory_bitmap *bm)
 {
 	int bit;
 
-	bit = max(bm->cur[index].node_bit - 1, 0);
-	clear_bit(bit, bm->cur[index].node->data);
+	bit = max(bm->cur.node_bit - 1, 0);
+	clear_bit(bit, bm->cur.node->data);
 }
 
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
 	BUG_ON(error);
 	return test_bit(bit, addr);
 }
 
-static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn)
+static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 
-	return !memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	return !memory_bm_find_bit(bm, pfn, &addr, &bit);
 }
 
 /*
@@ -810,25 +763,25 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned
  *
  *	Returns true if there is a next node, false otherwise.
  */
-static bool rtree_next_node(struct memory_bitmap *bm, int index)
+static bool rtree_next_node(struct memory_bitmap *bm)
 {
-	bm->cur[index].node = list_entry(bm->cur[index].node->list.next,
+	bm->cur.node = list_entry(bm->cur.node->list.next,
 				  struct rtree_node, list);
-	if (&bm->cur[index].node->list != &bm->cur[index].zone->leaves) {
-		bm->cur[index].node_pfn += BM_BITS_PER_BLOCK;
-		bm->cur[index].node_bit  = 0;
+	if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+		bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+		bm->cur.node_bit  = 0;
 		touch_softlockup_watchdog();
 		return true;
 	}
 
 	/* No more nodes, goto next zone */
-	bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next,
+	bm->cur.zone = list_entry(bm->cur.zone->list.next,
 				  struct mem_zone_bm_rtree, list);
-	if (&bm->cur[index].zone->list != &bm->zones) {
-		bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+	if (&bm->cur.zone->list != &bm->zones) {
+		bm->cur.node = list_entry(bm->cur.zone->leaves.next,
 					  struct rtree_node, list);
-		bm->cur[index].node_pfn = 0;
-		bm->cur[index].node_bit = 0;
+		bm->cur.node_pfn = 0;
+		bm->cur.node_bit = 0;
 		return true;
 	}
 
@@ -846,29 +799,38 @@ static bool rtree_next_node(struct memory_bitmap *bm, int index)
  *	It is required to run memory_bm_position_reset() before the
  *	first call to this function.
  */
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index)
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
 	unsigned long bits, pfn, pages;
 	int bit;
 
-        index += NR_CPUS; /* Iteration state is separated from get/set/test */
-
 	do {
-		pages	  = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn;
-		bits      = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK);
-		bit	  = find_next_bit(bm->cur[index].node->data, bits,
-					  bm->cur[index].node_bit);
+		pages	  = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
+		bits      = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
+		bit	  = find_next_bit(bm->cur.node->data, bits,
+					  bm->cur.node_bit);
 		if (bit < bits) {
-			pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit;
-			bm->cur[index].node_bit = bit + 1;
+			pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
+			bm->cur.node_bit = bit + 1;
 			return pfn;
 		}
-	} while (rtree_next_node(bm, index));
+	} while (rtree_next_node(bm));
 
 	return BM_END_OF_MAP;
 }
 
-LIST_HEAD(nosave_regions);
+/**
+ *	This structure represents a range of page frames the contents of which
+ *	should not be saved during the suspend.
+ */
+
+struct nosave_region {
+	struct list_head list;
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+};
+
+static LIST_HEAD(nosave_regions);
 
 /**
  *	register_nosave_region - register a range of page frames the contents
@@ -927,37 +889,37 @@ static struct memory_bitmap *free_pages_map;
 void swsusp_set_page_free(struct page *page)
 {
 	if (free_pages_map)
-		memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page));
+		memory_bm_set_bit(free_pages_map, page_to_pfn(page));
 }
 
 static int swsusp_page_is_free(struct page *page)
 {
 	return free_pages_map ?
-		memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0;
+		memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
 }
 
 void swsusp_unset_page_free(struct page *page)
 {
 	if (free_pages_map)
-		memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page));
+		memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
 }
 
 static void swsusp_set_page_forbidden(struct page *page)
 {
 	if (forbidden_pages_map)
-		memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page));
+		memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
 }
 
 int swsusp_page_is_forbidden(struct page *page)
 {
 	return forbidden_pages_map ?
-		memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0;
+		memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
 }
 
 static void swsusp_unset_page_forbidden(struct page *page)
 {
 	if (forbidden_pages_map)
-		memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page));
+		memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
 }
 
 /**
@@ -988,7 +950,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 				 * touch the PFNs for which the error is
 				 * returned anyway.
 				 */
-				mem_bm_set_bit_check(bm, 0, pfn);
+				mem_bm_set_bit_check(bm, pfn);
 			}
 	}
 }
@@ -1116,7 +1078,7 @@ static unsigned int count_free_highmem_pages(void)
  *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
  *	and it isn't a part of a free chunk of pages.
  */
-struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -1163,6 +1125,11 @@ static unsigned int count_highmem_pages(void)
 	}
 	return n;
 }
+#else
+static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+	return NULL;
+}
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -1173,7 +1140,7 @@ static unsigned int count_highmem_pages(void)
  *	of pages statically defined as 'unsaveable', and it isn't a part of
  *	a free chunk of pages.
  */
-struct page *saveable_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -1311,15 +1278,15 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (page_is_saveable(zone, pfn))
-				memory_bm_set_bit(orig_bm, 0, pfn);
+				memory_bm_set_bit(orig_bm, pfn);
 	}
 	memory_bm_position_reset(orig_bm);
 	memory_bm_position_reset(copy_bm);
 	for(;;) {
-		pfn = memory_bm_next_pfn(orig_bm, 0);
+		pfn = memory_bm_next_pfn(orig_bm);
 		if (unlikely(pfn == BM_END_OF_MAP))
 			break;
-		copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn);
+		copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
 	}
 }
 
@@ -1365,8 +1332,8 @@ void swsusp_free(void)
 	memory_bm_position_reset(free_pages_map);
 
 loop:
-	fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
-	fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+	fr_pfn = memory_bm_next_pfn(free_pages_map);
+	fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
 
 	/*
 	 * Find the next bit set in both bitmaps. This is guaranteed to
@@ -1374,16 +1341,16 @@ loop:
 	 */
 	do {
 		if (fb_pfn < fr_pfn)
-			fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+			fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
 		if (fr_pfn < fb_pfn)
-			fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
+			fr_pfn = memory_bm_next_pfn(free_pages_map);
 	} while (fb_pfn != fr_pfn);
 
 	if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
 		struct page *page = pfn_to_page(fr_pfn);
 
-		memory_bm_clear_current(forbidden_pages_map, 0);
-		memory_bm_clear_current(free_pages_map, 0);
+		memory_bm_clear_current(forbidden_pages_map);
+		memory_bm_clear_current(free_pages_map);
 		__free_page(page);
 		goto loop;
 	}
@@ -1418,7 +1385,7 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
 		page = alloc_image_page(mask);
 		if (!page)
 			break;
-		memory_bm_set_bit(&copy_bm, 0, page_to_pfn(page));
+		memory_bm_set_bit(&copy_bm, page_to_pfn(page));
 		if (PageHighMem(page))
 			alloc_highmem++;
 		else
@@ -1514,7 +1481,7 @@ static unsigned long free_unnecessary_pages(void)
 	memory_bm_position_reset(&copy_bm);
 
 	while (to_free_normal > 0 || to_free_highmem > 0) {
-		unsigned long pfn = memory_bm_next_pfn(&copy_bm, 0);
+		unsigned long pfn = memory_bm_next_pfn(&copy_bm);
 		struct page *page = pfn_to_page(pfn);
 
 		if (PageHighMem(page)) {
@@ -1528,7 +1495,7 @@ static unsigned long free_unnecessary_pages(void)
 			to_free_normal--;
 			alloc_normal--;
 		}
-		memory_bm_clear_bit(&copy_bm, 0, pfn);
+		memory_bm_clear_bit(&copy_bm, pfn);
 		swsusp_unset_page_forbidden(page);
 		swsusp_unset_page_free(page);
 		__free_page(page);
@@ -1813,7 +1780,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 		struct page *page;
 
 		page = alloc_image_page(__GFP_HIGHMEM);
-		memory_bm_set_bit(bm, 0, page_to_pfn(page));
+		memory_bm_set_bit(bm, page_to_pfn(page));
 	}
 	return nr_highmem;
 }
@@ -1856,7 +1823,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 			page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
 			if (!page)
 				goto err_out;
-			memory_bm_set_bit(copy_bm, 0, page_to_pfn(page));
+			memory_bm_set_bit(copy_bm, page_to_pfn(page));
 		}
 	}
 
@@ -1871,9 +1838,6 @@ asmlinkage __visible int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
 
-        if (toi_running)
-            return toi_post_context_save();
-
 	printk(KERN_INFO "PM: Creating hibernation image:\n");
 
 	drain_local_pages(NULL);
@@ -1921,7 +1885,7 @@ static int init_header_complete(struct swsusp_info *info)
 	return 0;
 }
 
-char *check_image_kernel(struct swsusp_info *info)
+static char *check_image_kernel(struct swsusp_info *info)
 {
 	if (info->version_code != LINUX_VERSION_CODE)
 		return "kernel version";
@@ -1942,7 +1906,7 @@ unsigned long snapshot_get_image_size(void)
 	return nr_copy_pages + nr_meta_pages + 1;
 }
 
-int init_header(struct swsusp_info *info)
+static int init_header(struct swsusp_info *info)
 {
 	memset(info, 0, sizeof(struct swsusp_info));
 	info->num_physpages = get_num_physpages();
@@ -1964,7 +1928,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 	int j;
 
 	for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
-		buf[j] = memory_bm_next_pfn(bm, 0);
+		buf[j] = memory_bm_next_pfn(bm);
 		if (unlikely(buf[j] == BM_END_OF_MAP))
 			break;
 		/* Save page key for data page (s390 only). */
@@ -2015,7 +1979,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
 	} else {
 		struct page *page;
 
-		page = pfn_to_page(memory_bm_next_pfn(&copy_bm, 0));
+		page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
 		if (PageHighMem(page)) {
 			/* Highmem pages are copied to the buffer,
 			 * because we can't return with a kmapped
@@ -2057,7 +2021,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 	/* Mark pages that correspond to the "original" pfns as "unsafe" */
 	memory_bm_position_reset(bm);
 	do {
-		pfn = memory_bm_next_pfn(bm, 0);
+		pfn = memory_bm_next_pfn(bm);
 		if (likely(pfn != BM_END_OF_MAP)) {
 			if (likely(pfn_valid(pfn)))
 				swsusp_set_page_free(pfn_to_page(pfn));
@@ -2077,10 +2041,10 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
 	unsigned long pfn;
 
 	memory_bm_position_reset(src);
-	pfn = memory_bm_next_pfn(src, 0);
+	pfn = memory_bm_next_pfn(src);
 	while (pfn != BM_END_OF_MAP) {
-		memory_bm_set_bit(dst, 0, pfn);
-		pfn = memory_bm_next_pfn(src, 0);
+		memory_bm_set_bit(dst, pfn);
+		pfn = memory_bm_next_pfn(src);
 	}
 }
 
@@ -2131,8 +2095,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 		/* Extract and buffer page key for data page (s390 only). */
 		page_key_memorize(buf + j);
 
-		if (memory_bm_pfn_present(bm, 0, buf[j]))
-			memory_bm_set_bit(bm, 0, buf[j]);
+		if (memory_bm_pfn_present(bm, buf[j]))
+			memory_bm_set_bit(bm, buf[j]);
 		else
 			return -EFAULT;
 	}
@@ -2175,12 +2139,12 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
 	unsigned int cnt = 0;
 
 	memory_bm_position_reset(bm);
-	pfn = memory_bm_next_pfn(bm, 0);
+	pfn = memory_bm_next_pfn(bm);
 	while (pfn != BM_END_OF_MAP) {
 		if (PageHighMem(pfn_to_page(pfn)))
 			cnt++;
 
-		pfn = memory_bm_next_pfn(bm, 0);
+		pfn = memory_bm_next_pfn(bm);
 	}
 	return cnt;
 }
@@ -2225,7 +2189,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
 		page = alloc_page(__GFP_HIGHMEM);
 		if (!swsusp_page_is_free(page)) {
 			/* The page is "safe", set its bit the bitmap */
-			memory_bm_set_bit(bm, 0, page_to_pfn(page));
+			memory_bm_set_bit(bm, page_to_pfn(page));
 			safe_highmem_pages++;
 		}
 		/* Mark the page as allocated */
@@ -2283,7 +2247,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 
 		/* Copy of the page will be stored in high memory */
 		kaddr = buffer;
-		tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0));
+		tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
 		safe_highmem_pages--;
 		last_highmem_page = tmp;
 		pbe->copy_page = tmp;
@@ -2454,7 +2418,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
 	struct pbe *pbe;
 	struct page *page;
-	unsigned long pfn = memory_bm_next_pfn(bm, 0);
+	unsigned long pfn = memory_bm_next_pfn(bm);
 
 	if (pfn == BM_END_OF_MAP)
 		return ERR_PTR(-EFAULT);
@@ -2641,82 +2605,3 @@ int restore_highmem(void)
 	return 0;
 }
 #endif /* CONFIG_HIGHMEM */
-
-struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map,
-  *pageset1_copy_map, *io_map, *page_resave_map, *compare_map;
-
-int resume_attempted;
-
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
-	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
-    int result;
-
-    memory_bm_position_reset(bm);
-
-    do {
-        result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
-        if (result)
-            return result;
-    } while (rtree_next_node(bm, 0));
-    return 0;
-}
-
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
-	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
-    int result;
-
-    memory_bm_position_reset(bm);
-
-    do {
-        result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
-        if (result)
-            return result;
-
-    } while (rtree_next_node(bm, 0));
-    return 0;
-}
-
-int memory_bm_space_needed(struct memory_bitmap *bm)
-{
-    unsigned long bytes = 0;
-
-    memory_bm_position_reset(bm);
-    do {
-        bytes += PAGE_SIZE;
-    } while (rtree_next_node(bm, 0));
-    return bytes;
-}
-
-int toi_alloc_bitmap(struct memory_bitmap **bm)
-{
-    int error;
-    struct memory_bitmap *bm1;
-
-    bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
-    if (!bm1)
-        return -ENOMEM;
-
-    error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
-    if (error) {
-        printk("Error returned - %d.\n", error);
-        kfree(bm1);
-        return -ENOMEM;
-    }
-
-    *bm = bm1;
-    return 0;
-}
-
-void toi_free_bitmap(struct memory_bitmap **bm)
-{
-    if (!*bm)
-        return;
-
-    memory_bm_free(*bm, 0);
-    kfree(*bm);
-    *bm = NULL;
-}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8d7a1ef72..53266b729 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -366,6 +366,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 			trace_suspend_resume(TPS("machine_suspend"),
 				state, false);
 			events_check_enabled = false;
+		} else if (*wakeup) {
+			error = -EBUSY;
 		}
 		syscore_resume();
 	}
@@ -468,7 +470,7 @@ static int enter_state(suspend_state_t state)
 	if (state == PM_SUSPEND_FREEZE) {
 #ifdef CONFIG_PM_DEBUG
 		if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
-			pr_warning("PM: Unsupported test mode for freeze state,"
+			pr_warning("PM: Unsupported test mode for suspend to idle,"
 				   "please choose none/freezer/devices/platform.\n");
 			return -EAGAIN;
 		}
@@ -488,7 +490,7 @@ static int enter_state(suspend_state_t state)
 	printk("done.\n");
 	trace_suspend_resume(TPS("sync_filesystems"), 0, false);
 
-	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+	pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
 	error = suspend_prepare(state);
 	if (error)
 		goto Unlock;
@@ -497,7 +499,7 @@ static int enter_state(suspend_state_t state)
 		goto Finish;
 
 	trace_suspend_resume(TPS("suspend_enter"), state, false);
-	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+	pr_debug("PM: Suspending system (%s)\n", pm_states[state]);
 	pm_restrict_gfp_mask();
 	error = suspend_devices_and_enter(state);
 	pm_restore_gfp_mask();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 570aff817..2f30ca91e 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -212,7 +212,84 @@ int swsusp_swap_in_use(void)
  */
 
 static unsigned short root_swap = 0xffff;
-struct block_device *hib_resume_bdev;
+static struct block_device *hib_resume_bdev;
+
+struct hib_bio_batch {
+	atomic_t		count;
+	wait_queue_head_t	wait;
+	int			error;
+};
+
+static void hib_init_batch(struct hib_bio_batch *hb)
+{
+	atomic_set(&hb->count, 0);
+	init_waitqueue_head(&hb->wait);
+	hb->error = 0;
+}
+
+static void hib_end_io(struct bio *bio, int error)
+{
+	struct hib_bio_batch *hb = bio->bi_private;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	if (!uptodate || error) {
+		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
+				imajor(bio->bi_bdev->bd_inode),
+				iminor(bio->bi_bdev->bd_inode),
+				(unsigned long long)bio->bi_iter.bi_sector);
+
+		if (!error)
+			error = -EIO;
+	}
+
+	if (bio_data_dir(bio) == WRITE)
+		put_page(page);
+
+	if (error && !hb->error)
+		hb->error = error;
+	if (atomic_dec_and_test(&hb->count))
+		wake_up(&hb->wait);
+
+	bio_put(bio);
+}
+
+static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+		struct hib_bio_batch *hb)
+{
+	struct page *page = virt_to_page(addr);
+	struct bio *bio;
+	int error = 0;
+
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio->bi_bdev = hib_resume_bdev;
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
+			(unsigned long long)bio->bi_iter.bi_sector);
+		bio_put(bio);
+		return -EFAULT;
+	}
+
+	if (hb) {
+		bio->bi_end_io = hib_end_io;
+		bio->bi_private = hb;
+		atomic_inc(&hb->count);
+		submit_bio(rw, bio);
+	} else {
+		error = submit_bio_wait(rw, bio);
+		bio_put(bio);
+	}
+
+	return error;
+}
+
+static int hib_wait_io(struct hib_bio_batch *hb)
+{
+	wait_event(hb->wait, atomic_read(&hb->count) == 0);
+	return hb->error;
+}
 
 /*
  * Saving part
@@ -222,7 +299,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
 
-	hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -231,7 +308,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		swsusp_header->flags = flags;
 		if (flags & SF_CRC32_MODE)
 			swsusp_header->crc32 = handle->crc32;
-		error = hib_bio_write_page(swsusp_resume_block,
+		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Swap header not found!\n");
@@ -271,10 +348,10 @@ static int swsusp_swap_check(void)
  *	write_page - Write one page to given swap location.
  *	@buf:		Address we're writing.
  *	@offset:	Offset of the swap page we're writing to.
- *	@bio_chain:	Link the next write BIO here
+ *	@hb:		bio completion batch
  */
 
-static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 {
 	void *src;
 	int ret;
@@ -282,13 +359,13 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 	if (!offset)
 		return -ENOSPC;
 
-	if (bio_chain) {
+	if (hb) {
 		src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
 		                              __GFP_NORETRY);
 		if (src) {
 			copy_page(src, buf);
 		} else {
-			ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
+			ret = hib_wait_io(hb); /* Free pages */
 			if (ret)
 				return ret;
 			src = (void *)__get_free_page(__GFP_WAIT |
@@ -298,14 +375,14 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 				copy_page(src, buf);
 			} else {
 				WARN_ON_ONCE(1);
-				bio_chain = NULL;	/* Go synchronous */
+				hb = NULL;	/* Go synchronous */
 				src = buf;
 			}
 		}
 	} else {
 		src = buf;
 	}
-	return hib_bio_write_page(offset, src, bio_chain);
+	return hib_submit_io(WRITE_SYNC, offset, src, hb);
 }
 
 static void release_swap_writer(struct swap_map_handle *handle)
@@ -348,7 +425,7 @@ err_close:
 }
 
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
-				struct bio **bio_chain)
+		struct hib_bio_batch *hb)
 {
 	int error = 0;
 	sector_t offset;
@@ -356,7 +433,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 	if (!handle->cur)
 		return -EINVAL;
 	offset = alloc_swapdev_block(root_swap);
-	error = write_page(buf, offset, bio_chain);
+	error = write_page(buf, offset, hb);
 	if (error)
 		return error;
 	handle->cur->entries[handle->k++] = offset;
@@ -365,15 +442,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		if (!offset)
 			return -ENOSPC;
 		handle->cur->next_swap = offset;
-		error = write_page(handle->cur, handle->cur_swap, bio_chain);
+		error = write_page(handle->cur, handle->cur_swap, hb);
 		if (error)
 			goto out;
 		clear_page(handle->cur);
 		handle->cur_swap = offset;
 		handle->k = 0;
 
-		if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
-			error = hib_wait_on_bio_chain(bio_chain);
+		if (hb && low_free_pages() <= handle->reqd_free_pages) {
+			error = hib_wait_io(hb);
 			if (error)
 				goto out;
 			/*
@@ -445,23 +522,24 @@ static int save_image(struct swap_map_handle *handle,
 	int ret;
 	int nr_pages;
 	int err2;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	ktime_t start;
 	ktime_t stop;
 
+	hib_init_batch(&hb);
+
 	printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
 		nr_to_write);
 	m = nr_to_write / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 	while (1) {
 		ret = snapshot_read_next(snapshot);
 		if (ret <= 0)
 			break;
-		ret = swap_write_page(handle, data_of(*snapshot), &bio);
+		ret = swap_write_page(handle, data_of(*snapshot), &hb);
 		if (ret)
 			break;
 		if (!(nr_pages % m))
@@ -469,7 +547,7 @@ static int save_image(struct swap_map_handle *handle,
 			       nr_pages / m * 10);
 		nr_pages++;
 	}
-	err2 = hib_wait_on_bio_chain(&bio);
+	err2 = hib_wait_io(&hb);
 	stop = ktime_get();
 	if (!ret)
 		ret = err2;
@@ -580,7 +658,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	int ret = 0;
 	int nr_pages;
 	int err2;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	ktime_t start;
 	ktime_t stop;
 	size_t off;
@@ -589,6 +667,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	struct cmp_data *data = NULL;
 	struct crc_data *crc = NULL;
 
+	hib_init_batch(&hb);
+
 	/*
 	 * We'll limit the number of threads for compression to limit memory
 	 * footprint.
@@ -674,7 +754,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 	for (;;) {
 		for (thr = 0; thr < nr_threads; thr++) {
@@ -748,7 +827,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 			     off += PAGE_SIZE) {
 				memcpy(page, data[thr].cmp + off, PAGE_SIZE);
 
-				ret = swap_write_page(handle, page, &bio);
+				ret = swap_write_page(handle, page, &hb);
 				if (ret)
 					goto out_finish;
 			}
@@ -759,7 +838,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	}
 
 out_finish:
-	err2 = hib_wait_on_bio_chain(&bio);
+	err2 = hib_wait_io(&hb);
 	stop = ktime_get();
 	if (!ret)
 		ret = err2;
@@ -906,7 +985,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 			return -ENOMEM;
 		}
 
-		error = hib_bio_read_page(offset, tmp->map, NULL);
+		error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
 		if (error) {
 			release_swap_reader(handle);
 			return error;
@@ -919,7 +998,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 }
 
 static int swap_read_page(struct swap_map_handle *handle, void *buf,
-				struct bio **bio_chain)
+		struct hib_bio_batch *hb)
 {
 	sector_t offset;
 	int error;
@@ -930,7 +1009,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = hib_bio_read_page(offset, buf, bio_chain);
+	error = hib_submit_io(READ_SYNC, offset, buf, hb);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -968,27 +1047,28 @@ static int load_image(struct swap_map_handle *handle,
 	int ret = 0;
 	ktime_t start;
 	ktime_t stop;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	int err2;
 	unsigned nr_pages;
 
+	hib_init_batch(&hb);
+
 	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
 		nr_to_read);
 	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 	for ( ; ; ) {
 		ret = snapshot_write_next(snapshot);
 		if (ret <= 0)
 			break;
-		ret = swap_read_page(handle, data_of(*snapshot), &bio);
+		ret = swap_read_page(handle, data_of(*snapshot), &hb);
 		if (ret)
 			break;
 		if (snapshot->sync_read)
-			ret = hib_wait_on_bio_chain(&bio);
+			ret = hib_wait_io(&hb);
 		if (ret)
 			break;
 		if (!(nr_pages % m))
@@ -996,7 +1076,7 @@ static int load_image(struct swap_map_handle *handle,
 			       nr_pages / m * 10);
 		nr_pages++;
 	}
-	err2 = hib_wait_on_bio_chain(&bio);
+	err2 = hib_wait_io(&hb);
 	stop = ktime_get();
 	if (!ret)
 		ret = err2;
@@ -1067,7 +1147,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	unsigned int m;
 	int ret = 0;
 	int eof = 0;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	ktime_t start;
 	ktime_t stop;
 	unsigned nr_pages;
@@ -1080,6 +1160,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	struct dec_data *data = NULL;
 	struct crc_data *crc = NULL;
 
+	hib_init_batch(&hb);
+
 	/*
 	 * We'll limit the number of threads for decompression to limit memory
 	 * footprint.
@@ -1190,7 +1272,6 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 
 	ret = snapshot_write_next(snapshot);
@@ -1199,7 +1280,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	for(;;) {
 		for (i = 0; !eof && i < want; i++) {
-			ret = swap_read_page(handle, page[ring], &bio);
+			ret = swap_read_page(handle, page[ring], &hb);
 			if (ret) {
 				/*
 				 * On real read error, finish. On end of data,
@@ -1226,7 +1307,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 			if (!asked)
 				break;
 
-			ret = hib_wait_on_bio_chain(&bio);
+			ret = hib_wait_io(&hb);
 			if (ret)
 				goto out_finish;
 			have += asked;
@@ -1281,7 +1362,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		 * Wait for more data while we are decompressing.
 		 */
 		if (have < LZO_CMP_PAGES && asked) {
-			ret = hib_wait_on_bio_chain(&bio);
+			ret = hib_wait_io(&hb);
 			if (ret)
 				goto out_finish;
 			have += asked;
@@ -1430,7 +1511,7 @@ int swsusp_check(void)
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-		error = hib_bio_read_page(swsusp_resume_block,
+		error = hib_submit_io(READ_SYNC, swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
 			goto put;
@@ -1438,7 +1519,7 @@ int swsusp_check(void)
 		if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
-			error = hib_bio_write_page(swsusp_resume_block,
+			error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
 			error = -EINVAL;
@@ -1482,10 +1563,10 @@ int swsusp_unmark(void)
 {
 	int error;
 
-	hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
 	if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
-		error = hib_bio_write_page(swsusp_resume_block,
+		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
deleted file mode 100644
index 1aff98026..000000000
--- a/kernel/power/tuxonice.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * kernel/power/tuxonice.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations used throughout swsusp.
- *
- */
-
-#ifndef KERNEL_POWER_TOI_H
-#define KERNEL_POWER_TOI_H
-
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/suspend.h>
-#include <linux/fs.h>
-#include <asm/setup.h>
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-#define TOI_CORE_VERSION "3.3"
-#define	TOI_HEADER_VERSION 3
-#define MY_BOOT_KERNEL_DATA_VERSION 4
-
-struct toi_boot_kernel_data {
-	int version;
-	int size;
-	unsigned long toi_action;
-	unsigned long toi_debug_state;
-	u32 toi_default_console_level;
-	int toi_io_time[2][2];
-	char toi_nosave_commandline[COMMAND_LINE_SIZE];
-	unsigned long pages_used[33];
-	unsigned long incremental_bytes_in;
-	unsigned long incremental_bytes_out;
-	unsigned long compress_bytes_in;
-	unsigned long compress_bytes_out;
-	unsigned long pruned_pages;
-};
-
-extern struct toi_boot_kernel_data toi_bkd;
-
-/* Location of book kernel data struct in kernel being resumed */
-extern unsigned long boot_kernel_data_buffer;
-
-/*		 == Action states == 		*/
-
-enum {
-	TOI_REBOOT,
-	TOI_PAUSE,
-	TOI_LOGALL,
-	TOI_CAN_CANCEL,
-	TOI_KEEP_IMAGE,
-	TOI_FREEZER_TEST,
-	TOI_SINGLESTEP,
-	TOI_PAUSE_NEAR_PAGESET_END,
-	TOI_TEST_FILTER_SPEED,
-	TOI_TEST_BIO,
-	TOI_NO_PAGESET2,
-	TOI_IGNORE_ROOTFS,
-	TOI_REPLACE_SWSUSP,
-	TOI_PAGESET2_FULL,
-	TOI_ABORT_ON_RESAVE_NEEDED,
-	TOI_NO_MULTITHREADED_IO,
-	TOI_NO_DIRECT_LOAD, /* Obsolete */
-	TOI_LATE_CPU_HOTPLUG, /* Obsolete */
-	TOI_GET_MAX_MEM_ALLOCD,
-	TOI_NO_FLUSHER_THREAD,
-	TOI_NO_PS2_IF_UNNEEDED,
-	TOI_POST_RESUME_BREAKPOINT,
-	TOI_NO_READAHEAD,
-        TOI_TRACE_DEBUG_ON,
-        TOI_INCREMENTAL_IMAGE,
-};
-
-extern unsigned long toi_bootflags_mask;
-
-#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
-
-/*		 == Result states == 		*/
-
-enum {
-	TOI_ABORTED,
-	TOI_ABORT_REQUESTED,
-	TOI_NOSTORAGE_AVAILABLE,
-	TOI_INSUFFICIENT_STORAGE,
-	TOI_FREEZING_FAILED,
-	TOI_KEPT_IMAGE,
-	TOI_WOULD_EAT_MEMORY,
-	TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
-	TOI_PM_SEM,
-	TOI_DEVICE_REFUSED,
-	TOI_SYSDEV_REFUSED,
-	TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
-	TOI_UNABLE_TO_PREPARE_IMAGE,
-	TOI_FAILED_MODULE_INIT,
-	TOI_FAILED_MODULE_CLEANUP,
-	TOI_FAILED_IO,
-	TOI_OUT_OF_MEMORY,
-	TOI_IMAGE_ERROR,
-	TOI_PLATFORM_PREP_FAILED,
-	TOI_CPU_HOTPLUG_FAILED,
-	TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */
-	TOI_RESAVE_NEEDED,
-	TOI_CANT_SUSPEND,
-	TOI_NOTIFIERS_PREPARE_FAILED,
-	TOI_PRE_SNAPSHOT_FAILED,
-	TOI_PRE_RESTORE_FAILED,
-	TOI_USERMODE_HELPERS_ERR,
-	TOI_CANT_USE_ALT_RESUME,
-	TOI_HEADER_TOO_BIG,
-	TOI_WAKEUP_EVENT,
-	TOI_SYSCORE_REFUSED,
-	TOI_DPM_PREPARE_FAILED,
-	TOI_DPM_SUSPEND_FAILED,
-	TOI_NUM_RESULT_STATES	/* Used in printing debug info only */
-};
-
-extern unsigned long toi_result;
-
-#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
-#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
-				test_and_set_bit(bit, &toi_result))
-#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
-#define test_result_state(bit) (test_bit(bit, &toi_result))
-
-/*	 == Debug sections and levels == 	*/
-
-/* debugging levels. */
-enum {
-	TOI_STATUS = 0,
-	TOI_ERROR = 2,
-	TOI_LOW,
-	TOI_MEDIUM,
-	TOI_HIGH,
-	TOI_VERBOSE,
-};
-
-enum {
-	TOI_ANY_SECTION,
-	TOI_EAT_MEMORY,
-	TOI_IO,
-	TOI_HEADER,
-	TOI_WRITER,
-	TOI_MEMORY,
-	TOI_PAGEDIR,
-	TOI_COMPRESS,
-	TOI_BIO,
-};
-
-#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
-#define clear_debug_state(bit) \
-	(test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
-#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
-
-/*		== Steps in hibernating ==	*/
-
-enum {
-	STEP_HIBERNATE_PREPARE_IMAGE,
-	STEP_HIBERNATE_SAVE_IMAGE,
-	STEP_HIBERNATE_POWERDOWN,
-	STEP_RESUME_CAN_RESUME,
-	STEP_RESUME_LOAD_PS1,
-	STEP_RESUME_DO_RESTORE,
-	STEP_RESUME_READ_PS2,
-	STEP_RESUME_GO,
-	STEP_RESUME_ALT_IMAGE,
-	STEP_CLEANUP,
-	STEP_QUIET_CLEANUP
-};
-
-/*		== TuxOnIce states ==
-	(see also include/linux/suspend.h)	*/
-
-#define get_toi_state()  (toi_state)
-#define restore_toi_state(saved_state) \
-	do { toi_state = saved_state; } while (0)
-
-/*		== Module support ==		*/
-
-struct toi_core_fns {
-	int (*post_context_save)(void);
-	unsigned long (*get_nonconflicting_page)(void);
-	int (*try_hibernate)(void);
-	void (*try_resume)(void);
-};
-
-extern struct toi_core_fns *toi_core_fns;
-
-/*		== All else ==			*/
-#define KB(x) ((x) << (PAGE_SHIFT - 10))
-#define MB(x) ((x) >> (20 - PAGE_SHIFT))
-
-extern int toi_start_anything(int toi_or_resume);
-extern void toi_finish_anything(int toi_or_resume);
-
-extern int save_image_part1(void);
-extern int toi_atomic_restore(void);
-
-extern int toi_try_hibernate(void);
-extern void toi_try_resume(void);
-
-extern int __toi_post_context_save(void);
-
-extern unsigned int nr_hibernates;
-extern char alt_resume_param[256];
-
-extern void copyback_post(void);
-extern int toi_hibernate(void);
-extern unsigned long extra_pd1_pages_used;
-
-#define SECTOR_SIZE 512
-
-extern void toi_early_boot_message(int can_erase_image, int default_answer,
-	char *warning_reason, ...);
-
-extern int do_check_can_resume(void);
-extern int do_toi_step(int step);
-extern int toi_launch_userspace_program(char *command, int channel_no,
-		int wait, int debug);
-
-extern char tuxonice_signature[9];
-
-extern int toi_start_other_threads(void);
-extern void toi_stop_other_threads(void);
-
-extern int toi_trace_index;
-#define TOI_TRACE_DEBUG(PFN, DESC, ...) \
-    do { \
-        if (test_action_state(TOI_TRACE_DEBUG_ON)) { \
-            printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \
-        } \
-    } while(0)
-
-#ifdef CONFIG_TOI_KEEP_IMAGE
-#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE))
-#else
-#define toi_keeping_image (0)
-#endif
-
-#ifdef CONFIG_TOI_INCREMENTAL
-extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose);
-extern int toi_reset_dirtiness(int verbose);
-extern void toi_cbw_write(void);
-extern void toi_cbw_restore(void);
-extern int toi_allocate_cbw_data(void);
-extern void toi_free_cbw_data(void);
-extern int toi_cbw_init(void);
-extern void toi_mark_tasks_cbw(void);
-#else
-static inline int toi_reset_dirtiness(int verbose) { return 0; }
-#define toi_cbw_write() do { } while(0)
-#define toi_cbw_restore() do { } while(0)
-#define toi_allocate_cbw_data() do { } while(0)
-#define toi_free_cbw_data() do { } while(0)
-static inline int toi_cbw_init(void) { return 0; }
-#endif
-#endif
diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
deleted file mode 100644
index 5729240d8..000000000
--- a/kernel/power/tuxonice_alloc.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.c
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/export.h>
-#include <linux/slab.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-#define TOI_ALLOC_PATHS 41
-
-static DEFINE_MUTEX(toi_alloc_mutex);
-
-static struct toi_module_ops toi_alloc_ops;
-
-static int toi_fail_num;
-
-static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
-		toi_free_count[TOI_ALLOC_PATHS],
-		toi_test_count[TOI_ALLOC_PATHS],
-		toi_fail_count[TOI_ALLOC_PATHS];
-static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
-static int cur_allocd, max_allocd;
-
-static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
-	"", /* 0 */
-	"get_io_info_struct",
-	"extent",
-	"extent (loading chain)",
-	"userui channel",
-	"userui arg", /* 5 */
-	"attention list metadata",
-	"extra pagedir memory metadata",
-	"bdev metadata",
-	"extra pagedir memory",
-	"header_locations_read", /* 10 */
-	"bio queue",
-	"prepare_readahead",
-	"i/o buffer",
-	"writer buffer in bio_init",
-	"checksum buffer", /* 15 */
-	"compression buffer",
-	"filewriter signature op",
-	"set resume param alloc1",
-	"set resume param alloc2",
-	"debugging info buffer", /* 20 */
-	"check can resume buffer",
-	"write module config buffer",
-	"read module config buffer",
-	"write image header buffer",
-	"read pageset1 buffer", /* 25 */
-	"get_have_image_data buffer",
-	"checksum page",
-	"worker rw loop",
-	"get nonconflicting page",
-	"ps1 load addresses", /* 30 */
-	"remove swap image",
-	"swap image exists",
-	"swap parse sig location",
-	"sysfs kobj",
-	"swap mark resume attempted buffer", /* 35 */
-	"cluster member",
-	"boot kernel data buffer",
-	"setting swap signature",
-	"block i/o bdev struct",
-        "copy before write", /* 40 */
-};
-
-#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
-	do { \
-		BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
-		\
-		if (FAIL_NUM == toi_fail_num) { \
-			atomic_inc(&toi_test_count[FAIL_NUM]); \
-			toi_fail_num = 0; \
-			return FAIL_VAL; \
-		} \
-	} while (0)
-
-static void alloc_update_stats(int fail_num, void *result, int size)
-{
-	if (!result) {
-		atomic_inc(&toi_fail_count[fail_num]);
-		return;
-	}
-
-	atomic_inc(&toi_alloc_count[fail_num]);
-	if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
-		mutex_lock(&toi_alloc_mutex);
-		toi_cur_allocd[fail_num]++;
-		cur_allocd += size;
-		if (unlikely(cur_allocd > max_allocd)) {
-			int i;
-
-			for (i = 0; i < TOI_ALLOC_PATHS; i++)
-				toi_max_allocd[i] = toi_cur_allocd[i];
-			max_allocd = cur_allocd;
-		}
-		mutex_unlock(&toi_alloc_mutex);
-	}
-}
-
-static void free_update_stats(int fail_num, int size)
-{
-	BUG_ON(fail_num >= TOI_ALLOC_PATHS);
-	atomic_inc(&toi_free_count[fail_num]);
-	if (unlikely(atomic_read(&toi_free_count[fail_num]) >
-				atomic_read(&toi_alloc_count[fail_num])))
-		dump_stack();
-	if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
-		mutex_lock(&toi_alloc_mutex);
-		cur_allocd -= size;
-		toi_cur_allocd[fail_num]--;
-		mutex_unlock(&toi_alloc_mutex);
-	}
-}
-
-void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
-{
-	void *result;
-
-	if (toi_alloc_ops.enabled)
-		MIGHT_FAIL(fail_num, NULL);
-	result = kzalloc(size, flags);
-	if (toi_alloc_ops.enabled)
-		alloc_update_stats(fail_num, result, size);
-	if (fail_num == toi_trace_allocs)
-		dump_stack();
-	return result;
-}
-
-unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
-		unsigned int order)
-{
-	unsigned long result;
-
-        mask |= ___GFP_TOI_NOTRACK;
-	if (toi_alloc_ops.enabled)
-		MIGHT_FAIL(fail_num, 0);
-	result = __get_free_pages(mask, order);
-	if (toi_alloc_ops.enabled)
-		alloc_update_stats(fail_num, (void *) result,
-				PAGE_SIZE << order);
-	if (fail_num == toi_trace_allocs)
-		dump_stack();
-	return result;
-}
-
-struct page *toi_alloc_page(int fail_num, gfp_t mask)
-{
-	struct page *result;
-
-	if (toi_alloc_ops.enabled)
-		MIGHT_FAIL(fail_num, NULL);
-        mask |= ___GFP_TOI_NOTRACK;
-	result = alloc_page(mask);
-	if (toi_alloc_ops.enabled)
-		alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
-	if (fail_num == toi_trace_allocs)
-		dump_stack();
-	return result;
-}
-
-unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
-{
-	unsigned long result;
-
-	if (toi_alloc_ops.enabled)
-		MIGHT_FAIL(fail_num, 0);
-        mask |= ___GFP_TOI_NOTRACK;
-	result = get_zeroed_page(mask);
-	if (toi_alloc_ops.enabled)
-		alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
-	if (fail_num == toi_trace_allocs)
-		dump_stack();
-	return result;
-}
-
-void toi_kfree(int fail_num, const void *arg, int size)
-{
-	if (arg && toi_alloc_ops.enabled)
-		free_update_stats(fail_num, size);
-
-	if (fail_num == toi_trace_allocs)
-		dump_stack();
-	kfree(arg);
-}
-
-void toi_free_page(int fail_num, unsigned long virt)
-{
-	if (virt && toi_alloc_ops.enabled)
-		free_update_stats(fail_num, PAGE_SIZE);
-
-	if (fail_num == toi_trace_allocs)
-		dump_stack();
-	free_page(virt);
-}
-
-void toi__free_page(int fail_num, struct page *page)
-{
-	if (page && toi_alloc_ops.enabled)
-		free_update_stats(fail_num, PAGE_SIZE);
-
-	if (fail_num == toi_trace_allocs)
-		dump_stack();
-	__free_page(page);
-}
-
-void toi_free_pages(int fail_num, struct page *page, int order)
-{
-	if (page && toi_alloc_ops.enabled)
-		free_update_stats(fail_num, PAGE_SIZE << order);
-
-	if (fail_num == toi_trace_allocs)
-		dump_stack();
-	__free_pages(page, order);
-}
-
-void toi_alloc_print_debug_stats(void)
-{
-	int i, header_done = 0;
-
-	if (!toi_alloc_ops.enabled)
-		return;
-
-	for (i = 0; i < TOI_ALLOC_PATHS; i++)
-		if (atomic_read(&toi_alloc_count[i]) !=
-		    atomic_read(&toi_free_count[i])) {
-			if (!header_done) {
-				printk(KERN_INFO "Idx  Allocs   Frees   Tests "
-					"  Fails     Max Description\n");
-				header_done = 1;
-			}
-
-			printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
-				atomic_read(&toi_alloc_count[i]),
-				atomic_read(&toi_free_count[i]),
-				atomic_read(&toi_test_count[i]),
-				atomic_read(&toi_fail_count[i]),
-				toi_max_allocd[i],
-				toi_alloc_desc[i]);
-		}
-}
-
-static int toi_alloc_initialise(int starting_cycle)
-{
-	int i;
-
-	if (!starting_cycle)
-		return 0;
-
-	if (toi_trace_allocs)
-		dump_stack();
-
-	for (i = 0; i < TOI_ALLOC_PATHS; i++) {
-		atomic_set(&toi_alloc_count[i], 0);
-		atomic_set(&toi_free_count[i], 0);
-		atomic_set(&toi_test_count[i], 0);
-		atomic_set(&toi_fail_count[i], 0);
-		toi_cur_allocd[i] = 0;
-		toi_max_allocd[i] = 0;
-	};
-
-	max_allocd = 0;
-	cur_allocd = 0;
-	return 0;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL),
-	SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0,
-			NULL),
-	SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_GET_MAX_MEM_ALLOCD, 0),
-	SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0,
-			NULL)
-};
-
-static struct toi_module_ops toi_alloc_ops = {
-	.type					= MISC_HIDDEN_MODULE,
-	.name					= "allocation debugging",
-	.directory				= "alloc",
-	.module					= THIS_MODULE,
-	.early					= 1,
-	.initialise				= toi_alloc_initialise,
-
-	.sysfs_data		= sysfs_params,
-	.num_sysfs_entries	= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-int toi_alloc_init(void)
-{
-	int result = toi_register_module(&toi_alloc_ops);
-	return result;
-}
-
-void toi_alloc_exit(void)
-{
-	toi_unregister_module(&toi_alloc_ops);
-}
diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
deleted file mode 100644
index 28c5af193..000000000
--- a/kernel/power/tuxonice_alloc.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.h
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/slab.h>
-#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN)
-#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
-
-#ifdef CONFIG_PM_DEBUG
-extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
-extern void toi_kfree(int fail_num, const void *arg, int size);
-
-extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
-		unsigned int order);
-#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
-extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
-extern void toi_free_page(int fail_num, unsigned long buf);
-extern void toi__free_page(int fail_num, struct page *page);
-extern void toi_free_pages(int fail_num, struct page *page, int order);
-extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
-extern int toi_alloc_init(void);
-extern void toi_alloc_exit(void);
-
-extern void toi_alloc_print_debug_stats(void);
-
-#else /* CONFIG_PM_DEBUG */
-
-#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
-#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN))
-
-#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
-#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
-#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
-#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
-#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
-#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
-#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
-static inline int toi_alloc_init(void)
-{
-	return 0;
-}
-
-static inline void toi_alloc_exit(void) { }
-
-static inline void toi_alloc_print_debug_stats(void) { }
-
-#endif
-
-extern int toi_trace_allocs;
diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
deleted file mode 100644
index 7b9886f54..000000000
--- a/kernel/power/tuxonice_atomic_copy.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.c
- *
- * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/syscore_ops.h>
-#include <linux/ftrace.h>
-#include <asm/suspend.h>
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-
-unsigned long extra_pd1_pages_used;
-
-/**
- * free_pbe_list - free page backup entries used by the atomic copy code.
- * @list:	List to free.
- * @highmem:	Whether the list is in highmem.
- *
- * Normally, this function isn't used. If, however, we need to abort before
- * doing the atomic copy, we use this to free the pbes previously allocated.
- **/
-static void free_pbe_list(struct pbe **list, int highmem)
-{
-	while (*list) {
-		int i;
-		struct pbe *free_pbe, *next_page = NULL;
-		struct page *page;
-
-		if (highmem) {
-			page = (struct page *) *list;
-			free_pbe = (struct pbe *) kmap(page);
-		} else {
-			page = virt_to_page(*list);
-			free_pbe = *list;
-		}
-
-		for (i = 0; i < PBES_PER_PAGE; i++) {
-			if (!free_pbe)
-				break;
-			if (highmem)
-				toi__free_page(29, free_pbe->address);
-			else
-				toi_free_page(29,
-					(unsigned long) free_pbe->address);
-			free_pbe = free_pbe->next;
-		}
-
-		if (highmem) {
-			if (free_pbe)
-				next_page = free_pbe;
-			kunmap(page);
-		} else {
-			if (free_pbe)
-				next_page = free_pbe;
-		}
-
-		toi__free_page(29, page);
-		*list = (struct pbe *) next_page;
-	};
-}
-
-/**
- * copyback_post - post atomic-restore actions
- *
- * After doing the atomic restore, we have a few more things to do:
- *	1) We want to retain some values across the restore, so we now copy
- *	these from the nosave variables to the normal ones.
- *	2) Set the status flags.
- *	3) Resume devices.
- *	4) Tell userui so it can redraw & restore settings.
- *	5) Reread the page cache.
- **/
-void copyback_post(void)
-{
-	struct toi_boot_kernel_data *bkd =
-		(struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
-	if (toi_activate_storage(1))
-		panic("Failed to reactivate our storage.");
-
-	toi_post_atomic_restore_modules(bkd);
-
-	toi_cond_pause(1, "About to reload secondary pagedir.");
-
-	if (read_pageset2(0))
-		panic("Unable to successfully reread the page cache.");
-
-	/*
-	 * If the user wants to sleep again after resuming from full-off,
-	 * it's most likely to be in order to suspend to ram, so we'll
-	 * do this check after loading pageset2, to give them the fastest
-	 * wakeup when they are ready to use the computer again.
-	 */
-	toi_check_resleep();
-
-        if (test_action_state(TOI_INCREMENTAL_IMAGE))
-            toi_reset_dirtiness(1);
-}
-
-/**
- * toi_copy_pageset1 - do the atomic copy of pageset1
- *
- * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
- * because we can't be sure what side effects it has. On my old Duron, with
- * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
- * count at resume time 4 instead of 3.
- *
- * We don't want to call kmap_atomic unconditionally because it has the side
- * effect of incrementing the preempt count, which will leave it one too high
- * post resume (the page containing the preempt count will be copied after
- * its incremented. This is essentially the same problem.
- **/
-void toi_copy_pageset1(void)
-{
-	int i;
-	unsigned long source_index, dest_index;
-
-	memory_bm_position_reset(pageset1_map);
-	memory_bm_position_reset(pageset1_copy_map);
-
-	source_index = memory_bm_next_pfn(pageset1_map, 0);
-	dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
-
-	for (i = 0; i < pagedir1.size; i++) {
-		unsigned long *origvirt, *copyvirt;
-		struct page *origpage, *copypage;
-		int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1,
-		    was_present1, was_present2;
-
-		origpage = pfn_to_page(source_index);
-		copypage = pfn_to_page(dest_index);
-
-		origvirt = PageHighMem(origpage) ?
-			kmap_atomic(origpage) :
-			page_address(origpage);
-
-		copyvirt = PageHighMem(copypage) ?
-			kmap_atomic(copypage) :
-			page_address(copypage);
-
-		was_present1 = kernel_page_present(origpage);
-		if (!was_present1)
-			kernel_map_pages(origpage, 1, 1);
-
-		was_present2 = kernel_page_present(copypage);
-		if (!was_present2)
-			kernel_map_pages(copypage, 1, 1);
-
-		while (loop >= 0) {
-			*(copyvirt + loop) = *(origvirt + loop);
-			loop--;
-		}
-
-		if (!was_present1)
-			kernel_map_pages(origpage, 1, 0);
-
-		if (!was_present2)
-			kernel_map_pages(copypage, 1, 0);
-
-		if (PageHighMem(origpage))
-			kunmap_atomic(origvirt);
-
-		if (PageHighMem(copypage))
-			kunmap_atomic(copyvirt);
-
-		source_index = memory_bm_next_pfn(pageset1_map, 0);
-		dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
-	}
-}
-
-/**
- * __toi_post_context_save - steps after saving the cpu context
- *
- * Steps taken after saving the CPU state to make the actual
- * atomic copy.
- *
- * Called from swsusp_save in snapshot.c via toi_post_context_save.
- **/
-int __toi_post_context_save(void)
-{
-	unsigned long old_ps1_size = pagedir1.size;
-
-	check_checksums();
-
-	free_checksum_pages();
-
-	toi_recalculate_image_contents(1);
-
-	extra_pd1_pages_used = pagedir1.size > old_ps1_size ?
-		pagedir1.size - old_ps1_size : 0;
-
-	if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
-		printk(KERN_INFO "Pageset1 has grown by %lu pages. "
-			"extra_pages_allowance is currently only %lu.\n",
-			pagedir1.size - old_ps1_size,
-			extra_pd1_pages_allowance);
-
-		/*
-		 * Highlevel code will see this, clear the state and
-		 * retry if we haven't already done so twice.
-		 */
-		if (any_to_free(1)) {
-			set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-			return 1;
-		}
-		if (try_allocate_extra_memory()) {
-			printk(KERN_INFO "Failed to allocate the extra memory"
-					" needed. Restarting the process.");
-			set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-			return 1;
-		}
-		printk(KERN_INFO "However it looks like there's enough"
-			" free ram and storage to handle this, so "
-			" continuing anyway.");
-		/* 
-		 * What if try_allocate_extra_memory above calls
-		 * toi_allocate_extra_pagedir_memory and it allocs a new
-		 * slab page via toi_kzalloc which should be in ps1? So...
-		 */
-		toi_recalculate_image_contents(1);
-	}
-
-	if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
-	    !test_action_state(TOI_TEST_BIO))
-		toi_copy_pageset1();
-
-	return 0;
-}
-
-/**
- * toi_hibernate - high level code for doing the atomic copy
- *
- * High-level code which prepares to do the atomic copy. Loosely based
- * on the swsusp version, but with the following twists:
- *	- We set toi_running so the swsusp code uses our code paths.
- *	- We give better feedback regarding what goes wrong if there is a
- *	  problem.
- *	- We use an extra function to call the assembly, just in case this code
- *	  is in a module (return address).
- **/
-int toi_hibernate(void)
-{
-	int error;
-
-	error = toi_lowlevel_builtin();
-
-	if (!error) {
-		struct toi_boot_kernel_data *bkd =
-			(struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
-		/*
-		 * The boot kernel's data may be larger (newer version) or
-		 * smaller (older version) than ours. Copy the minimum
-		 * of the two sizes, so that we don't overwrite valid values
-		 * from pre-atomic copy.
-		 */
-
-		memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
-			min_t(int, sizeof(struct toi_boot_kernel_data),
-				bkd->size));
-	}
-
-	return error;
-}
-
-/**
- * toi_atomic_restore - prepare to do the atomic restore
- *
- * Get ready to do the atomic restore. This part gets us into the same
- * state we are in prior to do calling do_toi_lowlevel while
- * hibernating: hot-unplugging secondary cpus and freeze processes,
- * before starting the thread that will do the restore.
- **/
-int toi_atomic_restore(void)
-{
-	int error;
-
-	toi_prepare_status(DONT_CLEAR_BAR,	"Atomic restore.");
-
-	memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line,
-		strlen(saved_command_line));
-
-	toi_pre_atomic_restore_modules(&toi_bkd);
-
-	if (add_boot_kernel_data_pbe())
-		goto Failed;
-
-	toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
-	if (toi_go_atomic(PMSG_QUIESCE, 0))
-		goto Failed;
-
-	/* We'll ignore saved state, but this gets preempt count (etc) right */
-	save_processor_state();
-
-	error = swsusp_arch_resume();
-	/*
-	 * Code below is only ever reached in case of failure. Otherwise
-	 * execution continues at place where swsusp_arch_suspend was called.
-	 *
-	 * We don't know whether it's safe to continue (this shouldn't happen),
-	 * so lets err on the side of caution.
-	 */
-	BUG();
-
-Failed:
-	free_pbe_list(&restore_pblist, 0);
-#ifdef CONFIG_HIGHMEM
-	free_pbe_list(&restore_highmem_pblist, 1);
-#endif
-	return 1;
-}
-
-/**
- * toi_go_atomic - do the actual atomic copy/restore
- * @state:	   The state to use for dpm_suspend_start & power_down calls.
- * @suspend_time:  Whether we're suspending or resuming.
- **/
-int toi_go_atomic(pm_message_t state, int suspend_time)
-{
-  if (suspend_time) {
-    if (platform_begin(1)) {
-      set_abort_result(TOI_PLATFORM_PREP_FAILED);
-      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
-      return 1;
-    }
-
-    if (dpm_prepare(PMSG_FREEZE)) {
-      set_abort_result(TOI_DPM_PREPARE_FAILED);
-      dpm_complete(PMSG_RECOVER);
-      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
-      return 1;
-    }
-  }
-
-	suspend_console();
-	pm_restrict_gfp_mask();
-
-  if (suspend_time) {
-    if (dpm_suspend(state)) {
-      set_abort_result(TOI_DPM_SUSPEND_FAILED);
-      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
-      return 1;
-    }
-  } else {
-    if (dpm_suspend_start(state)) {
-      set_abort_result(TOI_DPM_SUSPEND_FAILED);
-      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
-      return 1;
-    }
-  }
-
-	/* At this point, dpm_suspend_start() has been called, but *not*
-	 * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now.
-	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
-	 * become desynchronized with the actual state of the hardware
-	 * at resume time, and evil weirdness ensues.
-	 */
-
-	if (dpm_suspend_end(state)) {
-		set_abort_result(TOI_DEVICE_REFUSED);
-		toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1);
-		return 1;
-	}
-
-	if (suspend_time) {
-		if (platform_pre_snapshot(1))
-			set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
-	} else {
-		if (platform_pre_restore(1))
-			set_abort_result(TOI_PRE_RESTORE_FAILED);
-	}
-
-	if (test_result_state(TOI_ABORTED)) {
-		toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1);
-		return 1;
-	}
-
-        if (disable_nonboot_cpus()) {
-            set_abort_result(TOI_CPU_HOTPLUG_FAILED);
-            toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
-                    suspend_time, 1);
-            return 1;
-        }
-
-	local_irq_disable();
-
-	if (syscore_suspend()) {
-		set_abort_result(TOI_SYSCORE_REFUSED);
-		toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1);
-		return 1;
-	}
-
-	if (suspend_time && pm_wakeup_pending()) {
-		set_abort_result(TOI_WAKEUP_EVENT);
-		toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1);
-		return 1;
-	}
-	return 0;
-}
-
-/**
- * toi_end_atomic - post atomic copy/restore routines
- * @stage:		What step to start at.
- * @suspend_time:	Whether we're suspending or resuming.
- * @error:		Whether we're recovering from an error.
- **/
-void toi_end_atomic(int stage, int suspend_time, int error)
-{
-	pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) :
-		PMSG_RESTORE;
-
-	switch (stage) {
-	case ATOMIC_ALL_STEPS:
-		if (!suspend_time) {
-			events_check_enabled = false;
-		}
-		platform_leave(1);
-	case ATOMIC_STEP_SYSCORE_RESUME:
-		syscore_resume();
-	case ATOMIC_STEP_IRQS:
-		local_irq_enable();
-	case ATOMIC_STEP_CPU_HOTPLUG:
-                enable_nonboot_cpus();
-	case ATOMIC_STEP_PLATFORM_FINISH:
-		if (!suspend_time && error & 2)
-			platform_restore_cleanup(1);
-		else 
-			platform_finish(1);
-		dpm_resume_start(msg);
-	case ATOMIC_STEP_DEVICE_RESUME:
-		if (suspend_time && (error & 2))
-			platform_recover(1);
-		dpm_resume(msg);
-		if (!toi_in_suspend()) {
-                    dpm_resume_end(PMSG_RECOVER);
-                }
-		if (error || !toi_in_suspend()) {
-			pm_restore_gfp_mask();
-                }
-		resume_console();
-	case ATOMIC_STEP_DPM_COMPLETE:
-		dpm_complete(msg);
-	case ATOMIC_STEP_PLATFORM_END:
-		platform_end(1);
-
-		toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
-	}
-}
diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
deleted file mode 100644
index 2de0e3b49..000000000
--- a/kernel/power/tuxonice_atomic_copy.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.h
- *
- * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-enum {
-	ATOMIC_ALL_STEPS,
-	ATOMIC_STEP_SYSCORE_RESUME,
-	ATOMIC_STEP_IRQS,
-	ATOMIC_STEP_CPU_HOTPLUG,
-	ATOMIC_STEP_PLATFORM_FINISH,
-	ATOMIC_STEP_DEVICE_RESUME,
-	ATOMIC_STEP_DPM_COMPLETE,
-	ATOMIC_STEP_PLATFORM_END,
-};
-
-int toi_go_atomic(pm_message_t state, int toi_time);
-void toi_end_atomic(int stage, int toi_time, int error);
-
-extern void platform_recover(int platform_mode);
diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h
deleted file mode 100644
index 201e3cd47..000000000
--- a/kernel/power/tuxonice_bio.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-#include <linux/buffer_head.h>
-#include "tuxonice_extent.h"
-
-void toi_put_extent_chain(struct hibernate_extent_chain *chain);
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
-		unsigned long start, unsigned long end);
-
-struct hibernate_extent_saved_state {
-	int extent_num;
-	struct hibernate_extent *extent_ptr;
-	unsigned long offset;
-};
-
-struct toi_bdev_info {
-	struct toi_bdev_info *next;
-	struct hibernate_extent_chain blocks;
-	struct block_device *bdev;
-	struct toi_module_ops *allocator;
-	int allocator_index;
-	struct hibernate_extent_chain allocations;
-	char name[266]; /* "swap on " or "file " + up to 256 chars */
-
-	/* Saved in header */
-	char uuid[17];
-	dev_t dev_t;
-	int prio;
-	int bmap_shift;
-	int blocks_per_page;
-	unsigned long pages_used;
-	struct hibernate_extent_saved_state saved_state[4];
-};
-
-struct toi_extent_iterate_state {
-	struct toi_bdev_info *current_chain;
-	int num_chains;
-	int saved_chain_number[4];
-	struct toi_bdev_info *saved_chain_ptr[4];
-};
-
-/*
- * Our exported interface so the swapwriter and filewriter don't
- * need these functions duplicated.
- */
-struct toi_bio_ops {
-	int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
-			struct page *page);
-	int (*register_storage)(struct toi_bdev_info *new);
-	void (*free_storage)(void);
-};
-
-struct toi_allocator_ops {
-	unsigned long (*toi_swap_storage_available) (void);
-};
-
-extern struct toi_bio_ops toi_bio_ops;
-
-extern char *toi_writer_buffer;
-extern int toi_writer_buffer_posn;
-
-struct toi_bio_allocator_ops {
-	int (*register_storage) (void);
-	unsigned long (*storage_available)(void);
-	int (*allocate_storage) (struct toi_bdev_info *, unsigned long);
-	int (*bmap) (struct toi_bdev_info *);
-	void (*free_storage) (struct toi_bdev_info *);
-	unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used);
-};
diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c
deleted file mode 100644
index 364fae9db..000000000
--- a/kernel/power/tuxonice_bio_chains.c
+++ /dev/null
@@ -1,1126 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_devinfo.c
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/mm_types.h>
-#include "tuxonice_bio.h"
-#include "tuxonice_bio_internal.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-#include "tuxonice_io.h"
-
-static struct toi_bdev_info *prio_chain_head;
-static int num_chains;
-
-/* Pointer to current entry being loaded/saved. */
-struct toi_extent_iterate_state toi_writer_posn;
-
-#define metadata_size (sizeof(struct toi_bdev_info) - \
-		offsetof(struct toi_bdev_info, uuid))
-
-/*
- * After section 0 (header) comes 2 => next_section[0] = 2
- */
-static int next_section[3] = { 2, 3, 1 };
-
-/**
- * dump_block_chains - print the contents of the bdev info array.
- **/
-void dump_block_chains(void)
-{
-	int i = 0;
-	int j;
-	struct toi_bdev_info *cur_chain = prio_chain_head;
-
-	while (cur_chain) {
-		struct hibernate_extent *this = cur_chain->blocks.first;
-
-		printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio);
-
-		while (this) {
-			printk(KERN_CONT " [%lu-%lu]%s", this->start,
-					this->end, this->next ? "," : "");
-			this = this->next;
-		}
-
-		printk("\n");
-		cur_chain = cur_chain->next;
-		i++;
-	}
-
-	printk(KERN_DEBUG "Saved states:\n");
-	for (i = 0; i < 4; i++) {
-		printk(KERN_DEBUG "Slot %d: Chain %d.\n",
-			i, toi_writer_posn.saved_chain_number[i]);
-
-		cur_chain = prio_chain_head;
-		j = 0;
-		while (cur_chain) {
-			printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n",
-					j, cur_chain->saved_state[i].extent_num,
-					cur_chain->saved_state[i].offset);
-			cur_chain = cur_chain->next;
-			j++;
-		}
-		printk(KERN_CONT "\n");
-	}
-}
-
-/**
- *
- **/
-static void toi_extent_chain_next(void)
-{
-	struct toi_bdev_info *this = toi_writer_posn.current_chain;
-
-	if (!this->blocks.current_extent)
-		return;
-
-	if (this->blocks.current_offset == this->blocks.current_extent->end) {
-		if (this->blocks.current_extent->next) {
-			this->blocks.current_extent =
-				this->blocks.current_extent->next;
-			this->blocks.current_offset =
-				this->blocks.current_extent->start;
-		} else {
-			this->blocks.current_extent = NULL;
-			this->blocks.current_offset = 0;
-		}
-	} else
-		this->blocks.current_offset++;
-}
-
-/**
- *
- */
-
-static struct toi_bdev_info *__find_next_chain_same_prio(void)
-{
-	struct toi_bdev_info *start_chain = toi_writer_posn.current_chain;
-	struct toi_bdev_info *this = start_chain;
-	int orig_prio = this->prio;
-
-	do {
-		this = this->next;
-
-		if (!this)
-			this = prio_chain_head;
-
-		/* Back on original chain? Use it again. */
-		if (this == start_chain)
-			return start_chain;
-
-	} while (!this->blocks.current_extent || this->prio != orig_prio);
-
-	return this;
-}
-
-static void find_next_chain(void)
-{
-	struct toi_bdev_info *this;
-
-	this = __find_next_chain_same_prio();
-
-	/*
-	 * If we didn't get another chain of the same priority that we
-	 * can use, look for the next priority.
-	 */
-	while (this && !this->blocks.current_extent)
-		this = this->next;
-
-	toi_writer_posn.current_chain = this;
-}
-
-/**
- * toi_extent_state_next - go to the next extent
- * @blocks: The number of values to progress.
- * @stripe_mode: Whether to spread usage across all chains.
- *
- * Given a state, progress to the next valid entry. We may begin in an
- * invalid state, as we do when invoked after extent_state_goto_start below.
- *
- * When using compression and expected_compression > 0, we let the image size
- * be larger than storage, so we can validly run out of data to return.
- **/
-static unsigned long toi_extent_state_next(int blocks, int current_stream)
-{
-	int i;
-
-	if (!toi_writer_posn.current_chain)
-		return -ENOSPC;
-
-	/* Assume chains always have lengths that are multiples of @blocks */
-	for (i = 0; i < blocks; i++)
-		toi_extent_chain_next();
-
-	/* The header stream is not striped */
-	if (current_stream ||
-	    !toi_writer_posn.current_chain->blocks.current_extent)
-		find_next_chain();
-
-	return  toi_writer_posn.current_chain ? 0 : -ENOSPC;
-}
-
-static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this)
-{
-	struct toi_bdev_info **prev_ptr;
-	struct toi_bdev_info *cur;
-
-	/* Loop through the existing chain, finding where to insert it */
-	prev_ptr = &prio_chain_head;
-	cur = prio_chain_head;
-
-	while (cur && cur->prio >= this->prio) {
-		prev_ptr = &cur->next;
-		cur = cur->next;
-	}
-
-	this->next = *prev_ptr;
-	*prev_ptr = this;
-
-	this = prio_chain_head;
-	while (this)
-		this = this->next;
-	num_chains++;
-}
-
-/**
- * toi_extent_state_goto_start - reinitialize an extent chain iterator
- * @state:	Iterator to reinitialize
- **/
-void toi_extent_state_goto_start(void)
-{
-	struct toi_bdev_info *this = prio_chain_head;
-
-	while (this) {
-		toi_message(TOI_BIO, TOI_VERBOSE, 0,
-			"Setting current extent to %p.", this->blocks.first);
-		this->blocks.current_extent = this->blocks.first;
-		if (this->blocks.current_extent) {
-			toi_message(TOI_BIO, TOI_VERBOSE, 0,
-					"Setting current offset to %lu.",
-					this->blocks.current_extent->start);
-			this->blocks.current_offset =
-				this->blocks.current_extent->start;
-		}
-
-		this = this->next;
-	}
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.",
-			prio_chain_head);
-	toi_writer_posn.current_chain = prio_chain_head;
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start.");
-}
-
-/**
- * toi_extent_state_save - save state of the iterator
- * @state:		Current state of the chain
- * @saved_state:	Iterator to populate
- *
- * Given a state and a struct hibernate_extent_state_store, save the current
- * position in a format that can be used with relocated chains (at
- * resume time).
- **/
-void toi_extent_state_save(int slot)
-{
-	struct toi_bdev_info *cur_chain = prio_chain_head;
-	struct hibernate_extent *extent;
-	struct hibernate_extent_saved_state *chain_state;
-	int i = 0;
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.",
-			slot);
-
-	if (!toi_writer_posn.current_chain) {
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => "
-				"chain_num = -1.");
-		toi_writer_posn.saved_chain_number[slot] = -1;
-		return;
-	}
-
-	while (cur_chain) {
-		i++;
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) "
-				"state, slot %d.", i, cur_chain, slot);
-
-		chain_state = &cur_chain->saved_state[slot];
-
-		chain_state->offset = cur_chain->blocks.current_offset;
-
-		if (toi_writer_posn.current_chain == cur_chain) {
-			toi_writer_posn.saved_chain_number[slot] = i;
-			toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain "
-					"we were on => chain_num is %d.", i);
-		}
-
-		if (!cur_chain->blocks.current_extent) {
-			chain_state->extent_num = 0;
-			toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent "
-					"for this chain => extent_num %d is 0.",
-					i);
-			cur_chain = cur_chain->next;
-			continue;
-		}
-
-		extent = cur_chain->blocks.first;
-		chain_state->extent_num = 1;
-
-		while (extent != cur_chain->blocks.current_extent) {
-			chain_state->extent_num++;
-			extent = extent->next;
-		}
-
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i,
-				chain_state->extent_num);
-
-		cur_chain = cur_chain->next;
-	}
-	toi_message(TOI_BIO, TOI_VERBOSE, 0,
-			"Completed saving extent state slot %d.", slot);
-}
-
-/**
- * toi_extent_state_restore - restore the position saved by extent_state_save
- * @state:		State to populate
- * @saved_state:	Iterator saved to restore
- **/
-void toi_extent_state_restore(int slot)
-{
-	int i = 0;
-	struct toi_bdev_info *cur_chain = prio_chain_head;
-	struct hibernate_extent_saved_state *chain_state;
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0,
-			"toi_extent_state_restore - slot %d.", slot);
-
-	if (toi_writer_posn.saved_chain_number[slot] == -1) {
-		toi_writer_posn.current_chain = NULL;
-		return;
-	}
-
-	while (cur_chain) {
-		int posn;
-		int j;
-		i++;
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) "
-				"state, slot %d.", i, cur_chain, slot);
-
-		chain_state = &cur_chain->saved_state[slot];
-
-		posn = chain_state->extent_num;
-
-		cur_chain->blocks.current_extent = cur_chain->blocks.first;
-		cur_chain->blocks.current_offset = chain_state->offset;
-
-		if (i == toi_writer_posn.saved_chain_number[slot]) {
-			toi_writer_posn.current_chain = cur_chain;
-			toi_message(TOI_BIO, TOI_VERBOSE, 0,
-					"Found current chain.");
-		}
-
-		for (j = 0; j < 4; j++)
-			if (i == toi_writer_posn.saved_chain_number[j]) {
-				toi_writer_posn.saved_chain_ptr[j] = cur_chain;
-				toi_message(TOI_BIO, TOI_VERBOSE, 0,
-					"Found saved chain ptr %d (%p) (offset"
-					" %d).", j, cur_chain,
-					cur_chain->saved_state[j].offset);
-			}
-
-		if (posn) {
-			while (--posn)
-				cur_chain->blocks.current_extent =
-					cur_chain->blocks.current_extent->next;
-		} else
-			cur_chain->blocks.current_extent = NULL;
-
-		cur_chain = cur_chain->next;
-	}
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done.");
-	if (test_action_state(TOI_LOGALL))
-		dump_block_chains();
-}
-
-/*
- * Storage needed
- *
- * Returns amount of space in the image header required
- * for the chain data. This ignores the links between
- * pages, which we factor in when allocating the space.
- */
-int toi_bio_devinfo_storage_needed(void)
-{
-	int result = sizeof(num_chains);
-	struct toi_bdev_info *chain = prio_chain_head;
-
-	while (chain) {
-		result += metadata_size;
-
-		/* Chain size */
-		result += sizeof(int);
-
-		/* Extents */
-		result += (2 * sizeof(unsigned long) *
-			chain->blocks.num_extents);
-
-		chain = chain->next;
-	}
-
-	result += 4 * sizeof(int);
-	return result;
-}
-
-static unsigned long chain_pages_used(struct toi_bdev_info *chain)
-{
-	struct hibernate_extent *this = chain->blocks.first;
-	struct hibernate_extent_saved_state *state = &chain->saved_state[3];
-	unsigned long size = 0;
-	int extent_idx = 1;
-
-	if (!state->extent_num) {
-		if (!this)
-			return 0;
-		else
-			return chain->blocks.size;
-	}
-
-	while (extent_idx < state->extent_num) {
-		size += (this->end - this->start + 1);
-		this = this->next;
-		extent_idx++;
-	}
-
-	/* We didn't use the one we're sitting on, so don't count it */
-	return size + state->offset - this->start;
-}
-
-void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain)
-{
-    unsigned long used = chain_pages_used(chain);
-
-    /* Free the storage */
-    unsigned long first_freed = 0;
-
-    if (chain->allocator->bio_allocator_ops->free_unused_storage)
-        first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used);
-
-    printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed);
-
-    /* Adjust / free the extents. */
-    toi_put_extent_chain_from(&chain->blocks, first_freed);
-
-    {
-        struct hibernate_extent *this = chain->blocks.first;
-        while (this) {
-            printk("Extent %lx-%lx.\n", this->start, this->end);
-            this = this->next;
-        }
-    }
-}
-
-/**
- * toi_serialise_extent_chain - write a chain in the image
- * @chain:	Chain to write.
- **/
-static int toi_serialise_extent_chain(struct toi_bdev_info *chain)
-{
-	struct hibernate_extent *this;
-	int ret;
-	int i = 1;
-
-	chain->pages_used = chain_pages_used(chain);
-
-	if (test_action_state(TOI_LOGALL))
-		dump_block_chains();
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).",
-			chain->dev_t);
-	/* Device info -  dev_t, prio, bmap_shift, blocks per page, positions */
-	ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
-			(char *) &chain->uuid, metadata_size);
-	if (ret)
-		return ret;
-
-	/* Num extents */
-	ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
-			(char *) &chain->blocks.num_extents, sizeof(int));
-	if (ret)
-		return ret;
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
-			chain->blocks.num_extents);
-
-	this = chain->blocks.first;
-	while (this) {
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i);
-		ret = toiActiveAllocator->rw_header_chunk(WRITE,
-				&toi_blockwriter_ops,
-				(char *) this, 2 * sizeof(this->start));
-		if (ret)
-			return ret;
-		this = this->next;
-		i++;
-	}
-
-	return ret;
-}
-
-int toi_serialise_extent_chains(void)
-{
-	struct toi_bdev_info *this = prio_chain_head;
-	int result;
-
-	/* Write the number of chains */
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)",
-			num_chains);
-	result = toiActiveAllocator->rw_header_chunk(WRITE,
-			&toi_blockwriter_ops, (char *) &num_chains,
-			sizeof(int));
-	if (result)
-		return result;
-
-	/* Then the chains themselves */
-	while (this) {
-		result = toi_serialise_extent_chain(this);
-		if (result)
-			return result;
-		this = this->next;
-	}
-
-	/*
-	 * Finally, the chain we should be on at the start of each
-	 * section.
-	 */
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers.");
-	result = toiActiveAllocator->rw_header_chunk(WRITE,
-			&toi_blockwriter_ops,
-			(char *) &toi_writer_posn.saved_chain_number[0],
-			4 * sizeof(int));
-
-	return result;
-}
-
-int toi_register_storage_chain(struct toi_bdev_info *new)
-{
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.",
-			new);
-	toi_insert_chain_in_prio_list(new);
-	return 0;
-}
-
-static void free_bdev_info(struct toi_bdev_info *chain)
-{
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain);
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents.");
-	toi_put_extent_chain(&chain->blocks);
-
-	/*
-	 * The allocator may need to do more than just free the chains
-	 * (swap_free, for example). Don't call from boot kernel.
-	 */
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents.");
-	if (chain->allocator)
-		chain->allocator->bio_allocator_ops->free_storage(chain);
-
-	/*
-	 * Dropping out of reading atomic copy? Need to undo
-	 * toi_open_by_devnum.
-	 */
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev.");
-	if (chain->bdev && !IS_ERR(chain->bdev) &&
-			chain->bdev != resume_block_device &&
-			chain->bdev != header_block_device &&
-			test_toi_state(TOI_TRYING_TO_RESUME))
-		toi_close_bdev(chain->bdev);
-
-	/* Poison */
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct.");
-	toi_kfree(39, chain, sizeof(*chain));
-
-	if (prio_chain_head == chain)
-		prio_chain_head = NULL;
-
-	num_chains--;
-}
-
-void free_all_bdev_info(void)
-{
-	struct toi_bdev_info *this = prio_chain_head;
-
-	while (this) {
-		struct toi_bdev_info *next = this->next;
-		free_bdev_info(this);
-		this = next;
-	}
-
-	memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn));
-	prio_chain_head = NULL;
-}
-
-static void set_up_start_position(void)
-{
-	toi_writer_posn.current_chain = prio_chain_head;
-	go_next_page(0, 0);
-}
-
-/**
- * toi_load_extent_chain - read back a chain saved in the image
- * @chain:	Chain to load
- *
- * The linked list of extents is reconstructed from the disk. chain will point
- * to the first entry.
- **/
-int toi_load_extent_chain(int index, int *num_loaded)
-{
-	struct toi_bdev_info *chain = toi_kzalloc(39,
-			sizeof(struct toi_bdev_info), GFP_ATOMIC);
-	struct hibernate_extent *this, *last = NULL;
-	int i, ret;
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index);
-	/* Get dev_t, prio, bmap_shift, blocks per page, positions */
-	ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
-			(char *) &chain->uuid, metadata_size);
-
-	if (ret) {
-		printk(KERN_ERR "Failed to read the size of extent chain.\n");
-		toi_kfree(39, chain, sizeof(*chain));
-		return 1;
-	}
-
-	toi_bkd.pages_used[index] = chain->pages_used;
-
-	ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
-			(char *) &chain->blocks.num_extents, sizeof(int));
-	if (ret) {
-		printk(KERN_ERR "Failed to read the size of extent chain.\n");
-		toi_kfree(39, chain, sizeof(*chain));
-		return 1;
-	}
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
-			chain->blocks.num_extents);
-
-	for (i = 0; i < chain->blocks.num_extents; i++) {
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1);
-
-		this = toi_kzalloc(2, sizeof(struct hibernate_extent),
-				TOI_ATOMIC_GFP);
-		if (!this) {
-			printk(KERN_INFO "Failed to allocate a new extent.\n");
-			free_bdev_info(chain);
-			return -ENOMEM;
-		}
-		this->next = NULL;
-		/* Get the next page */
-		ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
-				NULL, (char *) this, 2 * sizeof(this->start));
-		if (ret) {
-			printk(KERN_INFO "Failed to read an extent.\n");
-			toi_kfree(2, this, sizeof(struct hibernate_extent));
-			free_bdev_info(chain);
-			return 1;
-		}
-
-		if (last)
-			last->next = this;
-		else {
-			char b1[32], b2[32], b3[32];
-			/*
-			 * Open the bdev
-			 */
-			toi_message(TOI_BIO, TOI_VERBOSE, 0,
-				"Chain dev_t is %s. Resume dev t is %s. Header"
-				" bdev_t is %s.\n",
-				format_dev_t(b1, chain->dev_t),
-				format_dev_t(b2, resume_dev_t),
-				format_dev_t(b3, toi_sig_data->header_dev_t));
-
-			if (chain->dev_t == resume_dev_t)
-				chain->bdev = resume_block_device;
-			else if (chain->dev_t == toi_sig_data->header_dev_t)
-				chain->bdev = header_block_device;
-			else {
-				chain->bdev = toi_open_bdev(chain->uuid,
-						chain->dev_t, 1);
-				if (IS_ERR(chain->bdev)) {
-					free_bdev_info(chain);
-					return -ENODEV;
-				}
-			}
-
-			toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift "
-					"is %d and blocks per page is %d.",
-					chain->bmap_shift,
-					chain->blocks_per_page);
-
-			chain->blocks.first = this;
-
-			/*
-			 * Couldn't do this earlier, but can't do
-			 * goto_start now - we may have already used blocks
-			 * in the first chain.
-			 */
-			chain->blocks.current_extent = this;
-			chain->blocks.current_offset = this->start;
-
-			/*
-			 * Can't wait until we've read the whole chain
-			 * before we insert it in the list. We might need
-			 * this chain to read the next page in the header
-			 */
-			toi_insert_chain_in_prio_list(chain);
-		}
-
-		/*
-		 * We have to wait until 2 extents are loaded before setting up
-		 * properly because if the first extent has only one page, we
-		 * will need to put the position on the second extent. Sounds
-		 * obvious, but it wasn't!
-		 */
-		(*num_loaded)++;
-		if ((*num_loaded) == 2)
-			set_up_start_position();
-		last = this;
-	}
-
-	/*
-	 * Shouldn't get empty chains, but it's not impossible. Link them in so
-	 * they get freed properly later.
-	 */
-	if (!chain->blocks.num_extents)
-		toi_insert_chain_in_prio_list(chain);
-
-	if (!chain->blocks.current_extent) {
-		chain->blocks.current_extent = chain->blocks.first;
-		if (chain->blocks.current_extent)
-			chain->blocks.current_offset =
-				chain->blocks.current_extent->start;
-	}
-	return 0;
-}
-
-int toi_load_extent_chains(void)
-{
-	int result;
-	int to_load;
-	int i;
-	int extents_loaded = 0;
-
-	result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
-			(char *) &to_load,
-			sizeof(int));
-	if (result)
-		return result;
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load);
-
-	for (i = 0; i < to_load; i++) {
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.",
-				i, to_load);
-		result = toi_load_extent_chain(i, &extents_loaded);
-		if (result)
-			return result;
-	}
-
-	/* If we never got to a second extent, we still need to do this. */
-	if (extents_loaded == 1)
-		set_up_start_position();
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers.");
-	result = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
-			&toi_blockwriter_ops,
-			(char *) &toi_writer_posn.saved_chain_number[0],
-			4 * sizeof(int));
-
-	return result;
-}
-
-static int toi_end_of_stream(int writing, int section_barrier)
-{
-	struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
-	int compare_to = next_section[current_stream];
-	struct toi_bdev_info *compare_chain =
-		toi_writer_posn.saved_chain_ptr[compare_to];
-	int compare_offset = compare_chain ?
-		compare_chain->saved_state[compare_to].offset : 0;
-
-	if (!section_barrier)
-		return 0;
-
-	if (!cur_chain)
-		return 1;
-
-	if (cur_chain == compare_chain &&
-	    cur_chain->blocks.current_offset == compare_offset) {
-		if (writing) {
-			if (!current_stream) {
-				debug_broken_header();
-				return 1;
-			}
-		} else {
-			more_readahead = 0;
-			toi_message(TOI_BIO, TOI_VERBOSE, 0,
-					"Reached the end of stream %d "
-					"(not an error).", current_stream);
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * go_next_page - skip blocks to the start of the next page
- * @writing: Whether we're reading or writing the image.
- *
- * Go forward one page.
- **/
-int go_next_page(int writing, int section_barrier)
-{
-	struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
-	int max = cur_chain ? cur_chain->blocks_per_page : 1;
-
-	/* Nope. Go foward a page - or maybe two. Don't stripe the header,
-	 * so that bad fragmentation doesn't put the extent data containing
-	 * the location of the second page out of the first header page.
-	 */
-	if (toi_extent_state_next(max, current_stream)) {
-		/* Don't complain if readahead falls off the end */
-		if (writing && section_barrier) {
-			toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. "
-				"Expected compression ratio too optimistic?");
-			if (test_action_state(TOI_LOGALL))
-				dump_block_chains();
-		}
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to "
-				"read/write. (Not necessarily a fatal error.");
-		return -ENOSPC;
-	}
-
-	return 0;
-}
-
-int devices_of_same_priority(struct toi_bdev_info *this)
-{
-	struct toi_bdev_info *check = prio_chain_head;
-	int i = 0;
-
-	while (check) {
-		if (check->prio == this->prio)
-			i++;
-		check = check->next;
-	}
-
-	return i;
-}
-
-/**
- * toi_bio_rw_page - do i/o on the next disk page in the image
- * @writing: Whether reading or writing.
- * @page: Page to do i/o on.
- * @is_readahead: Whether we're doing readahead
- * @free_group: The group used in allocating the page
- *
- * Submit a page for reading or writing, possibly readahead.
- * Pass the group used in allocating the page as well, as it should
- * be freed on completion of the bio if we're writing the page.
- **/
-int toi_bio_rw_page(int writing, struct page *page,
-		int is_readahead, int free_group)
-{
-	int result = toi_end_of_stream(writing, 1);
-	struct toi_bdev_info *dev_info = toi_writer_posn.current_chain;
-
-	if (result) {
-		if (writing)
-			abort_hibernate(TOI_INSUFFICIENT_STORAGE,
-				"Insufficient storage for your image.");
-		else
-			toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to "
-				"read/write another page when stream has "
-				"ended.");
-		return -ENOSPC;
-	}
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0,
-			"%s %lx:%ld",
-			writing ? "Write" : "Read",
-			dev_info->dev_t, dev_info->blocks.current_offset);
-
-	result = toi_do_io(writing, dev_info->bdev,
-		dev_info->blocks.current_offset << dev_info->bmap_shift,
-		page, is_readahead, 0, free_group);
-
-	/* Ignore the result here - will check end of stream if come in again */
-	go_next_page(writing, 1);
-
-	if (result)
-		printk(KERN_ERR "toi_do_io returned %d.\n", result);
-	return result;
-}
-
-dev_t get_header_dev_t(void)
-{
-	return prio_chain_head->dev_t;
-}
-
-struct block_device *get_header_bdev(void)
-{
-	return prio_chain_head->bdev;
-}
-
-unsigned long get_headerblock(void)
-{
-	return prio_chain_head->blocks.first->start <<
-		prio_chain_head->bmap_shift;
-}
-
-int get_main_pool_phys_params(void)
-{
-	struct toi_bdev_info *this = prio_chain_head;
-	int result;
-
-	while (this) {
-		result = this->allocator->bio_allocator_ops->bmap(this);
-		if (result)
-			return result;
-		this = this->next;
-	}
-
-	return 0;
-}
-
-static int apply_header_reservation(void)
-{
-	int i;
-
-	if (!header_pages_reserved) {
-		toi_message(TOI_BIO, TOI_VERBOSE, 0,
-				"No header pages reserved at the moment.");
-		return 0;
-	}
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation.");
-
-	/* Apply header space reservation */
-	toi_extent_state_goto_start();
-
-	for (i = 0; i < header_pages_reserved; i++)
-		if (go_next_page(1, 0))
-			return -ENOSPC;
-
-	/* The end of header pages will be the start of pageset 2 */
-	toi_extent_state_save(2);
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0,
-			"Finished applying header reservation.");
-	return 0;
-}
-
-static int toi_bio_register_storage(void)
-{
-	int result = 0;
-	struct toi_module_ops *this_module;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled ||
-		    this_module->type != BIO_ALLOCATOR_MODULE)
-			continue;
-		toi_message(TOI_BIO, TOI_VERBOSE, 0,
-				"Registering storage from %s.",
-				this_module->name);
-		result = this_module->bio_allocator_ops->register_storage();
-		if (result)
-			break;
-	}
-
-	return result;
-}
-
-void toi_bio_free_unused_storage(void)
-{
-    struct toi_bdev_info *this = prio_chain_head;
-
-    while (this) {
-        toi_bio_free_unused_storage_chain(this);
-        this = this->next;
-    }
-}
-
-int toi_bio_allocate_storage(unsigned long request)
-{
-	struct toi_bdev_info *chain = prio_chain_head;
-	unsigned long to_get = request;
-	unsigned long extra_pages, needed;
-	int no_free = 0;
-
-	if (!chain) {
-		int result = toi_bio_register_storage();
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
-			"Registering storage.");
-		if (result)
-			return 0;
-		chain = prio_chain_head;
-		if (!chain) {
-			printk("TuxOnIce: No storage was registered.\n");
-			return 0;
-		}
-	}
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
-			"Request is %lu pages.", request);
-	extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
-			       + sizeof(int)), PAGE_SIZE);
-	needed = request + extra_pages + header_pages_reserved;
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu "
-			"for header => %lu.",
-			extra_pages, header_pages_reserved, needed);
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.",
-			raw_pages_allocd);
-
-	to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0;
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get);
-
-	if (!to_get)
-		return apply_header_reservation();
-
-	while (to_get && chain) {
-		int num_group = devices_of_same_priority(chain);
-		int divisor = num_group - no_free;
-		int i;
-		unsigned long portion = DIV_ROUND_UP(to_get, divisor);
-		unsigned long got = 0;
-		unsigned long got_this_round = 0;
-		struct toi_bdev_info *top = chain;
-
-		toi_message(TOI_BIO, TOI_VERBOSE, 0,
-				" Start of loop. To get is %lu. Divisor is %d.",
-				to_get, divisor);
-		no_free = 0;
-
-		/*
-		 * We're aiming to spread the allocated storage as evenly
-		 * as possible, but we also want to get all the storage we
-		 * can off this priority.
-		 */
-		for (i = 0; i < num_group; i++) {
-			struct toi_bio_allocator_ops *ops =
-				chain->allocator->bio_allocator_ops;
-			toi_message(TOI_BIO, TOI_VERBOSE, 0,
-					" Asking for %lu pages from chain %p.",
-					portion, chain);
-			got = ops->allocate_storage(chain, portion);
-			toi_message(TOI_BIO, TOI_VERBOSE, 0,
-					" Got %lu pages from allocator %p.",
-					got, chain);
-			if (!got)
-				no_free++;
-			got_this_round += got;
-			chain = chain->next;
-		}
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a "
-				"total of %lu pages from %d allocators.",
-				got_this_round, divisor - no_free);
-
-		raw_pages_allocd += got_this_round;
-		to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd :
-			0;
-
-		/*
-		 * If we got anything from chains of this priority and we
-		 * still have storage to allocate, go over this priority
-		 * again.
-		 */
-		if (got_this_round && to_get)
-			chain = top;
-		else
-			no_free = 0;
-	}
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling "
-			"get_main_pool_phys_params");
-	/* Now let swap allocator bmap the pages */
-	get_main_pool_phys_params();
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header.");
-	return apply_header_reservation();
-}
-
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd)
-{
-	int i = 0;
-	struct toi_bdev_info *cur_chain = prio_chain_head;
-
-	while (cur_chain) {
-		cur_chain->pages_used = bkd->pages_used[i];
-		cur_chain = cur_chain->next;
-		i++;
-	}
-}
-
-int toi_bio_chains_debug_info(char *buffer, int size)
-{
-	/* Show what we actually used */
-	struct toi_bdev_info *cur_chain = prio_chain_head;
-	int len = 0;
-
-	while (cur_chain) {
-		len += scnprintf(buffer + len, size - len, "  Used %lu pages "
-				"from %s.\n", cur_chain->pages_used,
-				cur_chain->name);
-		cur_chain = cur_chain->next;
-	}
-
-	return len;
-}
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
-    struct toi_bdev_info *this = toi_writer_posn.current_chain,
-                         *cmp = prio_chain_head;
-
-    ptr->save.chain = 1;
-    while (this != cmp) {
-        ptr->save.chain++;
-        cmp = cmp->next;
-    }
-    ptr->save.block = this->blocks.current_offset;
-
-    /* Save the raw info internally for quicker access when updating pointers */
-    ptr->bdev = this->bdev;
-    ptr->block = this->blocks.current_offset << this->bmap_shift;
-}
-
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
-    int i = ptr->save.chain - 1;
-    struct toi_bdev_info *this;
-    struct hibernate_extent *hib;
-
-    /* Find chain by stored index */
-    this = prio_chain_head;
-    while (i) {
-        this = this->next;
-        i--;
-    }
-    toi_writer_posn.current_chain = this;
-
-    /* Restore block */
-    this->blocks.current_offset = ptr->save.block;
-
-    /* Find current offset from block number */
-    hib = this->blocks.first;
-
-    while (hib->start > ptr->save.block) {
-        hib = hib->next;
-    }
-
-    this->blocks.last_touched = this->blocks.current_extent = hib;
-}
diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c
deleted file mode 100644
index d18f2751c..000000000
--- a/kernel/power/tuxonice_bio_core.c
+++ /dev/null
@@ -1,1933 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains block io functions for TuxOnIce. These are
- * used by the swapwriter and it is planned that they will also
- * be used by the NFSwriter.
- *
- */
-
-#include <linux/blkdev.h>
-#include <linux/syscalls.h>
-#include <linux/suspend.h>
-#include <linux/ctype.h>
-#include <linux/fs_uuid.h>
-#include <linux/mount.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-#define MEMORY_ONLY 1
-#define THROTTLE_WAIT 2
-
-/* #define MEASURE_MUTEX_CONTENTION */
-#ifndef MEASURE_MUTEX_CONTENTION
-#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
-#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
-#else
-unsigned long mutex_times[2][2][NR_CPUS];
-#define my_mutex_lock(index, the_lock) do { \
-	int have_mutex; \
-	have_mutex = mutex_trylock(the_lock); \
-	if (!have_mutex) { \
-		mutex_lock(the_lock); \
-		mutex_times[index][0][smp_processor_id()]++; \
-	} else { \
-		mutex_times[index][1][smp_processor_id()]++; \
-	}
-
-#define my_mutex_unlock(index, the_lock) \
-	mutex_unlock(the_lock); \
-} while (0)
-#endif
-
-static int page_idx, reset_idx;
-
-static int target_outstanding_io = 1024;
-static int max_outstanding_writes, max_outstanding_reads;
-
-static struct page *bio_queue_head, *bio_queue_tail;
-static atomic_t toi_bio_queue_size;
-static DEFINE_SPINLOCK(bio_queue_lock);
-
-static int free_mem_throttle, throughput_throttle;
-int more_readahead = 1;
-static struct page *readahead_list_head, *readahead_list_tail;
-
-static struct page *waiting_on;
-
-static atomic_t toi_io_in_progress, toi_io_done;
-static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
-
-int current_stream;
-/* Not static, so that the allocators can setup and complete
- * writing the header */
-char *toi_writer_buffer;
-int toi_writer_buffer_posn;
-
-static DEFINE_MUTEX(toi_bio_mutex);
-static DEFINE_MUTEX(toi_bio_readahead_mutex);
-
-static struct task_struct *toi_queue_flusher;
-static int toi_bio_queue_flush_pages(int dedicated_thread);
-
-struct toi_module_ops toi_blockwriter_ops;
-
-struct toi_incremental_image_pointer toi_inc_ptr[2][2];
-
-#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \
-	       atomic_read(&toi_bio_queue_size))
-
-unsigned long raw_pages_allocd, header_pages_reserved;
-
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
-		int no_readahead);
-
-/**
- * set_free_mem_throttle - set the point where we pause to avoid oom.
- *
- * Initially, this value is zero, but when we first fail to allocate memory,
- * we set it (plus a buffer) and thereafter throttle i/o once that limit is
- * reached.
- **/
-static void set_free_mem_throttle(void)
-{
-	int new_throttle = nr_free_buffer_pages() + 256;
-
-	if (new_throttle > free_mem_throttle)
-		free_mem_throttle = new_throttle;
-}
-
-#define NUM_REASONS 7
-static atomic_t reasons[NUM_REASONS];
-static char *reason_name[NUM_REASONS] = {
-	"readahead not ready",
-	"bio allocation",
-	"synchronous I/O",
-	"toi_bio_get_new_page",
-	"memory low",
-	"readahead buffer allocation",
-	"throughput_throttle",
-};
-
-/* User Specified Parameters. */
-unsigned long resume_firstblock;
-dev_t resume_dev_t;
-struct block_device *resume_block_device;
-static atomic_t resume_bdev_open_count;
-
-struct block_device *header_block_device;
-
-/**
- * toi_open_bdev: Open a bdev at resume time.
- *
- * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
- * (the user can have resume= pointing at a swap partition/file that isn't
- * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
- * header. It will be from a swap partition that was enabled when we hibernated,
- * but we don't know it's real index until we read that first page.
- * dev_t: The device major/minor.
- * display_errs: Whether to try to do this quietly.
- *
- * We stored a dev_t in the image header. Open the matching device without
- * requiring /dev/<whatever> in most cases and record the details needed
- * to close it later and avoid duplicating work.
- */
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
-		int display_errs)
-{
-	struct block_device *bdev;
-	dev_t device = default_device;
-	char buf[32];
-	int retried = 0;
-
-retry:
-	if (uuid) {
-		struct fs_info seek;
-		strncpy((char *) &seek.uuid, uuid, 16);
-		seek.dev_t = 0;
-		seek.last_mount_size = 0;
-		device = blk_lookup_fs_info(&seek);
-		if (!device) {
-			device = default_device;
-			printk(KERN_DEBUG "Unable to resolve uuid. Falling back"
-					" to dev_t.\n");
-		} else
-			printk(KERN_DEBUG "Resolved uuid to device %s.\n",
-					format_dev_t(buf, device));
-	}
-
-	if (!device) {
-		printk(KERN_ERR "TuxOnIce attempting to open a "
-				"blank dev_t!\n");
-		dump_stack();
-		return NULL;
-	}
-	bdev = toi_open_by_devnum(device);
-
-	if (IS_ERR(bdev) || !bdev) {
-		if (!retried) {
-			retried = 1;
-			wait_for_device_probe();
-			goto retry;
-		}
-		if (display_errs)
-			toi_early_boot_message(1, TOI_CONTINUE_REQ,
-				"Failed to get access to block device "
-				"\"%x\" (error %d).\n Maybe you need "
-				"to run mknod and/or lvmsetup in an "
-				"initrd/ramfs?", device, bdev);
-		return ERR_PTR(-EINVAL);
-	}
-	toi_message(TOI_BIO, TOI_VERBOSE, 0,
-			"TuxOnIce got bdev %p for dev_t %x.",
-			bdev, device);
-
-	return bdev;
-}
-
-static void toi_bio_reserve_header_space(unsigned long request)
-{
-	header_pages_reserved = request;
-}
-
-/**
- * do_bio_wait - wait for some TuxOnIce I/O to complete
- * @reason: The array index of the reason we're waiting.
- *
- * Wait for a particular page of I/O if we're after a particular page.
- * If we're not after a particular page, wait instead for all in flight
- * I/O to be completed or for us to have enough free memory to be able
- * to submit more I/O.
- *
- * If we wait, we also update our statistics regarding why we waited.
- **/
-static void do_bio_wait(int reason)
-{
-	struct page *was_waiting_on = waiting_on;
-
-	/* On SMP, waiting_on can be reset, so we make a copy */
-	if (was_waiting_on) {
-		wait_on_page_locked(was_waiting_on);
-		atomic_inc(&reasons[reason]);
-	} else {
-		atomic_inc(&reasons[reason]);
-
-		wait_event(num_in_progress_wait,
-			!atomic_read(&toi_io_in_progress) ||
-			nr_free_buffer_pages() > free_mem_throttle);
-	}
-}
-
-/**
- * throttle_if_needed - wait for I/O completion if throttle points are reached
- * @flags: What to check and how to act.
- *
- * Check whether we need to wait for some I/O to complete. We always check
- * whether we have enough memory available, but may also (depending upon
- * @reason) check if the throughput throttle limit has been reached.
- **/
-static int throttle_if_needed(int flags)
-{
-	int free_pages = nr_free_buffer_pages();
-
-	/* Getting low on memory and I/O is in progress? */
-	while (unlikely(free_pages < free_mem_throttle) &&
-			atomic_read(&toi_io_in_progress) &&
-			!test_result_state(TOI_ABORTED)) {
-		if (!(flags & THROTTLE_WAIT))
-			return -ENOMEM;
-		do_bio_wait(4);
-		free_pages = nr_free_buffer_pages();
-	}
-
-	while (!(flags & MEMORY_ONLY) && throughput_throttle &&
-		TOTAL_OUTSTANDING_IO >= throughput_throttle &&
-		!test_result_state(TOI_ABORTED)) {
-		int result = toi_bio_queue_flush_pages(0);
-		if (result)
-			return result;
-		atomic_inc(&reasons[6]);
-		wait_event(num_in_progress_wait,
-			!atomic_read(&toi_io_in_progress) ||
-			TOTAL_OUTSTANDING_IO < throughput_throttle);
-	}
-
-	return 0;
-}
-
-/**
- * update_throughput_throttle - update the raw throughput throttle
- * @jif_index: The number of times this function has been called.
- *
- * This function is called four times per second by the core, and used to limit
- * the amount of I/O we submit at once, spreading out our waiting through the
- * whole job and letting userui get an opportunity to do its work.
- *
- * We don't start limiting I/O until 1/4s has gone so that we get a
- * decent sample for our initial limit, and keep updating it because
- * throughput may vary (on rotating media, eg) with our block number.
- *
- * We throttle to 1/10s worth of I/O.
- **/
-static void update_throughput_throttle(int jif_index)
-{
-	int done = atomic_read(&toi_io_done);
-	throughput_throttle = done * 2 / 5 / jif_index;
-}
-
-/**
- * toi_finish_all_io - wait for all outstanding i/o to complete
- *
- * Flush any queued but unsubmitted I/O and wait for it all to complete.
- **/
-static int toi_finish_all_io(void)
-{
-	int result = toi_bio_queue_flush_pages(0);
-	toi_bio_queue_flusher_should_finish = 1;
-	wake_up(&toi_io_queue_flusher);
-	wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO);
-	return result;
-}
-
-/**
- * toi_end_bio - bio completion function.
- * @bio: bio that has completed.
- * @err: Error value. Yes, like end_swap_bio_read, we ignore it.
- *
- * Function called by the block driver from interrupt context when I/O is
- * completed. If we were writing the page, we want to free it and will have
- * set bio->bi_private to the parameter we should use in telling the page
- * allocation accounting code what the page was allocated for. If we're
- * reading the page, it will be in the singly linked list made from
- * page->private pointers.
- **/
-static void toi_end_bio(struct bio *bio, int err)
-{
-	struct page *page = bio->bi_io_vec[0].bv_page;
-
-	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
-
-	unlock_page(page);
-	bio_put(bio);
-
-	if (waiting_on == page)
-		waiting_on = NULL;
-
-	put_page(page);
-
-	if (bio->bi_private)
-		toi__free_page((int) ((unsigned long) bio->bi_private) , page);
-
-	bio_put(bio);
-
-	atomic_dec(&toi_io_in_progress);
-	atomic_inc(&toi_io_done);
-
-	wake_up(&num_in_progress_wait);
-}
-
-/**
- * submit - submit BIO request
- * @writing: READ or WRITE.
- * @dev: The block device we're using.
- * @first_block: The first sector we're using.
- * @page: The page being used for I/O.
- * @free_group: If writing, the group that was used in allocating the page
- * 	and which will be used in freeing the page from the completion
- * 	routine.
- *
- * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
- * textbook - allocate and initialize the bio. If we're writing, make sure
- * the page is marked as dirty. Then submit it and carry on."
- *
- * If we're just testing the speed of our own code, we fake having done all
- * the hard work and all toi_end_bio immediately.
- **/
-static int submit(int writing, struct block_device *dev, sector_t first_block,
-		struct page *page, int free_group)
-{
-	struct bio *bio = NULL;
-	int cur_outstanding_io, result;
-
-	/*
-	 * Shouldn't throttle if reading - can deadlock in the single
-	 * threaded case as pages are only freed when we use the
-	 * readahead.
-	 */
-	if (writing) {
-		result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT);
-		if (result)
-			return result;
-	}
-
-	while (!bio) {
-		bio = bio_alloc(TOI_ATOMIC_GFP, 1);
-		if (!bio) {
-			set_free_mem_throttle();
-			do_bio_wait(1);
-		}
-	}
-
-	bio->bi_bdev = dev;
-	bio->bi_iter.bi_sector = first_block;
-	bio->bi_private = (void *) ((unsigned long) free_group);
-	bio->bi_end_io = toi_end_bio;
-	bio->bi_flags |= (1 << BIO_TOI);
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n",
-				(unsigned long long) first_block);
-		bio_put(bio);
-		return -EFAULT;
-	}
-
-	bio_get(bio);
-
-	cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
-	if (writing) {
-		if (cur_outstanding_io > max_outstanding_writes)
-			max_outstanding_writes = cur_outstanding_io;
-	} else {
-		if (cur_outstanding_io > max_outstanding_reads)
-			max_outstanding_reads = cur_outstanding_io;
-	}
-
-	/* Still read the header! */
-	if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) {
-		/* Fake having done the hard work */
-		set_bit(BIO_UPTODATE, &bio->bi_flags);
-		toi_end_bio(bio, 0);
-	} else
-		submit_bio(writing | REQ_SYNC, bio);
-
-	return 0;
-}
-
-/**
- * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
- *
- * @writing: Whether reading or writing.
- * @bdev: The block device which we're using.
- * @block0: The first sector we're reading or writing.
- * @page: The page on which I/O is being done.
- * @readahead_index: If doing readahead, the index (reset this flag when done).
- * @syncio: Whether the i/o is being done synchronously.
- *
- * Prepare and start a read or write operation.
- *
- * Note that we always work with our own page. If writing, we might be given a
- * compression buffer that will immediately be used to start compressing the
- * next page. For reading, we do readahead and therefore don't know the final
- * address where the data needs to go.
- **/
-int toi_do_io(int writing, struct block_device *bdev, long block0,
-	struct page *page, int is_readahead, int syncio, int free_group)
-{
-	page->private = 0;
-
-	/* Do here so we don't race against toi_bio_get_next_page_read */
-	lock_page(page);
-
-	if (is_readahead) {
-		if (readahead_list_head)
-			readahead_list_tail->private = (unsigned long) page;
-		else
-			readahead_list_head = page;
-
-		readahead_list_tail = page;
-	}
-
-	/* Done before submitting to avoid races. */
-	if (syncio)
-		waiting_on = page;
-
-	/* Submit the page */
-	get_page(page);
-
-	if (submit(writing, bdev, block0, page, free_group))
-		return -EFAULT;
-
-	if (syncio)
-		do_bio_wait(2);
-
-	return 0;
-}
-
-/**
- * toi_bdev_page_io - simpler interface to do directly i/o on a single page
- * @writing: Whether reading or writing.
- * @bdev: Block device on which we're operating.
- * @pos: Sector at which page to read or write starts.
- * @page: Page to be read/written.
- *
- * A simple interface to submit a page of I/O and wait for its completion.
- * The caller must free the page used.
- **/
-static int toi_bdev_page_io(int writing, struct block_device *bdev,
-		long pos, struct page *page)
-{
-	return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
-}
-
-/**
- * toi_bio_memory_needed - report the amount of memory needed for block i/o
- *
- * We want to have at least enough memory so as to have target_outstanding_io
- * or more transactions on the fly at once. If we can do more, fine.
- **/
-static int toi_bio_memory_needed(void)
-{
-	return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
-				sizeof(struct bio));
-}
-
-/**
- * toi_bio_print_debug_stats - put out debugging info in the buffer provided
- * @buffer: A buffer of size @size into which text should be placed.
- * @size: The size of @buffer.
- *
- * Fill a buffer with debugging info. This is used for both our debug_info sysfs
- * entry and for recording the same info in dmesg.
- **/
-static int toi_bio_print_debug_stats(char *buffer, int size)
-{
-	int len = 0;
-
-	if (toiActiveAllocator != &toi_blockwriter_ops) {
-		len = scnprintf(buffer, size,
-				"- Block I/O inactive.\n");
-		return len;
-	}
-
-	len = scnprintf(buffer, size, "- Block I/O active.\n");
-
-	len += toi_bio_chains_debug_info(buffer + len, size - len);
-
-	len += scnprintf(buffer + len, size - len,
-			"- Max outstanding reads %d. Max writes %d.\n",
-			max_outstanding_reads, max_outstanding_writes);
-
-	len += scnprintf(buffer + len, size - len,
-		"  Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
-		target_outstanding_io,
-		PAGE_SIZE, (unsigned int) sizeof(struct request),
-		(unsigned int) sizeof(struct bio), toi_bio_memory_needed());
-
-#ifdef MEASURE_MUTEX_CONTENTION
-	{
-	int i;
-
-	len += scnprintf(buffer + len, size - len,
-		"  Mutex contention while reading:\n  Contended      Free\n");
-
-	for_each_online_cpu(i)
-		len += scnprintf(buffer + len, size - len,
-		"  %9lu %9lu\n",
-		mutex_times[0][0][i], mutex_times[0][1][i]);
-
-	len += scnprintf(buffer + len, size - len,
-		"  Mutex contention while writing:\n  Contended      Free\n");
-
-	for_each_online_cpu(i)
-		len += scnprintf(buffer + len, size - len,
-		"  %9lu %9lu\n",
-		mutex_times[1][0][i], mutex_times[1][1][i]);
-
-	}
-#endif
-
-	return len + scnprintf(buffer + len, size - len,
-		"  Free mem throttle point reached %d.\n", free_mem_throttle);
-}
-
-static int total_header_bytes;
-static int unowned;
-
-void debug_broken_header(void)
-{
-	printk(KERN_DEBUG "Image header too big for size allocated!\n");
-	print_toi_header_storage_for_modules();
-	printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed());
-	printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header));
-	printk(KERN_DEBUG "Total unowned : %d.\n", unowned);
-	printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes,
-			DIV_ROUND_UP(total_header_bytes, PAGE_SIZE));
-	printk(KERN_DEBUG "Space needed now : %ld.\n",
-			get_header_storage_needed());
-	dump_block_chains();
-	abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small.");
-}
-
-static int toi_bio_update_previous_inc_img_ptr(int stream)
-{
-    int result;
-    char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP);
-    struct page *page;
-    struct toi_incremental_image_pointer *prev, *this;
-
-    prev = &toi_inc_ptr[stream][0];
-    this = &toi_inc_ptr[stream][1];
-
-    if (!buffer) {
-        // We're at the start of writing a pageset. Memory should not be that scarce.
-        return -ENOMEM;
-    }
-
-    page = virt_to_page(buffer);
-    result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0);
-
-    if (result)
-        goto out;
-
-    memcpy(buffer, (char *) this, sizeof(this->save));
-
-    result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12);
-
-    // If the IO is successfully submitted (!result), the page will be freed
-    // asynchronously on completion.
-out:
-    if (result)
-        toi__free_page(12, virt_to_page(buffer));
-    return result;
-}
-
-/**
- * toi_rw_init_incremental - incremental image part of setting up to write new section
- */
-static int toi_write_init_incremental(int stream)
-{
-    int result = 0;
-
-    // Remember the location of this block so we can link to it.
-    toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
-    // Update the pointer at the start of the last pageset with the same stream number.
-    result = toi_bio_update_previous_inc_img_ptr(stream);
-    if (result)
-        return result;
-
-    // Move the current to the previous slot.
-    memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]));
-
-    // Store a blank pointer at the start of this incremental pageset
-    memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1]));
-    result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
-    if (result)
-        return result;
-
-    // Serialise extent chains if this is an incremental pageset
-    return toi_serialise_extent_chains();
-}
-
-/**
- * toi_read_init_incremental - incremental image part of setting up to read new section
- */
-static int toi_read_init_incremental(int stream)
-{
-    int result;
-
-    // Set our position to the start of the next pageset
-    toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
-    // Read the start of the next incremental pageset (if any)
-    result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
-
-    if (!result)
-        result = toi_load_extent_chains();
-
-    return result;
-}
-
-/**
- * toi_rw_init - prepare to read or write a stream in the image
- * @writing: Whether reading or writing.
- * @stream number: Section of the image being processed.
- *
- * Prepare to read or write a section ('stream') in the image.
- **/
-static int toi_rw_init(int writing, int stream_number)
-{
-	if (stream_number)
-		toi_extent_state_restore(stream_number);
-	else
-		toi_extent_state_goto_start();
-
-	if (writing) {
-		reset_idx = 0;
-		if (!current_stream)
-			page_idx = 0;
-	} else {
-		reset_idx = 1;
-	}
-
-	atomic_set(&toi_io_done, 0);
-	if (!toi_writer_buffer)
-		toi_writer_buffer = (char *) toi_get_zeroed_page(11,
-				TOI_ATOMIC_GFP);
-	toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
-
-	current_stream = stream_number;
-
-	more_readahead = 1;
-
-        if (test_result_state(TOI_KEPT_IMAGE)) {
-            int result;
-
-            if (writing) {
-                result = toi_write_init_incremental(stream_number);
-            } else {
-                result = toi_read_init_incremental(stream_number);
-            }
-
-            if (result)
-                return result;
-        }
-
-	return toi_writer_buffer ? 0 : -ENOMEM;
-}
-
-/**
- * toi_bio_queue_write - queue a page for writing
- * @full_buffer: Pointer to a page to be queued
- *
- * Add a page to the queue to be submitted. If we're the queue flusher,
- * we'll do this once we've dropped toi_bio_mutex, so other threads can
- * continue to submit I/O while we're on the slow path doing the actual
- * submission.
- **/
-static void toi_bio_queue_write(char **full_buffer)
-{
-	struct page *page = virt_to_page(*full_buffer);
-	unsigned long flags;
-
-	*full_buffer = NULL;
-	page->private = 0;
-
-	spin_lock_irqsave(&bio_queue_lock, flags);
-	if (!bio_queue_head)
-		bio_queue_head = page;
-	else
-		bio_queue_tail->private = (unsigned long) page;
-
-	bio_queue_tail = page;
-	atomic_inc(&toi_bio_queue_size);
-
-	spin_unlock_irqrestore(&bio_queue_lock, flags);
-	wake_up(&toi_io_queue_flusher);
-}
-
-/**
- * toi_rw_cleanup - Cleanup after i/o.
- * @writing: Whether we were reading or writing.
- *
- * Flush all I/O and clean everything up after reading or writing a
- * section of the image.
- **/
-static int toi_rw_cleanup(int writing)
-{
-	int i, result = 0;
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup.");
-	if (writing) {
-		if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED))
-			toi_bio_queue_write(&toi_writer_buffer);
-
-		while (bio_queue_head && !result)
-			result = toi_bio_queue_flush_pages(0);
-
-		if (result)
-			return result;
-
-		if (current_stream == 2)
-			toi_extent_state_save(1);
-		else if (current_stream == 1)
-			toi_extent_state_save(3);
-	}
-
-	result = toi_finish_all_io();
-
-	while (readahead_list_head) {
-		void *next = (void *) readahead_list_head->private;
-		toi__free_page(12, readahead_list_head);
-		readahead_list_head = next;
-	}
-
-	readahead_list_tail = NULL;
-
-	if (!current_stream)
-		return result;
-
-	for (i = 0; i < NUM_REASONS; i++) {
-		if (!atomic_read(&reasons[i]))
-			continue;
-		printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n",
-				reason_name[i], atomic_read(&reasons[i]));
-		atomic_set(&reasons[i], 0);
-	}
-
-	current_stream = 0;
-	return result;
-}
-
-/**
- * toi_start_one_readahead - start one page of readahead
- * @dedicated_thread: Is this a thread dedicated to doing readahead?
- *
- * Start one new page of readahead. If this is being called by a thread
- * whose only just is to submit readahead, don't quit because we failed
- * to allocate a page.
- **/
-static int toi_start_one_readahead(int dedicated_thread)
-{
-	char *buffer = NULL;
-	int oom = 0, result;
-
-	result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0);
-	if (result) {
-            printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result);
-            return result;
-        }
-
-	mutex_lock(&toi_bio_readahead_mutex);
-
-	while (!buffer) {
-		buffer = (char *) toi_get_zeroed_page(12,
-				TOI_ATOMIC_GFP);
-		if (!buffer) {
-			if (oom && !dedicated_thread) {
-				mutex_unlock(&toi_bio_readahead_mutex);
-                                printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result);
-				return -ENOMEM;
-			}
-
-			oom = 1;
-			set_free_mem_throttle();
-			do_bio_wait(5);
-		}
-	}
-
-	result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
-        if (result) {
-            printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result);
-        }
-	if (result == -ENOSPC)
-		toi__free_page(12, virt_to_page(buffer));
-	mutex_unlock(&toi_bio_readahead_mutex);
-	if (result) {
-		if (result == -ENOSPC)
-			toi_message(TOI_BIO, TOI_VERBOSE, 0,
-					"Last readahead page submitted.");
-		else
-			printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n",
-					result);
-	}
-	return result;
-}
-
-/**
- * toi_start_new_readahead - start new readahead
- * @dedicated_thread: Are we dedicated to this task?
- *
- * Start readahead of image pages.
- *
- * We can be called as a thread dedicated to this task (may be helpful on
- * systems with lots of CPUs), in which case we don't exit until there's no
- * more readahead.
- *
- * If this is not called by a dedicated thread, we top up our queue until
- * there's no more readahead to submit, we've submitted the number given
- * in target_outstanding_io or the number in progress exceeds the target
- * outstanding I/O value.
- *
- * No mutex needed because this is only ever called by the first cpu.
- **/
-static int toi_start_new_readahead(int dedicated_thread)
-{
-	int last_result, num_submitted = 0;
-
-	/* Start a new readahead? */
-	if (!more_readahead)
-		return 0;
-
-	do {
-		last_result = toi_start_one_readahead(dedicated_thread);
-
-		if (last_result) {
-			if (last_result == -ENOMEM || last_result == -ENOSPC)
-				return 0;
-
-			printk(KERN_DEBUG
-				"Begin read chunk returned %d.\n",
-				last_result);
-		} else
-			num_submitted++;
-
-	} while (more_readahead && !last_result &&
-		 (dedicated_thread ||
-		  (num_submitted < target_outstanding_io &&
-		   atomic_read(&toi_io_in_progress) < target_outstanding_io)));
-
-	return last_result;
-}
-
-/**
- * bio_io_flusher - start the dedicated I/O flushing routine
- * @writing: Whether we're writing the image.
- **/
-static int bio_io_flusher(int writing)
-{
-
-	if (writing)
-		return toi_bio_queue_flush_pages(1);
-	else
-		return toi_start_new_readahead(1);
-}
-
-/**
- * toi_bio_get_next_page_read - read a disk page, perhaps with readahead
- * @no_readahead: Whether we can use readahead
- *
- * Read a page from disk, submitting readahead and cleaning up finished i/o
- * while we wait for the page we're after.
- **/
-static int toi_bio_get_next_page_read(int no_readahead)
-{
-	char *virt;
-	struct page *old_readahead_list_head;
-
-	/*
-	 * When reading the second page of the header, we have to
-	 * delay submitting the read until after we've gotten the
-	 * extents out of the first page.
-	 */
-        if (unlikely(no_readahead)) {
-            int result = toi_start_one_readahead(0);
-            if (result) {
-                printk(KERN_EMERG "No readahead and toi_start_one_readahead "
-                        "returned non-zero.\n");
-                return -EIO;
-            }
-        }
-
-	if (unlikely(!readahead_list_head)) {
-		/*
-		 * If the last page finishes exactly on the page
-		 * boundary, we will be called one extra time and
-		 * have no data to return. In this case, we should
-		 * not BUG(), like we used to!
-		 */
-		if (!more_readahead) {
-			printk(KERN_EMERG "No more readahead.\n");
-			return -ENOSPC;
-		}
-		if (unlikely(toi_start_one_readahead(0))) {
-			printk(KERN_EMERG "No readahead and "
-			 "toi_start_one_readahead returned non-zero.\n");
-			return -EIO;
-		}
-	}
-
-	if (PageLocked(readahead_list_head)) {
-		waiting_on = readahead_list_head;
-		do_bio_wait(0);
-	}
-
-	virt = page_address(readahead_list_head);
-	memcpy(toi_writer_buffer, virt, PAGE_SIZE);
-	
-	mutex_lock(&toi_bio_readahead_mutex);
-	old_readahead_list_head = readahead_list_head;
-	readahead_list_head = (struct page *) readahead_list_head->private;
-	mutex_unlock(&toi_bio_readahead_mutex);
-	toi__free_page(12, old_readahead_list_head);
-	return 0;
-}
-
-/**
- * toi_bio_queue_flush_pages - flush the queue of pages queued for writing
- * @dedicated_thread: Whether we're a dedicated thread
- *
- * Flush the queue of pages ready to be written to disk.
- *
- * If we're a dedicated thread, stay in here until told to leave,
- * sleeping in wait_event.
- *
- * The first thread is normally the only one to come in here. Another
- * thread can enter this routine too, though, via throttle_if_needed.
- * Since that's the case, we must be careful to only have one thread
- * doing this work at a time. Otherwise we have a race and could save
- * pages out of order.
- *
- * If an error occurs, free all remaining pages without submitting them
- * for I/O.
- **/
-
-int toi_bio_queue_flush_pages(int dedicated_thread)
-{
-	unsigned long flags;
-	int result = 0;
-	static DEFINE_MUTEX(busy);
-
-	if (!mutex_trylock(&busy))
-		return 0;
-
-top:
-	spin_lock_irqsave(&bio_queue_lock, flags);
-	while (bio_queue_head) {
-		struct page *page = bio_queue_head;
-		bio_queue_head = (struct page *) page->private;
-		if (bio_queue_tail == page)
-			bio_queue_tail = NULL;
-		atomic_dec(&toi_bio_queue_size);
-		spin_unlock_irqrestore(&bio_queue_lock, flags);
-
-		/* Don't generate more error messages if already had one */
-		if (!result)
-			result = toi_bio_rw_page(WRITE, page, 0, 11);
-		/*
-		 * If writing the page failed, don't drop out.
-		 * Flush the rest of the queue too.
-		 */
-		if (result)
-			toi__free_page(11 , page);
-		spin_lock_irqsave(&bio_queue_lock, flags);
-	}
-	spin_unlock_irqrestore(&bio_queue_lock, flags);
-
-	if (dedicated_thread) {
-		wait_event(toi_io_queue_flusher, bio_queue_head ||
-				toi_bio_queue_flusher_should_finish);
-		if (likely(!toi_bio_queue_flusher_should_finish))
-			goto top;
-		toi_bio_queue_flusher_should_finish = 0;
-	}
-
-	mutex_unlock(&busy);
-	return result;
-}
-
-/**
- * toi_bio_get_new_page - get a new page for I/O
- * @full_buffer: Pointer to a page to allocate.
- **/
-static int toi_bio_get_new_page(char **full_buffer)
-{
-	int result = throttle_if_needed(THROTTLE_WAIT);
-	if (result)
-		return result;
-
-	while (!*full_buffer) {
-		*full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
-		if (!*full_buffer) {
-			set_free_mem_throttle();
-			do_bio_wait(3);
-		}
-	}
-
-	return 0;
-}
-
-/**
- * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O
- * @writing:		Bool - whether writing (or reading).
- * @buffer:		The start of the buffer to write or fill.
- * @buffer_size:	The size of the buffer to write or fill.
- * @no_readahead:	Don't try to start readhead (when getting extents).
- **/
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
-		int no_readahead)
-{
-	int bytes_left = buffer_size, result = 0;
-
-	while (bytes_left) {
-		char *source_start = buffer + buffer_size - bytes_left;
-		char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
-		int capacity = PAGE_SIZE - toi_writer_buffer_posn;
-		char *to = writing ? dest_start : source_start;
-		char *from = writing ? source_start : dest_start;
-
-		if (bytes_left <= capacity) {
-			memcpy(to, from, bytes_left);
-			toi_writer_buffer_posn += bytes_left;
-			return 0;
-		}
-
-		/* Complete this page and start a new one */
-		memcpy(to, from, capacity);
-		bytes_left -= capacity;
-
-		if (!writing) {
-			/*
-			 * Perform actual I/O:
-			 * read readahead_list_head into toi_writer_buffer
-			 */
-			int result = toi_bio_get_next_page_read(no_readahead);
-			if (result && bytes_left) {
-				printk("toi_bio_get_next_page_read "
-                                        "returned %d. Expecting to read %d bytes.\n", result, bytes_left);
-                                return result;
-			}
-		} else {
-			toi_bio_queue_write(&toi_writer_buffer);
-			result = toi_bio_get_new_page(&toi_writer_buffer);
-			if (result) {
-				printk(KERN_ERR "toi_bio_get_new_page returned "
-						"%d.\n", result);
-				return result;
-			}
-		}
-
-		toi_writer_buffer_posn = 0;
-		toi_cond_pause(0, NULL);
-	}
-
-	return 0;
-}
-
-/**
- * toi_bio_read_page - read a page of the image
- * @pfn:		The pfn where the data belongs.
- * @buffer_page:	The page containing the (possibly compressed) data.
- * @buf_size:		The number of bytes on @buffer_page used (PAGE_SIZE).
- *
- * Read a (possibly compressed) page from the image, into buffer_page,
- * returning its pfn and the buffer size.
- **/
-static int toi_bio_read_page(unsigned long *pfn, int buf_type,
-		void *buffer_page, unsigned int *buf_size)
-{
-	int result = 0;
-	int this_idx;
-	char *buffer_virt = TOI_MAP(buf_type, buffer_page);
-
-	/*
-	 * Only call start_new_readahead if we don't have a dedicated thread
-	 * and we're the queue flusher.
-	 */
-	if (current == toi_queue_flusher && more_readahead &&
-			!test_action_state(TOI_NO_READAHEAD)) {
-		int result2 = toi_start_new_readahead(0);
-		if (result2) {
-			printk(KERN_DEBUG "Queue flusher and "
-			 "toi_start_one_readahead returned non-zero.\n");
-			result = -EIO;
-			goto out;
-		}
-	}
-
-	my_mutex_lock(0, &toi_bio_mutex);
-
-	/*
-	 * Structure in the image:
-	 *	[destination pfn|page size|page data]
-	 * buf_size is PAGE_SIZE
-	 * We can validly find there's nothing to read in a multithreaded
-	 * situation.
-	 */
-	if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) ||
-	    toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
-	    toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
-	    toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
-		result = -ENODATA;
-		goto out_unlock;
-	}
-
-	if (reset_idx) {
-		page_idx = this_idx;
-		reset_idx = 0;
-	} else {
-		page_idx++;
-		if (!this_idx)
-			result = -ENODATA;
-		else if (page_idx != this_idx)
-			printk(KERN_ERR "Got page index %d, expected %d.\n",
-					this_idx, page_idx);
-	}
-
-out_unlock:
-	my_mutex_unlock(0, &toi_bio_mutex);
-out:
-	TOI_UNMAP(buf_type, buffer_page);
-	return result;
-}
-
-/**
- * toi_bio_write_page - write a page of the image
- * @pfn:		The pfn where the data belongs.
- * @buffer_page:	The page containing the (possibly compressed) data.
- * @buf_size:	The number of bytes on @buffer_page used.
- *
- * Write a (possibly compressed) page to the image from the buffer, together
- * with it's index and buffer size.
- **/
-static int toi_bio_write_page(unsigned long pfn, int buf_type,
-		void *buffer_page, unsigned int buf_size)
-{
-	char *buffer_virt;
-	int result = 0, result2 = 0;
-
-	if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
-		return 0;
-
-	my_mutex_lock(1, &toi_bio_mutex);
-
-	if (test_result_state(TOI_ABORTED)) {
-		my_mutex_unlock(1, &toi_bio_mutex);
-		return 0;
-	}
-
-	buffer_virt = TOI_MAP(buf_type, buffer_page);
-	page_idx++;
-
-	/*
-	 * Structure in the image:
-	 *	[destination pfn|page size|page data]
-	 * buf_size is PAGE_SIZE
-	 */
-	if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) ||
-	    toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
-	    toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
-	    toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
-		printk(KERN_DEBUG "toi_rw_buffer returned non-zero to "
-				"toi_bio_write_page.\n");
-		result = -EIO;
-	}
-
-	TOI_UNMAP(buf_type, buffer_page);
-	my_mutex_unlock(1, &toi_bio_mutex);
-
-	if (current == toi_queue_flusher)
-		result2 = toi_bio_queue_flush_pages(0);
-
-	return result ? result : result2;
-}
-
-/**
- * _toi_rw_header_chunk - read or write a portion of the image header
- * @writing:		Whether reading or writing.
- * @owner:		The module for which we're writing.
- *			Used for confirming that modules
- *			don't use more header space than they asked for.
- * @buffer:		Address of the data to write.
- * @buffer_size:	Size of the data buffer.
- * @no_readahead:	Don't try to start readhead (when getting extents).
- *
- * Perform PAGE_SIZE I/O. Start readahead if needed.
- **/
-static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
-		char *buffer, int buffer_size, int no_readahead)
-{
-	int result = 0;
-
-	if (owner) {
-		owner->header_used += buffer_size;
-		toi_message(TOI_HEADER, TOI_LOW, 1,
-			"Header: %s : %d bytes (%d/%d) from offset %d.",
-			owner->name,
-			buffer_size, owner->header_used,
-			owner->header_requested,
-			toi_writer_buffer_posn);
-		if (owner->header_used > owner->header_requested && writing) {
-			printk(KERN_EMERG "TuxOnIce module %s is using more "
-				"header space (%u) than it requested (%u).\n",
-				owner->name,
-				owner->header_used,
-				owner->header_requested);
-			return buffer_size;
-		}
-	} else {
-		unowned += buffer_size;
-		toi_message(TOI_HEADER, TOI_LOW, 1,
-			"Header: (No owner): %d bytes (%d total so far) from "
-			"offset %d.", buffer_size, unowned,
-			toi_writer_buffer_posn);
-	}
-
-	if (!writing && !no_readahead && more_readahead) {
-		result = toi_start_new_readahead(0);
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead "
-				"returned %d.", result);
-	}
-
-	if (!result) {
-		result = toi_rw_buffer(writing, buffer, buffer_size,
-				no_readahead);
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned "
-				"%d.", result);
-	}
-
-	total_header_bytes += buffer_size;
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning "
-			"%d.", result);
-	return result;
-}
-
-static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
-		char *buffer, int size)
-{
-	return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-static int toi_rw_header_chunk_noreadahead(int writing,
-		struct toi_module_ops *owner, char *buffer, int size)
-{
-	return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-/**
- * toi_bio_storage_needed - get the amount of storage needed for my fns
- **/
-static int toi_bio_storage_needed(void)
-{
-	return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed();
-}
-
-/**
- * toi_bio_save_config_info - save block I/O config to image header
- * @buf:	PAGE_SIZE'd buffer into which data should be saved.
- **/
-static int toi_bio_save_config_info(char *buf)
-{
-	int *ints = (int *) buf;
-	ints[0] = target_outstanding_io;
-	return sizeof(int);
-}
-
-/**
- * toi_bio_load_config_info - restore block I/O config
- * @buf:	Data to be reloaded.
- * @size:	Size of the buffer saved.
- **/
-static void toi_bio_load_config_info(char *buf, int size)
-{
-	int *ints = (int *) buf;
-	target_outstanding_io  = ints[0];
-}
-
-void close_resume_dev_t(int force)
-{
-	if (!resume_block_device)
-		return;
-
-	if (force)
-		atomic_set(&resume_bdev_open_count, 0);
-	else
-		atomic_dec(&resume_bdev_open_count);
-
-	if (!atomic_read(&resume_bdev_open_count)) {
-		toi_close_bdev(resume_block_device);
-		resume_block_device = NULL;
-	}
-}
-
-int open_resume_dev_t(int force, int quiet)
-{
-	if (force) {
-		close_resume_dev_t(1);
-		atomic_set(&resume_bdev_open_count, 1);
-	} else
-		atomic_inc(&resume_bdev_open_count);
-
-	if (resume_block_device)
-		return 0;
-
-	resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0);
-	if (IS_ERR(resume_block_device)) {
-		if (!quiet)
-			toi_early_boot_message(1, TOI_CONTINUE_REQ,
-				"Failed to open device %x, where"
-				" the header should be found.",
-				resume_dev_t);
-		resume_block_device = NULL;
-		atomic_set(&resume_bdev_open_count, 0);
-		return 1;
-	}
-
-	return 0;
-}
-
-/**
- * toi_bio_initialise - initialise bio code at start of some action
- * @starting_cycle:	Whether starting a hibernation cycle, or just reading or
- *			writing a sysfs value.
- **/
-static int toi_bio_initialise(int starting_cycle)
-{
-	int result;
-
-	if (!starting_cycle || !resume_dev_t)
-		return 0;
-
-	max_outstanding_writes = 0;
-	max_outstanding_reads = 0;
-	current_stream = 0;
-	toi_queue_flusher = current;
-#ifdef MEASURE_MUTEX_CONTENTION
-	{
-		int i, j, k;
-
-		for (i = 0; i < 2; i++)
-			for (j = 0; j < 2; j++)
-				for_each_online_cpu(k)
-					mutex_times[i][j][k] = 0;
-	}
-#endif
-	result = open_resume_dev_t(0, 1);
-
-	if (result)
-		return result;
-
-	return get_signature_page();
-}
-
-static unsigned long raw_to_real(unsigned long raw)
-{
-	unsigned long extra;
-
-	extra = (raw * (sizeof(unsigned long) + sizeof(int)) +
-		(PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
-		(PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
-
-	return raw > extra ? raw - extra : 0;
-}
-
-static unsigned long toi_bio_storage_available(void)
-{
-	unsigned long sum = 0;
-	struct toi_module_ops *this_module;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled ||
-		    this_module->type != BIO_ALLOCATOR_MODULE)
-			continue;
-		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage "
-				"available from %s.", this_module->name);
-		sum += this_module->bio_allocator_ops->storage_available();
-	}
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu "
-			"pages (%d header pages).", sum, header_pages_reserved);
-
-	return sum > header_pages_reserved ?
-		raw_to_real(sum - header_pages_reserved) : 0;
-
-}
-
-static unsigned long toi_bio_storage_allocated(void)
-{
-	return raw_pages_allocd > header_pages_reserved ?
-		raw_to_real(raw_pages_allocd - header_pages_reserved) : 0;
-}
-
-/*
- * If we have read part of the image, we might have filled  memory with
- * data that should be zeroed out.
- */
-static void toi_bio_noresume_reset(void)
-{
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset.");
-	toi_rw_cleanup(READ);
-	free_all_bdev_info();
-}
-
-/**
- * toi_bio_cleanup - cleanup after some action
- * @finishing_cycle:	Whether completing a cycle.
- **/
-static void toi_bio_cleanup(int finishing_cycle)
-{
-	if (!finishing_cycle)
-		return;
-
-	if (toi_writer_buffer) {
-		toi_free_page(11, (unsigned long) toi_writer_buffer);
-		toi_writer_buffer = NULL;
-	}
-
-	forget_signature_page();
-
-	if (header_block_device && toi_sig_data &&
-			toi_sig_data->header_dev_t != resume_dev_t)
-		toi_close_bdev(header_block_device);
-
-	header_block_device = NULL;
-
-	close_resume_dev_t(0);
-}
-
-static int toi_bio_write_header_init(void)
-{
-	int result;
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init");
-	toi_rw_init(WRITE, 0);
-	toi_writer_buffer_posn = 0;
-
-	/* Info needed to bootstrap goes at the start of the header.
-	 * First we save the positions and devinfo, including the number
-	 * of header pages. Then we save the structs containing data needed
-	 * for reading the header pages back.
-	 * Note that even if header pages take more than one page, when we
-	 * read back the info, we will have restored the location of the
-	 * next header page by the time we go to use it.
-	 */
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains.");
-	result = toi_serialise_extent_chains();
-
-	if (result)
-		return result;
-
-	/*
-	 * Signature page hasn't been modified at this point. Write it in
-	 * the header so we can restore it later.
-	 */
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page.");
-	return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops,
-			(char *) toi_cur_sig_page,
-			PAGE_SIZE);
-}
-
-static int toi_bio_write_header_cleanup(void)
-{
-	int result = 0;
-
-	if (toi_writer_buffer_posn)
-		toi_bio_queue_write(&toi_writer_buffer);
-
-	result = toi_finish_all_io();
-
-	unowned = 0;
-	total_header_bytes = 0;
-
-	/* Set signature to save we have an image */
-	if (!result)
-		result = toi_bio_mark_have_image();
-
-	return result;
-}
-
-/*
- * toi_bio_read_header_init()
- *
- * Description:
- * 1. Attempt to read the device specified with resume=.
- * 2. Check the contents of the swap header for our signature.
- * 3. Warn, ignore, reset and/or continue as appropriate.
- * 4. If continuing, read the toi_swap configuration section
- *    of the header and set up block device info so we can read
- *    the rest of the header & image.
- *
- * Returns:
- * May not return if user choose to reboot at a warning.
- * -EINVAL if cannot resume at this time. Booting should continue
- * normally.
- */
-
-static int toi_bio_read_header_init(void)
-{
-	int result = 0;
-	char buf[32];
-
-	toi_writer_buffer_posn = 0;
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init");
-
-	if (!toi_sig_data) {
-		printk(KERN_INFO "toi_bio_read_header_init called when we "
-				"haven't verified there is an image!\n");
-		return -EINVAL;
-	}
-
-	/*
-	 * If the header is not on the resume_swap_dev_t, get the resume device
-	 * first.
-	 */
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.",
-			toi_sig_data->header_dev_t);
-	if (toi_sig_data->have_uuid) {
-		struct fs_info seek;
-		dev_t device;
-
-		strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16);
-		seek.dev_t = toi_sig_data->header_dev_t;
-		seek.last_mount_size = 0;
-		device = blk_lookup_fs_info(&seek);
-		if (device) {
-			printk("Using dev_t %s, returned by blk_lookup_fs_info.\n",
-					format_dev_t(buf, device));
-			toi_sig_data->header_dev_t = device;
-		}
-	}
-	if (toi_sig_data->header_dev_t != resume_dev_t) {
-		header_block_device = toi_open_bdev(NULL,
-				toi_sig_data->header_dev_t, 1);
-
-		if (IS_ERR(header_block_device))
-			return PTR_ERR(header_block_device);
-	} else
-		header_block_device = resume_block_device;
-
-	if (!toi_writer_buffer)
-		toi_writer_buffer = (char *) toi_get_zeroed_page(11,
-				TOI_ATOMIC_GFP);
-	more_readahead = 1;
-
-	/*
-	 * Read toi_swap configuration.
-	 * Headerblock size taken into account already.
-	 */
-	result = toi_bio_ops.bdev_page_io(READ, header_block_device,
-			toi_sig_data->first_header_block,
-			virt_to_page((unsigned long) toi_writer_buffer));
-	if (result)
-		return result;
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains.");
-	result = toi_load_extent_chains();
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page.");
-	toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
-	if (!toi_orig_sig_page) {
-		printk(KERN_ERR "Failed to allocate memory for the current"
-			" image signature.\n");
-		return -ENOMEM;
-	}
-
-	return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops,
-			(char *) toi_orig_sig_page,
-			PAGE_SIZE);
-}
-
-static int toi_bio_read_header_cleanup(void)
-{
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup.");
-	return toi_rw_cleanup(READ);
-}
-
-/* Works only for digits and letters, but small and fast */
-#define TOLOWER(x) ((x) | 0x20)
-
-/*
- * UUID must be 32 chars long. It may have dashes, but nothing
- * else.
- */
-char *uuid_from_commandline(char *commandline)
-{
-	int low = 0;
-	char *result = NULL, *output, *ptr;
-
-	if (strncmp(commandline, "UUID=", 5))
-		return NULL;
-
-	result = kzalloc(17, GFP_KERNEL);
-	if (!result) {
-		printk("Failed to kzalloc UUID text memory.\n");
-		return NULL;
-	}
-
-	ptr = commandline + 5;
-	output = result;
-
-	while (*ptr && (output - result) < 16) {
-		if (isxdigit(*ptr)) {
-			int value = isdigit(*ptr) ? *ptr - '0' :
-				TOLOWER(*ptr) - 'a' + 10;
-			if (low) {
-				*output += value;
-				output++;
-			} else {
-				*output = value << 4;
-			}
-			low = !low;
-		} else if (*ptr != '-')
-			break;
-		ptr++;
-	}
-
-	if ((output - result) < 16 || *ptr) {
-		printk(KERN_DEBUG "Found resume=UUID=, but the value looks "
-				"invalid.\n");
-		kfree(result);
-		result = NULL;
-	}
-
-	return result;
-}
-
-#define retry_if_fails(command) \
-do { \
-	command; \
-	if (!resume_dev_t && !waited_for_device_probe) { \
-		wait_for_device_probe(); \
-		command; \
-		waited_for_device_probe = 1; \
-	} \
-} while(0)
-
-/**
- * try_to_open_resume_device: Try to parse and open resume=
- *
- * Any "swap:" has been stripped away and we just have the path to deal with.
- * We attempt to do name_to_dev_t, open and stat the file. Having opened the
- * file, get the struct block_device * to match.
- */
-static int try_to_open_resume_device(char *commandline, int quiet)
-{
-	struct kstat stat;
-	int error = 0;
-	char *uuid = uuid_from_commandline(commandline);
-	int waited_for_device_probe = 0;
-
-	resume_dev_t = MKDEV(0, 0);
-
-	if (!strlen(commandline))
-		retry_if_fails(toi_bio_scan_for_image(quiet));
-
-	if (uuid) {
-		struct fs_info seek;
-		strncpy((char *) &seek.uuid, uuid, 16);
-		seek.dev_t = resume_dev_t;
-		seek.last_mount_size = 0;
-		retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek));
-		kfree(uuid);
-	}
-
-	if (!resume_dev_t)
-		retry_if_fails(resume_dev_t = name_to_dev_t(commandline));
-
-	if (!resume_dev_t) {
-		struct file *file = filp_open(commandline,
-				O_RDONLY|O_LARGEFILE, 0);
-
-		if (!IS_ERR(file) && file) {
-			vfs_getattr(&file->f_path, &stat);
-			filp_close(file, NULL);
-		} else
-			error = vfs_stat(commandline, &stat);
-		if (!error)
-			resume_dev_t = stat.rdev;
-	}
-
-	if (!resume_dev_t) {
-		if (quiet)
-			return 1;
-
-		if (test_toi_state(TOI_TRYING_TO_RESUME))
-			toi_early_boot_message(1, toi_translate_err_default,
-			  "Failed to translate \"%s\" into a device id.\n",
-			  commandline);
-		else
-			printk("TuxOnIce: Can't translate \"%s\" into a device "
-					"id yet.\n", commandline);
-		return 1;
-	}
-
-	return open_resume_dev_t(1, quiet);
-}
-
-/*
- * Parse Image Location
- *
- * Attempt to parse a resume= parameter.
- * Swap Writer accepts:
- * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
- *
- * Where:
- * DEVNAME is convertable to a dev_t by name_to_dev_t
- * FIRSTBLOCK is the location of the first block in the swap file
- * (specifying for a swap partition is nonsensical but not prohibited).
- * Data is validated by attempting to read a swap header from the
- * location given. Failure will result in toi_swap refusing to
- * save an image, and a reboot with correct parameters will be
- * necessary.
- */
-static int toi_bio_parse_sig_location(char *commandline,
-		int only_allocator, int quiet)
-{
-	char *thischar, *devstart, *colon = NULL;
-	int signature_found, result = -EINVAL, temp_result = 0;
-
-	if (strncmp(commandline, "swap:", 5) &&
-	    strncmp(commandline, "file:", 5)) {
-		/*
-		 * Failing swap:, we'll take a simple resume=/dev/hda2, or a
-		 * blank value (scan) but fall through to other allocators
-		 * if /dev/ or UUID= isn't matched.
-		 */
-		if (strncmp(commandline, "/dev/", 5) &&
-		    strncmp(commandline, "UUID=", 5) &&
-		    strlen(commandline))
-			return 1;
-	} else
-		commandline += 5;
-
-	devstart = commandline;
-	thischar = commandline;
-	while ((*thischar != ':') && (*thischar != '@') &&
-		((thischar - commandline) < 250) && (*thischar))
-		thischar++;
-
-	if (*thischar == ':') {
-		colon = thischar;
-		*colon = 0;
-		thischar++;
-	}
-
-	while ((thischar - commandline) < 250 && *thischar)
-		thischar++;
-
-	if (colon) {
-		unsigned long block;
-		temp_result = kstrtoul(colon + 1, 0, &block);
-		if (!temp_result)
-			resume_firstblock = (int) block;
-	} else
-		resume_firstblock = 0;
-
-	clear_toi_state(TOI_CAN_HIBERNATE);
-	clear_toi_state(TOI_CAN_RESUME);
-
-	if (!temp_result)
-		temp_result = try_to_open_resume_device(devstart, quiet);
-
-	if (colon)
-		*colon = ':';
-
-	/* No error if we only scanned */
-	if (temp_result)
-		return strlen(commandline) ? -EINVAL : 1;
-
-	signature_found = toi_bio_image_exists(quiet);
-
-	if (signature_found != -1) {
-		result = 0;
-		/*
-		 * TODO: If only file storage, CAN_HIBERNATE should only be
-		 * set if file allocator's target is valid.
-		 */
-		set_toi_state(TOI_CAN_HIBERNATE);
-		set_toi_state(TOI_CAN_RESUME);
-	} else
-		if (!quiet)
-			printk(KERN_ERR "TuxOnIce: Block I/O: No "
-				"signature found at %s.\n", devstart);
-
-	return result;
-}
-
-static void toi_bio_release_storage(void)
-{
-	header_pages_reserved = 0;
-	raw_pages_allocd = 0;
-
-	free_all_bdev_info();
-}
-
-/* toi_swap_remove_image
- *
- */
-static int toi_bio_remove_image(void)
-{
-	int result;
-
-	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image.");
-
-	result = toi_bio_restore_original_signature();
-
-	/*
-	 * We don't do a sanity check here: we want to restore the swap
-	 * whatever version of kernel made the hibernate image.
-	 *
-	 * We need to write swap, but swap may not be enabled so
-	 * we write the device directly
-	 *
-	 * If we don't have an current_signature_page, we didn't
-	 * read an image header, so don't change anything.
-	 */
-
-	toi_bio_release_storage();
-
-	return result;
-}
-
-struct toi_bio_ops toi_bio_ops = {
-	.bdev_page_io = toi_bdev_page_io,
-	.register_storage = toi_register_storage_chain,
-	.free_storage = toi_bio_release_storage,
-};
-
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io,
-			0, 16384, 0, NULL),
-};
-
-struct toi_module_ops toi_blockwriter_ops = {
-	.type				= WRITER_MODULE,
-	.name				= "block i/o",
-	.directory			= "block_io",
-	.module				= THIS_MODULE,
-	.memory_needed			= toi_bio_memory_needed,
-	.print_debug_info		= toi_bio_print_debug_stats,
-	.storage_needed			= toi_bio_storage_needed,
-	.save_config_info		= toi_bio_save_config_info,
-	.load_config_info		= toi_bio_load_config_info,
-	.initialise			= toi_bio_initialise,
-	.cleanup			= toi_bio_cleanup,
-	.post_atomic_restore		= toi_bio_chains_post_atomic,
-
-	.rw_init			= toi_rw_init,
-	.rw_cleanup			= toi_rw_cleanup,
-	.read_page			= toi_bio_read_page,
-	.write_page			= toi_bio_write_page,
-	.rw_header_chunk		= toi_rw_header_chunk,
-	.rw_header_chunk_noreadahead	= toi_rw_header_chunk_noreadahead,
-	.io_flusher			= bio_io_flusher,
-	.update_throughput_throttle	= update_throughput_throttle,
-	.finish_all_io			= toi_finish_all_io,
-
-	.noresume_reset			= toi_bio_noresume_reset,
-	.storage_available 		= toi_bio_storage_available,
-	.storage_allocated		= toi_bio_storage_allocated,
-	.reserve_header_space		= toi_bio_reserve_header_space,
-	.allocate_storage		= toi_bio_allocate_storage,
-        .free_unused_storage            = toi_bio_free_unused_storage,
-	.image_exists			= toi_bio_image_exists,
-	.mark_resume_attempted		= toi_bio_mark_resume_attempted,
-	.write_header_init		= toi_bio_write_header_init,
-	.write_header_cleanup		= toi_bio_write_header_cleanup,
-	.read_header_init		= toi_bio_read_header_init,
-	.read_header_cleanup		= toi_bio_read_header_cleanup,
-	.get_header_version		= toi_bio_get_header_version,
-	.remove_image			= toi_bio_remove_image,
-	.parse_sig_location		= toi_bio_parse_sig_location,
-
-	.sysfs_data			= sysfs_params,
-	.num_sysfs_entries		= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-/**
- * toi_block_io_load - load time routine for block I/O module
- *
- * Register block i/o ops and sysfs entries.
- **/
-static __init int toi_block_io_load(void)
-{
-	return toi_register_module(&toi_blockwriter_ops);
-}
-
-late_initcall(toi_block_io_load);
diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h
deleted file mode 100644
index cf9211ed9..000000000
--- a/kernel/power/tuxonice_bio_internal.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_internal.h
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-/* Extent chains */
-void toi_extent_state_goto_start(void);
-void toi_extent_state_save(int slot);
-int go_next_page(int writing, int section_barrier);
-void toi_extent_state_restore(int slot);
-void free_all_bdev_info(void);
-int devices_of_same_priority(struct toi_bdev_info *this);
-int toi_register_storage_chain(struct toi_bdev_info *new);
-int toi_serialise_extent_chains(void);
-int toi_load_extent_chains(void);
-int toi_bio_rw_page(int writing, struct page *page, int is_readahead,
-		int free_group);
-int toi_bio_restore_original_signature(void);
-int toi_bio_devinfo_storage_needed(void);
-unsigned long get_headerblock(void);
-dev_t get_header_dev_t(void);
-struct block_device *get_header_bdev(void);
-int toi_bio_allocate_storage(unsigned long request);
-void toi_bio_free_unused_storage(void);
-
-/* Signature functions */
-#define HaveImage "HaveImage"
-#define NoImage "TuxOnIce"
-#define sig_size (sizeof(HaveImage))
-
-struct sig_data {
-	char sig[sig_size];
-	int have_image;
-	int resumed_before;
-
-	char have_uuid;
-	char header_uuid[17];
-	dev_t header_dev_t;
-	unsigned long first_header_block;
-
-	/* Repeat the signature to be sure we have a header version */
-	char sig2[sig_size];
-	int header_version;
-};
-
-void forget_signature_page(void);
-int toi_check_for_signature(void);
-int toi_bio_image_exists(int quiet);
-int get_signature_page(void);
-int toi_bio_mark_resume_attempted(int);
-extern char *toi_cur_sig_page;
-extern char *toi_orig_sig_page;
-int toi_bio_mark_have_image(void);
-extern struct sig_data *toi_sig_data;
-extern dev_t resume_dev_t;
-extern struct block_device *resume_block_device;
-extern struct block_device *header_block_device;
-extern unsigned long resume_firstblock;
-
-struct block_device *open_bdev(dev_t device, int display_errs);
-extern int current_stream;
-extern int more_readahead;
-int toi_do_io(int writing, struct block_device *bdev, long block0,
-	struct page *page, int is_readahead, int syncio, int free_group);
-int get_main_pool_phys_params(void);
-
-void toi_close_bdev(struct block_device *bdev);
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
-		int display_errs);
-
-extern struct toi_module_ops toi_blockwriter_ops;
-void dump_block_chains(void);
-void debug_broken_header(void);
-extern unsigned long raw_pages_allocd, header_pages_reserved;
-int toi_bio_chains_debug_info(char *buffer, int size);
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd);
-int toi_bio_scan_for_image(int quiet);
-int toi_bio_get_header_version(void);
-
-void close_resume_dev_t(int force);
-int open_resume_dev_t(int force, int quiet);
-
-struct toi_incremental_image_pointer_saved_data {
-    unsigned long block;
-    int chain;
-};
-
-struct toi_incremental_image_pointer {
-    struct toi_incremental_image_pointer_saved_data save;
-    struct block_device *bdev;
-    unsigned long block;
-};
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c
deleted file mode 100644
index ead874f8e..000000000
--- a/kernel/power/tuxonice_bio_signature.c
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_signature.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-struct sig_data *toi_sig_data;
-
-/* Struct of swap header pages */
-
-struct old_sig_data {
-	dev_t device;
-	unsigned long sector;
-	int resume_attempted;
-	int orig_sig_type;
-};
-
-union diskpage {
-	union swap_header swh;	/* swh.magic is the only member used */
-	struct sig_data sig_data;
-	struct old_sig_data old_sig_data;
-};
-
-union p_diskpage {
-	union diskpage *pointer;
-	char *ptr;
-	unsigned long address;
-};
-
-char *toi_cur_sig_page;
-char *toi_orig_sig_page;
-int have_image;
-int have_old_image;
-
-int get_signature_page(void)
-{
-	if (!toi_cur_sig_page) {
-		toi_message(TOI_IO, TOI_VERBOSE, 0,
-				"Allocating current signature page.");
-		toi_cur_sig_page = (char *) toi_get_zeroed_page(38,
-			TOI_ATOMIC_GFP);
-		if (!toi_cur_sig_page) {
-			printk(KERN_ERR "Failed to allocate memory for the "
-				"current image signature.\n");
-			return -ENOMEM;
-		}
-
-		toi_sig_data = (struct sig_data *) toi_cur_sig_page;
-	}
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx,"
-			" sector %d.",
-			resume_block_device->bd_dev, resume_firstblock);
-
-	return toi_bio_ops.bdev_page_io(READ, resume_block_device,
-		resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-void forget_signature_page(void)
-{
-	if (toi_cur_sig_page) {
-		toi_sig_data = NULL;
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page"
-				" (%p).", toi_cur_sig_page);
-		toi_free_page(38, (unsigned long) toi_cur_sig_page);
-		toi_cur_sig_page = NULL;
-	}
-
-	if (toi_orig_sig_page) {
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page"
-				" (%p).", toi_orig_sig_page);
-		toi_free_page(38, (unsigned long) toi_orig_sig_page);
-		toi_orig_sig_page = NULL;
-	}
-}
-
-/*
- * We need to ensure we use the signature page that's currently on disk,
- * so as to not remove the image header. Post-atomic-restore, the orig sig
- * page will be empty, so we can use that as our method of knowing that we
- * need to load the on-disk signature and not use the non-image sig in
- * memory. (We're going to powerdown after writing the change, so it's safe.
- */
-int toi_bio_mark_resume_attempted(int flag)
-{
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.",
-			flag);
-	if (!toi_orig_sig_page) {
-		forget_signature_page();
-		get_signature_page();
-	}
-	toi_sig_data->resumed_before = flag;
-	return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-		resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int toi_bio_mark_have_image(void)
-{
-	int result = 0;
-	char buf[32];
-	struct fs_info *fs_info;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists.");
-	memcpy(toi_sig_data->sig, tuxonice_signature,
-			sizeof(tuxonice_signature));
-	toi_sig_data->have_image = 1;
-	toi_sig_data->resumed_before = 0;
-	toi_sig_data->header_dev_t = get_header_dev_t();
-	toi_sig_data->have_uuid = 0;
-
-	fs_info = fs_info_from_block_dev(get_header_bdev());
-	if (fs_info && !IS_ERR(fs_info)) {
-		memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16);
-		free_fs_info(fs_info);
-	} else
-		result = (int) PTR_ERR(fs_info);
-
-	if (!result) {
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.",
-				format_dev_t(buf, get_header_dev_t()));
-		toi_sig_data->have_uuid = 1;
-	} else
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for "
-				"dev_t %s.",
-				format_dev_t(buf, get_header_dev_t()));
-
-	toi_sig_data->first_header_block = get_headerblock();
-	have_image = 1;
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block "
-			"is %d.", toi_sig_data->header_dev_t,
-			toi_sig_data->first_header_block);
-
-	memcpy(toi_sig_data->sig2, tuxonice_signature,
-			sizeof(tuxonice_signature));
-	toi_sig_data->header_version = TOI_HEADER_VERSION;
-
-	return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-		resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int remove_old_signature(void)
-{
-	union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page;
-	char *orig_sig;
-	char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
-	int result;
-	struct block_device *header_bdev;
-	struct old_sig_data *old_sig_data =
-		&swap_header_page.pointer->old_sig_data;
-
-	header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1);
-	result = toi_bio_ops.bdev_page_io(READ, header_bdev,
-			old_sig_data->sector, virt_to_page(header_start));
-
-	if (result)
-		goto out;
-
-	/*
-	 * TODO: Get the original contents of the first bytes of the swap
-	 * header page.
-	 */
-	if (!old_sig_data->orig_sig_type)
-		orig_sig = "SWAP-SPACE";
-	else
-		orig_sig = "SWAPSPACE2";
-
-	memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
-	memcpy(swap_header_page.ptr, header_start, 10);
-
-	result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-		resume_firstblock, virt_to_page(swap_header_page.ptr));
-
-out:
-	toi_close_bdev(header_bdev);
-	have_old_image = 0;
-	toi_free_page(38, (unsigned long) header_start);
-	return result;
-}
-
-/*
- * toi_bio_restore_original_signature - restore the original signature
- *
- * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used.
- * It will have the original signature page contents, stored in the image
- * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain
- * the contents that were loaded when we started the cycle.
- */
-int toi_bio_restore_original_signature(void)
-{
-	char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page;
-
-	if (have_old_image)
-		return remove_old_signature();
-
-	if (!use) {
-		printk("toi_bio_restore_original_signature: No signature "
-				"page loaded.\n");
-		return 0;
-	}
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists.");
-	have_image = 0;
-	toi_sig_data->have_image = 0;
-	return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-		resume_firstblock, virt_to_page(use));
-}
-
-/*
- * check_for_signature - See whether we have an image.
- *
- * Returns 0 if no image, 1 if there is one, -1 if indeterminate.
- */
-int toi_check_for_signature(void)
-{
-	union p_diskpage swap_header_page;
-	int type;
-	const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" };
-	const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" };
-	char *swap_header;
-
-	if (!toi_cur_sig_page) {
-		int result = get_signature_page();
-
-		if (result)
-			return result;
-	}
-
-	/*
-	 * Start by looking for the binary header.
-	 */
-	if (!memcmp(tuxonice_signature, toi_cur_sig_page,
-				sizeof(tuxonice_signature))) {
-		have_image = toi_sig_data->have_image;
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. "
-				"Have image is %d.", have_image);
-		if (have_image)
-			toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is "
-					"%x. First block is %d.",
-					toi_sig_data->header_dev_t,
-					toi_sig_data->first_header_block);
-		return toi_sig_data->have_image;
-	}
-
-	/*
-	 * Failing that, try old file allocator headers.
-	 */
-
-	if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) {
-		have_image = 1;
-		return 1;
-	}
-
-	have_image = 0;
-
-	if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage)))
-		return 0;
-
-	/*
-	 * Nope? How about swap?
-	 */
-	swap_header_page = (union p_diskpage) toi_cur_sig_page;
-	swap_header = swap_header_page.pointer->swh.magic.magic;
-
-	/* Normal swapspace? */
-	for (type = 0; type < 2; type++)
-		if (!memcmp(normal_sigs[type], swap_header,
-					strlen(normal_sigs[type])))
-			return 0;
-
-	/* Swsusp or uswsusp? */
-	for (type = 0; type < 3; type++)
-		if (!memcmp(swsusp_sigs[type], swap_header,
-					strlen(swsusp_sigs[type])))
-			return 2;
-
-	/* Old TuxOnIce version? */
-	if (!memcmp(tuxonice_signature, swap_header,
-				sizeof(tuxonice_signature) - 1)) {
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce "
-				"signature.");
-		have_old_image = 1;
-		return 3;
-	}
-
-	return -1;
-}
-
-/*
- * Image_exists
- *
- * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
- */
-int toi_bio_image_exists(int quiet)
-{
-	int result;
-	char *msg = NULL;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists.");
-
-	if (!resume_dev_t) {
-		if (!quiet)
-			printk(KERN_INFO "Not even trying to read header "
-				"because resume_dev_t is not set.\n");
-		return -1;
-	}
-
-	if (open_resume_dev_t(0, quiet))
-		return -1;
-
-	result = toi_check_for_signature();
-
-	clear_toi_state(TOI_RESUMED_BEFORE);
-	if (toi_sig_data->resumed_before)
-		set_toi_state(TOI_RESUMED_BEFORE);
-
-	if (quiet || result == -ENOMEM)
-		return result;
-
-	if (result == -1)
-		msg = "TuxOnIce: Unable to find a signature."
-				" Could you have moved a swap file?\n";
-	else if (!result)
-		msg = "TuxOnIce: No image found.\n";
-	else if (result == 1)
-		msg = "TuxOnIce: Image found.\n";
-	else if (result == 2)
-		msg = "TuxOnIce: uswsusp or swsusp image found.\n";
-	else if (result == 3)
-		msg = "TuxOnIce: Old implementation's signature found.\n";
-
-	printk(KERN_INFO "%s", msg);
-
-	return result;
-}
-
-int toi_bio_scan_for_image(int quiet)
-{
-	struct block_device *bdev;
-	char default_name[255] = "";
-
-	if (!quiet)
-		printk(KERN_DEBUG "Scanning swap devices for TuxOnIce "
-				"signature...\n");
-	for (bdev = next_bdev_of_type(NULL, "swap"); bdev;
-				bdev = next_bdev_of_type(bdev, "swap")) {
-		int result;
-		char name[255] = "";
-		sprintf(name, "%u:%u", MAJOR(bdev->bd_dev),
-				MINOR(bdev->bd_dev));
-		if (!quiet)
-			printk(KERN_DEBUG "- Trying %s.\n", name);
-		resume_block_device = bdev;
-		resume_dev_t = bdev->bd_dev;
-
-		result = toi_check_for_signature();
-
-		resume_block_device = NULL;
-		resume_dev_t = MKDEV(0, 0);
-
-		if (!default_name[0])
-			strcpy(default_name, name);
-
-		if (result == 1) {
-			/* Got one! */
-			strcpy(resume_file, name);
-			next_bdev_of_type(bdev, NULL);
-			if (!quiet)
-				printk(KERN_DEBUG " ==> Image found on %s.\n",
-						resume_file);
-			return 1;
-		}
-		forget_signature_page();
-	}
-
-	if (!quiet)
-		printk(KERN_DEBUG "TuxOnIce scan: No image found.\n");
-	strcpy(resume_file, default_name);
-	return 0;
-}
-
-int toi_bio_get_header_version(void)
-{
-	return (memcmp(toi_sig_data->sig2, tuxonice_signature,
-				sizeof(tuxonice_signature))) ?
-		0 : toi_sig_data->header_version;
-
-}
diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
deleted file mode 100644
index 0a6733ae0..000000000
--- a/kernel/power/tuxonice_builtin.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <linux/kernel.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/bio.h>
-#include <linux/root_dev.h>
-#include <linux/freezer.h>
-#include <linux/reboot.h>
-#include <linux/writeback.h>
-#include <linux/tty.h>
-#include <linux/crypto.h>
-#include <linux/cpu.h>
-#include <linux/ctype.h>
-#include <linux/kthread.h>
-#include "tuxonice_io.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_alloc.h"
-
-unsigned long toi_bootflags_mask;
-
-/*
- * Highmem related functions (x86 only).
- */
-
-#ifdef CONFIG_HIGHMEM
-
-/**
- * copyback_high: Restore highmem pages.
- *
- * Highmem data and pbe lists are/can be stored in highmem.
- * The format is slightly different to the lowmem pbe lists
- * used for the assembly code: the last pbe in each page is
- * a struct page * instead of struct pbe *, pointing to the
- * next page where pbes are stored (or NULL if happens to be
- * the end of the list). Since we don't want to generate
- * unnecessary deltas against swsusp code, we use a cast
- * instead of a union.
- **/
-
-static void copyback_high(void)
-{
-	struct page *pbe_page = (struct page *) restore_highmem_pblist;
-	struct pbe *this_pbe, *first_pbe;
-	unsigned long *origpage, *copypage;
-	int pbe_index = 1;
-
-	if (!pbe_page)
-		return;
-
-	this_pbe = (struct pbe *) kmap_atomic(pbe_page);
-	first_pbe = this_pbe;
-
-	while (this_pbe) {
-		int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
-
-		origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address));
-		copypage = kmap_atomic((struct page *) this_pbe->address);
-
-		while (loop >= 0) {
-			*(origpage + loop) = *(copypage + loop);
-			loop--;
-		}
-
-		kunmap_atomic(origpage);
-		kunmap_atomic(copypage);
-
-		if (!this_pbe->next)
-			break;
-
-		if (pbe_index < PBES_PER_PAGE) {
-			this_pbe++;
-			pbe_index++;
-		} else {
-			pbe_page = (struct page *) this_pbe->next;
-			kunmap_atomic(first_pbe);
-			if (!pbe_page)
-				return;
-			this_pbe = (struct pbe *) kmap_atomic(pbe_page);
-			first_pbe = this_pbe;
-			pbe_index = 1;
-		}
-	}
-	kunmap_atomic(first_pbe);
-}
-
-#else /* CONFIG_HIGHMEM */
-static void copyback_high(void) { }
-#endif
-
-char toi_wait_for_keypress_dev_console(int timeout)
-{
-	int fd, this_timeout = 255, orig_kthread = 0;
-	char key = '\0';
-	struct termios t, t_backup;
-
-	/* We should be guaranteed /dev/console exists after populate_rootfs()
-	 * in init/main.c.
-	 */
-	fd = sys_open("/dev/console", O_RDONLY, 0);
-	if (fd < 0) {
-		printk(KERN_INFO "Couldn't open /dev/console.\n");
-		return key;
-	}
-
-	if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
-		goto out_close;
-
-	memcpy(&t_backup, &t, sizeof(t));
-
-	t.c_lflag &= ~(ISIG|ICANON|ECHO);
-	t.c_cc[VMIN] = 0;
-
-new_timeout:
-	if (timeout > 0) {
-		this_timeout = timeout < 26 ? timeout : 25;
-		timeout -= this_timeout;
-		this_timeout *= 10;
-	}
-
-	t.c_cc[VTIME] = this_timeout;
-
-	if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
-		goto out_restore;
-
-        if (current->flags & PF_KTHREAD) {
-            orig_kthread = (current->flags & PF_KTHREAD);
-            current->flags &= ~PF_KTHREAD;
-        }
-
-	while (1) {
-		if (sys_read(fd, &key, 1) <= 0) {
-			if (timeout)
-				goto new_timeout;
-			key = '\0';
-			break;
-		}
-		key = tolower(key);
-		if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
-			if (key == 'c') {
-				set_toi_state(TOI_CONTINUE_REQ);
-				break;
-			} else if (key == ' ')
-				break;
-		} else
-			break;
-	}
-        if (orig_kthread) {
-            current->flags |= PF_KTHREAD;
-        }
-
-out_restore:
-	sys_ioctl(fd, TCSETS, (long)&t_backup);
-out_close:
-	sys_close(fd);
-
-	return key;
-}
-
-struct toi_boot_kernel_data toi_bkd __nosavedata
-		__attribute__((aligned(PAGE_SIZE))) = {
-	MY_BOOT_KERNEL_DATA_VERSION,
-	0,
-#ifdef CONFIG_TOI_REPLACE_SWSUSP
-	(1 << TOI_REPLACE_SWSUSP) |
-#endif
-	(1 << TOI_NO_FLUSHER_THREAD) |
-	(1 << TOI_PAGESET2_FULL),
-};
-
-struct block_device *toi_open_by_devnum(dev_t dev)
-{
-	struct block_device *bdev = bdget(dev);
-	int err = -ENOMEM;
-	if (bdev)
-		err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
-	return err ? ERR_PTR(err) : bdev;
-}
-
-/**
- * toi_close_bdev: Close a swap bdev.
- *
- * int: The swap entry number to close.
- */
-void toi_close_bdev(struct block_device *bdev)
-{
-	blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
-}
-
-int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
-struct toi_core_fns *toi_core_fns;
-unsigned long toi_result;
-struct pagedir pagedir1 = {1};
-struct toi_cbw **toi_first_cbw;
-int toi_next_cbw;
-
-unsigned long toi_get_nonconflicting_page(void)
-{
-	return toi_core_fns->get_nonconflicting_page();
-}
-
-int toi_post_context_save(void)
-{
-	return toi_core_fns->post_context_save();
-}
-
-int try_tuxonice_hibernate(void)
-{
-	if (!toi_core_fns)
-		return -ENODEV;
-
-	return toi_core_fns->try_hibernate();
-}
-
-static int num_resume_calls;
-#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL
-static int ignore_late_initcall = 1;
-#else
-static int ignore_late_initcall;
-#endif
-
-int toi_translate_err_default = TOI_CONTINUE_REQ;
-
-void try_tuxonice_resume(void)
-{
-        if (!hibernation_available())
-                return;
-
-	/* Don't let it wrap around eventually */
-	if (num_resume_calls < 2)
-		num_resume_calls++;
-
-	if (num_resume_calls == 1 && ignore_late_initcall) {
-		printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n");
-		return;
-	}
-
-	if (toi_core_fns)
-		toi_core_fns->try_resume();
-	else
-		printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
-}
-
-int toi_lowlevel_builtin(void)
-{
-	int error = 0;
-
-	save_processor_state();
-	error = swsusp_arch_suspend();
-	if (error)
-		printk(KERN_ERR "Error %d hibernating\n", error);
-
-	/* Restore control flow appears here */
-	if (!toi_in_hibernate) {
-		copyback_high();
-		set_toi_state(TOI_NOW_RESUMING);
-	}
-
-	restore_processor_state();
-	return error;
-}
-
-unsigned long toi_compress_bytes_in;
-unsigned long toi_compress_bytes_out;
-
-int toi_in_suspend(void)
-{
-  return in_suspend;
-}
-
-unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
-		(1 << TOI_IGNORE_LOGLEVEL) |
-		(1 << TOI_IO_STOPPED));
-
-/* The number of hibernates we have started (some may have been cancelled) */
-unsigned int nr_hibernates;
-int toi_running;
-__nosavedata int toi_in_hibernate;
-__nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_trace_allocs;
-
-void toi_read_lock_tasklist(void)
-{
-	read_lock(&tasklist_lock);
-}
-
-void toi_read_unlock_tasklist(void)
-{
-	read_unlock(&tasklist_lock);
-}
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-int (*toi_flag_zram_disks) (void);
-
-int toi_do_flag_zram_disks(void)
-{
-	return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0;
-}
-
-#endif
-
-/* toi_generate_free_page_map
- *
- * Description:	This routine generates a bitmap of free pages from the
- * 		lists used by the memory manager. We then use the bitmap
- * 		to quickly calculate which pages to save and in which
- * 		pagesets.
- */
-void toi_generate_free_page_map(void)
-{
-	int order, cpu, t;
-	unsigned long flags, i;
-	struct zone *zone;
-	struct list_head *curr;
-	unsigned long pfn;
-	struct page *page;
-
-	for_each_populated_zone(zone) {
-
-		if (!zone->spanned_pages)
-			continue;
-
-		spin_lock_irqsave(&zone->lock, flags);
-
-		for (i = 0; i < zone->spanned_pages; i++) {
-			pfn = zone->zone_start_pfn + i;
-
-			if (!pfn_valid(pfn))
-				continue;
-
-			page = pfn_to_page(pfn);
-
-			ClearPageNosaveFree(page);
-		}
-
-		for_each_migratetype_order(order, t) {
-			list_for_each(curr,
-					&zone->free_area[order].free_list[t]) {
-				unsigned long j;
-
-				pfn = page_to_pfn(list_entry(curr, struct page,
-							lru));
-				for (j = 0; j < (1UL << order); j++)
-					SetPageNosaveFree(pfn_to_page(pfn + j));
-			}
-		}
-
-		for_each_online_cpu(cpu) {
-			struct per_cpu_pageset *pset =
-				per_cpu_ptr(zone->pageset, cpu);
-			struct per_cpu_pages *pcp = &pset->pcp;
-			struct page *page;
-			int t;
-
-			for (t = 0; t < MIGRATE_PCPTYPES; t++)
-				list_for_each_entry(page, &pcp->lists[t], lru)
-					SetPageNosaveFree(page);
-		}
-
-		spin_unlock_irqrestore(&zone->lock, flags);
-	}
-}
-
-/* toi_size_of_free_region
- *
- * Description:	Return the number of pages that are free, beginning with and
- * 		including this one.
- */
-int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn)
-{
-	unsigned long this_pfn = start_pfn,
-		      end_pfn = zone_end_pfn(zone);
-
-	while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
-		this_pfn++;
-
-	return this_pfn - start_pfn;
-}
-
-static int __init toi_wait_setup(char *str)
-{
-	int value;
-
-	if (sscanf(str, "=%d", &value)) {
-		if (value < -1 || value > 255)
-			printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
-					"255.\n");
-		else
-			toi_wait = value;
-	}
-
-	return 1;
-}
-__setup("toi_wait", toi_wait_setup);
-
-static int __init toi_translate_retry_setup(char *str)
-{
-	toi_translate_err_default = 0;
-	return 1;
-}
-__setup("toi_translate_retry", toi_translate_retry_setup);
-
-static int __init toi_debug_setup(char *str)
-{
-	toi_bkd.toi_action |= (1 << TOI_LOGALL);
-	toi_bootflags_mask |= (1 << TOI_LOGALL);
-	toi_bkd.toi_debug_state = 255;
-	toi_bkd.toi_default_console_level = 7;
-	return 1;
-}
-__setup("toi_debug_setup", toi_debug_setup);
-
-static int __init toi_pause_setup(char *str)
-{
-	toi_bkd.toi_action |= (1 << TOI_PAUSE);
-	toi_bootflags_mask |= (1 << TOI_PAUSE);
-	return 1;
-}
-__setup("toi_pause", toi_pause_setup);
-
-#ifdef CONFIG_PM_DEBUG
-static int __init toi_trace_allocs_setup(char *str)
-{
-	int value;
-
-	if (sscanf(str, "=%d", &value))
-		toi_trace_allocs = value;
-
-	return 1;
-}
-__setup("toi_trace_allocs", toi_trace_allocs_setup);
-#endif
-
-static int __init toi_ignore_late_initcall_setup(char *str)
-{
-	int value;
-
-	if (sscanf(str, "=%d", &value))
-		ignore_late_initcall = value;
-
-	return 1;
-}
-__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup);
-
-static int __init toi_force_no_multithreaded_setup(char *str)
-{
-	int value;
-
-	toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO);
-	toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO);
-
-	if (sscanf(str, "=%d", &value) && value)
-		toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO);
-
-	return 1;
-}
-__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup);
-
-#ifdef CONFIG_KGDB
-static int __init toi_post_resume_breakpoint_setup(char *str)
-{
-	int value;
-
-	toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT);
-	toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT);
-	if (sscanf(str, "=%d", &value) && value)
-		toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT);
-
-	return 1;
-}
-__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup);
-#endif
-
-static int __init toi_disable_readahead_setup(char *str)
-{
-	int value;
-
-	toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD);
-	toi_bootflags_mask |= (1 << TOI_NO_READAHEAD);
-	if (sscanf(str, "=%d", &value) && value)
-		toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD);
-
-	return 1;
-}
-__setup("toi_no_readahead", toi_disable_readahead_setup);
diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
deleted file mode 100644
index 9539818e0..000000000
--- a/kernel/power/tuxonice_builtin.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <asm/setup.h>
-
-extern struct toi_core_fns *toi_core_fns;
-extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
-extern unsigned int nr_hibernates;
-extern int toi_in_hibernate;
-
-extern __nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_lowlevel_builtin(void);
-
-#ifdef CONFIG_HIGHMEM
-extern __nosavedata struct zone_data *toi_nosave_zone_list;
-extern __nosavedata unsigned long toi_nosave_max_pfn;
-#endif
-
-extern unsigned long toi_get_nonconflicting_page(void);
-extern int toi_post_context_save(void);
-
-extern char toi_wait_for_keypress_dev_console(int timeout);
-extern struct block_device *toi_open_by_devnum(dev_t dev);
-extern void toi_close_bdev(struct block_device *bdev);
-extern int toi_wait;
-extern int toi_translate_err_default;
-extern int toi_force_no_multithreaded;
-extern void toi_read_lock_tasklist(void);
-extern void toi_read_unlock_tasklist(void);
-extern int toi_in_suspend(void);
-extern void toi_generate_free_page_map(void);
-extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn);
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-extern int toi_do_flag_zram_disks(void);
-#else
-#define toi_do_flag_zram_disks() (0)
-#endif
diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
deleted file mode 100644
index 8952c0fec..000000000
--- a/kernel/power/tuxonice_checksum.c
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-
-static struct toi_module_ops toi_checksum_ops;
-
-/* Constant at the mo, but I might allow tuning later */
-static char toi_checksum_name[32] = "md4";
-/* Bytes per checksum */
-#define CHECKSUM_SIZE (16)
-
-#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
-
-struct cpu_context {
-	struct crypto_hash *transform;
-	struct hash_desc desc;
-	struct scatterlist sg[2];
-	char *buf;
-};
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-static int pages_allocated;
-static unsigned long page_list;
-
-static int toi_num_resaved;
-
-static unsigned long this_checksum, next_page;
-static int checksum_count;
-
-static inline int checksum_pages_needed(void)
-{
-	return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
-}
-
-/* ---- Local buffer management ---- */
-
-/*
- * toi_checksum_cleanup
- *
- * Frees memory allocated for our labours.
- */
-static void toi_checksum_cleanup(int ending_cycle)
-{
-	int cpu;
-
-	if (ending_cycle) {
-		for_each_online_cpu(cpu) {
-			struct cpu_context *this = &per_cpu(contexts, cpu);
-			if (this->transform) {
-				crypto_free_hash(this->transform);
-				this->transform = NULL;
-				this->desc.tfm = NULL;
-			}
-
-			if (this->buf) {
-				toi_free_page(27, (unsigned long) this->buf);
-				this->buf = NULL;
-			}
-		}
-	}
-}
-
-/*
- * toi_crypto_initialise
- *
- * Prepare to do some work by allocating buffers and transforms.
- * Returns: Int: Zero. Even if we can't set up checksum, we still
- * seek to hibernate.
- */
-static int toi_checksum_initialise(int starting_cycle)
-{
-	int cpu;
-
-	if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
-		return 0;
-
-	if (!*toi_checksum_name) {
-		printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
-		return 1;
-	}
-
-	for_each_online_cpu(cpu) {
-		struct cpu_context *this = &per_cpu(contexts, cpu);
-		struct page *page;
-
-		this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
-		if (IS_ERR(this->transform)) {
-			printk(KERN_INFO "TuxOnIce: Failed to initialise the "
-				"%s checksum algorithm: %ld.\n",
-				toi_checksum_name, (long) this->transform);
-			this->transform = NULL;
-			return 1;
-		}
-
-		this->desc.tfm = this->transform;
-		this->desc.flags = 0;
-
-		page = toi_alloc_page(27, GFP_KERNEL);
-		if (!page)
-			return 1;
-		this->buf = page_address(page);
-		sg_init_one(&this->sg[0], this->buf, PAGE_SIZE);
-	}
-	return 0;
-}
-
-/*
- * toi_checksum_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_checksum_print_debug_stats(char *buffer, int size)
-{
-	int len;
-
-	if (!toi_checksum_ops.enabled)
-		return scnprintf(buffer, size,
-			"- Checksumming disabled.\n");
-
-	len = scnprintf(buffer, size, "- Checksum method is '%s'.\n",
-			toi_checksum_name);
-	len += scnprintf(buffer + len, size - len,
-		"  %d pages resaved in atomic copy.\n", toi_num_resaved);
-	return len;
-}
-
-static int toi_checksum_memory_needed(void)
-{
-	return toi_checksum_ops.enabled ?
-		checksum_pages_needed() << PAGE_SHIFT : 0;
-}
-
-static int toi_checksum_storage_needed(void)
-{
-	if (toi_checksum_ops.enabled)
-		return strlen(toi_checksum_name) + sizeof(int) + 1;
-	else
-		return 0;
-}
-
-/*
- * toi_checksum_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_checksum_save_config_info(char *buffer)
-{
-	int namelen = strlen(toi_checksum_name) + 1;
-	int total_len;
-
-	*((unsigned int *) buffer) = namelen;
-	strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
-	total_len = sizeof(unsigned int) + namelen;
-	return total_len;
-}
-
-/* toi_checksum_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description:	Reload information needed for dechecksuming the image at
- * resume time.
- */
-static void toi_checksum_load_config_info(char *buffer, int size)
-{
-	int namelen;
-
-	namelen = *((unsigned int *) (buffer));
-	strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
-			namelen);
-	return;
-}
-
-/*
- * Free Checksum Memory
- */
-
-void free_checksum_pages(void)
-{
-	while (pages_allocated) {
-		unsigned long next = *((unsigned long *) page_list);
-		ClearPageNosave(virt_to_page(page_list));
-		toi_free_page(15, (unsigned long) page_list);
-		page_list = next;
-		pages_allocated--;
-	}
-}
-
-/*
- * Allocate Checksum Memory
- */
-
-int allocate_checksum_pages(void)
-{
-	int pages_needed = checksum_pages_needed();
-
-	if (!toi_checksum_ops.enabled)
-		return 0;
-
-	while (pages_allocated < pages_needed) {
-		unsigned long *new_page =
-		  (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
-		if (!new_page) {
-			printk(KERN_ERR "Unable to allocate checksum pages.\n");
-			return -ENOMEM;
-		}
-		SetPageNosave(virt_to_page(new_page));
-		(*new_page) = page_list;
-		page_list = (unsigned long) new_page;
-		pages_allocated++;
-	}
-
-	next_page = (unsigned long) page_list;
-	checksum_count = 0;
-
-	return 0;
-}
-
-char *tuxonice_get_next_checksum(void)
-{
-	if (!toi_checksum_ops.enabled)
-		return NULL;
-
-	if (checksum_count % CHECKSUMS_PER_PAGE)
-		this_checksum += CHECKSUM_SIZE;
-	else {
-		this_checksum = next_page + sizeof(void *);
-		next_page = *((unsigned long *) next_page);
-	}
-
-	checksum_count++;
-	return (char *) this_checksum;
-}
-
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
-{
-	char *pa;
-	int result, cpu = smp_processor_id();
-	struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
-	if (!toi_checksum_ops.enabled)
-		return 0;
-
-	pa = kmap(page);
-	memcpy(ctx->buf, pa, PAGE_SIZE);
-	kunmap(page);
-	result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
-						checksum_locn);
-	if (result)
-		printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest "
-				"returned %d.\n", result);
-	return result;
-}
-/*
- * Calculate checksums
- */
-
-void check_checksums(void)
-{
-	int index = 0, cpu = smp_processor_id();
-	char current_checksum[CHECKSUM_SIZE];
-	struct cpu_context *ctx = &per_cpu(contexts, cpu);
-        unsigned long pfn;
-
-	if (!toi_checksum_ops.enabled) {
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled.");
-		return;
-	}
-
-	next_page = (unsigned long) page_list;
-
-	toi_num_resaved = 0;
-	this_checksum = 0;
-
-        toi_trace_index++;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums.");
-	memory_bm_position_reset(pageset2_map);
-	for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP;
-			pfn = memory_bm_next_pfn(pageset2_map, 0)) {
-		int ret, resave_needed = false;
-		char *pa;
-		struct page *page = pfn_to_page(pfn);
-
-                if (index < checksum_count) {
-                    if (index % CHECKSUMS_PER_PAGE) {
-                        this_checksum += CHECKSUM_SIZE;
-                    } else {
-                        this_checksum = next_page + sizeof(void *);
-                        next_page = *((unsigned long *) next_page);
-                    }
-
-                    /* Done when IRQs disabled so must be atomic */
-                    pa = kmap_atomic(page);
-                    memcpy(ctx->buf, pa, PAGE_SIZE);
-                    kunmap_atomic(pa);
-                    ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
-                            current_checksum);
-
-                    if (ret) {
-                        printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
-                        return;
-                    }
-
-                    resave_needed = memcmp(current_checksum, (char *) this_checksum,
-                            CHECKSUM_SIZE);
-                } else {
-                    resave_needed = true;
-                }
-
-                if (resave_needed) {
-			TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed);
-			SetPageResave(pfn_to_page(pfn));
-			toi_num_resaved++;
-			if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
-				set_abort_result(TOI_RESAVE_NEEDED);
-		}
-
-		index++;
-	}
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete.");
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0,
-			NULL),
-	SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_ABORT_ON_RESAVE_NEEDED, 0)
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_checksum_ops = {
-	.type			= MISC_MODULE,
-	.name			= "checksumming",
-	.directory		= "checksum",
-	.module			= THIS_MODULE,
-	.initialise		= toi_checksum_initialise,
-	.cleanup		= toi_checksum_cleanup,
-	.print_debug_info	= toi_checksum_print_debug_stats,
-	.save_config_info	= toi_checksum_save_config_info,
-	.load_config_info	= toi_checksum_load_config_info,
-	.memory_needed		= toi_checksum_memory_needed,
-	.storage_needed		= toi_checksum_storage_needed,
-
-	.sysfs_data		= sysfs_params,
-	.num_sysfs_entries	= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-int toi_checksum_init(void)
-{
-	int result = toi_register_module(&toi_checksum_ops);
-	return result;
-}
-
-void toi_checksum_exit(void)
-{
-	toi_unregister_module(&toi_checksum_ops);
-}
diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
deleted file mode 100644
index 7d6478a6a..000000000
--- a/kernel/power/tuxonice_checksum.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#if defined(CONFIG_TOI_CHECKSUM)
-extern int toi_checksum_init(void);
-extern void toi_checksum_exit(void);
-void check_checksums(void);
-int allocate_checksum_pages(void);
-void free_checksum_pages(void);
-char *tuxonice_get_next_checksum(void);
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
-#else
-static inline int toi_checksum_init(void) { return 0; }
-static inline void toi_checksum_exit(void) { }
-static inline void check_checksums(void) { };
-static inline int allocate_checksum_pages(void) { return 0; };
-static inline void free_checksum_pages(void) { };
-static inline char *tuxonice_get_next_checksum(void) { return NULL; };
-static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
-	{ return 0; }
-#endif
-
diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
deleted file mode 100644
index cfe3383ab..000000000
--- a/kernel/power/tuxonice_cluster.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines for cluster hibernation support.
- *
- * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
- *
- * How does it work?
- *
- * There is no 'master' node that tells everyone else what to do. All nodes
- * send messages to the broadcast address/port, maintain a list of peers
- * and figure out when to progress to the next step in hibernating or resuming.
- * This makes us more fault tolerant when it comes to nodes coming and going
- * (which may be more of an issue if we're hibernating when power supplies
- * are being unreliable).
- *
- * At boot time, we start a ktuxonice thread that handles communication with
- * other nodes. This node maintains a state machine that controls our progress
- * through hibernating and resuming, keeping us in step with other nodes. Nodes
- * are identified by their hw address.
- *
- * On startup, the node sends CLUSTER_PING on the configured interface's
- * broadcast address, port $toi_cluster_port (see below) and begins to listen
- * for other broadcast messages. CLUSTER_PING messages are repeated at
- * intervals of 5 minutes, with a random offset to spread traffic out.
- *
- * A hibernation cycle is initiated from any node via
- *
- * echo > /sys/power/tuxonice/do_hibernate
- *
- * and (possibily) the hibernate script. At each step of the process, the node
- * completes its work, and waits for all other nodes to signal completion of
- * their work (or timeout) before progressing to the next step.
- *
- * Request/state  Action before reply	Possible reply	Next state
- * HIBERNATE	  capable, pre-script	HIBERNATE|ACK	NODE_PREP
- * 					HIBERNATE|NACK	INIT_0
- *
- * PREP		  prepare_image		PREP|ACK	IMAGE_WRITE
- *		 			PREP|NACK	INIT_0
- * 					ABORT		RUNNING
- *
- * IO		  write image		IO|ACK		power off
- * 					ABORT		POST_RESUME
- *
- * (Boot time)	  check for image	IMAGE|ACK	RESUME_PREP
- * 					(Note 1)
- * 					IMAGE|NACK	(Note 2)
- *
- * PREP		  prepare read image	PREP|ACK	IMAGE_READ
- * 					PREP|NACK	(As NACK_IMAGE)
- *
- * IO		  read image		IO|ACK		POST_RESUME
- *
- * POST_RESUME	  thaw, post-script			RUNNING
- *
- * INIT_0	  init 0
- *
- * Other messages:
- *
- * - PING: Request for all other live nodes to send a PONG. Used at startup to
- *   announce presence, when a node is suspected dead and periodically, in case
- *   segments of the network are [un]plugged.
- *
- * - PONG: Response to a PING.
- *
- * - ABORT: Request to cancel writing an image.
- *
- * - BYE: Notification that this node is shutting down.
- *
- * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
- * nodes which are slower to start up can get state synchronised. If a node
- * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
- * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
- * must invalidate its image (if any) and boot normally.
- *
- * Note 2: May occur when one node lost power or powered off while others
- * hibernated. This node waits for others to complete resuming (ACK_READ)
- * before completing its boot, so that it appears as a fail node restarting.
- *
- * If any node has an image, then it also has a list of nodes that hibernated
- * in synchronisation with it. The node will wait for other nodes to appear
- * or timeout before beginning its restoration.
- *
- * If a node has no image, it needs to wait, in case other nodes which do have
- * an image are going to resume, but are taking longer to announce their
- * presence. For this reason, the user can specify a timeout value and a number
- * of nodes detected before we just continue. (We might want to assume in a
- * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
- * the remaining nodes will too. This might help in situations where some nodes
- * are much slower to boot, or more subject to hardware failures or such like).
- */
-
-#include <linux/suspend.h>
-#include <linux/if.h>
-#include <linux/rtnetlink.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/if_arp.h>
-#include <linux/kthread.h>
-#include <linux/wait.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-
-#if 1
-#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
-#else
-#define PRINTK(a, b...) do { } while (0)
-#endif
-
-static int loopback_mode;
-static int num_local_nodes = 1;
-#define MAX_LOCAL_NODES 8
-#define SADDR (loopback_mode ? b->sid : h->saddr)
-
-#define MYNAME "TuxOnIce Clustering"
-
-enum cluster_message {
-	MSG_ACK = 1,
-	MSG_NACK = 2,
-	MSG_PING = 4,
-	MSG_ABORT = 8,
-	MSG_BYE = 16,
-	MSG_HIBERNATE = 32,
-	MSG_IMAGE = 64,
-	MSG_IO = 128,
-	MSG_RUNNING = 256
-};
-
-static char *str_message(int message)
-{
-	switch (message) {
-	case 4:
-		return "Ping";
-	case 8:
-		return "Abort";
-	case 9:
-		return "Abort acked";
-	case 10:
-		return "Abort nacked";
-	case 16:
-		return "Bye";
-	case 17:
-		return "Bye acked";
-	case 18:
-		return "Bye nacked";
-	case 32:
-		return "Hibernate request";
-	case 33:
-		return "Hibernate ack";
-	case 34:
-		return "Hibernate nack";
-	case 64:
-		return "Image exists?";
-	case 65:
-		return "Image does exist";
-	case 66:
-		return "No image here";
-	case 128:
-		return "I/O";
-	case 129:
-		return "I/O okay";
-	case 130:
-		return "I/O failed";
-	case 256:
-		return "Running";
-	default:
-		printk(KERN_ERR "Unrecognised message %d.\n", message);
-		return "Unrecognised message (see dmesg)";
-	}
-}
-
-#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
-#define MSG_STATE_MASK (~MSG_ACK_MASK)
-
-struct node_info {
-	struct list_head member_list;
-	wait_queue_head_t member_events;
-	spinlock_t member_list_lock;
-	spinlock_t receive_lock;
-	int peer_count, ignored_peer_count;
-	struct toi_sysfs_data sysfs_data;
-	enum cluster_message current_message;
-};
-
-struct node_info node_array[MAX_LOCAL_NODES];
-
-struct cluster_member {
-	__be32 addr;
-	enum cluster_message message;
-	struct list_head list;
-	int ignore;
-};
-
-#define toi_cluster_port_send 3501
-#define toi_cluster_port_recv 3502
-
-static struct net_device *net_dev;
-static struct toi_module_ops toi_cluster_ops;
-
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
-		struct packet_type *pt, struct net_device *orig_dev);
-
-static struct packet_type toi_cluster_packet_type = {
-	.type =	__constant_htons(ETH_P_IP),
-	.func =	toi_recv,
-};
-
-struct toi_pkt {		/* BOOTP packet format */
-	struct iphdr iph;	/* IP header */
-	struct udphdr udph;	/* UDP header */
-	u8 htype;		/* HW address type */
-	u8 hlen;		/* HW address length */
-	__be32 xid;		/* Transaction ID */
-	__be16 secs;		/* Seconds since we started */
-	__be16 flags;		/* Just what it says */
-	u8 hw_addr[16];		/* Sender's HW address */
-	u16 message;		/* Message */
-	unsigned long sid;	/* Source ID for loopback testing */
-};
-
-static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
-
-static int added_pack;
-
-static int others_have_image;
-
-/* Key used to allow multiple clusters on the same lan */
-static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
-static char pre_hibernate_script[255] =
-	CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
-static char post_hibernate_script[255] =
-	CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
-
-/*			List of cluster members			*/
-static unsigned long continue_delay = 5 * HZ;
-static unsigned long cluster_message_timeout = 3 * HZ;
-
-/* 		=== Membership list === 	*/
-
-static void print_member_info(int index)
-{
-	struct cluster_member *this;
-
-	printk(KERN_INFO "==> Dumping node %d.\n", index);
-
-	list_for_each_entry(this, &node_array[index].member_list, list)
-		printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
-				NIPQUAD(this->addr),
-				str_message(this->message),
-				this->ignore ? "(Ignored)" : "");
-	printk(KERN_INFO "== Done ==\n");
-}
-
-static struct cluster_member *__find_member(int index, __be32 addr)
-{
-	struct cluster_member *this;
-
-	list_for_each_entry(this, &node_array[index].member_list, list) {
-		if (this->addr != addr)
-			continue;
-
-		return this;
-	}
-
-	return NULL;
-}
-
-static void set_ignore(int index, __be32 addr, struct cluster_member *this)
-{
-	if (this->ignore) {
-		PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
-				index, NIPQUAD(addr));
-		return;
-	}
-
-	PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
-				index, NIPQUAD(addr));
-	this->ignore = 1;
-	node_array[index].ignored_peer_count++;
-}
-
-static int __add_update_member(int index, __be32 addr, int message)
-{
-	struct cluster_member *this;
-
-	this = __find_member(index, addr);
-	if (this) {
-		if (this->message != message) {
-			this->message = message;
-			if ((message & MSG_NACK) &&
-			    (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
-				set_ignore(index, addr, this);
-			PRINTK("Node %d sees node %d.%d.%d.%d now sending "
-					"%s.\n", index, NIPQUAD(addr),
-					str_message(message));
-			wake_up(&node_array[index].member_events);
-		}
-		return 0;
-	}
-
-	this = (struct cluster_member *) toi_kzalloc(36,
-			sizeof(struct cluster_member), GFP_KERNEL);
-
-	if (!this)
-		return -1;
-
-	this->addr = addr;
-	this->message = message;
-	this->ignore = 0;
-	INIT_LIST_HEAD(&this->list);
-
-	node_array[index].peer_count++;
-
-	PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
-			NIPQUAD(addr), str_message(message));
-
-	if ((message & MSG_NACK) &&
-	    (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
-		set_ignore(index, addr, this);
-	list_add_tail(&this->list, &node_array[index].member_list);
-	return 1;
-}
-
-static int add_update_member(int index, __be32 addr, int message)
-{
-	int result;
-	unsigned long flags;
-	spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-	result = __add_update_member(index, addr, message);
-	spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-
-	print_member_info(index);
-
-	wake_up(&node_array[index].member_events);
-
-	return result;
-}
-
-static void del_member(int index, __be32 addr)
-{
-	struct cluster_member *this;
-	unsigned long flags;
-
-	spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-	this = __find_member(index, addr);
-
-	if (this) {
-		list_del_init(&this->list);
-		toi_kfree(36, this, sizeof(*this));
-		node_array[index].peer_count--;
-	}
-
-	spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-/* 		=== Message transmission ===	*/
-
-static void toi_send_if(int message, unsigned long my_id);
-
-/*
- *  Process received TOI packet.
- */
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
-		struct packet_type *pt, struct net_device *orig_dev)
-{
-	struct toi_pkt *b;
-	struct iphdr *h;
-	int len, result, index;
-	unsigned long addr, message, ack;
-
-	/* Perform verifications before taking the lock.  */
-	if (skb->pkt_type == PACKET_OTHERHOST)
-		goto drop;
-
-	if (dev != net_dev)
-		goto drop;
-
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (!skb)
-		return NET_RX_DROP;
-
-	if (!pskb_may_pull(skb,
-			   sizeof(struct iphdr) +
-			   sizeof(struct udphdr)))
-		goto drop;
-
-	b = (struct toi_pkt *)skb_network_header(skb);
-	h = &b->iph;
-
-	if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
-		goto drop;
-
-	/* Fragments are not supported */
-	if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
-		if (net_ratelimit())
-			printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
-			       "cluster message.\n");
-		goto drop;
-	}
-
-	if (skb->len < ntohs(h->tot_len))
-		goto drop;
-
-	if (ip_fast_csum((char *) h, h->ihl))
-		goto drop;
-
-	if (b->udph.source != htons(toi_cluster_port_send) ||
-	    b->udph.dest != htons(toi_cluster_port_recv))
-		goto drop;
-
-	if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
-		goto drop;
-
-	len = ntohs(b->udph.len) - sizeof(struct udphdr);
-
-	/* Ok the front looks good, make sure we can get at the rest.  */
-	if (!pskb_may_pull(skb, skb->len))
-		goto drop;
-
-	b = (struct toi_pkt *)skb_network_header(skb);
-	h = &b->iph;
-
-	addr = SADDR;
-	PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
-			str_message(b->message), NIPQUAD(addr));
-
-	message = b->message & MSG_STATE_MASK;
-	ack = b->message & MSG_ACK_MASK;
-
-	for (index = 0; index < num_local_nodes; index++) {
-		int new_message = node_array[index].current_message,
-		    old_message = new_message;
-
-		if (index == SADDR || !old_message) {
-			PRINTK("Ignoring node %d (offline or self).\n", index);
-			continue;
-		}
-
-		/* One message at a time, please. */
-		spin_lock(&node_array[index].receive_lock);
-
-		result = add_update_member(index, SADDR, b->message);
-		if (result == -1) {
-			printk(KERN_INFO "Failed to add new cluster member "
-					NIPQUAD_FMT ".\n",
-					NIPQUAD(addr));
-			goto drop_unlock;
-		}
-
-		switch (b->message & MSG_STATE_MASK) {
-		case MSG_PING:
-			break;
-		case MSG_ABORT:
-			break;
-		case MSG_BYE:
-			break;
-		case MSG_HIBERNATE:
-			/* Can I hibernate? */
-			new_message = MSG_HIBERNATE |
-				((index & 1) ? MSG_NACK : MSG_ACK);
-			break;
-		case MSG_IMAGE:
-			/* Can I resume? */
-			new_message = MSG_IMAGE |
-				((index & 1) ? MSG_NACK : MSG_ACK);
-			if (new_message != old_message)
-				printk(KERN_ERR "Setting whether I can resume "
-						"to %d.\n", new_message);
-			break;
-		case MSG_IO:
-			new_message = MSG_IO | MSG_ACK;
-			break;
-		case MSG_RUNNING:
-			break;
-		default:
-			if (net_ratelimit())
-				printk(KERN_ERR "Unrecognised TuxOnIce cluster"
-					" message %d from " NIPQUAD_FMT ".\n",
-					b->message, NIPQUAD(addr));
-		};
-
-		if (old_message != new_message) {
-			node_array[index].current_message = new_message;
-			printk(KERN_INFO ">>> Sending new message for node "
-					"%d.\n", index);
-			toi_send_if(new_message, index);
-		} else if (!ack) {
-			printk(KERN_INFO ">>> Resending message for node %d.\n",
-					index);
-			toi_send_if(new_message, index);
-		}
-drop_unlock:
-		spin_unlock(&node_array[index].receive_lock);
-	};
-
-drop:
-	/* Throw the packet out. */
-	kfree_skb(skb);
-
-	return 0;
-}
-
-/*
- *  Send cluster message to single interface.
- */
-static void toi_send_if(int message, unsigned long my_id)
-{
-	struct sk_buff *skb;
-	struct toi_pkt *b;
-	int hh_len = LL_RESERVED_SPACE(net_dev);
-	struct iphdr *h;
-
-	/* Allocate packet */
-	skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
-	if (!skb)
-		return;
-	skb_reserve(skb, hh_len);
-	b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
-	memset(b, 0, sizeof(struct toi_pkt));
-
-	/* Construct IP header */
-	skb_reset_network_header(skb);
-	h = ip_hdr(skb);
-	h->version = 4;
-	h->ihl = 5;
-	h->tot_len = htons(sizeof(struct toi_pkt));
-	h->frag_off = htons(IP_DF);
-	h->ttl = 64;
-	h->protocol = IPPROTO_UDP;
-	h->daddr = htonl(INADDR_BROADCAST);
-	h->check = ip_fast_csum((unsigned char *) h, h->ihl);
-
-	/* Construct UDP header */
-	b->udph.source = htons(toi_cluster_port_send);
-	b->udph.dest = htons(toi_cluster_port_recv);
-	b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
-	/* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
-
-	/* Construct message */
-	b->message = message;
-	b->sid = my_id;
-	b->htype = net_dev->type; /* can cause undefined behavior */
-	b->hlen = net_dev->addr_len;
-	memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
-	b->secs = htons(3); /* 3 seconds */
-
-	/* Chain packet down the line... */
-	skb->dev = net_dev;
-	skb->protocol = htons(ETH_P_IP);
-	if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
-		     net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
-			dev_queue_xmit(skb) < 0)
-		printk(KERN_INFO "E");
-}
-
-/*	=========================================		*/
-
-/*			kTOICluster			*/
-
-static atomic_t num_cluster_threads;
-static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
-
-static int kTOICluster(void *data)
-{
-	unsigned long my_id;
-
-	my_id = atomic_add_return(1, &num_cluster_threads) - 1;
-	node_array[my_id].current_message = (unsigned long) data;
-
-	PRINTK("kTOICluster daemon %lu starting.\n", my_id);
-
-	current->flags |= PF_NOFREEZE;
-
-	while (node_array[my_id].current_message) {
-		toi_send_if(node_array[my_id].current_message, my_id);
-		sleep_on_timeout(&clusterd_events,
-				cluster_message_timeout);
-		PRINTK("Link state %lu is %d.\n", my_id,
-				node_array[my_id].current_message);
-	}
-
-	toi_send_if(MSG_BYE, my_id);
-	atomic_dec(&num_cluster_threads);
-	wake_up(&clusterd_events);
-
-	PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
-	__set_current_state(TASK_RUNNING);
-	return 0;
-}
-
-static void kill_clusterd(void)
-{
-	int i;
-
-	for (i = 0; i < num_local_nodes; i++) {
-		if (node_array[i].current_message) {
-			PRINTK("Seeking to kill clusterd %d.\n", i);
-			node_array[i].current_message = 0;
-		}
-	}
-	wait_event(clusterd_events,
-			!atomic_read(&num_cluster_threads));
-	PRINTK("All cluster daemons have exited.\n");
-}
-
-static int peers_not_in_message(int index, int message, int precise)
-{
-	struct cluster_member *this;
-	unsigned long flags;
-	int result = 0;
-
-	spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-	list_for_each_entry(this, &node_array[index].member_list, list) {
-		if (this->ignore)
-			continue;
-
-		PRINTK("Peer %d.%d.%d.%d sending %s. "
-			"Seeking %s.\n",
-			NIPQUAD(this->addr),
-			str_message(this->message), str_message(message));
-		if ((precise ? this->message :
-					this->message & MSG_STATE_MASK) !=
-					message)
-			result++;
-	}
-	spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-	PRINTK("%d peers in sought message.\n", result);
-	return result;
-}
-
-static void reset_ignored(int index)
-{
-	struct cluster_member *this;
-	unsigned long flags;
-
-	spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-	list_for_each_entry(this, &node_array[index].member_list, list)
-		this->ignore = 0;
-	node_array[index].ignored_peer_count = 0;
-	spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-static int peers_in_message(int index, int message, int precise)
-{
-	return node_array[index].peer_count -
-		node_array[index].ignored_peer_count -
-		peers_not_in_message(index, message, precise);
-}
-
-static int time_to_continue(int index, unsigned long start, int message)
-{
-	int first = peers_not_in_message(index, message, 0);
-	int second = peers_in_message(index, message, 1);
-
-	PRINTK("First part returns %d, second returns %d.\n", first, second);
-
-	if (!first && !second) {
-		PRINTK("All peers answered message %d.\n",
-			message);
-		return 1;
-	}
-
-	if (time_after(jiffies, start + continue_delay)) {
-		PRINTK("Timeout reached.\n");
-		return 1;
-	}
-
-	PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
-			start + continue_delay);
-	return 0;
-}
-
-void toi_initiate_cluster_hibernate(void)
-{
-	int result;
-	unsigned long start;
-
-	result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
-	if (result)
-		return;
-
-	toi_send_if(MSG_HIBERNATE, 0);
-
-	start = jiffies;
-	wait_event(node_array[0].member_events,
-			time_to_continue(0, start, MSG_HIBERNATE));
-
-	if (test_action_state(TOI_FREEZER_TEST)) {
-		toi_send_if(MSG_ABORT, 0);
-
-		start = jiffies;
-		wait_event(node_array[0].member_events,
-			time_to_continue(0, start, MSG_RUNNING));
-
-		do_toi_step(STEP_QUIET_CLEANUP);
-		return;
-	}
-
-	toi_send_if(MSG_IO, 0);
-
-	result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
-	if (result)
-		return;
-
-	/* This code runs at resume time too! */
-	if (toi_in_hibernate)
-		result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-}
-
-/* toi_cluster_print_debug_stats
- *
- * Description:	Print information to be recorded for debugging purposes into a
- * 		buffer.
- * Arguments:	buffer: Pointer to a buffer into which the debug info will be
- * 			printed.
- * 		size:	Size of the buffer.
- * Returns:	Number of characters written to the buffer.
- */
-static int toi_cluster_print_debug_stats(char *buffer, int size)
-{
-	int len;
-
-	if (strlen(toi_cluster_iface))
-		len = scnprintf(buffer, size,
-				"- Cluster interface is '%s'.\n",
-				toi_cluster_iface);
-	else
-		len = scnprintf(buffer, size,
-				"- Cluster support is disabled.\n");
-	return len;
-}
-
-/* cluster_memory_needed
- *
- * Description:	Tell the caller how much memory we need to operate during
- * 		hibernate/resume.
- * Returns:	Unsigned long. Maximum number of bytes of memory required for
- * 		operation.
- */
-static int toi_cluster_memory_needed(void)
-{
-	return 0;
-}
-
-static int toi_cluster_storage_needed(void)
-{
-	return 1 + strlen(toi_cluster_iface);
-}
-
-/* toi_cluster_save_config_info
- *
- * Description:	Save informaton needed when reloading the image at resume time.
- * Arguments:	Buffer:		Pointer to a buffer of size PAGE_SIZE.
- * Returns:	Number of bytes used for saving our data.
- */
-static int toi_cluster_save_config_info(char *buffer)
-{
-	strcpy(buffer, toi_cluster_iface);
-	return strlen(toi_cluster_iface + 1);
-}
-
-/* toi_cluster_load_config_info
- *
- * Description:	Reload information needed for declustering the image at
- * 		resume time.
- * Arguments:	Buffer:		Pointer to the start of the data.
- *		Size:		Number of bytes that were saved.
- */
-static void toi_cluster_load_config_info(char *buffer, int size)
-{
-	strncpy(toi_cluster_iface, buffer, size);
-	return;
-}
-
-static void cluster_startup(void)
-{
-	int have_image = do_check_can_resume(), i;
-	unsigned long start = jiffies, initial_message;
-	struct task_struct *p;
-
-	initial_message = MSG_IMAGE;
-
-	have_image = 1;
-
-	for (i = 0; i < num_local_nodes; i++) {
-		PRINTK("Starting ktoiclusterd %d.\n", i);
-		p = kthread_create(kTOICluster, (void *) initial_message,
-				"ktoiclusterd/%d", i);
-		if (IS_ERR(p)) {
-			printk(KERN_ERR "Failed to start ktoiclusterd.\n");
-			return;
-		}
-
-		wake_up_process(p);
-	}
-
-	/* Wait for delay or someone else sending first message */
-	wait_event(node_array[0].member_events, time_to_continue(0, start,
-				MSG_IMAGE));
-
-	others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
-
-	printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
-		" %d.\n", have_image ? "" : "don't ", others_have_image);
-
-	if (have_image) {
-		int result;
-
-		/* Start to resume */
-		printk(KERN_INFO "  === Starting to resume ===  \n");
-		node_array[0].current_message = MSG_IO;
-		toi_send_if(MSG_IO, 0);
-
-		/* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
-		result = 0;
-
-		if (!result) {
-			/*
-			 * Atomic restore - we'll come back in the hibernation
-			 * path.
-			 */
-
-			/* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
-			result = 0;
-
-			/* do_toi_step(STEP_QUIET_CLEANUP); */
-		}
-
-		node_array[0].current_message |= MSG_NACK;
-
-		/* For debugging - disable for real life? */
-		wait_event(node_array[0].member_events,
-				time_to_continue(0, start, MSG_IO));
-	}
-
-	if (others_have_image) {
-		/* Wait for them to resume */
-		printk(KERN_INFO "Waiting for other nodes to resume.\n");
-		start = jiffies;
-		wait_event(node_array[0].member_events,
-				time_to_continue(0, start, MSG_RUNNING));
-		if (peers_not_in_message(0, MSG_RUNNING, 0))
-			printk(KERN_INFO "Timed out while waiting for other "
-					"nodes to resume.\n");
-	}
-
-	/* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
-	 * as appropriate.
-	 *
-	 * If we don't have an image:
-	 * - Wait until someone else says they have one, or conditions are met
-	 *   for continuing to boot (n machines or t seconds).
-	 * - If anyone has an image, wait for them to resume before continuing
-	 *   to boot.
-	 *
-	 * If we have an image:
-	 * - Wait until conditions are met before continuing to resume (n
-	 *   machines or t seconds). Send RESUME_PREP and freeze processes.
-	 *   NACK_PREP if freezing fails (shouldn't) and follow logic for
-	 *   us having no image above. On success, wait for [N]ACK_PREP from
-	 *   other machines. Read image (including atomic restore) until done.
-	 *   Wait for ACK_READ from others (should never fail). Thaw processes
-	 *   and do post-resume. (The section after the atomic restore is done
-	 *   via the code for hibernating).
-	 */
-
-	node_array[0].current_message = MSG_RUNNING;
-}
-
-/* toi_cluster_open_iface
- *
- * Description:	Prepare to use an interface.
- */
-
-static int toi_cluster_open_iface(void)
-{
-	struct net_device *dev;
-
-	rtnl_lock();
-
-	for_each_netdev(&init_net, dev) {
-		if (/* dev == &init_net.loopback_dev || */
-		    strcmp(dev->name, toi_cluster_iface))
-			continue;
-
-		net_dev = dev;
-		break;
-	}
-
-	rtnl_unlock();
-
-	if (!net_dev) {
-		printk(KERN_ERR MYNAME ": Device %s not found.\n",
-				toi_cluster_iface);
-		return -ENODEV;
-	}
-
-	dev_add_pack(&toi_cluster_packet_type);
-	added_pack = 1;
-
-	loopback_mode = (net_dev == init_net.loopback_dev);
-	num_local_nodes = loopback_mode ? 8 : 1;
-
-	PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
-			loopback_mode ? "on" : "off", num_local_nodes);
-
-	cluster_startup();
-	return 0;
-}
-
-/* toi_cluster_close_iface
- *
- * Description: Stop using an interface.
- */
-
-static int toi_cluster_close_iface(void)
-{
-	kill_clusterd();
-	if (added_pack) {
-		dev_remove_pack(&toi_cluster_packet_type);
-		added_pack = 0;
-	}
-	return 0;
-}
-
-static void write_side_effect(void)
-{
-	if (toi_cluster_ops.enabled) {
-		toi_cluster_open_iface();
-		set_toi_state(TOI_CLUSTER_MODE);
-	} else {
-		toi_cluster_close_iface();
-		clear_toi_state(TOI_CLUSTER_MODE);
-	}
-}
-
-static void node_write_side_effect(void)
-{
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
-			NULL),
-	SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
-			write_side_effect),
-	SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
-	SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
-			256, 0, NULL),
-	SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
-			256, 0, STRING),
-	SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
-			0)
-};
-
-/*
- * Ops structure.
- */
-
-static struct toi_module_ops toi_cluster_ops = {
-	.type			= FILTER_MODULE,
-	.name			= "Cluster",
-	.directory		= "cluster",
-	.module			= THIS_MODULE,
-	.memory_needed 		= toi_cluster_memory_needed,
-	.print_debug_info	= toi_cluster_print_debug_stats,
-	.save_config_info	= toi_cluster_save_config_info,
-	.load_config_info	= toi_cluster_load_config_info,
-	.storage_needed		= toi_cluster_storage_needed,
-
-	.sysfs_data		= sysfs_params,
-	.num_sysfs_entries	= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-#ifdef MODULE
-#define INIT static __init
-#define EXIT static __exit
-#else
-#define INIT
-#define EXIT
-#endif
-
-INIT int toi_cluster_init(void)
-{
-	int temp = toi_register_module(&toi_cluster_ops), i;
-	struct kobject *kobj = toi_cluster_ops.dir_kobj;
-
-	for (i = 0; i < MAX_LOCAL_NODES; i++) {
-		node_array[i].current_message = 0;
-		INIT_LIST_HEAD(&node_array[i].member_list);
-		init_waitqueue_head(&node_array[i].member_events);
-		spin_lock_init(&node_array[i].member_list_lock);
-		spin_lock_init(&node_array[i].receive_lock);
-
-		/* Set up sysfs entry */
-		node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
-				sizeof(node_array[i].sysfs_data.attr.name),
-				GFP_KERNEL);
-		sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
-				i);
-		node_array[i].sysfs_data.attr.mode = SYSFS_RW;
-		node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
-		node_array[i].sysfs_data.flags = 0;
-		node_array[i].sysfs_data.data.integer.variable =
-			(int *) &node_array[i].current_message;
-		node_array[i].sysfs_data.data.integer.minimum = 0;
-		node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
-		node_array[i].sysfs_data.write_side_effect =
-			node_write_side_effect;
-		toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
-	}
-
-	toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
-
-	if (toi_cluster_ops.enabled)
-		toi_cluster_open_iface();
-
-	return temp;
-}
-
-EXIT void toi_cluster_exit(void)
-{
-	int i;
-	toi_cluster_close_iface();
-
-	for (i = 0; i < MAX_LOCAL_NODES; i++)
-		toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
-				&node_array[i].sysfs_data);
-	toi_unregister_module(&toi_cluster_ops);
-}
-
-static int __init toi_cluster_iface_setup(char *iface)
-{
-	toi_cluster_ops.enabled = (*iface &&
-			strcmp(iface, "off"));
-
-	if (toi_cluster_ops.enabled)
-		strncpy(toi_cluster_iface, iface, strlen(iface));
-}
-
-__setup("toi_cluster=", toi_cluster_iface_setup);
diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
deleted file mode 100644
index 84356b304..000000000
--- a/kernel/power/tuxonice_cluster.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_TOI_CLUSTER
-extern int toi_cluster_init(void);
-extern void toi_cluster_exit(void);
-extern void toi_initiate_cluster_hibernate(void);
-#else
-static inline int toi_cluster_init(void) { return 0; }
-static inline void toi_cluster_exit(void) { }
-static inline void toi_initiate_cluster_hibernate(void) { }
-#endif
-
diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
deleted file mode 100644
index d118568b7..000000000
--- a/kernel/power/tuxonice_compress.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * kernel/power/compression.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data compression routines for TuxOnIce,
- * using cryptoapi.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-static int toi_expected_compression;
-
-static struct toi_module_ops toi_compression_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_compressor_name[32] = "lzo";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
-	u8 *page_buffer;
-	struct crypto_comp *transform;
-	unsigned int len;
-	u8 *buffer_start;
-	u8 *output_buffer;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_compress_crypto_prepare(void)
-{
-	int cpu;
-
-	if (!*toi_compressor_name) {
-		printk(KERN_INFO "TuxOnIce: Compression enabled but no "
-				"compressor name set.\n");
-		return 1;
-	}
-
-	for_each_online_cpu(cpu) {
-		struct cpu_context *this = &per_cpu(contexts, cpu);
-		this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
-		if (IS_ERR(this->transform)) {
-			printk(KERN_INFO "TuxOnIce: Failed to initialise the "
-					"%s compression transform.\n",
-					toi_compressor_name);
-			this->transform = NULL;
-			return 1;
-		}
-
-		this->page_buffer =
-			(char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
-
-		if (!this->page_buffer) {
-			printk(KERN_ERR
-			  "Failed to allocate a page buffer for TuxOnIce "
-			  "compression driver.\n");
-			return -ENOMEM;
-		}
-
-		this->output_buffer =
-			(char *) vmalloc_32(OUT_BUF_SIZE);
-
-		if (!this->output_buffer) {
-			printk(KERN_ERR
-			  "Failed to allocate a output buffer for TuxOnIce "
-			  "compression driver.\n");
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
-static int toi_compress_rw_cleanup(int writing)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		struct cpu_context *this = &per_cpu(contexts, cpu);
-		if (this->transform) {
-			crypto_free_comp(this->transform);
-			this->transform = NULL;
-		}
-
-		if (this->page_buffer)
-			toi_free_page(16, (unsigned long) this->page_buffer);
-
-		this->page_buffer = NULL;
-
-		if (this->output_buffer)
-			vfree(this->output_buffer);
-
-		this->output_buffer = NULL;
-	}
-
-	return 0;
-}
-
-/*
- * toi_compress_init
- */
-
-static int toi_compress_init(int toi_or_resume)
-{
-	if (!toi_or_resume)
-		return 0;
-
-	toi_compress_bytes_in = 0;
-	toi_compress_bytes_out = 0;
-
-	next_driver = toi_get_next_filter(&toi_compression_ops);
-
-	return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_compress_rw_init()
- */
-
-static int toi_compress_rw_init(int rw, int stream_number)
-{
-	if (toi_compress_crypto_prepare()) {
-		printk(KERN_ERR "Failed to initialise compression "
-				"algorithm.\n");
-		if (rw == READ) {
-			printk(KERN_INFO "Unable to read the image.\n");
-			return -ENODEV;
-		} else {
-			printk(KERN_INFO "Continuing without "
-				"compressing the image.\n");
-			toi_compression_ops.enabled = 0;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * toi_compress_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page:	Pointer to a buffer of size PAGE_SIZE, containing
- * data to be compressed.
- *
- * Returns:	0 on success. Otherwise the error is that returned by later
- * 		modules, -ECHILD if we have a broken pipeline or -EIO if
- * 		zlib errs.
- */
-static int toi_compress_write_page(unsigned long index, int buf_type,
-		void *buffer_page, unsigned int buf_size)
-{
-	int ret = 0, cpu = smp_processor_id();
-	struct cpu_context *ctx = &per_cpu(contexts, cpu);
-	u8* output_buffer = buffer_page;
-	int output_len = buf_size;
-	int out_buf_type = buf_type;
-
-	if (ctx->transform) {
-
-		ctx->buffer_start = TOI_MAP(buf_type, buffer_page);
-		ctx->len = OUT_BUF_SIZE;
-
-		ret = crypto_comp_compress(ctx->transform,
-			ctx->buffer_start, buf_size,
-			ctx->output_buffer, &ctx->len);
-
-		TOI_UNMAP(buf_type, buffer_page);
-
-		toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
-				"CPU %d, index %lu: %d bytes",
-				cpu, index, ctx->len);
-
-		if (!ret && ctx->len < buf_size) { /* some compression */
-			output_buffer = ctx->output_buffer;
-			output_len = ctx->len;
-			out_buf_type = TOI_VIRT;
-		}
-
-	}
-
-	mutex_lock(&stats_lock);
-
-	toi_compress_bytes_in += buf_size;
-	toi_compress_bytes_out += output_len;
-
-	mutex_unlock(&stats_lock);
-
-	if (!ret)
-		ret = next_driver->write_page(index, out_buf_type,
-				output_buffer, output_len);
-
-	return ret;
-}
-
-/*
- * toi_compress_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules and decompress it until the input buffer
- * is filled.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_compress_read_page(unsigned long *index, int buf_type,
-		void *buffer_page, unsigned int *buf_size)
-{
-	int ret, cpu = smp_processor_id();
-	unsigned int len;
-	unsigned int outlen = PAGE_SIZE;
-	char *buffer_start;
-	struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
-	if (!ctx->transform)
-		return next_driver->read_page(index, TOI_PAGE, buffer_page,
-				buf_size);
-
-	/*
-	 * All our reads must be synchronous - we can't decompress
-	 * data that hasn't been read yet.
-	 */
-
-	ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len);
-
-	buffer_start = kmap(buffer_page);
-
-	/* Error or uncompressed data */
-	if (ret || len == PAGE_SIZE) {
-		memcpy(buffer_start, ctx->page_buffer, len);
-		goto out;
-	}
-
-	ret = crypto_comp_decompress(
-			ctx->transform,
-			ctx->page_buffer,
-			len, buffer_start, &outlen);
-
-	toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
-			"CPU %d, index %lu: %d=>%d (%d).",
-			cpu, *index, len, outlen, ret);
-
-	if (ret)
-		abort_hibernate(TOI_FAILED_IO,
-			"Compress_read returned %d.\n", ret);
-	else if (outlen != PAGE_SIZE) {
-		abort_hibernate(TOI_FAILED_IO,
-			"Decompression yielded %d bytes instead of %ld.\n",
-			outlen, PAGE_SIZE);
-		printk(KERN_ERR "Decompression yielded %d bytes instead of "
-				"%ld.\n", outlen, PAGE_SIZE);
-		ret = -EIO;
-		*buf_size = outlen;
-	}
-out:
-	TOI_UNMAP(buf_type, buffer_page);
-	return ret;
-}
-
-/*
- * toi_compress_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_compress_print_debug_stats(char *buffer, int size)
-{
-	unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
-		      pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
-	int len;
-
-	/* Output the compression ratio achieved. */
-	if (*toi_compressor_name)
-		len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
-				toi_compressor_name);
-	else
-		len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
-	if (pages_in)
-		len += scnprintf(buffer+len, size - len, "  Compressed "
-			"%lu bytes into %lu (%ld percent compression).\n",
-		  toi_compress_bytes_in,
-		  toi_compress_bytes_out,
-		  (pages_in - pages_out) * 100 / pages_in);
-	return len;
-}
-
-/*
- * toi_compress_compression_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_compress_memory_needed(void)
-{
-	return 2 * PAGE_SIZE;
-}
-
-static int toi_compress_storage_needed(void)
-{
-	return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
-		strlen(toi_compressor_name) + 1;
-}
-
-/*
- * toi_compress_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_compress_save_config_info(char *buffer)
-{
-	int len = strlen(toi_compressor_name) + 1, offset = 0;
-
-	*((unsigned long *) buffer) = toi_compress_bytes_in;
-	offset += sizeof(unsigned long);
-	*((unsigned long *) (buffer + offset)) = toi_compress_bytes_out;
-	offset += sizeof(unsigned long);
-	*((int *) (buffer + offset)) = toi_expected_compression;
-	offset += sizeof(int);
-	*((int *) (buffer + offset)) = len;
-	offset += sizeof(int);
-	strncpy(buffer + offset, toi_compressor_name, len);
-	return offset + len;
-}
-
-/* toi_compress_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description:	Reload information needed for decompressing the image at
- * resume time.
- */
-static void toi_compress_load_config_info(char *buffer, int size)
-{
-	int len, offset = 0;
-
-	toi_compress_bytes_in = *((unsigned long *) buffer);
-	offset += sizeof(unsigned long);
-	toi_compress_bytes_out = *((unsigned long *) (buffer + offset));
-	offset += sizeof(unsigned long);
-	toi_expected_compression = *((int *) (buffer + offset));
-	offset += sizeof(int);
-	len = *((int *) (buffer + offset));
-	offset += sizeof(int);
-	strncpy(toi_compressor_name, buffer + offset, len);
-}
-
-static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-	bkd->compress_bytes_in = toi_compress_bytes_in;
-	bkd->compress_bytes_out = toi_compress_bytes_out;
-}
-
-static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-	toi_compress_bytes_in = bkd->compress_bytes_in;
-	toi_compress_bytes_out = bkd->compress_bytes_out;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Description:	Returns the expected ratio between data passed into this module
- * 		and the amount of data output when writing.
- * Returns:	100 if the module is disabled. Otherwise the value set by the
- * 		user via our sysfs entry.
- */
-
-static int toi_compress_expected_ratio(void)
-{
-	if (!toi_compression_ops.enabled)
-		return 100;
-	else
-		return 100 - toi_expected_compression;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression,
-			0, 99, 0, NULL),
-	SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0,
-			NULL),
-	SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_compression_ops = {
-	.type			= FILTER_MODULE,
-	.name			= "compression",
-	.directory		= "compression",
-	.module			= THIS_MODULE,
-	.initialise		= toi_compress_init,
-	.memory_needed 		= toi_compress_memory_needed,
-	.print_debug_info	= toi_compress_print_debug_stats,
-	.save_config_info	= toi_compress_save_config_info,
-	.load_config_info	= toi_compress_load_config_info,
-	.storage_needed		= toi_compress_storage_needed,
-	.expected_compression	= toi_compress_expected_ratio,
-
-	.pre_atomic_restore	= toi_compress_pre_atomic_restore,
-	.post_atomic_restore	= toi_compress_post_atomic_restore,
-
-	.rw_init		= toi_compress_rw_init,
-	.rw_cleanup		= toi_compress_rw_cleanup,
-
-	.write_page		= toi_compress_write_page,
-	.read_page		= toi_compress_read_page,
-
-	.sysfs_data		= sysfs_params,
-	.num_sysfs_entries	= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_compress_load(void)
-{
-	return toi_register_module(&toi_compression_ops);
-}
-
-late_initcall(toi_compress_load);
diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c
deleted file mode 100644
index dc02a4acf..000000000
--- a/kernel/power/tuxonice_copy_before_write.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * kernel/power/tuxonice_copy_before_write.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines (apart from the fault handling code) to deal with allocating memory
- * for copying pages before they are modified, restoring the contents and getting
- * the contents written to disk.
- */
-
-#include <linux/percpu-defs.h>
-#include <linux/sched.h>
-#include <linux/tuxonice.h>
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states);
-#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw))
-#define toi_cbw_pool_size 100
-
-static void _toi_free_cbw_data(struct toi_cbw_state *state)
-{
-    struct toi_cbw *page_ptr, *ptr, *next;
-
-    page_ptr = ptr = state->first;
-
-    while(ptr) {
-        next = ptr->next;
-
-        if (ptr->virt) {
-            toi__free_page(40, virt_to_page(ptr->virt));
-        }
-        if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) {
-            /* Must be on a new page - free the previous one. */
-            toi__free_page(40, virt_to_page(page_ptr));
-            page_ptr = ptr;
-        }
-        ptr = next;
-    }
-
-    if (page_ptr) {
-        toi__free_page(40, virt_to_page(page_ptr));
-    }
-
-    state->first = state->next = state->last = NULL;
-    state->size = 0;
-}
-
-void toi_free_cbw_data(void)
-{
-    int i;
-
-    for_each_online_cpu(i) {
-        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
-        if (!state->first)
-            continue;
-
-        state->enabled = 0;
-
-        while (state->active) {
-            schedule();
-        }
-
-        _toi_free_cbw_data(state);
-    }
-}
-
-static int _toi_allocate_cbw_data(struct toi_cbw_state *state)
-{
-    while(state->size < toi_cbw_pool_size) {
-        int i;
-        struct toi_cbw *ptr;
-
-        ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL);
-
-        if (!ptr) {
-            return -ENOMEM;
-        }
-
-        if (!state->first) {
-            state->first = state->next = state->last = ptr;
-        }
-
-        for (i = 0; i < CBWS_PER_PAGE; i++) {
-            struct toi_cbw *cbw = &ptr[i];
-
-            cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL);
-            if (!cbw->virt) {
-                state->size += i;
-                printk("Out of memory allocating CBW pages.\n");
-                return -ENOMEM;
-            }
-
-            if (cbw == state->first)
-                continue;
-
-            state->last->next = cbw;
-            state->last = cbw;
-        }
-
-        state->size += CBWS_PER_PAGE;
-    }
-
-    state->enabled = 1;
-
-    return 0;
-}
-
-
-int toi_allocate_cbw_data(void)
-{
-    int i, result;
-
-    for_each_online_cpu(i) {
-        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
-        result = _toi_allocate_cbw_data(state);
-
-        if (result)
-            return result;
-    }
-
-    return 0;
-}
-
-void toi_cbw_restore(void)
-{
-    if (!toi_keeping_image)
-        return;
-
-}
-
-void toi_cbw_write(void)
-{
-    if (!toi_keeping_image)
-        return;
-
-}
-
-/**
- * toi_cbw_test_read - Test copy before write on one page
- *
- * Allocate copy before write buffers, then make one page only copy-before-write
- * and attempt to write to it. We should then be able to retrieve the original
- * version from the cbw buffer and the modified version from the page itself.
- */
-static int toi_cbw_test_read(const char *buffer, int count)
-{
-    unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL);
-    char *original = "Original contents";
-    char *modified = "Modified material";
-    struct page *page = virt_to_page(virt);
-    int i, len = 0, found = 0, pfn = page_to_pfn(page);
-
-    if (!page) {
-        printk("toi_cbw_test_read: Unable to allocate a page for testing.\n");
-        return -ENOMEM;
-    }
-
-    memcpy((char *) virt, original, strlen(original));
-
-    if (toi_allocate_cbw_data()) {
-        printk("toi_cbw_test_read: Unable to allocate cbw data.\n");
-        return -ENOMEM;
-    }
-
-    toi_reset_dirtiness_one(pfn, 0);
-
-    SetPageTOI_CBW(page);
-
-    memcpy((char *) virt, modified, strlen(modified));
-
-    if (strncmp((char *) virt, modified, strlen(modified))) {
-        len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n");
-    }
-
-    for_each_online_cpu(i) {
-        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-        struct toi_cbw *ptr = state->first, *last_ptr = ptr;
-
-        if (!found) {
-            while (ptr) {
-                if (ptr->pfn == pfn) {
-                    found = 1;
-                    if (strncmp(ptr->virt, original, strlen(original))) {
-                        len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n");
-                    } else {
-                        len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n");
-                    }
-                    break;
-                }
-
-                last_ptr = ptr;
-                ptr = ptr->next;
-            }
-        }
-
-        if (!last_ptr)
-            len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i);
-    }
-
-    if (!found)
-        len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n");
-
-    toi_free_cbw_data();
-
-    return len;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read,
-			NULL, SYSFS_NEEDS_SM_FOR_READ, NULL),
-};
-
-static struct toi_module_ops toi_cbw_ops = {
-	.type					= MISC_HIDDEN_MODULE,
-	.name					= "copy_before_write debugging",
-	.directory				= "cbw",
-	.module					= THIS_MODULE,
-	.early					= 1,
-
-	.sysfs_data		= sysfs_params,
-	.num_sysfs_entries	= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-int toi_cbw_init(void)
-{
-	int result = toi_register_module(&toi_cbw_ops);
-	return result;
-}
diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
deleted file mode 100644
index 3b558b220..000000000
--- a/kernel/power/tuxonice_extent.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * These functions encapsulate the manipulation of storage metadata.
- */
-
-#include <linux/suspend.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-
-/**
- * toi_get_extent - return a free extent
- *
- * May fail, returning NULL instead.
- **/
-static struct hibernate_extent *toi_get_extent(void)
-{
-	return (struct hibernate_extent *) toi_kzalloc(2,
-			sizeof(struct hibernate_extent), TOI_ATOMIC_GFP);
-}
-
-/**
- * toi_put_extent_chain - free a chain of extents starting from value 'from'
- * @chain:	Chain to free.
- *
- * Note that 'from' is an extent value, and may be part way through an extent.
- * In this case, the extent should be truncated (if necessary) and following
- * extents freed.
- **/
-void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from)
-{
-	struct hibernate_extent *this;
-
-	this = chain->first;
-
-	while (this) {
-		struct hibernate_extent *next = this->next;
-
-                // Delete the whole extent?
-                if (this->start >= from) {
-                    chain->size -= (this->end - this->start + 1);
-                    if (chain->first == this)
-                        chain->first = next;
-                    if (chain->last_touched == this)
-                        chain->last_touched = NULL;
-                    if (chain->current_extent == this)
-                        chain->current_extent = NULL;
-                    toi_kfree(2, this, sizeof(*this));
-                    chain->num_extents--;
-                } else if (this->end >= from) {
-                    // Delete part of the extent
-                    chain->size -= (this->end - from + 1);
-                    this->start = from;
-                }
-		this = next;
-	}
-}
-
-/**
- * toi_put_extent_chain - free a whole chain of extents
- * @chain:	Chain to free.
- **/
-void toi_put_extent_chain(struct hibernate_extent_chain *chain)
-{
-    toi_put_extent_chain_from(chain, 0);
-}
-
-/**
- * toi_add_to_extent_chain - add an extent to an existing chain
- * @chain:	Chain to which the extend should be added
- * @start:	Start of the extent (first physical block)
- * @end:	End of the extent (last physical block)
- *
- * The chain information is updated if the insertion is successful.
- **/
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
-		unsigned long start, unsigned long end)
-{
-	struct hibernate_extent *new_ext = NULL, *cur_ext = NULL;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0,
-		"Adding extent %lu-%lu to chain %p.\n", start, end, chain);
-
-	/* Find the right place in the chain */
-	if (chain->last_touched && chain->last_touched->start < start)
-		cur_ext = chain->last_touched;
-	else if (chain->first && chain->first->start < start)
-		cur_ext = chain->first;
-
-	if (cur_ext) {
-		while (cur_ext->next && cur_ext->next->start < start)
-			cur_ext = cur_ext->next;
-
-		if (cur_ext->end == (start - 1)) {
-			struct hibernate_extent *next_ext = cur_ext->next;
-			cur_ext->end = end;
-
-			/* Merge with the following one? */
-			if (next_ext && cur_ext->end + 1 == next_ext->start) {
-				cur_ext->end = next_ext->end;
-				cur_ext->next = next_ext->next;
-				toi_kfree(2, next_ext, sizeof(*next_ext));
-				chain->num_extents--;
-			}
-
-			chain->last_touched = cur_ext;
-			chain->size += (end - start + 1);
-
-			return 0;
-		}
-	}
-
-	new_ext = toi_get_extent();
-	if (!new_ext) {
-		printk(KERN_INFO "Error unable to append a new extent to the "
-				"chain.\n");
-		return -ENOMEM;
-	}
-
-	chain->num_extents++;
-	chain->size += (end - start + 1);
-	new_ext->start = start;
-	new_ext->end = end;
-
-	chain->last_touched = new_ext;
-
-	if (cur_ext) {
-		new_ext->next = cur_ext->next;
-		cur_ext->next = new_ext;
-	} else {
-		if (chain->first)
-			new_ext->next = chain->first;
-		chain->first = new_ext;
-	}
-
-	return 0;
-}
diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
deleted file mode 100644
index cf1289efc..000000000
--- a/kernel/power/tuxonice_extent.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations related to extents. Extents are
- * TuxOnIce's method of storing some of the metadata for the image.
- * See tuxonice_extent.c for more info.
- *
- */
-
-#include "tuxonice_modules.h"
-
-#ifndef EXTENT_H
-#define EXTENT_H
-
-struct hibernate_extent {
-	unsigned long start, end;
-	struct hibernate_extent *next;
-};
-
-struct hibernate_extent_chain {
-	unsigned long size; /* size of the chain ie sum (max-min+1) */
-	int num_extents;
-	struct hibernate_extent *first, *last_touched;
-	struct hibernate_extent *current_extent;
-	unsigned long current_offset;
-};
-
-/* Simplify iterating through all the values in an extent chain */
-#define toi_extent_for_each(extent_chain, extentpointer, value) \
-if ((extent_chain)->first) \
-	for ((extentpointer) = (extent_chain)->first, (value) = \
-			(extentpointer)->start; \
-	     ((extentpointer) && ((extentpointer)->next || (value) <= \
-				 (extentpointer)->end)); \
-	     (((value) == (extentpointer)->end) ? \
-		((extentpointer) = (extentpointer)->next, (value) = \
-		 ((extentpointer) ? (extentpointer)->start : 0)) : \
-			(value)++))
-
-extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from);
-#endif
diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
deleted file mode 100644
index 607246051..000000000
--- a/kernel/power/tuxonice_file.c
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- * kernel/power/tuxonice_file.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of a simple file as a
- * backing store. It is based upon the swapallocator, and shares the
- * same basic working. Here, though, we have nothing to do with
- * swapspace, and only one device to worry about.
- *
- * The user can just
- *
- * echo TuxOnIce > /path/to/my_file
- *
- * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
- *
- * and
- *
- * echo /path/to/my_file > /sys/power/tuxonice/file/target
- *
- * then put what they find in /sys/power/tuxonice/resume
- * as their resume= parameter in lilo.conf (and rerun lilo if using it).
- *
- * Having done this, they're ready to hibernate and resume.
- *
- * TODO:
- * - File resizing.
- */
-
-#include <linux/blkdev.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-
-#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
-
-static struct toi_module_ops toi_fileops;
-
-static struct file *target_file;
-static struct block_device *toi_file_target_bdev;
-static unsigned long pages_available, pages_allocated;
-static char toi_file_target[256];
-static struct inode *target_inode;
-static int file_target_priority;
-static int used_devt;
-static int target_claim;
-static dev_t toi_file_dev_t;
-static int sig_page_index;
-
-/* For test_toi_file_target */
-static struct toi_bdev_info *file_chain;
-
-static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num)
-{
-	int j;
-	sector_t last = 0;
-
-	for (j = 0; j < dev_info->blocks_per_page; j++) {
-		sector_t this = bmap(target_inode,
-				page_num * dev_info->blocks_per_page + j);
-
-		if (!this || (last && (last + 1) != this))
-			break;
-
-		last = this;
-	}
-
-	return j == dev_info->blocks_per_page;
-}
-
-static unsigned long get_usable_pages(struct toi_bdev_info *dev_info)
-{
-	unsigned long result = 0;
-	struct block_device *bdev = dev_info->bdev;
-	int i;
-
-	switch (target_inode->i_mode & S_IFMT) {
-	case S_IFSOCK:
-	case S_IFCHR:
-	case S_IFIFO: /* Socket, Char, Fifo */
-		return -1;
-	case S_IFREG: /* Regular file: current size - holes + free
-			 space on part */
-		for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) {
-			if (has_contiguous_blocks(dev_info, i))
-				result++;
-		}
-		break;
-	case S_IFBLK: /* Block device */
-		if (!bdev->bd_disk) {
-			toi_message(TOI_IO, TOI_VERBOSE, 0,
-					"bdev->bd_disk null.");
-			return 0;
-		}
-
-		result = (bdev->bd_part ?
-			bdev->bd_part->nr_sects :
-			get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9);
-	}
-
-
-	return result;
-}
-
-static int toi_file_register_storage(void)
-{
-	struct toi_bdev_info *devinfo;
-	int result = 0;
-	struct fs_info *fs_info;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage.");
-	if (!strlen(toi_file_target)) {
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: "
-				"No target filename set.");
-		return 0;
-	}
-
-	target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0);
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.",
-			toi_file_target, target_file);
-
-	if (IS_ERR(target_file) || !target_file) {
-		target_file = NULL;
-		toi_file_dev_t = name_to_dev_t(toi_file_target);
-		if (!toi_file_dev_t) {
-			struct kstat stat;
-			int error = vfs_stat(toi_file_target, &stat);
-			printk(KERN_INFO "Open file %s returned %p and "
-					"name_to_devt failed.\n",
-					toi_file_target, target_file);
-			if (error) {
-				printk(KERN_INFO "Stating the file also failed."
-					" Nothing more we can do.\n");
-				return 0;
-			} else
-				toi_file_dev_t = stat.rdev;
-		}
-
-		toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t);
-		if (IS_ERR(toi_file_target_bdev)) {
-			printk(KERN_INFO "Got a dev_num (%lx) but failed to "
-					"open it.\n",
-					(unsigned long) toi_file_dev_t);
-			toi_file_target_bdev = NULL;
-			return 0;
-		}
-		used_devt = 1;
-		target_inode = toi_file_target_bdev->bd_inode;
-	} else
-		target_inode = target_file->f_mapping->host;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target.");
-	if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
-	    S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
-		printk(KERN_INFO "File support works with regular files,"
-				" character files and block devices.\n");
-		/* Cleanup routine will undo the above */
-		return 0;
-	}
-
-	if (!used_devt) {
-		if (S_ISBLK(target_inode->i_mode)) {
-			toi_file_target_bdev = I_BDEV(target_inode);
-			if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE |
-						FMODE_READ, NULL))
-				target_claim = 1;
-		} else
-			toi_file_target_bdev = target_inode->i_sb->s_bdev;
-		if (!toi_file_target_bdev) {
-			printk(KERN_INFO "%s is not a valid file allocator "
-					"target.\n", toi_file_target);
-			return 0;
-		}
-		toi_file_dev_t = toi_file_target_bdev->bd_dev;
-	}
-
-	devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC);
-	if (!devinfo) {
-		printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n");
-		return -ENOMEM;
-	}
-
-	devinfo->bdev = toi_file_target_bdev;
-	devinfo->allocator = &toi_fileops;
-	devinfo->allocator_index = 0;
-
-	fs_info = fs_info_from_block_dev(toi_file_target_bdev);
-	if (fs_info && !IS_ERR(fs_info)) {
-		memcpy(devinfo->uuid, &fs_info->uuid, 16);
-		free_fs_info(fs_info);
-	} else
-		result = (int) PTR_ERR(fs_info);
-
-	/* Unlike swap code, only complain if fs_info_from_block_dev returned
-	 * -ENOMEM. The 'file' might be a full partition, so might validly not
-	 * have an identifiable type, UUID etc.
-	 */
-	if (result)
-		printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n",
-				result);
-	devinfo->dev_t = toi_file_dev_t;
-	devinfo->prio = file_target_priority;
-	devinfo->bmap_shift = target_inode->i_blkbits - 9;
-	devinfo->blocks_per_page =
-		(1 << (PAGE_SHIFT - target_inode->i_blkbits));
-	sprintf(devinfo->name, "file %s", toi_file_target);
-	file_chain = devinfo;
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap "
-			"shift is %d. Blocks per page %d.",
-			devinfo->dev_t, devinfo->prio, devinfo->bmap_shift,
-			devinfo->blocks_per_page);
-
-	/* Keep one aside for the signature */
-	pages_available = get_usable_pages(devinfo) - 1;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu "
-			"pages.", pages_available);
-
-	toi_bio_ops.register_storage(devinfo);
-	return 0;
-}
-
-static unsigned long toi_file_storage_available(void)
-{
-	return pages_available;
-}
-
-static int toi_file_allocate_storage(struct toi_bdev_info *chain,
-		unsigned long request)
-{
-	unsigned long available = pages_available - pages_allocated;
-	unsigned long to_add = min(available, request);
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated "
-		"is %lu. Allocating %lu pages from file.",
-		pages_available, pages_allocated, to_add);
-	pages_allocated += to_add;
-
-	return to_add;
-}
-
-/**
- * __populate_block_list - add an extent to the chain
- * @min:	Start of the extent (first physical block = sector)
- * @max:	End of the extent (last physical block = sector)
- *
- * If TOI_TEST_BIO is set, print a debug message, outputting the min and max
- * fs block numbers.
- **/
-static int __populate_block_list(struct toi_bdev_info *chain, int min, int max)
-{
-	if (test_action_state(TOI_TEST_BIO))
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.",
-			min << chain->bmap_shift,
-			((max + 1) << chain->bmap_shift) - 1);
-
-	return toi_add_to_extent_chain(&chain->blocks, min, max);
-}
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
-	int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0;
-	unsigned long pages_mapped = 0;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks.");
-
-	if (chain->blocks.first)
-		toi_put_extent_chain(&chain->blocks);
-
-	if (!target_is_normal_file()) {
-		result = (pages_available > 0) ?
-			__populate_block_list(chain, chain->blocks_per_page,
-				(pages_allocated + 1) *
-				chain->blocks_per_page - 1) : 0;
-		return result;
-	}
-
-	/*
-	 * FIXME: We are assuming the first page is contiguous. Is that
-	 * assumption always right?
-	 */
-
-	for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
-		sector_t new_sector;
-
-		if (!has_contiguous_blocks(chain, i))
-			continue;
-
-		if (!have_sig_page) {
-			have_sig_page = 1;
-			sig_page_index = i;
-			continue;
-		}
-
-		pages_mapped++;
-
-		/* Ignore first page - it has the header */
-		if (pages_mapped == 1)
-			continue;
-
-		new_sector = bmap(target_inode, (i * chain->blocks_per_page));
-
-		/*
-		 * I'd love to be able to fill in holes and resize
-		 * files, but not yet...
-		 */
-
-		if (new_sector == extent_max + 1)
-			extent_max += chain->blocks_per_page;
-		else {
-			if (extent_min > -1) {
-				result = __populate_block_list(chain,
-						extent_min, extent_max);
-				if (result)
-					return result;
-			}
-
-			extent_min = new_sector;
-			extent_max = extent_min +
-				chain->blocks_per_page - 1;
-		}
-
-		if (pages_mapped == pages_allocated)
-			break;
-	}
-
-	if (extent_min > -1) {
-		result = __populate_block_list(chain, extent_min, extent_max);
-		if (result)
-			return result;
-	}
-
-	return 0;
-}
-
-static void toi_file_free_storage(struct toi_bdev_info *chain)
-{
-	pages_allocated = 0;
-	file_chain = NULL;
-}
-
-/**
- * toi_file_print_debug_stats - print debug info
- * @buffer:	Buffer to data to populate
- * @size:	Size of the buffer
- **/
-static int toi_file_print_debug_stats(char *buffer, int size)
-{
-	int len = scnprintf(buffer, size, "- File Allocator active.\n");
-
-	len += scnprintf(buffer+len, size-len, "  Storage available for "
-			"image: %lu pages.\n", pages_available);
-
-	return len;
-}
-
-static void toi_file_cleanup(int finishing_cycle)
-{
-	if (toi_file_target_bdev) {
-		if (target_claim) {
-			blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ);
-			target_claim = 0;
-		}
-
-		if (used_devt) {
-			blkdev_put(toi_file_target_bdev,
-					FMODE_READ | FMODE_NDELAY);
-			used_devt = 0;
-		}
-		toi_file_target_bdev = NULL;
-		target_inode = NULL;
-	}
-
-	if (target_file) {
-		filp_close(target_file, NULL);
-		target_file = NULL;
-	}
-
-	pages_available = 0;
-}
-
-/**
- * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target
- *
- * Test wheter the target file is valid for hibernating.
- **/
-static void test_toi_file_target(void)
-{
-	int result = toi_file_register_storage();
-	sector_t sector;
-	char buf[50];
-	struct fs_info *fs_info;
-
-	if (result || !file_chain)
-		return;
-
-	/* This doesn't mean we're in business. Is any storage available? */
-	if (!pages_available)
-		goto out;
-
-	toi_file_allocate_storage(file_chain, 1);
-	result = get_main_pool_phys_params(file_chain);
-	if (result)
-		goto out;
-
-
-	sector = bmap(target_inode, sig_page_index *
-			file_chain->blocks_per_page) << file_chain->bmap_shift;
-
-	/* Use the uuid, or the dev_t if that fails */
-	fs_info = fs_info_from_block_dev(toi_file_target_bdev);
-	if (!fs_info || IS_ERR(fs_info)) {
-		bdevname(toi_file_target_bdev, buf);
-		sprintf(resume_file, "/dev/%s:%llu", buf,
-				(unsigned long long) sector);
-	} else {
-		int i;
-		hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0);
-
-		/* Remove the spaces */
-		for (i = 1; i < 16; i++) {
-			buf[2 * i] = buf[3 * i];
-			buf[2 * i + 1] = buf[3 * i + 1];
-		}
-		buf[32] = 0;
-		sprintf(resume_file, "UUID=%s:0x%llx", buf,
-				(unsigned long long) sector);
-		free_fs_info(fs_info);
-	}
-
-	toi_attempt_to_parse_resume_device(0);
-out:
-	toi_file_free_storage(file_chain);
-	toi_bio_ops.free_storage();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256,
-		SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target),
-	SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL),
-	SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095,
-			4096, 0, NULL),
-};
-
-static struct toi_bio_allocator_ops toi_bio_fileops = {
-	.register_storage			= toi_file_register_storage,
-	.storage_available			= toi_file_storage_available,
-	.allocate_storage			= toi_file_allocate_storage,
-	.bmap					= get_main_pool_phys_params,
-	.free_storage				= toi_file_free_storage,
-};
-
-static struct toi_module_ops toi_fileops = {
-	.type					= BIO_ALLOCATOR_MODULE,
-	.name					= "file storage",
-	.directory				= "file",
-	.module					= THIS_MODULE,
-	.print_debug_info			= toi_file_print_debug_stats,
-	.cleanup				= toi_file_cleanup,
-	.bio_allocator_ops			= &toi_bio_fileops,
-
-	.sysfs_data		= sysfs_params,
-	.num_sysfs_entries	= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_file_load(void)
-{
-	return toi_register_module(&toi_fileops);
-}
-
-late_initcall(toi_file_load);
diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
deleted file mode 100644
index bdcd832f3..000000000
--- a/kernel/power/tuxonice_highlevel.c
+++ /dev/null
@@ -1,1413 +0,0 @@
-/*
- * kernel/power/tuxonice_highlevel.c
- */
-/** \mainpage TuxOnIce.
- *
- * TuxOnIce provides support for saving and restoring an image of
- * system memory to an arbitrary storage device, either on the local computer,
- * or across some network. The support is entirely OS based, so TuxOnIce
- * works without requiring BIOS, APM or ACPI support. The vast majority of the
- * code is also architecture independant, so it should be very easy to port
- * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
- * and preemption. Initramfses and initrds are also supported.
- *
- * TuxOnIce uses a modular design, in which the method of storing the image is
- * completely abstracted from the core code, as are transformations on the data
- * such as compression and/or encryption (multiple 'modules' can be used to
- * provide arbitrary combinations of functionality). The user interface is also
- * modular, so that arbitrarily simple or complex interfaces can be used to
- * provide anything from debugging information through to eye candy.
- *
- * \section Copyright
- *
- * TuxOnIce is released under the GPLv2.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR>
- *
- * \section Credits
- *
- * Nigel would like to thank the following people for their work:
- *
- * Bernard Blackham <bernard@blackham.com.au><BR>
- * Web page & Wiki administration, some coding. A person without whom
- * TuxOnIce would not be where it is.
- *
- * Michael Frank <mhf@linuxmail.org><BR>
- * Extensive testing and help with improving stability. I was constantly
- * amazed by the quality and quantity of Michael's help.
- *
- * Pavel Machek <pavel@ucw.cz><BR>
- * Modifications, defectiveness pointing, being with Gabor at the very
- * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
- * 2.5.17. Even though Pavel and I disagree on the direction suspend to
- * disk should take, I appreciate the valuable work he did in helping Gabor
- * get the concept working.
- *
- * ..and of course the myriads of TuxOnIce users who have helped diagnose
- * and fix bugs, made suggestions on how to improve the code, proofread
- * documentation, and donated time and money.
- *
- * Thanks also to corporate sponsors:
- *
- * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
- *
- * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
- * allowed him to work on TuxOnIce and PM related issues on company time.
- *
- * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
- * 2003 to Jan 2004.
- *
- * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
- * maintenance of SMP and Highmem support.
- *
- * <B>OSDL.</B> Provided access to various hardware configurations, make
- * occasional small donations to the project.
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/freezer.h>
-#include <generated/utsrelease.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/writeback.h>
-#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */
-#include <linux/bio.h>
-#include <linux/kgdb.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_cluster.h"
-
-/*! Pageset metadata. */
-struct pagedir pagedir2 = {2};
-
-static mm_segment_t oldfs;
-static DEFINE_MUTEX(tuxonice_in_use);
-static int block_dump_save;
-
-int toi_trace_index;
-
-/* Binary signature if an image is present */
-char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
-
-unsigned long boot_kernel_data_buffer;
-
-static char *result_strings[] = {
-	"Hibernation was aborted",
-	"The user requested that we cancel the hibernation",
-	"No storage was available",
-	"Insufficient storage was available",
-	"Freezing filesystems and/or tasks failed",
-	"A pre-existing image was used",
-	"We would free memory, but image size limit doesn't allow this",
-	"Unable to free enough memory to hibernate",
-	"Unable to obtain the Power Management Semaphore",
-	"A device suspend/resume returned an error",
-	"A system device suspend/resume returned an error",
-	"The extra pages allowance is too small",
-	"We were unable to successfully prepare an image",
-	"TuxOnIce module initialisation failed",
-	"TuxOnIce module cleanup failed",
-	"I/O errors were encountered",
-	"Ran out of memory",
-	"An error was encountered while reading the image",
-	"Platform preparation failed",
-	"CPU Hotplugging failed",
-	"Architecture specific preparation failed",
-	"Pages needed resaving, but we were told to abort if this happens",
-	"We can't hibernate at the moment (invalid resume= or filewriter "
-		"target?)",
-	"A hibernation preparation notifier chain member cancelled the "
-		"hibernation",
-	"Pre-snapshot preparation failed",
-	"Pre-restore preparation failed",
-	"Failed to disable usermode helpers",
-	"Can't resume from alternate image",
-	"Header reservation too small",
-	"Device Power Management Preparation failed",
-};
-
-/**
- * toi_finish_anything - cleanup after doing anything
- * @hibernate_or_resume:	Whether finishing a cycle or attempt at
- *				resuming.
- *
- * This is our basic clean-up routine, matching start_anything below. We
- * call cleanup routines, drop module references and restore process fs and
- * cpus allowed masks, together with the global block_dump variable's value.
- **/
-void toi_finish_anything(int hibernate_or_resume)
-{
-	toi_running = 0;
-	toi_cleanup_modules(hibernate_or_resume);
-	toi_put_modules();
-	if (hibernate_or_resume) {
-		block_dump = block_dump_save;
-		set_cpus_allowed_ptr(current, cpu_all_mask);
-		toi_alloc_print_debug_stats();
-		atomic_inc(&snapshot_device_available);
-    unlock_system_sleep();
-	}
-
-	set_fs(oldfs);
-	mutex_unlock(&tuxonice_in_use);
-}
-
-/**
- * toi_start_anything - basic initialisation for TuxOnIce
- * @toi_or_resume:	Whether starting a cycle or attempt at resuming.
- *
- * Our basic initialisation routine. Take references on modules, use the
- * kernel segment, recheck resume= if no active allocator is set, initialise
- * modules, save and reset block_dump and ensure we're running on CPU0.
- **/
-int toi_start_anything(int hibernate_or_resume)
-{
-	mutex_lock(&tuxonice_in_use);
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-
-        toi_trace_index = 0;
-
-	if (hibernate_or_resume) {
-    lock_system_sleep();
-
-		if (!atomic_add_unless(&snapshot_device_available, -1, 0))
-			goto snapshotdevice_unavailable;
-	}
-
-	if (hibernate_or_resume == SYSFS_HIBERNATE)
-		toi_print_modules();
-
-	if (toi_get_modules()) {
-		printk(KERN_INFO "TuxOnIce: Get modules failed!\n");
-		goto prehibernate_err;
-	}
-
-	if (hibernate_or_resume) {
-		block_dump_save = block_dump;
-		block_dump = 0;
-		set_cpus_allowed_ptr(current,
-				cpumask_of(cpumask_first(cpu_online_mask)));
-	}
-
-	if (toi_initialise_modules_early(hibernate_or_resume))
-		goto early_init_err;
-
-	if (!toiActiveAllocator)
-		toi_attempt_to_parse_resume_device(!hibernate_or_resume);
-
-        if (!toi_initialise_modules_late(hibernate_or_resume)) {
-            toi_running = 1; /* For the swsusp code we use :< */
-            return 0;
-        }
-
-	toi_cleanup_modules(hibernate_or_resume);
-early_init_err:
-	if (hibernate_or_resume) {
-		block_dump_save = block_dump;
-		set_cpus_allowed_ptr(current, cpu_all_mask);
-	}
-	toi_put_modules();
-prehibernate_err:
-	if (hibernate_or_resume)
-		atomic_inc(&snapshot_device_available);
-snapshotdevice_unavailable:
-	if (hibernate_or_resume)
-		mutex_unlock(&pm_mutex);
-	set_fs(oldfs);
-	mutex_unlock(&tuxonice_in_use);
-	return -EBUSY;
-}
-
-/*
- * Nosave page tracking.
- *
- * Here rather than in prepare_image because we want to do it once only at the
- * start of a cycle.
- */
-
-/**
- * mark_nosave_pages - set up our Nosave bitmap
- *
- * Build a bitmap of Nosave pages from the list. The bitmap allows faster
- * use when preparing the image.
- **/
-static void mark_nosave_pages(void)
-{
-	struct nosave_region *region;
-
-	list_for_each_entry(region, &nosave_regions, list) {
-		unsigned long pfn;
-
-		for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
-			if (pfn_valid(pfn)) {
-				SetPageNosave(pfn_to_page(pfn));
-                        }
-	}
-}
-
-/**
- * allocate_bitmaps - allocate bitmaps used to record page states
- *
- * Allocate the bitmaps we use to record the various TuxOnIce related
- * page states.
- **/
-static int allocate_bitmaps(void)
-{
-	if (toi_alloc_bitmap(&pageset1_map) ||
-	    toi_alloc_bitmap(&pageset1_copy_map) ||
-	    toi_alloc_bitmap(&pageset2_map) ||
-	    toi_alloc_bitmap(&io_map) ||
-	    toi_alloc_bitmap(&nosave_map) ||
-	    toi_alloc_bitmap(&free_map) ||
-	    toi_alloc_bitmap(&compare_map) ||
-	    toi_alloc_bitmap(&page_resave_map))
-		return 1;
-
-	return 0;
-}
-
-/**
- * free_bitmaps - free the bitmaps used to record page states
- *
- * Free the bitmaps allocated above. It is not an error to call
- * memory_bm_free on a bitmap that isn't currently allocated.
- **/
-static void free_bitmaps(void)
-{
-	toi_free_bitmap(&pageset1_map);
-	toi_free_bitmap(&pageset1_copy_map);
-	toi_free_bitmap(&pageset2_map);
-	toi_free_bitmap(&io_map);
-	toi_free_bitmap(&nosave_map);
-	toi_free_bitmap(&free_map);
-	toi_free_bitmap(&compare_map);
-	toi_free_bitmap(&page_resave_map);
-}
-
-/**
- * io_MB_per_second - return the number of MB/s read or written
- * @write:	Whether to return the speed at which we wrote.
- *
- * Calculate the number of megabytes per second that were read or written.
- **/
-static int io_MB_per_second(int write)
-{
-	return (toi_bkd.toi_io_time[write][1]) ?
-		MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
-		toi_bkd.toi_io_time[write][1] : 0;
-}
-
-#define SNPRINTF(a...) 	do { len += scnprintf(((char *) buffer) + len, \
-		count - len - 1, ## a); } while (0)
-
-/**
- * get_debug_info - fill a buffer with debugging information
- * @buffer:	The buffer to be filled.
- * @count:	The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_debug_info(const char *buffer, int count)
-{
-	int len = 0, i, first_result = 1;
-
-	SNPRINTF("TuxOnIce debugging info:\n");
-	SNPRINTF("- TuxOnIce core  : " TOI_CORE_VERSION "\n");
-	SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
-	SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
-	SNPRINTF("- Attempt number : %d\n", nr_hibernates);
-	SNPRINTF("- Parameters     : %ld %ld %ld %d %ld %ld\n",
-			toi_result,
-			toi_bkd.toi_action,
-			toi_bkd.toi_debug_state,
-			toi_bkd.toi_default_console_level,
-			image_size_limit,
-			toi_poweroff_method);
-	SNPRINTF("- Overall expected compression percentage: %d.\n",
-			100 - toi_expected_compression_ratio());
-	len += toi_print_module_debug_info(((char *) buffer) + len,
-			count - len - 1);
-	if (toi_bkd.toi_io_time[0][1]) {
-		if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
-			SNPRINTF("- I/O speed: Write %ld KB/s",
-			  (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
-			  toi_bkd.toi_io_time[0][1]));
-			if (toi_bkd.toi_io_time[1][1])
-				SNPRINTF(", Read %ld KB/s",
-				  (KB((unsigned long)
-				      toi_bkd.toi_io_time[1][0]) * HZ /
-				  toi_bkd.toi_io_time[1][1]));
-		} else {
-			SNPRINTF("- I/O speed: Write %ld MB/s",
-			 (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
-			  toi_bkd.toi_io_time[0][1]));
-			if (toi_bkd.toi_io_time[1][1])
-				SNPRINTF(", Read %ld MB/s",
-				 (MB((unsigned long)
-				     toi_bkd.toi_io_time[1][0]) * HZ /
-				  toi_bkd.toi_io_time[1][1]));
-		}
-		SNPRINTF(".\n");
-	} else
-		SNPRINTF("- No I/O speed stats available.\n");
-	SNPRINTF("- Extra pages    : %lu used/%lu.\n",
-			extra_pd1_pages_used, extra_pd1_pages_allowance);
-
-	for (i = 0; i < TOI_NUM_RESULT_STATES; i++)
-		if (test_result_state(i)) {
-			SNPRINTF("%s: %s.\n", first_result ?
-					"- Result         " :
-					"                 ",
-					result_strings[i]);
-			first_result = 0;
-		}
-	if (first_result)
-		SNPRINTF("- Result         : %s.\n", nr_hibernates ?
-			"Succeeded" :
-			"No hibernation attempts so far");
-	return len;
-}
-
-#ifdef CONFIG_TOI_INCREMENTAL
-/**
- * get_toi_page_state - fill a buffer with page state information
- * @buffer:	The buffer to be filled.
- * @count:	The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_page_state(const char *buffer, int count)
-{
-    int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0;
-    int len = 0;
-    struct zone *zone;
-    int allocated_bitmaps = 0;
-
-    set_cpus_allowed_ptr(current,
-            cpumask_of(cpumask_first(cpu_online_mask)));
-
-    if (!free_map) {
-        BUG_ON(toi_alloc_bitmap(&free_map));
-        allocated_bitmaps = 1;
-    }
-
-    toi_generate_free_page_map();
-
-    for_each_populated_zone(zone) {
-        unsigned long loop;
-
-        total += zone->spanned_pages;
-
-        for (loop = 0; loop < zone->spanned_pages; loop++) {
-            unsigned long pfn = zone->zone_start_pfn + loop;
-            struct page *page;
-            int chunk_size;
-
-            if (!pfn_valid(pfn)) {
-                continue;
-            }
-
-            chunk_size = toi_size_of_free_region(zone, pfn);
-            if (chunk_size) {
-                /*
-                 * If the page gets allocated, it will be need
-                 * saving in an image.
-                 * Don't bother with explicitly removing any
-                 * RO protection applied below.
-                 * We'll SetPageTOI_Dirty(page) if/when it
-                 * gets allocated.
-                 */
-                free += chunk_size;
-                loop += chunk_size - 1;
-                continue;
-            }
-
-            page = pfn_to_page(pfn);
-
-            if (PageTOI_Untracked(page)) {
-                untracked++;
-            } else if (PageTOI_RO(page)) {
-                ro++;
-            } else if (PageTOI_Dirty(page)) {
-                dirty++;
-            } else {
-                printk("Page %ld state 'other'.\n", pfn);
-                other++;
-            }
-        }
-    }
-
-    if (allocated_bitmaps) {
-        toi_free_bitmap(&free_map);
-    }
-
-    set_cpus_allowed_ptr(current, cpu_all_mask);
-
-    SNPRINTF("TuxOnIce page breakdown:\n");
-    SNPRINTF("- Free           : %d\n", free);
-    SNPRINTF("- Untracked      : %d\n", untracked);
-    SNPRINTF("- Read only      : %d\n", ro);
-    SNPRINTF("- Dirty          : %d\n", dirty);
-    SNPRINTF("- Other          : %d\n", other);
-    SNPRINTF("- Invalid        : %d\n", invalid);
-    SNPRINTF("- Total          : %d\n", total);
-    return len;
-}
-#endif
-
-/**
- * do_cleanup - cleanup after attempting to hibernate or resume
- * @get_debug_info:	Whether to allocate and return debugging info.
- *
- * Cleanup after attempting to hibernate or resume, possibly getting
- * debugging info as we do so.
- **/
-static void do_cleanup(int get_debug_info, int restarting)
-{
-	int i = 0;
-	char *buffer = NULL;
-
-	trap_non_toi_io = 0;
-
-	if (get_debug_info)
-		toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
-
-	free_checksum_pages();
-
-        toi_cbw_restore();
-        toi_free_cbw_data();
-
-	if (get_debug_info)
-		buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
-
-	if (buffer)
-		i = get_toi_debug_info(buffer, PAGE_SIZE);
-
-	toi_free_extra_pagedir_memory();
-
-	pagedir1.size = 0;
-	pagedir2.size = 0;
-	set_highmem_size(pagedir1, 0);
-	set_highmem_size(pagedir2, 0);
-
-	if (boot_kernel_data_buffer) {
-		if (!test_toi_state(TOI_BOOT_KERNEL))
-			toi_free_page(37, boot_kernel_data_buffer);
-		boot_kernel_data_buffer = 0;
-	}
-
-	if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) {
-		unlock_device_hotplug();
-		clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
-	}
-
-	clear_toi_state(TOI_BOOT_KERNEL);
-	if (current->flags & PF_SUSPEND_TASK)
-		thaw_processes();
-
-	if (!restarting)
-		toi_stop_other_threads();
-
-	if (toi_keeping_image &&
-	    !test_result_state(TOI_ABORTED)) {
-		toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
-			"TuxOnIce: Not invalidating the image due "
-			"to Keep Image or Incremental Image being enabled.");
-		set_result_state(TOI_KEPT_IMAGE);
-
-                /*
-                 * For an incremental image, free unused storage so
-                 * swap (if any) can be used for normal system operation,
-                 * if so desired.
-                 */
-
-                toiActiveAllocator->free_unused_storage();
-	} else
-		if (toiActiveAllocator)
-			toiActiveAllocator->remove_image();
-
-	free_bitmaps();
-	usermodehelper_enable();
-
-	if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
-		pm_notifier_call_chain(PM_POST_HIBERNATION);
-		clear_toi_state(TOI_NOTIFIERS_PREPARE);
-	}
-
-	if (buffer && i) {
-		/* Printk can only handle 1023 bytes, including
-		 * its level mangling. */
-		for (i = 0; i < 3; i++)
-			printk(KERN_ERR "%s", buffer + (1023 * i));
-		toi_free_page(20, (unsigned long) buffer);
-	}
-
-	if (!restarting)
-		toi_cleanup_console();
-
-	free_attention_list();
-
-	if (!restarting)
-		toi_deactivate_storage(0);
-
-	clear_toi_state(TOI_IGNORE_LOGLEVEL);
-	clear_toi_state(TOI_TRYING_TO_RESUME);
-	clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * check_still_keeping_image - we kept an image; check whether to reuse it.
- *
- * We enter this routine when we have kept an image. If the user has said they
- * want to still keep it, all we need to do is powerdown. If powering down
- * means hibernating to ram and the power doesn't run out, we'll return 1.
- * If we do power off properly or the battery runs out, we'll resume via the
- * normal paths.
- *
- * If the user has said they want to remove the previously kept image, we
- * remove it, and return 0. We'll then store a new image.
- **/
-static int check_still_keeping_image(void)
-{
-    if (toi_keeping_image) {
-        if (!test_action_state(TOI_INCREMENTAL_IMAGE)) {
-            printk(KERN_INFO "Image already stored: powering down "
-                    "immediately.");
-            do_toi_step(STEP_HIBERNATE_POWERDOWN);
-            return 1;
-        }
-        /**
-         * Incremental image - need to write new part.
-         * We detect that we're writing an incremental image by looking
-         * at test_result_state(TOI_KEPT_IMAGE)
-         **/
-        return 0;
-    }
-
-    printk(KERN_INFO "Invalidating previous image.\n");
-    toiActiveAllocator->remove_image();
-
-    return 0;
-}
-
-/**
- * toi_init - prepare to hibernate to disk
- *
- * Initialise variables & data structures, in preparation for
- * hibernating to disk.
- **/
-static int toi_init(int restarting)
-{
-	int result, i, j;
-
-	toi_result = 0;
-
-	printk(KERN_INFO "Initiating a hibernation cycle.\n");
-
-	nr_hibernates++;
-
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < 2; j++)
-			toi_bkd.toi_io_time[i][j] = 0;
-
-	if (!test_toi_state(TOI_CAN_HIBERNATE) ||
-	    allocate_bitmaps())
-		return 1;
-
-	mark_nosave_pages();
-
-	if (!restarting)
-		toi_prepare_console();
-
-	result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
-	if (result) {
-		set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
-		return 1;
-	}
-	set_toi_state(TOI_NOTIFIERS_PREPARE);
-
-	if (!restarting) {
-		printk(KERN_ERR "Starting other threads.");
-		toi_start_other_threads();
-	}
-
-	result = usermodehelper_disable();
-	if (result) {
-		printk(KERN_ERR "TuxOnIce: Failed to disable usermode "
-				"helpers\n");
-		set_result_state(TOI_USERMODE_HELPERS_ERR);
-		return 1;
-	}
-
-	boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
-	if (!boot_kernel_data_buffer) {
-		printk(KERN_ERR "TuxOnIce: Failed to allocate "
-				"boot_kernel_data_buffer.\n");
-		set_result_state(TOI_OUT_OF_MEMORY);
-		return 1;
-	}
-
-        toi_allocate_cbw_data();
-
-	return 0;
-}
-
-/**
- * can_hibernate - perform basic 'Can we hibernate?' tests
- *
- * Perform basic tests that must pass if we're going to be able to hibernate:
- * Can we get the pm_mutex? Is resume= valid (we need to know where to write
- * the image header).
- **/
-static int can_hibernate(void)
-{
-	if (!test_toi_state(TOI_CAN_HIBERNATE))
-		toi_attempt_to_parse_resume_device(0);
-
-	if (!test_toi_state(TOI_CAN_HIBERNATE)) {
-		printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
-			"This may be because you haven't put something along "
-			"the lines of\n\nresume=swap:/dev/hda1\n\n"
-			"in lilo.conf or equivalent. (Where /dev/hda1 is your "
-			"swap partition).\n");
-		set_abort_result(TOI_CANT_SUSPEND);
-		return 0;
-	}
-
-	if (strlen(alt_resume_param)) {
-		attempt_to_parse_alt_resume_param();
-
-		if (!strlen(alt_resume_param)) {
-			printk(KERN_INFO "Alternate resume parameter now "
-					"invalid. Aborting.\n");
-			set_abort_result(TOI_CANT_USE_ALT_RESUME);
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
-/**
- * do_post_image_write - having written an image, figure out what to do next
- *
- * After writing an image, we might load an alternate image or power down.
- * Powering down might involve hibernating to ram, in which case we also
- * need to handle reloading pageset2.
- **/
-static int do_post_image_write(void)
-{
-	/* If switching images fails, do normal powerdown */
-	if (alt_resume_param[0])
-		do_toi_step(STEP_RESUME_ALT_IMAGE);
-
-	toi_power_down();
-
-	barrier();
-	mb();
-	return 0;
-}
-
-/**
- * __save_image - do the hard work of saving the image
- *
- * High level routine for getting the image saved. The key assumptions made
- * are that processes have been frozen and sufficient memory is available.
- *
- * We also exit through here at resume time, coming back from toi_hibernate
- * after the atomic restore. This is the reason for the toi_in_hibernate
- * test.
- **/
-static int __save_image(void)
-{
-	int temp_result, did_copy = 0;
-
-	toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
-
-	toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
-		" - Final values: %d and %d.",
-		pagedir1.size, pagedir2.size);
-
-	toi_cond_pause(1, "About to write pagedir2.");
-
-	temp_result = write_pageset(&pagedir2);
-
-	if (temp_result == -1 || test_result_state(TOI_ABORTED))
-		return 1;
-
-	toi_cond_pause(1, "About to copy pageset 1.");
-
-	if (test_result_state(TOI_ABORTED))
-		return 1;
-
-	toi_deactivate_storage(1);
-
-	toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
-	toi_in_hibernate = 1;
-
-	if (toi_go_atomic(PMSG_FREEZE, 1))
-		goto Failed;
-
-	temp_result = toi_hibernate();
-
-#ifdef CONFIG_KGDB
-	if (test_action_state(TOI_POST_RESUME_BREAKPOINT))
-		kgdb_breakpoint();
-#endif
-
-	if (!temp_result)
-		did_copy = 1;
-
-	/* We return here at resume time too! */
-	toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result);
-
-Failed:
-	if (toi_activate_storage(1))
-		panic("Failed to reactivate our storage.");
-
-	/* Resume time? */
-	if (!toi_in_hibernate) {
-		copyback_post();
-		return 0;
-	}
-
-	/* Nope. Hibernating. So, see if we can save the image... */
-
-	if (temp_result || test_result_state(TOI_ABORTED)) {
-		if (did_copy)
-			goto abort_reloading_pagedir_two;
-		else
-			return 1;
-	}
-
-	toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size,
-			NULL);
-
-	if (test_result_state(TOI_ABORTED))
-		goto abort_reloading_pagedir_two;
-
-	toi_cond_pause(1, "About to write pageset1.");
-
-	toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1");
-
-	temp_result = write_pageset(&pagedir1);
-
-	/* We didn't overwrite any memory, so no reread needs to be done. */
-	if (test_action_state(TOI_TEST_FILTER_SPEED) ||
-	    test_action_state(TOI_TEST_BIO))
-		return 1;
-
-	if (temp_result == 1 || test_result_state(TOI_ABORTED))
-		goto abort_reloading_pagedir_two;
-
-	toi_cond_pause(1, "About to write header.");
-
-	if (test_result_state(TOI_ABORTED))
-		goto abort_reloading_pagedir_two;
-
-	temp_result = write_image_header();
-
-	if (!temp_result && !test_result_state(TOI_ABORTED))
-		return 0;
-
-abort_reloading_pagedir_two:
-	temp_result = read_pageset2(1);
-
-	/* If that failed, we're sunk. Panic! */
-	if (temp_result)
-		panic("Attempt to reload pagedir 2 while aborting "
-				"a hibernate failed.");
-
-	return 1;
-}
-
-static void map_ps2_pages(int enable)
-{
-	unsigned long pfn = 0;
-
-        memory_bm_position_reset(pageset2_map);
-	pfn = memory_bm_next_pfn(pageset2_map, 0);
-
-	while (pfn != BM_END_OF_MAP) {
-		struct page *page = pfn_to_page(pfn);
-		kernel_map_pages(page, 1, enable);
-		pfn = memory_bm_next_pfn(pageset2_map, 0);
-	}
-}
-
-/**
- * do_save_image - save the image and handle the result
- *
- * Save the prepared image. If we fail or we're in the path returning
- * from the atomic restore, cleanup.
- **/
-static int do_save_image(void)
-{
-	int result;
-	map_ps2_pages(0);
-	result = __save_image();
-	map_ps2_pages(1);
-	return result;
-}
-
-/**
- * do_prepare_image - try to prepare an image
- *
- * Seek to initialise and prepare an image to be saved. On failure,
- * cleanup.
- **/
-static int do_prepare_image(void)
-{
-	int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-
-	if (!restarting && toi_activate_storage(0))
-		return 1;
-
-	/*
-         * If kept image and still keeping image and hibernating to RAM, (non
-         * incremental image case) we will return 1 after hibernating and
-         * resuming (provided the power doesn't run out. In that case, we skip
-         * directly to cleaning up and exiting.
-	 */
-
-	if (!can_hibernate() ||
-	    (test_result_state(TOI_KEPT_IMAGE) &&
-	     check_still_keeping_image()))
-		return 1;
-
-	if (toi_init(restarting) || toi_prepare_image() ||
-			test_result_state(TOI_ABORTED))
-		return 1;
-
-	trap_non_toi_io = 1;
-
-	return 0;
-}
-
-/**
- * do_check_can_resume - find out whether an image has been stored
- *
- * Read whether an image exists. We use the same routine as the
- * image_exists sysfs entry, and just look to see whether the
- * first character in the resulting buffer is a '1'.
- **/
-int do_check_can_resume(void)
-{
-	int result = -1;
-
-	if (toi_activate_storage(0))
-		return -1;
-
-	if (!test_toi_state(TOI_RESUME_DEVICE_OK))
-		toi_attempt_to_parse_resume_device(1);
-
-	if (toiActiveAllocator)
-		result = toiActiveAllocator->image_exists(1);
-
-	toi_deactivate_storage(0);
-	return result;
-}
-
-/**
- * do_load_atomic_copy - load the first part of an image, if it exists
- *
- * Check whether we have an image. If one exists, do sanity checking
- * (possibly invalidating the image or even rebooting if the user
- * requests that) before loading it into memory in preparation for the
- * atomic restore.
- *
- * If and only if we have an image loaded and ready to restore, we return 1.
- **/
-static int do_load_atomic_copy(void)
-{
-	int read_image_result = 0;
-
-	if (sizeof(swp_entry_t) != sizeof(long)) {
-		printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
-			" of long. Please report this!\n");
-		return 1;
-	}
-
-	if (!resume_file[0])
-		printk(KERN_WARNING "TuxOnIce: "
-			"You need to use a resume= command line parameter to "
-			"tell TuxOnIce where to look for an image.\n");
-
-	toi_activate_storage(0);
-
-	if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
-		!toi_attempt_to_parse_resume_device(0)) {
-		/*
-		 * Without a usable storage device we can do nothing -
-		 * even if noresume is given
-		 */
-
-		if (!toiNumAllocators)
-			printk(KERN_ALERT "TuxOnIce: "
-			  "No storage allocators have been registered.\n");
-		else
-			printk(KERN_ALERT "TuxOnIce: "
-				"Missing or invalid storage location "
-				"(resume= parameter). Please correct and "
-				"rerun lilo (or equivalent) before "
-				"hibernating.\n");
-		toi_deactivate_storage(0);
-		return 1;
-	}
-
-	if (allocate_bitmaps())
-		return 1;
-
-	read_image_result = read_pageset1(); /* non fatal error ignored */
-
-	if (test_toi_state(TOI_NORESUME_SPECIFIED))
-		clear_toi_state(TOI_NORESUME_SPECIFIED);
-
-	toi_deactivate_storage(0);
-
-	if (read_image_result)
-		return 1;
-
-	return 0;
-}
-
-/**
- * prepare_restore_load_alt_image - save & restore alt image variables
- *
- * Save and restore the pageset1 maps, when loading an alternate image.
- **/
-static void prepare_restore_load_alt_image(int prepare)
-{
-	static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save;
-
-	if (prepare) {
-		pageset1_map_save = pageset1_map;
-		pageset1_map = NULL;
-		pageset1_copy_map_save = pageset1_copy_map;
-		pageset1_copy_map = NULL;
-		set_toi_state(TOI_LOADING_ALT_IMAGE);
-		toi_reset_alt_image_pageset2_pfn();
-	} else {
-		toi_free_bitmap(&pageset1_map);
-		pageset1_map = pageset1_map_save;
-		toi_free_bitmap(&pageset1_copy_map);
-		pageset1_copy_map = pageset1_copy_map_save;
-		clear_toi_state(TOI_NOW_RESUMING);
-		clear_toi_state(TOI_LOADING_ALT_IMAGE);
-	}
-}
-
-/**
- * do_toi_step - perform a step in hibernating or resuming
- *
- * Perform a step in hibernating or resuming an image. This abstraction
- * is in preparation for implementing cluster support, and perhaps replacing
- * uswsusp too (haven't looked whether that's possible yet).
- **/
-int do_toi_step(int step)
-{
-	switch (step) {
-	case STEP_HIBERNATE_PREPARE_IMAGE:
-		return do_prepare_image();
-	case STEP_HIBERNATE_SAVE_IMAGE:
-		return do_save_image();
-	case STEP_HIBERNATE_POWERDOWN:
-		return do_post_image_write();
-	case STEP_RESUME_CAN_RESUME:
-		return do_check_can_resume();
-	case STEP_RESUME_LOAD_PS1:
-		return do_load_atomic_copy();
-	case STEP_RESUME_DO_RESTORE:
-		/*
-		 * If we succeed, this doesn't return.
-		 * Instead, we return from do_save_image() in the
-		 * hibernated kernel.
-		 */
-		return toi_atomic_restore();
-	case STEP_RESUME_ALT_IMAGE:
-		printk(KERN_INFO "Trying to resume alternate image.\n");
-		toi_in_hibernate = 0;
-		save_restore_alt_param(SAVE, NOQUIET);
-		prepare_restore_load_alt_image(1);
-		if (!do_check_can_resume()) {
-			printk(KERN_INFO "Nothing to resume from.\n");
-			goto out;
-		}
-		if (!do_load_atomic_copy())
-			toi_atomic_restore();
-
-		printk(KERN_INFO "Failed to load image.\n");
-out:
-		prepare_restore_load_alt_image(0);
-		save_restore_alt_param(RESTORE, NOQUIET);
-		break;
-	case STEP_CLEANUP:
-		do_cleanup(1, 0);
-		break;
-	case STEP_QUIET_CLEANUP:
-		do_cleanup(0, 0);
-		break;
-	}
-
-	return 0;
-}
-
-/* -- Functions for kickstarting a hibernate or resume --- */
-
-/**
- * toi_try_resume - try to do the steps in resuming
- *
- * Check if we have an image and if so try to resume. Clear the status
- * flags too.
- **/
-void toi_try_resume(void)
-{
-	set_toi_state(TOI_TRYING_TO_RESUME);
-	resume_attempted = 1;
-
-	current->flags |= PF_MEMALLOC;
-	toi_start_other_threads();
-
-	if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
-			!do_toi_step(STEP_RESUME_LOAD_PS1))
-		do_toi_step(STEP_RESUME_DO_RESTORE);
-
-	toi_stop_other_threads();
-	do_cleanup(0, 0);
-
-	current->flags &= ~PF_MEMALLOC;
-
-	clear_toi_state(TOI_IGNORE_LOGLEVEL);
-	clear_toi_state(TOI_TRYING_TO_RESUME);
-	clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume
- *
- * Wrapper for when __toi_try_resume is called from swsusp resume path,
- * rather than from echo > /sys/power/tuxonice/do_resume.
- **/
-static void toi_sys_power_disk_try_resume(void)
-{
-	resume_attempted = 1;
-
-	/*
-	 * There's a comment in kernel/power/disk.c that indicates
-	 * we should be able to use mutex_lock_nested below. That
-	 * doesn't seem to cut it, though, so let's just turn lockdep
-	 * off for now.
-	 */
-	lockdep_off();
-
-	if (toi_start_anything(SYSFS_RESUMING))
-		goto out;
-
-	toi_try_resume();
-
-	/*
-	 * For initramfs, we have to clear the boot time
-	 * flag after trying to resume
-	 */
-	clear_toi_state(TOI_BOOT_TIME);
-
-	toi_finish_anything(SYSFS_RESUMING);
-out:
-	lockdep_on();
-}
-
-/**
- * toi_try_hibernate - try to start a hibernation cycle
- *
- * Start a hibernation cycle, coming in from either
- * echo > /sys/power/tuxonice/do_suspend
- *
- * or
- *
- * echo disk > /sys/power/state
- *
- * In the later case, we come in without pm_sem taken; in the
- * former, it has been taken.
- **/
-int toi_try_hibernate(void)
-{
-	int result = 0, sys_power_disk = 0, retries = 0;
-
-	if (!mutex_is_locked(&tuxonice_in_use)) {
-		/* Came in via /sys/power/disk */
-		if (toi_start_anything(SYSFS_HIBERNATING))
-			return -EBUSY;
-		sys_power_disk = 1;
-	}
-
-	current->flags |= PF_MEMALLOC;
-
-	if (test_toi_state(TOI_CLUSTER_MODE)) {
-		toi_initiate_cluster_hibernate();
-		goto out;
-	}
-
-prepare:
-	result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
-
-	if (result)
-		goto out;
-
-	if (test_action_state(TOI_FREEZER_TEST))
-		goto out_restore_gfp_mask;
-
-	result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
-
-	if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) {
-		if (retries < 2) {
-			do_cleanup(0, 1);
-			retries++;
-			clear_result_state(TOI_ABORTED);
-			extra_pd1_pages_allowance = extra_pd1_pages_used + 500;
-			printk(KERN_INFO "Automatically adjusting the extra"
-				" pages allowance to %ld and restarting.\n",
-				extra_pd1_pages_allowance);
-			pm_restore_gfp_mask();
-			goto prepare;
-		}
-
-		printk(KERN_INFO "Adjusted extra pages allowance twice and "
-			"still couldn't hibernate successfully. Giving up.");
-	}
-
-	/* This code runs at resume time too! */
-	if (!result && toi_in_hibernate)
-		result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-
-out_restore_gfp_mask:
-	pm_restore_gfp_mask();
-out:
-	do_cleanup(1, 0);
-	current->flags &= ~PF_MEMALLOC;
-
-	if (sys_power_disk)
-		toi_finish_anything(SYSFS_HIBERNATING);
-
-	return result;
-}
-
-/*
- * channel_no: If !0, -c <channel_no> is added to args (userui).
- */
-int toi_launch_userspace_program(char *command, int channel_no,
-		int wait, int debug)
-{
-	int retval;
-	static char *envp[] = {
-			"HOME=/",
-			"TERM=linux",
-			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-			NULL };
-	static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
-		};
-	char *channel = NULL;
-	int arg = 0, size;
-	char test_read[255];
-	char *orig_posn = command;
-
-	if (!strlen(orig_posn))
-		return 1;
-
-	if (channel_no) {
-		channel = toi_kzalloc(4, 6, GFP_KERNEL);
-		if (!channel) {
-			printk(KERN_INFO "Failed to allocate memory in "
-				"preparing to launch userspace program.\n");
-			return 1;
-		}
-	}
-
-	/* Up to 6 args supported */
-	while (arg < 6) {
-		sscanf(orig_posn, "%s", test_read);
-		size = strlen(test_read);
-		if (!(size))
-			break;
-		argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
-		strcpy(argv[arg], test_read);
-		orig_posn += size + 1;
-		*test_read = 0;
-		arg++;
-	}
-
-	if (channel_no) {
-		sprintf(channel, "-c%d", channel_no);
-		argv[arg] = channel;
-	} else
-		arg--;
-
-	if (debug) {
-		argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP);
-		strcpy(argv[arg], "--debug");
-	}
-
-	retval = call_usermodehelper(argv[0], argv, envp, wait);
-
-	/*
-	 * If the program reports an error, retval = 256. Don't complain
-	 * about that here.
-	 */
-	if (retval && retval != 256)
-		printk(KERN_ERR "Failed to launch userspace program '%s': "
-				"Error %d\n", command, retval);
-
-	{
-		int i;
-		for (i = 0; i < arg; i++)
-			if (argv[i] && argv[i] != channel)
-				toi_kfree(5, argv[i], sizeof(*argv[i]));
-	}
-
-	toi_kfree(4, channel, sizeof(*channel));
-
-	return retval;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_LONG("extra_pages_allowance", SYSFS_RW,
-			&extra_pd1_pages_allowance, 0, LONG_MAX, 0),
-	SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read,
-			image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL),
-	SYSFS_STRING("resume", SYSFS_RW, resume_file, 255,
-			SYSFS_NEEDS_SM_FOR_WRITE,
-			attempt_to_parse_resume_device2),
-	SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255,
-			SYSFS_NEEDS_SM_FOR_WRITE,
-			attempt_to_parse_alt_resume_param),
-	SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0,
-			NULL),
-	SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_IGNORE_ROOTFS, 0),
-	SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2,
-			INT_MAX, 0),
-	SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0),
-	SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_NO_MULTITHREADED_IO, 0),
-	SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_NO_FLUSHER_THREAD, 0),
-	SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_PAGESET2_FULL, 0),
-	SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0),
-	SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_REPLACE_SWSUSP, 0),
-	SYSFS_STRING("resume_commandline", SYSFS_RW,
-			toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0,
-			NULL),
-	SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL),
-	SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_FREEZER_TEST, 0),
-	SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0),
-	SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_TEST_FILTER_SPEED, 0),
-	SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_NO_PAGESET2, 0),
-	SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_NO_PS2_IF_UNNEEDED, 0),
-	SYSFS_STRING("binary_signature", SYSFS_READONLY,
-			tuxonice_signature, 9, 0, NULL),
-	SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0,
-			NULL),
-#ifdef CONFIG_KGDB
-	SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_POST_RESUME_BREAKPOINT, 0),
-#endif
-	SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_NO_READAHEAD, 0),
-	SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_TRACE_DEBUG_ON, 0),
-#ifdef CONFIG_TOI_KEEP_IMAGE
-	SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE,
-			0),
-#endif
-#ifdef CONFIG_TOI_INCREMENTAL
-	SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0,
-			NULL),
-	SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_INCREMENTAL_IMAGE, 1),
-#endif
-};
-
-static struct toi_core_fns my_fns = {
-	.get_nonconflicting_page = __toi_get_nonconflicting_page,
-	.post_context_save = __toi_post_context_save,
-	.try_hibernate = toi_try_hibernate,
-	.try_resume = toi_sys_power_disk_try_resume,
-};
-
-/**
- * core_load - initialisation of TuxOnIce core
- *
- * Initialise the core, beginning with sysfs. Checksum and so on are part of
- * the core, but have their own initialisation routines because they either
- * aren't compiled in all the time or have their own subdirectories.
- **/
-static __init int core_load(void)
-{
-	int i,
-	    numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
-	printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION
-			" (http://tuxonice.net)\n");
-
-        if (!hibernation_available()) {
-          printk(KERN_INFO "TuxOnIce disabled due to request for hibernation"
-              " to be disabled in this kernel.\n");
-          return 1;
-        }
-
-	if (toi_sysfs_init())
-		return 1;
-
-	for (i = 0; i < numfiles; i++)
-		toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
-	toi_core_fns = &my_fns;
-
-	if (toi_alloc_init())
-		return 1;
-	if (toi_checksum_init())
-		return 1;
-	if (toi_usm_init())
-		return 1;
-	if (toi_ui_init())
-		return 1;
-	if (toi_poweroff_init())
-		return 1;
-	if (toi_cluster_init())
-		return 1;
-	if (toi_cbw_init())
-		return 1;
-
-	return 0;
-}
-
-late_initcall(core_load);
diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c
deleted file mode 100644
index c5a09789e..000000000
--- a/kernel/power/tuxonice_incremental.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * kernel/power/tuxonice_incremental.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines related to storing incremental images - that
- * is, retaining an image after an initial cycle and then storing incremental
- * changes on subsequent hibernations.
- *
- * Based in part on on...
- *
- * Debug helper to dump the current kernel pagetables of the system
- * so that we can see what the various memory ranges are set to.
- *
- * (C) Copyright 2008 Intel Corporation
- *
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/mm.h>
-#include <linux/tuxonice.h>
-#include <linux/sched.h>
-#include <asm/pgtable.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/page.h>
-#include "tuxonice_pageflags.h"
-#include "tuxonice_builtin.h"
-#include "power.h"
-
-int toi_do_incremental_initcall;
-
-extern void kdb_init(int level);
-extern noinline void kgdb_breakpoint(void);
-
-#undef pr_debug
-#if 0
-#define pr_debug(a, b...) do { printk(a, ##b); } while(0)
-#else
-#define pr_debug(a, b...) do { } while(0)
-#endif
-
-/* Multipliers for offsets within the PTEs */
-#define PTE_LEVEL_MULT (PAGE_SIZE)
-#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
-#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
-
-/*
- * This function gets called on a break in a continuous series
- * of PTE entries; the next one is different so we need to
- * print what we collected so far.
- */
-static void note_page(void *addr)
-{
-    static struct page *lastpage;
-    struct page *page;
-
-    page = virt_to_page(addr);
-
-    if (page != lastpage) {
-        unsigned int level;
-        pte_t *pte = lookup_address((unsigned long) addr, &level);
-        struct page *pt_page2 = pte_page(*pte);
-        //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2));
-        SetPageTOI_Untracked(pt_page2);
-        lastpage = page;
-    }
-}
-
-static void walk_pte_level(pmd_t addr)
-{
-	int i;
-	pte_t *start;
-
-	start = (pte_t *) pmd_page_vaddr(addr);
-	for (i = 0; i < PTRS_PER_PTE; i++) {
-		note_page(start);
-		start++;
-	}
-}
-
-#if PTRS_PER_PMD > 1
-
-static void walk_pmd_level(pud_t addr)
-{
-	int i;
-	pmd_t *start;
-
-	start = (pmd_t *) pud_page_vaddr(addr);
-	for (i = 0; i < PTRS_PER_PMD; i++) {
-		if (!pmd_none(*start)) {
-			if (pmd_large(*start) || !pmd_present(*start))
-				note_page(start);
-			else
-				walk_pte_level(*start);
-		} else
-			note_page(start);
-		start++;
-	}
-}
-
-#else
-#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a)))
-#define pud_large(a) pmd_large(__pmd(pud_val(a)))
-#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
-#endif
-
-#if PTRS_PER_PUD > 1
-
-static void walk_pud_level(pgd_t addr)
-{
-	int i;
-	pud_t *start;
-
-	start = (pud_t *) pgd_page_vaddr(addr);
-
-	for (i = 0; i < PTRS_PER_PUD; i++) {
-		if (!pud_none(*start)) {
-			if (pud_large(*start) || !pud_present(*start))
-				note_page(start);
-			else
-				walk_pmd_level(*start);
-		} else
-			note_page(start);
-
-		start++;
-	}
-}
-
-#else
-#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a)))
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
-#endif
-
-/*
- * Not static in the original at the time of writing, so needs renaming here.
- */
-static void toi_ptdump_walk_pgd_level(pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
-	pgd_t *start = (pgd_t *) &init_level4_pgt;
-#else
-	pgd_t *start = swapper_pg_dir;
-#endif
-	int i;
-	if (pgd) {
-		start = pgd;
-	}
-
-	for (i = 0; i < PTRS_PER_PGD; i++) {
-		if (!pgd_none(*start)) {
-			if (pgd_large(*start) || !pgd_present(*start))
-				note_page(start);
-			else
-				walk_pud_level(*start);
-		} else
-			note_page(start);
-
-		start++;
-	}
-
-	/* Flush out the last page */
-	note_page(start);
-}
-
-#ifdef CONFIG_PARAVIRT
-extern struct pv_info pv_info;
-
-static void toi_set_paravirt_ops_untracked(void) {
-    int i;
-
-    unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)),
-                  pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end));
-    //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end);
-    for (i = pvpfn; i <= pvpfn_end; i++) {
-        SetPageTOI_Untracked(pfn_to_page(i));
-    }
-}
-#else
-#define toi_set_paravirt_ops_untracked() { do { } while(0) }
-#endif
-
-extern void toi_mark_per_cpus_pages_untracked(void);
-
-void toi_untrack_stack(unsigned long *stack)
-{
-    int i;
-    struct page *stack_page = virt_to_page(stack);
-
-    for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) {
-        pr_debug("Untrack stack page %p.\n", page_address(stack_page + i));
-        SetPageTOI_Untracked(stack_page + i);
-    }
-}
-void toi_untrack_process(struct task_struct *p)
-{
-    SetPageTOI_Untracked(virt_to_page(p));
-    pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p)));
-
-    toi_untrack_stack(p->stack);
-}
-
-void toi_generate_untracked_map(void)
-{
-    struct task_struct *p, *t;
-    struct page *page;
-    pte_t *pte;
-    int i;
-    unsigned int level;
-    static int been_here = 0;
-
-    if (been_here)
-        return;
-
-    been_here = 1;
-
-    /* Pagetable pages */
-    toi_ptdump_walk_pgd_level(NULL);
-
-    /* Printk buffer - not normally needed but can be helpful for debugging. */
-    //toi_set_logbuf_untracked();
-
-    /* Paravirt ops */
-    toi_set_paravirt_ops_untracked();
-
-    /* Task structs and stacks */
-    for_each_process_thread(p, t) {
-        toi_untrack_process(p);
-        //toi_untrack_stack((unsigned long *) t->thread.sp);
-    }
-
-    for (i = 0; i < NR_CPUS; i++) {
-        struct task_struct *idle = idle_task(i);
-
-        if (idle) {
-            pr_debug("Untrack idle process for CPU %d.\n", i);
-            toi_untrack_process(idle);
-        }
-
-        /* IRQ stack */
-        pr_debug("Untrack IRQ stack for CPU %d.\n", i);
-        toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i));
-    }
-
-    /* Per CPU data */
-    //pr_debug("Untracking per CPU variable pages.\n");
-    toi_mark_per_cpus_pages_untracked();
-
-    /* Init stack - for bringing up secondary CPUs */
-    page = virt_to_page(init_stack);
-    for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) {
-        SetPageTOI_Untracked(page + i);
-    }
-
-    pte = lookup_address((unsigned long) &mmu_cr4_features, &level);
-    SetPageTOI_Untracked(pte_page(*pte));
-    SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features));
-}
-
-/**
- * toi_reset_dirtiness_one
- */
-
-void toi_reset_dirtiness_one(unsigned long pfn, int verbose)
-{
-    struct page *page = pfn_to_page(pfn);
-
-    /**
-     * Don't worry about whether the Dirty flag is
-     * already set. If this is our first call, it
-     * won't be.
-     */
-
-    preempt_disable();
-
-    ClearPageTOI_Dirty(page);
-    SetPageTOI_RO(page);
-    if (verbose)
-        printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page));
-
-    set_memory_ro((unsigned long) page_address(page), 1);
-
-    preempt_enable();
-}
-
-/**
- * TuxOnIce's incremental image support works by marking all memory apart from
- * the page tables read-only, then in the page-faults that result enabling
- * writing if appropriate and flagging the page as dirty. Free pages are also
- * marked as dirty and not protected so that if allocated, they will be included
- * in the image without further processing.
- *
- * toi_reset_dirtiness is called when and image exists and incremental images are
- * enabled, and each time we resume thereafter. It is not invoked on a fresh boot.
- *
- * This routine should be called from a single-cpu-running context to avoid races in setting
- * page dirty/read only flags.
- *
- * TODO: Make "it is not invoked on a fresh boot" true  when I've finished developing it!
- *
- * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c.
- **/
-
-int toi_reset_dirtiness(int verbose)
-{
-	struct zone *zone;
-	unsigned long loop;
-        int allocated_map = 0;
-
-        toi_generate_untracked_map();
-
-        if (!free_map) {
-            if (!toi_alloc_bitmap(&free_map))
-                return -ENOMEM;
-            allocated_map = 1;
-        }
-
-	toi_generate_free_page_map();
-
-        pr_debug(KERN_EMERG "Reset dirtiness.\n");
-        for_each_populated_zone(zone) {
-            // 64 bit only. No need to worry about highmem.
-            for (loop = 0; loop < zone->spanned_pages; loop++) {
-                unsigned long pfn = zone->zone_start_pfn + loop;
-                struct page *page;
-                int chunk_size;
-
-                if (!pfn_valid(pfn)) {
-                    continue;
-                }
-
-                chunk_size = toi_size_of_free_region(zone, pfn);
-                if (chunk_size) {
-                    loop += chunk_size - 1;
-                    continue;
-                }
-
-                page = pfn_to_page(pfn);
-
-                if (PageNosave(page) || !saveable_page(zone, pfn)) {
-                    continue;
-                }
-
-                if (PageTOI_Untracked(page)) {
-                    continue;
-                }
-
-                /**
-                 * Do we need to (re)protect the page?
-                 * If it is already protected (PageTOI_RO), there is
-                 * nothing to do - skip the following.
-                 * If it is marked as dirty (PageTOI_Dirty), it was
-                 * either free and has been allocated or has been
-                 * written to and marked dirty. Reset the dirty flag
-                 * and (re)apply the protection.
-                 */
-                if (!PageTOI_RO(page)) {
-                    toi_reset_dirtiness_one(pfn, verbose);
-                }
-            }
-        }
-
-        pr_debug(KERN_EMERG "Done resetting dirtiness.\n");
-
-        if (allocated_map) {
-            toi_free_bitmap(&free_map);
-        }
-        return 0;
-}
-
-static int toi_reset_dirtiness_initcall(void)
-{
-    if (toi_do_incremental_initcall) {
-        pr_info("TuxOnIce: Enabling dirty page tracking.\n");
-        toi_reset_dirtiness(0);
-    }
-    return 1;
-}
-extern void toi_generate_untracked_map(void);
-
-// Leave early_initcall for pages to register untracked sections.
-early_initcall(toi_reset_dirtiness_initcall);
-
-static int __init toi_incremental_initcall_setup(char *str)
-{
-	int value;
-
-	if (sscanf(str, "=%d", &value) && value)
-		toi_do_incremental_initcall = value;
-
-	return 1;
-}
-__setup("toi_incremental_initcall", toi_incremental_initcall_setup);
diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
deleted file mode 100644
index 91b0c4fd0..000000000
--- a/kernel/power/tuxonice_io.c
+++ /dev/null
@@ -1,1932 +0,0 @@
-/*
- * kernel/power/tuxonice_io.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/version.h>
-#include <linux/utsname.h>
-#include <linux/mount.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-#include <linux/cpu.h>
-#include <linux/fs_struct.h>
-#include <linux/bio.h>
-#include <linux/fs_uuid.h>
-#include <linux/kmod.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_alloc.h"
-char alt_resume_param[256];
-
-/* Version read from image header at resume */
-static int toi_image_header_version;
-
-#define read_if_version(VERS, VAR, DESC, ERR_ACT) do {					\
-	if (likely(toi_image_header_version >= VERS))				\
-		if (toiActiveAllocator->rw_header_chunk(READ, NULL,		\
-					(char *) &VAR, sizeof(VAR))) {		\
-			abort_hibernate(TOI_FAILED_IO, "Failed to read DESC.");	\
-			ERR_ACT;					\
-		}								\
-} while(0)									\
-
-/* Variables shared between threads and updated under the mutex */
-static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
-static int io_index, io_nextupdate, io_pc, io_pc_step;
-static DEFINE_MUTEX(io_mutex);
-static DEFINE_PER_CPU(struct page *, last_sought);
-static DEFINE_PER_CPU(struct page *, last_high_page);
-static DEFINE_PER_CPU(char *, checksum_locn);
-static DEFINE_PER_CPU(struct pbe *, last_low_page);
-static atomic_t io_count;
-atomic_t toi_io_workers;
-
-static int using_flusher;
-
-DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
-
-int toi_bio_queue_flusher_should_finish;
-
-int toi_max_workers;
-
-static char *image_version_error = "The image header version is newer than " \
-	"this kernel supports.";
-
-struct toi_module_ops *first_filter;
-
-static atomic_t toi_num_other_threads;
-static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue);
-enum toi_worker_commands {
-	TOI_IO_WORKER_STOP,
-	TOI_IO_WORKER_RUN,
-	TOI_IO_WORKER_EXIT
-};
-static enum toi_worker_commands toi_worker_command;
-
-/**
- * toi_attempt_to_parse_resume_device - determine if we can hibernate
- *
- * Can we hibernate, using the current resume= parameter?
- **/
-int toi_attempt_to_parse_resume_device(int quiet)
-{
-	struct list_head *Allocator;
-	struct toi_module_ops *thisAllocator;
-	int result, returning = 0;
-
-	if (toi_activate_storage(0))
-		return 0;
-
-	toiActiveAllocator = NULL;
-	clear_toi_state(TOI_RESUME_DEVICE_OK);
-	clear_toi_state(TOI_CAN_RESUME);
-	clear_result_state(TOI_ABORTED);
-
-	if (!toiNumAllocators) {
-		if (!quiet)
-			printk(KERN_INFO "TuxOnIce: No storage allocators have "
-				"been registered. Hibernating will be "
-				"disabled.\n");
-		goto cleanup;
-	}
-
-	list_for_each(Allocator, &toiAllocators) {
-		thisAllocator = list_entry(Allocator, struct toi_module_ops,
-								type_list);
-
-		/*
-		 * Not sure why you'd want to disable an allocator, but
-		 * we should honour the flag if we're providing it
-		 */
-		if (!thisAllocator->enabled)
-			continue;
-
-		result = thisAllocator->parse_sig_location(
-				resume_file, (toiNumAllocators == 1),
-				quiet);
-
-		switch (result) {
-		case -EINVAL:
-			/* For this allocator, but not a valid
-			 * configuration. Error already printed. */
-			goto cleanup;
-
-		case 0:
-			/* For this allocator and valid. */
-			toiActiveAllocator = thisAllocator;
-
-			set_toi_state(TOI_RESUME_DEVICE_OK);
-			set_toi_state(TOI_CAN_RESUME);
-			returning = 1;
-			goto cleanup;
-		}
-	}
-	if (!quiet)
-		printk(KERN_INFO "TuxOnIce: No matching enabled allocator "
-				"found. Resuming disabled.\n");
-cleanup:
-	toi_deactivate_storage(0);
-	return returning;
-}
-
-void attempt_to_parse_resume_device2(void)
-{
-	toi_prepare_usm();
-	toi_attempt_to_parse_resume_device(0);
-	toi_cleanup_usm();
-}
-
-void save_restore_alt_param(int replace, int quiet)
-{
-	static char resume_param_save[255];
-	static unsigned long toi_state_save;
-
-	if (replace) {
-		toi_state_save = toi_state;
-		strcpy(resume_param_save, resume_file);
-		strcpy(resume_file, alt_resume_param);
-	} else {
-		strcpy(resume_file, resume_param_save);
-		toi_state = toi_state_save;
-	}
-	toi_attempt_to_parse_resume_device(quiet);
-}
-
-void attempt_to_parse_alt_resume_param(void)
-{
-	int ok = 0;
-
-	/* Temporarily set resume_param to the poweroff value */
-	if (!strlen(alt_resume_param))
-		return;
-
-	printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n");
-	save_restore_alt_param(SAVE, NOQUIET);
-	if (test_toi_state(TOI_CAN_RESUME))
-		ok = 1;
-
-	printk(KERN_INFO "=== Done ===\n");
-	save_restore_alt_param(RESTORE, QUIET);
-
-	/* If not ok, clear the string */
-	if (ok)
-		return;
-
-	printk(KERN_INFO "Can't resume from that location; clearing "
-			"alt_resume_param.\n");
-	alt_resume_param[0] = '\0';
-}
-
-/**
- * noresume_reset_modules - reset data structures in case of non resuming
- *
- * When we read the start of an image, modules (and especially the
- * active allocator) might need to reset data structures if we
- * decide to remove the image rather than resuming from it.
- **/
-static void noresume_reset_modules(void)
-{
-	struct toi_module_ops *this_filter;
-
-	list_for_each_entry(this_filter, &toi_filters, type_list)
-		if (this_filter->noresume_reset)
-			this_filter->noresume_reset();
-
-	if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
-		toiActiveAllocator->noresume_reset();
-}
-
-/**
- * fill_toi_header - fill the hibernate header structure
- * @struct toi_header: Header data structure to be filled.
- **/
-static int fill_toi_header(struct toi_header *sh)
-{
-	int i, error;
-
-	error = init_header((struct swsusp_info *) sh);
-	if (error)
-		return error;
-
-	sh->pagedir = pagedir1;
-	sh->pageset_2_size = pagedir2.size;
-	sh->param0 = toi_result;
-	sh->param1 = toi_bkd.toi_action;
-	sh->param2 = toi_bkd.toi_debug_state;
-	sh->param3 = toi_bkd.toi_default_console_level;
-	sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
-	for (i = 0; i < 4; i++)
-		sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
-	sh->bkd = boot_kernel_data_buffer;
-	return 0;
-}
-
-/**
- * rw_init_modules - initialize modules
- * @rw:		Whether we are reading of writing an image.
- * @which:	Section of the image being processed.
- *
- * Iterate over modules, preparing the ones that will be used to read or write
- * data.
- **/
-static int rw_init_modules(int rw, int which)
-{
-	struct toi_module_ops *this_module;
-	/* Initialise page transformers */
-	list_for_each_entry(this_module, &toi_filters, type_list) {
-		if (!this_module->enabled)
-			continue;
-		if (this_module->rw_init && this_module->rw_init(rw, which)) {
-			abort_hibernate(TOI_FAILED_MODULE_INIT,
-				"Failed to initialize the %s filter.",
-				this_module->name);
-			return 1;
-		}
-	}
-
-	/* Initialise allocator */
-	if (toiActiveAllocator->rw_init(rw, which)) {
-		abort_hibernate(TOI_FAILED_MODULE_INIT,
-				"Failed to initialise the allocator.");
-		return 1;
-	}
-
-	/* Initialise other modules */
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled ||
-		    this_module->type == FILTER_MODULE ||
-		    this_module->type == WRITER_MODULE)
-			continue;
-		if (this_module->rw_init && this_module->rw_init(rw, which)) {
-			set_abort_result(TOI_FAILED_MODULE_INIT);
-			printk(KERN_INFO "Setting aborted flag due to module "
-					"init failure.\n");
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * rw_cleanup_modules - cleanup modules
- * @rw:	Whether we are reading of writing an image.
- *
- * Cleanup components after reading or writing a set of pages.
- * Only the allocator may fail.
- **/
-static int rw_cleanup_modules(int rw)
-{
-	struct toi_module_ops *this_module;
-	int result = 0;
-
-	/* Cleanup other modules */
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled ||
-		    this_module->type == FILTER_MODULE ||
-		    this_module->type == WRITER_MODULE)
-			continue;
-		if (this_module->rw_cleanup)
-			result |= this_module->rw_cleanup(rw);
-	}
-
-	/* Flush data and cleanup */
-	list_for_each_entry(this_module, &toi_filters, type_list) {
-		if (!this_module->enabled)
-			continue;
-		if (this_module->rw_cleanup)
-			result |= this_module->rw_cleanup(rw);
-	}
-
-	result |= toiActiveAllocator->rw_cleanup(rw);
-
-	return result;
-}
-
-static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high)
-{
-	int index, min, max;
-	struct page *high_page = NULL,
-		    **my_last_high_page = raw_cpu_ptr(&last_high_page),
-		    **my_last_sought = raw_cpu_ptr(&last_sought);
-	struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page);
-	void *compare;
-
-	if (is_high) {
-		if (*my_last_sought && *my_last_high_page &&
-				*my_last_sought < orig_page)
-			high_page = *my_last_high_page;
-		else
-			high_page = (struct page *) restore_highmem_pblist;
-		this = (struct pbe *) kmap(high_page);
-		compare = orig_page;
-	} else {
-		if (*my_last_sought && *my_last_low_page &&
-				*my_last_sought < orig_page)
-			this = *my_last_low_page;
-		else
-			this = restore_pblist;
-		compare = page_address(orig_page);
-	}
-
-	*my_last_sought = orig_page;
-
-	/* Locate page containing pbe */
-	while (this[PBES_PER_PAGE - 1].next &&
-			this[PBES_PER_PAGE - 1].orig_address < compare) {
-		if (is_high) {
-			struct page *next_high_page = (struct page *)
-				this[PBES_PER_PAGE - 1].next;
-			kunmap(high_page);
-			this = kmap(next_high_page);
-			high_page = next_high_page;
-		} else
-			this = this[PBES_PER_PAGE - 1].next;
-	}
-
-	/* Do a binary search within the page */
-	min = 0;
-	max = PBES_PER_PAGE;
-	index = PBES_PER_PAGE / 2;
-	while (max - min) {
-		if (!this[index].orig_address ||
-		    this[index].orig_address > compare)
-			max = index;
-		else if (this[index].orig_address == compare) {
-			if (is_high) {
-				struct page *page = this[index].address;
-				*my_last_high_page = high_page;
-				kunmap(high_page);
-				return page;
-			}
-			*my_last_low_page = this;
-			return virt_to_page(this[index].address);
-		} else
-			min = index;
-		index = ((max + min) / 2);
-	};
-
-	if (is_high)
-		kunmap(high_page);
-
-	abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
-		" orig page %p. This[min].orig_address=%p.\n", orig_page,
-		this[index].orig_address);
-	return NULL;
-}
-
-/**
- * write_next_page - write the next page in a pageset
- * @data_pfn: The pfn where the next data to write is located.
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn number to write in the image (where the data belongs).
- *
- * Get the pfn of the next page to write, map the page if necessary and do the
- * write.
- **/
-static int write_next_page(unsigned long *data_pfn, int *my_io_index,
-		unsigned long *write_pfn)
-{
-	struct page *page;
-	char **my_checksum_locn = raw_cpu_ptr(&checksum_locn);
-	int result = 0, was_present;
-
-	*data_pfn = memory_bm_next_pfn(io_map, 0);
-
-	/* Another thread could have beaten us to it. */
-	if (*data_pfn == BM_END_OF_MAP) {
-		if (atomic_read(&io_count)) {
-			printk(KERN_INFO "Ran out of pfns but io_count is "
-					"still %d.\n", atomic_read(&io_count));
-			BUG();
-		}
-		mutex_unlock(&io_mutex);
-		return -ENODATA;
-	}
-
-	*my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
-	memory_bm_clear_bit(io_map, 0, *data_pfn);
-	page = pfn_to_page(*data_pfn);
-
-	was_present = kernel_page_present(page);
-	if (!was_present)
-		kernel_map_pages(page, 1, 1);
-
-	if (io_pageset == 1)
-		*write_pfn = memory_bm_next_pfn(pageset1_map, 0);
-	else {
-		*write_pfn = *data_pfn;
-		*my_checksum_locn = tuxonice_get_next_checksum();
-	}
-
-	TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index);
-
-	mutex_unlock(&io_mutex);
-
-	if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn))
-		return 1;
-
-	result = first_filter->write_page(*write_pfn, TOI_PAGE, page,
-			PAGE_SIZE);
-
-	if (!was_present)
-		kernel_map_pages(page, 1, 0);
-
-	return result;
-}
-
-/**
- * read_next_page - read the next page in a pageset
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn in which the data belongs.
- *
- * Read a page of the image into our buffer. It can happen (here and in the
- * write routine) that threads don't get run until after other CPUs have done
- * all the work. This was the cause of the long standing issue with
- * occasionally getting -ENODATA errors at the end of reading the image. We
- * therefore need to check there's actually a page to read before trying to
- * retrieve one.
- **/
-
-static int read_next_page(int *my_io_index, unsigned long *write_pfn,
-		struct page *buffer)
-{
-	unsigned int buf_size = PAGE_SIZE;
-	unsigned long left = atomic_read(&io_count);
-
-	if (!left)
-		return -ENODATA;
-
-	/* Start off assuming the page we read isn't resaved */
-	*my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
-	mutex_unlock(&io_mutex);
-
-	/*
-	 * Are we aborting? If so, don't submit any more I/O as
-	 * resetting the resume_attempted flag (from ui.c) will
-	 * clear the bdev flags, making this thread oops.
-	 */
-	if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
-		atomic_dec(&toi_io_workers);
-		if (!atomic_read(&toi_io_workers)) {
-			/*
-			 * So we can be sure we'll have memory for
-			 * marking that we haven't resumed.
-			 */
-			rw_cleanup_modules(READ);
-			set_toi_state(TOI_IO_STOPPED);
-		}
-		while (1)
-			schedule();
-	}
-
-	/*
-	 * See toi_bio_read_page in tuxonice_bio.c:
-	 * read the next page in the image.
-	 */
-	return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size);
-}
-
-static void use_read_page(unsigned long write_pfn, struct page *buffer)
-{
-	struct page *final_page = pfn_to_page(write_pfn),
-		    *copy_page = final_page;
-	char *virt, *buffer_virt;
-	int was_present, cpu = smp_processor_id();
-	unsigned long idx = 0;
-
-	if (io_pageset == 1 && (!pageset1_copy_map ||
-			!memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) {
-		int is_high = PageHighMem(final_page);
-		copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high);
-	}
-
-	if (!memory_bm_test_bit(io_map, cpu, write_pfn)) {
-	        int test = !memory_bm_test_bit(io_map, cpu, write_pfn);
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test);
-		mutex_lock(&io_mutex);
-		idx = atomic_add_return(1, &io_count);
-		mutex_unlock(&io_mutex);
-		return;
-	}
-
-	virt = kmap(copy_page);
-	buffer_virt = kmap(buffer);
-	was_present = kernel_page_present(copy_page);
-	if (!was_present)
-		kernel_map_pages(copy_page, 1, 1);
-	memcpy(virt, buffer_virt, PAGE_SIZE);
-	if (!was_present)
-		kernel_map_pages(copy_page, 1, 0);
-	kunmap(copy_page);
-	kunmap(buffer);
-	memory_bm_clear_bit(io_map, cpu, write_pfn);
-	TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset);
-}
-
-static unsigned long status_update(int writing, unsigned long done,
-		unsigned long ticks)
-{
-	int cs_index = writing ? 0 : 1;
-	unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks;
-	unsigned long msec = jiffies_to_msecs(abs(ticks_so_far));
-	unsigned long pgs_per_s, estimate = 0, pages_left;
-
-	if (msec) {
-		pages_left = io_barmax - done;
-		pgs_per_s = 1000 * done / msec;
-		if (pgs_per_s)
-			estimate = DIV_ROUND_UP(pages_left, pgs_per_s);
-	}
-
-	if (estimate && ticks > HZ / 2)
-		return toi_update_status(done, io_barmax,
-			" %d/%d MB (%lu sec left)",
-			MB(done+1), MB(io_barmax), estimate);
-
-	return toi_update_status(done, io_barmax, " %d/%d MB",
-		MB(done+1), MB(io_barmax));
-}
-
-/**
- * worker_rw_loop - main loop to read/write pages
- *
- * The main I/O loop for reading or writing pages. The io_map bitmap is used to
- * track the pages to read/write.
- * If we are reading, the pages are loaded to their final (mapped) pfn.
- * Data is non zero iff this is a thread started via start_other_threads.
- * In that case, we stay in here until told to quit.
- **/
-static int worker_rw_loop(void *data)
-{
-	unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4,
-		      jif_index = 1, start_time = jiffies, thread_num;
-	int result = 0, my_io_index = 0, last_worker;
-	struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
-	cpumask_var_t orig_mask;
-
-        if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) {
-		printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data);
-                result = -ENOMEM;
-                goto out;
-        }
-
-	cpumask_copy(orig_mask, tsk_cpus_allowed(current));
-
-	current->flags |= PF_NOFREEZE;
-
-top:
-	mutex_lock(&io_mutex);
-	thread_num = atomic_read(&toi_io_workers);
-
-	cpumask_copy(tsk_cpus_allowed(current), orig_mask);
-	schedule();
-
-	atomic_inc(&toi_io_workers);
-
-	while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
-		!(io_write && test_result_state(TOI_ABORTED)) &&
-		toi_worker_command == TOI_IO_WORKER_RUN) {
-		if (!thread_num && jiffies > next_jiffies) {
-			next_jiffies += HZ / 4;
-			if (toiActiveAllocator->update_throughput_throttle)
-				toiActiveAllocator->update_throughput_throttle(
-						jif_index);
-			jif_index++;
-		}
-
-		/*
-		 * What page to use? If reading, don't know yet which page's
-		 * data will be read, so always use the buffer. If writing,
-		 * use the copy (Pageset1) or original page (Pageset2), but
-		 * always write the pfn of the original page.
-		 */
-		if (io_write)
-			result = write_next_page(&data_pfn, &my_io_index,
-					&write_pfn);
-		else /* Reading */
-			result = read_next_page(&my_io_index, &write_pfn,
-					buffer);
-
-		if (result) {
-			mutex_lock(&io_mutex);
-			/* Nothing to do? */
-			if (result == -ENODATA) {
-				toi_message(TOI_IO, TOI_VERBOSE, 0,
-					"Thread %d has no more work.",
-					smp_processor_id());
-				break;
-			}
-
-			io_result = result;
-
-			if (io_write) {
-				printk(KERN_INFO "Write chunk returned %d.\n",
-						result);
-				abort_hibernate(TOI_FAILED_IO,
-					"Failed to write a chunk of the "
-					"image.");
-				break;
-			}
-
-			if (io_pageset == 1) {
-				printk(KERN_ERR "\nBreaking out of I/O loop "
-					"because of result code %d.\n", result);
-				break;
-			}
-			panic("Read chunk returned (%d)", result);
-		}
-
-		/*
-		 * Discard reads of resaved pages while reading ps2
-		 * and unwanted pages while rereading ps2 when aborting.
-		 */
-		if (!io_write) {
-			if (!PageResave(pfn_to_page(write_pfn)))
-				use_read_page(write_pfn, buffer);
-			else {
-				mutex_lock(&io_mutex);
-				toi_message(TOI_IO, TOI_VERBOSE, 0,
-						"Resaved %ld.", write_pfn);
-				atomic_inc(&io_count);
-				mutex_unlock(&io_mutex);
-			}
-		}
-
-		if (!thread_num) {
-			if(my_io_index + io_base > io_nextupdate)
-				io_nextupdate = status_update(io_write,
-						my_io_index + io_base,
-						jiffies - start_time);
-
-			if (my_io_index > io_pc) {
-				printk(KERN_CONT "...%d%%", 20 * io_pc_step);
-				io_pc_step++;
-				io_pc = io_finish_at * io_pc_step / 5;
-			}
-		}
-
-		toi_cond_pause(0, NULL);
-
-		/*
-		 * Subtle: If there's less I/O still to be done than threads
-		 * running, quit. This stops us doing I/O beyond the end of
-		 * the image when reading.
-		 *
-		 * Possible race condition. Two threads could do the test at
-		 * the same time; one should exit and one should continue.
-		 * Therefore we take the mutex before comparing and exiting.
-		 */
-
-		mutex_lock(&io_mutex);
-	}
-
-	last_worker = atomic_dec_and_test(&toi_io_workers);
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers));
-	mutex_unlock(&io_mutex);
-
-	if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) {
-		/* Were we the last thread and we're using a flusher thread? */
-		if (last_worker && using_flusher) {
-			toiActiveAllocator->finish_all_io();
-		}
-		/* First, if we're doing I/O, wait for it to finish */
-		wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN);
-		/* Then wait to be told what to do next */
-		wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP);
-		if (toi_worker_command == TOI_IO_WORKER_RUN)
-			goto top;
-	}
-
-	if (thread_num)
-		atomic_dec(&toi_num_other_threads);
-
-out:
-	toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num);
-	toi__free_page(28, buffer);
-	free_cpumask_var(orig_mask);
-
-	return result;
-}
-
-int toi_start_other_threads(void)
-{
-	int cpu;
-	struct task_struct *p;
-	int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1;
-  unsigned long num_started = 0;
-
-	if (test_action_state(TOI_NO_MULTITHREADED_IO))
-		return 0;
-
-	toi_worker_command = TOI_IO_WORKER_STOP;
-
-	for_each_online_cpu(cpu) {
-		if (num_started == to_start)
-			break;
-
-		if (cpu == smp_processor_id())
-			continue;
-
-		p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1,
-				cpu_to_node(cpu), "ktoi_io/%d", cpu);
-		if (IS_ERR(p)) {
-			printk(KERN_ERR "ktoi_io for %i failed\n", cpu);
-			continue;
-		}
-		kthread_bind(p, cpu);
-		p->flags |= PF_MEMALLOC;
-		wake_up_process(p);
-		num_started++;
-		atomic_inc(&toi_num_other_threads);
-	}
-
-	toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started);
-	return num_started;
-}
-
-void toi_stop_other_threads(void)
-{
-	toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads.");
-	toi_worker_command = TOI_IO_WORKER_EXIT;
-	wake_up(&toi_worker_wait_queue);
-}
-
-/**
- * do_rw_loop - main highlevel function for reading or writing pages
- *
- * Create the io_map bitmap and call worker_rw_loop to perform I/O operations.
- **/
-static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags,
-		int base, int barmax, int pageset)
-{
-	int index = 0, cpu, result = 0, workers_started;
-	unsigned long pfn, next;
-
-	first_filter = toi_get_next_filter(NULL);
-
-	if (!finish_at)
-		return 0;
-
-	io_write = write;
-	io_finish_at = finish_at;
-	io_base = base;
-	io_barmax = barmax;
-	io_pageset = pageset;
-	io_index = 0;
-	io_pc = io_finish_at / 5;
-	io_pc_step = 1;
-	io_result = 0;
-	io_nextupdate = base + 1;
-	toi_bio_queue_flusher_should_finish = 0;
-
-	for_each_online_cpu(cpu) {
-		per_cpu(last_sought, cpu) = NULL;
-		per_cpu(last_low_page, cpu) = NULL;
-		per_cpu(last_high_page, cpu) = NULL;
-	}
-
-	/* Ensure all bits clear */
-	memory_bm_clear(io_map);
-
-        memory_bm_position_reset(io_map);
-        next = memory_bm_next_pfn(io_map, 0);
-
-        BUG_ON(next != BM_END_OF_MAP);
-
-	/* Set the bits for the pages to write */
-	memory_bm_position_reset(pageflags);
-
-	pfn = memory_bm_next_pfn(pageflags, 0);
-        toi_trace_index++;
-
-	while (pfn != BM_END_OF_MAP && index < finish_at) {
-		TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at);
-		memory_bm_set_bit(io_map, 0, pfn);
-		pfn = memory_bm_next_pfn(pageflags, 0);
-		index++;
-	}
-
-        BUG_ON(next != BM_END_OF_MAP || index < finish_at);
-
-        memory_bm_position_reset(io_map);
-        toi_trace_index++;
-
-	atomic_set(&io_count, finish_at);
-
-	memory_bm_position_reset(pageset1_map);
-
-	mutex_lock(&io_mutex);
-
-	clear_toi_state(TOI_IO_STOPPED);
-
-	using_flusher = (atomic_read(&toi_num_other_threads) &&
-			 toiActiveAllocator->io_flusher &&
-			 !test_action_state(TOI_NO_FLUSHER_THREAD));
-
-	workers_started = atomic_read(&toi_num_other_threads);
-
-	memory_bm_position_reset(io_map);
-	memory_bm_position_reset(pageset1_copy_map);
-
-	toi_worker_command = TOI_IO_WORKER_RUN;
-	wake_up(&toi_worker_wait_queue);
-
-	mutex_unlock(&io_mutex);
-
-	if (using_flusher)
-		result = toiActiveAllocator->io_flusher(write);
-	else
-		worker_rw_loop(NULL);
-
-	while (atomic_read(&toi_io_workers))
-		schedule();
-
-	printk(KERN_CONT "\n");
-
-	toi_worker_command = TOI_IO_WORKER_STOP;
-	wake_up(&toi_worker_wait_queue);
-
-	if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
-		if (!atomic_read(&toi_io_workers)) {
-			rw_cleanup_modules(READ);
-			set_toi_state(TOI_IO_STOPPED);
-		}
-		while (1)
-			schedule();
-	}
-	set_toi_state(TOI_IO_STOPPED);
-
-	if (!io_result && !result && !test_result_state(TOI_ABORTED)) {
-		unsigned long next;
-
-		toi_update_status(io_base + io_finish_at, io_barmax,
-				" %d/%d MB ",
-				MB(io_base + io_finish_at), MB(io_barmax));
-
-		memory_bm_position_reset(io_map);
-		next = memory_bm_next_pfn(io_map, 0);
-		if  (next != BM_END_OF_MAP) {
-			printk(KERN_INFO "Finished I/O loop but still work to "
-					"do?\nFinish at = %d. io_count = %d.\n",
-					finish_at, atomic_read(&io_count));
-			printk(KERN_INFO "I/O bitmap still records work to do."
-					"%ld.\n", next);
-			BUG();
-			do {
-				cpu_relax();
-			} while (0);
-		}
-	}
-
-	return io_result ? io_result : result;
-}
-
-/**
- * write_pageset - write a pageset to disk.
- * @pagedir:	Which pagedir to write.
- *
- * Returns:
- *	Zero on success or -1 on failure.
- **/
-int write_pageset(struct pagedir *pagedir)
-{
-	int finish_at, base = 0;
-	int barmax = pagedir1.size + pagedir2.size;
-	long error = 0;
-	struct memory_bitmap *pageflags;
-	unsigned long start_time, end_time;
-
-	/*
-	 * Even if there is nothing to read or write, the allocator
-	 * may need the init/cleanup for it's housekeeping.  (eg:
-	 * Pageset1 may start where pageset2 ends when writing).
-	 */
-	finish_at = pagedir->size;
-
-	if (pagedir->id == 1) {
-		toi_prepare_status(DONT_CLEAR_BAR,
-				"Writing kernel & process data...");
-		base = pagedir2.size;
-		if (test_action_state(TOI_TEST_FILTER_SPEED) ||
-		    test_action_state(TOI_TEST_BIO))
-			pageflags = pageset1_map;
-		else
-			pageflags = pageset1_copy_map;
-	} else {
-		toi_prepare_status(DONT_CLEAR_BAR, "Writing caches...");
-		pageflags = pageset2_map;
-	}
-
-	start_time = jiffies;
-
-	if (rw_init_modules(WRITE, pagedir->id)) {
-		abort_hibernate(TOI_FAILED_MODULE_INIT,
-				"Failed to initialise modules for writing.");
-		error = 1;
-	}
-
-	if (!error)
-		error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax,
-				pagedir->id);
-
-	if (rw_cleanup_modules(WRITE) && !error) {
-		abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
-				"Failed to cleanup after writing.");
-		error = 1;
-	}
-
-	end_time = jiffies;
-
-	if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
-		toi_bkd.toi_io_time[0][0] += finish_at,
-		toi_bkd.toi_io_time[0][1] += (end_time - start_time);
-	}
-
-	return error;
-}
-
-/**
- * read_pageset - highlevel function to read a pageset from disk
- * @pagedir:			pageset to read
- * @overwrittenpagesonly:	Whether to read the whole pageset or
- *				only part of it.
- *
- * Returns:
- *	Zero on success or -1 on failure.
- **/
-static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
-{
-	int result = 0, base = 0;
-	int finish_at = pagedir->size;
-	int barmax = pagedir1.size + pagedir2.size;
-	struct memory_bitmap *pageflags;
-	unsigned long start_time, end_time;
-
-	if (pagedir->id == 1) {
-		toi_prepare_status(DONT_CLEAR_BAR,
-				"Reading kernel & process data...");
-		pageflags = pageset1_map;
-	} else {
-		toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
-		if (overwrittenpagesonly) {
-			barmax = min(pagedir1.size, pagedir2.size);
-			finish_at = min(pagedir1.size, pagedir2.size);
-		} else
-			base = pagedir1.size;
-		pageflags = pageset2_map;
-	}
-
-	start_time = jiffies;
-
-	if (rw_init_modules(READ, pagedir->id)) {
-		toiActiveAllocator->remove_image();
-		result = 1;
-	} else
-		result = do_rw_loop(READ, finish_at, pageflags, base, barmax,
-				pagedir->id);
-
-	if (rw_cleanup_modules(READ) && !result) {
-		abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
-				"Failed to cleanup after reading.");
-		result = 1;
-	}
-
-	/* Statistics */
-	end_time = jiffies;
-
-	if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
-		toi_bkd.toi_io_time[1][0] += finish_at,
-		toi_bkd.toi_io_time[1][1] += (end_time - start_time);
-	}
-
-	return result;
-}
-
-/**
- * write_module_configs - store the modules configuration
- *
- * The configuration for each module is stored in the image header.
- * Returns: Int
- *	Zero on success, Error value otherwise.
- **/
-static int write_module_configs(void)
-{
-	struct toi_module_ops *this_module;
-	char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
-	int len, index = 1;
-	struct toi_module_header toi_module_header;
-
-	if (!buffer) {
-		printk(KERN_INFO "Failed to allocate a buffer for saving "
-				"module configuration info.\n");
-		return -ENOMEM;
-	}
-
-	/*
-	 * We have to know which data goes with which module, so we at
-	 * least write a length of zero for a module. Note that we are
-	 * also assuming every module's config data takes <= PAGE_SIZE.
-	 */
-
-	/* For each module (in registration order) */
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled || !this_module->storage_needed ||
-		    (this_module->type == WRITER_MODULE &&
-		     toiActiveAllocator != this_module))
-			continue;
-
-		/* Get the data from the module */
-		len = 0;
-		if (this_module->save_config_info)
-			len = this_module->save_config_info(buffer);
-
-		/* Save the details of the module */
-		toi_module_header.enabled = this_module->enabled;
-		toi_module_header.type = this_module->type;
-		toi_module_header.index = index++;
-		strncpy(toi_module_header.name, this_module->name,
-					sizeof(toi_module_header.name));
-		toiActiveAllocator->rw_header_chunk(WRITE,
-				this_module,
-				(char *) &toi_module_header,
-				sizeof(toi_module_header));
-
-		/* Save the size of the data and any data returned */
-		toiActiveAllocator->rw_header_chunk(WRITE,
-				this_module,
-				(char *) &len, sizeof(int));
-		if (len)
-			toiActiveAllocator->rw_header_chunk(
-				WRITE, this_module, buffer, len);
-	}
-
-	/* Write a blank header to terminate the list */
-	toi_module_header.name[0] = '\0';
-	toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-			(char *) &toi_module_header, sizeof(toi_module_header));
-
-	toi_free_page(22, (unsigned long) buffer);
-	return 0;
-}
-
-/**
- * read_one_module_config - read and configure one module
- *
- * Read the configuration for one module, and configure the module
- * to match if it is loaded.
- *
- * Returns: Int
- *	Zero on success, Error value otherwise.
- **/
-static int read_one_module_config(struct toi_module_header *header)
-{
-	struct toi_module_ops *this_module;
-	int result, len;
-	char *buffer;
-
-	/* Find the module */
-	this_module = toi_find_module_given_name(header->name);
-
-	if (!this_module) {
-		if (header->enabled) {
-			toi_early_boot_message(1, TOI_CONTINUE_REQ,
-				"It looks like we need module %s for reading "
-				"the image but it hasn't been registered.\n",
-				header->name);
-			if (!(test_toi_state(TOI_CONTINUE_REQ)))
-				return -EINVAL;
-		} else
-			printk(KERN_INFO "Module %s configuration data found, "
-				"but the module hasn't registered. Looks like "
-				"it was disabled, so we're ignoring its data.",
-				header->name);
-	}
-
-	/* Get the length of the data (if any) */
-	result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len,
-			sizeof(int));
-	if (result) {
-		printk(KERN_ERR "Failed to read the length of the module %s's"
-				" configuration data.\n",
-				header->name);
-		return -EINVAL;
-	}
-
-	/* Read any data and pass to the module (if we found one) */
-	if (!len)
-		return 0;
-
-	buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
-
-	if (!buffer) {
-		printk(KERN_ERR "Failed to allocate a buffer for reloading "
-				"module configuration info.\n");
-		return -ENOMEM;
-	}
-
-	toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len);
-
-	if (!this_module)
-		goto out;
-
-	if (!this_module->save_config_info)
-		printk(KERN_ERR "Huh? Module %s appears to have a "
-				"save_config_info, but not a load_config_info "
-				"function!\n", this_module->name);
-	else
-		this_module->load_config_info(buffer, len);
-
-	/*
-	 * Now move this module to the tail of its lists. This will put it in
-	 * order. Any new modules will end up at the top of the lists. They
-	 * should have been set to disabled when loaded (people will
-	 * normally not edit an initrd to load a new module and then hibernate
-	 * without using it!).
-	 */
-
-	toi_move_module_tail(this_module);
-
-	this_module->enabled = header->enabled;
-
-out:
-	toi_free_page(23, (unsigned long) buffer);
-	return 0;
-}
-
-/**
- * read_module_configs - reload module configurations from the image header.
- *
- * Returns: Int
- *	Zero on success or an error code.
- **/
-static int read_module_configs(void)
-{
-	int result = 0;
-	struct toi_module_header toi_module_header;
-	struct toi_module_ops *this_module;
-
-	/* All modules are initially disabled. That way, if we have a module
-	 * loaded now that wasn't loaded when we hibernated, it won't be used
-	 * in trying to read the data.
-	 */
-	list_for_each_entry(this_module, &toi_modules, module_list)
-		this_module->enabled = 0;
-
-	/* Get the first module header */
-	result = toiActiveAllocator->rw_header_chunk(READ, NULL,
-			(char *) &toi_module_header,
-			sizeof(toi_module_header));
-	if (result) {
-		printk(KERN_ERR "Failed to read the next module header.\n");
-		return -EINVAL;
-	}
-
-	/* For each module (in registration order) */
-	while (toi_module_header.name[0]) {
-		result = read_one_module_config(&toi_module_header);
-
-		if (result)
-			return -EINVAL;
-
-		/* Get the next module header */
-		result = toiActiveAllocator->rw_header_chunk(READ, NULL,
-				(char *) &toi_module_header,
-				sizeof(toi_module_header));
-
-		if (result) {
-			printk(KERN_ERR "Failed to read the next module "
-					"header.\n");
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-
-static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev)
-{
-	return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1;
-}
-
-int fs_info_space_needed(void)
-{
-	const struct super_block *sb;
-	int result = sizeof(int);
-
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		struct fs_info *fs;
-
-		if (!sb->s_bdev)
-			continue;
-
-		fs = fs_info_from_block_dev(sb->s_bdev);
-		if (save_fs_info(fs, sb->s_bdev))
-			result += 16 + sizeof(dev_t) + sizeof(int) +
-				fs->last_mount_size;
-		free_fs_info(fs);
-	}
-	return result;
-}
-
-static int fs_info_num_to_save(void)
-{
-	const struct super_block *sb;
-	int to_save = 0;
-
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		struct fs_info *fs;
-
-		if (!sb->s_bdev)
-			continue;
-
-		fs = fs_info_from_block_dev(sb->s_bdev);
-		if (save_fs_info(fs, sb->s_bdev))
-			to_save++;
-		free_fs_info(fs);
-	}
-
-	return to_save;
-}
-
-static int fs_info_save(void)
-{
-	const struct super_block *sb;
-	int to_save = fs_info_num_to_save();
-
-	if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save,
-				sizeof(int))) {
-		abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info"
-				" to save.");
-		return -EIO;
-	}
-
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		struct fs_info *fs;
-
-		if (!sb->s_bdev)
-			continue;
-
-		fs = fs_info_from_block_dev(sb->s_bdev);
-		if (save_fs_info(fs, sb->s_bdev)) {
-			if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-					&fs->uuid[0], 16)) {
-				abort_hibernate(TOI_FAILED_IO, "Failed to "
-						"write uuid.");
-				return -EIO;
-			}
-			if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-					(char *) &fs->dev_t, sizeof(dev_t))) {
-				abort_hibernate(TOI_FAILED_IO, "Failed to "
-						"write dev_t.");
-				return -EIO;
-			}
-			if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-					(char *) &fs->last_mount_size, sizeof(int))) {
-				abort_hibernate(TOI_FAILED_IO, "Failed to "
-						"write last mount length.");
-				return -EIO;
-			}
-			if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-					fs->last_mount, fs->last_mount_size)) {
-				abort_hibernate(TOI_FAILED_IO, "Failed to "
-						"write uuid.");
-				return -EIO;
-			}
-		}
-		free_fs_info(fs);
-	}
-	return 0;
-}
-
-static int fs_info_load_and_check_one(void)
-{
-	char uuid[16], *last_mount;
-	int result = 0, ln;
-	dev_t dev_t;
-	struct block_device *dev;
-	struct fs_info *fs_info, seek;
-
-	if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) {
-		abort_hibernate(TOI_FAILED_IO, "Failed to read uuid.");
-		return -EIO;
-	}
-
-	read_if_version(3, dev_t, "uuid dev_t field", return -EIO);
-
-	if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln,
-				sizeof(int))) {
-		abort_hibernate(TOI_FAILED_IO,
-				"Failed to read last mount size.");
-		return -EIO;
-	}
-
-	last_mount = kzalloc(ln, GFP_KERNEL);
-
-	if (!last_mount)
-		return -ENOMEM;
-
-	if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount,	ln)) {
-		abort_hibernate(TOI_FAILED_IO,
-				"Failed to read last mount timestamp.");
-		result = -EIO;
-		goto out_lmt;
-	}
-
-	strncpy((char *) &seek.uuid, uuid, 16);
-	seek.dev_t = dev_t;
-	seek.last_mount_size = ln;
-	seek.last_mount = last_mount;
-	dev_t = blk_lookup_fs_info(&seek);
-	if (!dev_t)
-		goto out_lmt;
-
-	dev = toi_open_by_devnum(dev_t);
-
-	fs_info = fs_info_from_block_dev(dev);
-	if (fs_info && !IS_ERR(fs_info)) {
-		if (ln != fs_info->last_mount_size) {
-			printk(KERN_EMERG "Found matching uuid but last mount "
-					"time lengths differ?! "
-					"(%d vs %d).\n", ln,
-					fs_info->last_mount_size);
-			result = -EINVAL;
-		} else {
-			char buf[BDEVNAME_SIZE];
-			result = !!memcmp(fs_info->last_mount, last_mount, ln);
-			if (result)
-				printk(KERN_EMERG "Last mount time for %s has "
-					"changed!\n", bdevname(dev, buf));
-		}
-	}
-	toi_close_bdev(dev);
-	free_fs_info(fs_info);
-out_lmt:
-	kfree(last_mount);
-	return result;
-}
-
-static int fs_info_load_and_check(void)
-{
-	int to_do, result = 0;
-
-	if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do,
-				sizeof(int))) {
-		abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info "
-				"to load.");
-		return -EIO;
-	}
-
-	while(to_do--)
-		result |= fs_info_load_and_check_one();
-
-	return result;
-}
-
-/**
- * write_image_header - write the image header after write the image proper
- *
- * Returns: Int
- *	Zero on success, error value otherwise.
- **/
-int write_image_header(void)
-{
-	int ret;
-	int total = pagedir1.size + pagedir2.size+2;
-	char *header_buffer = NULL;
-
-	/* Now prepare to write the header */
-	ret = toiActiveAllocator->write_header_init();
-	if (ret) {
-		abort_hibernate(TOI_FAILED_MODULE_INIT,
-				"Active allocator's write_header_init"
-				" function failed.");
-		goto write_image_header_abort;
-	}
-
-	/* Get a buffer */
-	header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
-	if (!header_buffer) {
-		abort_hibernate(TOI_OUT_OF_MEMORY,
-			"Out of memory when trying to get page for header!");
-		goto write_image_header_abort;
-	}
-
-	/* Write hibernate header */
-	if (fill_toi_header((struct toi_header *) header_buffer)) {
-		abort_hibernate(TOI_OUT_OF_MEMORY,
-			"Failure to fill header information!");
-		goto write_image_header_abort;
-	}
-
-	if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-			header_buffer, sizeof(struct toi_header))) {
-		abort_hibernate(TOI_OUT_OF_MEMORY,
-			"Failure to write header info.");
-		goto write_image_header_abort;
-	}
-
-	if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-			(char *) &toi_max_workers, sizeof(toi_max_workers))) {
-		abort_hibernate(TOI_OUT_OF_MEMORY,
-			"Failure to number of workers to use.");
-		goto write_image_header_abort;
-	}
-
-	/* Write filesystem info */
-	if (fs_info_save())
-		goto write_image_header_abort;
-
-	/* Write module configurations */
-	ret = write_module_configs();
-	if (ret) {
-		abort_hibernate(TOI_FAILED_IO,
-				"Failed to write module configs.");
-		goto write_image_header_abort;
-	}
-
-	if (memory_bm_write(pageset1_map,
-				toiActiveAllocator->rw_header_chunk)) {
-		abort_hibernate(TOI_FAILED_IO,
-				"Failed to write bitmaps.");
-		goto write_image_header_abort;
-	}
-
-	/* Flush data and let allocator cleanup */
-	if (toiActiveAllocator->write_header_cleanup()) {
-		abort_hibernate(TOI_FAILED_IO,
-				"Failed to cleanup writing header.");
-		goto write_image_header_abort_no_cleanup;
-	}
-
-	if (test_result_state(TOI_ABORTED))
-		goto write_image_header_abort_no_cleanup;
-
-	toi_update_status(total, total, NULL);
-
-out:
-	if (header_buffer)
-		toi_free_page(24, (unsigned long) header_buffer);
-	return ret;
-
-write_image_header_abort:
-	toiActiveAllocator->write_header_cleanup();
-write_image_header_abort_no_cleanup:
-	ret = -1;
-	goto out;
-}
-
-/**
- * sanity_check - check the header
- * @sh:	the header which was saved at hibernate time.
- *
- * Perform a few checks, seeking to ensure that the kernel being
- * booted matches the one hibernated. They need to match so we can
- * be _sure_ things will work. It is not absolutely impossible for
- * resuming from a different kernel to work, just not assured.
- **/
-static char *sanity_check(struct toi_header *sh)
-{
-	char *reason = check_image_kernel((struct swsusp_info *) sh);
-
-	if (reason)
-		return reason;
-
-	if (!test_action_state(TOI_IGNORE_ROOTFS)) {
-		const struct super_block *sb;
-		list_for_each_entry(sb, &super_blocks, s_list) {
-			if ((!(sb->s_flags & MS_RDONLY)) &&
-			    (sb->s_type->fs_flags & FS_REQUIRES_DEV))
-				return "Device backed fs has been mounted "
-					"rw prior to resume or initrd/ramfs "
-					"is mounted rw.";
-		}
-	}
-
-	return NULL;
-}
-
-static DECLARE_WAIT_QUEUE_HEAD(freeze_wait);
-
-#define FREEZE_IN_PROGRESS (~0)
-
-static int freeze_result;
-
-static void do_freeze(struct work_struct *dummy)
-{
-	freeze_result = freeze_processes();
-	wake_up(&freeze_wait);
-	trap_non_toi_io = 1;
-}
-
-static DECLARE_WORK(freeze_work, do_freeze);
-
-/**
- * __read_pageset1 - test for the existence of an image and attempt to load it
- *
- * Returns:	Int
- *	Zero if image found and pageset1 successfully loaded.
- *	Error if no image found or loaded.
- **/
-static int __read_pageset1(void)
-{
-	int i, result = 0;
-	char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
-	     *sanity_error = NULL;
-	struct toi_header *toi_header;
-
-	if (!header_buffer) {
-		printk(KERN_INFO "Unable to allocate a page for reading the "
-				"signature.\n");
-		return -ENOMEM;
-	}
-
-	/* Check for an image */
-	result = toiActiveAllocator->image_exists(1);
-	if (result == 3) {
-		result = -ENODATA;
-		toi_early_boot_message(1, 0, "The signature from an older "
-				"version of TuxOnIce has been detected.");
-		goto out_remove_image;
-	}
-
-	if (result != 1) {
-		result = -ENODATA;
-		noresume_reset_modules();
-		printk(KERN_INFO "TuxOnIce: No image found.\n");
-		goto out;
-	}
-
-	/*
-	 * Prepare the active allocator for reading the image header. The
-	 * activate allocator might read its own configuration.
-	 *
-	 * NB: This call may never return because there might be a signature
-	 * for a different image such that we warn the user and they choose
-	 * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
-	 * location of the image might be unavailable if it was stored on a
-	 * network connection).
-	 */
-
-	result = toiActiveAllocator->read_header_init();
-	if (result) {
-		printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the "
-				"image header.\n");
-		goto out_remove_image;
-	}
-
-	/* Check for noresume command line option */
-	if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
-		printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
-				"image.\n");
-		goto out_remove_image;
-	}
-
-	/* Check whether we've resumed before */
-	if (test_toi_state(TOI_RESUMED_BEFORE)) {
-		toi_early_boot_message(1, 0, NULL);
-		if (!(test_toi_state(TOI_CONTINUE_REQ))) {
-			printk(KERN_INFO "TuxOnIce: Tried to resume before: "
-					"Invalidated image.\n");
-			goto out_remove_image;
-		}
-	}
-
-	clear_toi_state(TOI_CONTINUE_REQ);
-
-	toi_image_header_version = toiActiveAllocator->get_header_version();
-
-	if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) {
-		toi_early_boot_message(1, 0, image_version_error);
-		if (!(test_toi_state(TOI_CONTINUE_REQ))) {
-			printk(KERN_INFO "TuxOnIce: Header version too new: "
-					"Invalidated image.\n");
-			goto out_remove_image;
-		}
-	}
-
-	/* Read hibernate header */
-	result = toiActiveAllocator->rw_header_chunk(READ, NULL,
-			header_buffer, sizeof(struct toi_header));
-	if (result < 0) {
-		printk(KERN_ERR "TuxOnIce: Failed to read the image "
-				"signature.\n");
-		goto out_remove_image;
-	}
-
-	toi_header = (struct toi_header *) header_buffer;
-
-	/*
-	 * NB: This call may also result in a reboot rather than returning.
-	 */
-
-	sanity_error = sanity_check(toi_header);
-	if (sanity_error) {
-		toi_early_boot_message(1, TOI_CONTINUE_REQ,
-				sanity_error);
-		printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
-		goto out_remove_image;
-	}
-
-	/*
-	 * We have an image and it looks like it will load okay.
-	 *
-	 * Get metadata from header. Don't override commandline parameters.
-	 *
-	 * We don't need to save the image size limit because it's not used
-	 * during resume and will be restored with the image anyway.
-	 */
-
-	memcpy((char *) &pagedir1,
-		(char *) &toi_header->pagedir, sizeof(pagedir1));
-	toi_result = toi_header->param0;
-	if (!toi_bkd.toi_debug_state) {
-		toi_bkd.toi_action =
-			(toi_header->param1 & ~toi_bootflags_mask) |
-			(toi_bkd.toi_action & toi_bootflags_mask);
-		toi_bkd.toi_debug_state = toi_header->param2;
-		toi_bkd.toi_default_console_level = toi_header->param3;
-	}
-	clear_toi_state(TOI_IGNORE_LOGLEVEL);
-	pagedir2.size = toi_header->pageset_2_size;
-	for (i = 0; i < 4; i++)
-		toi_bkd.toi_io_time[i/2][i%2] =
-			toi_header->io_time[i/2][i%2];
-
-	set_toi_state(TOI_BOOT_KERNEL);
-	boot_kernel_data_buffer = toi_header->bkd;
-
-	read_if_version(1, toi_max_workers, "TuxOnIce max workers",
-			goto out_remove_image);
-
-	/* Read filesystem info */
-	if (fs_info_load_and_check()) {
-		printk(KERN_EMERG "TuxOnIce: File system mount time checks "
-			"failed. Refusing to corrupt your filesystems!\n");
-		goto out_remove_image;
-	}
-
-	/* Read module configurations */
-	result = read_module_configs();
-	if (result) {
-		pagedir1.size = 0;
-		pagedir2.size = 0;
-		printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
-				"configurations.\n");
-		clear_action_state(TOI_KEEP_IMAGE);
-		goto out_remove_image;
-	}
-
-	toi_prepare_console();
-
-	set_toi_state(TOI_NOW_RESUMING);
-
-	result = pm_notifier_call_chain(PM_RESTORE_PREPARE);
-	if (result)
-		goto out_notifier_call_chain;;
-
-	if (usermodehelper_disable())
-		goto out_enable_usermodehelper;
-
-	current->flags |= PF_NOFREEZE;
-	freeze_result = FREEZE_IN_PROGRESS;
-
-	schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work);
-
-	toi_cond_pause(1, "About to read original pageset1 locations.");
-
-	/*
-	 * See _toi_rw_header_chunk in tuxonice_bio.c:
-	 * Initialize pageset1_map by reading the map from the image.
-	 */
-	if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk))
-		goto out_thaw;
-
-	/*
-	 * See toi_rw_cleanup in tuxonice_bio.c:
-	 * Clean up after reading the header.
-	 */
-	result = toiActiveAllocator->read_header_cleanup();
-	if (result) {
-		printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the "
-				"image header.\n");
-		goto out_thaw;
-	}
-
-	toi_cond_pause(1, "About to read pagedir.");
-
-	/*
-	 * Get the addresses of pages into which we will load the kernel to
-	 * be copied back and check if they conflict with the ones we are using.
-	 */
-	if (toi_get_pageset1_load_addresses()) {
-		printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
-				"pageset1.\n");
-		goto out_thaw;
-	}
-
-	/* Read the original kernel back */
-	toi_cond_pause(1, "About to read pageset 1.");
-
-	/* Given the pagemap, read back the data from disk */
-	if (read_pageset(&pagedir1, 0)) {
-		toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1.");
-		result = -EIO;
-		goto out_thaw;
-	}
-
-	toi_cond_pause(1, "About to restore original kernel.");
-	result = 0;
-
-	if (!toi_keeping_image &&
-	    toiActiveAllocator->mark_resume_attempted)
-		toiActiveAllocator->mark_resume_attempted(1);
-
-	wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
-out:
-	current->flags &= ~PF_NOFREEZE;
-	toi_free_page(25, (unsigned long) header_buffer);
-	return result;
-
-out_thaw:
-	wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
-	trap_non_toi_io = 0;
-	thaw_processes();
-out_enable_usermodehelper:
-	usermodehelper_enable();
-out_notifier_call_chain:
-        pm_notifier_call_chain(PM_POST_RESTORE);
-	toi_cleanup_console();
-out_remove_image:
-	result = -EINVAL;
-	if (!toi_keeping_image)
-		toiActiveAllocator->remove_image();
-	toiActiveAllocator->read_header_cleanup();
-	noresume_reset_modules();
-	goto out;
-}
-
-/**
- * read_pageset1 - highlevel function to read the saved pages
- *
- * Attempt to read the header and pageset1 of a hibernate image.
- * Handle the outcome, complaining where appropriate.
- **/
-int read_pageset1(void)
-{
-	int error;
-
-	error = __read_pageset1();
-
-	if (error && error != -ENODATA && error != -EINVAL &&
-					!test_result_state(TOI_ABORTED))
-		abort_hibernate(TOI_IMAGE_ERROR,
-			"TuxOnIce: Error %d resuming\n", error);
-
-	return error;
-}
-
-/**
- * get_have_image_data - check the image header
- **/
-static char *get_have_image_data(void)
-{
-	char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
-	struct toi_header *toi_header;
-
-	if (!output_buffer) {
-		printk(KERN_INFO "Output buffer null.\n");
-		return NULL;
-	}
-
-	/* Check for an image */
-	if (!toiActiveAllocator->image_exists(1) ||
-	    toiActiveAllocator->read_header_init() ||
-	    toiActiveAllocator->rw_header_chunk(READ, NULL,
-			output_buffer, sizeof(struct toi_header))) {
-		sprintf(output_buffer, "0\n");
-		/*
-		 * From an initrd/ramfs, catting have_image and
-		 * getting a result of 0 is sufficient.
-		 */
-		clear_toi_state(TOI_BOOT_TIME);
-		goto out;
-	}
-
-	toi_header = (struct toi_header *) output_buffer;
-
-	sprintf(output_buffer, "1\n%s\n%s\n",
-			toi_header->uts.machine,
-			toi_header->uts.version);
-
-	/* Check whether we've resumed before */
-	if (test_toi_state(TOI_RESUMED_BEFORE))
-		strcat(output_buffer, "Resumed before.\n");
-
-out:
-	noresume_reset_modules();
-	return output_buffer;
-}
-
-/**
- * read_pageset2 - read second part of the image
- * @overwrittenpagesonly:	Read only pages which would have been
- *				verwritten by pageset1?
- *
- * Read in part or all of pageset2 of an image, depending upon
- * whether we are hibernating and have only overwritten a portion
- * with pageset1 pages, or are resuming and need to read them
- * all.
- *
- * Returns: Int
- *	Zero if no error, otherwise the error value.
- **/
-int read_pageset2(int overwrittenpagesonly)
-{
-	int result = 0;
-
-	if (!pagedir2.size)
-		return 0;
-
-	result = read_pageset(&pagedir2, overwrittenpagesonly);
-
-	toi_cond_pause(1, "Pagedir 2 read.");
-
-	return result;
-}
-
-/**
- * image_exists_read - has an image been found?
- * @page:	Output buffer
- *
- * Store 0 or 1 in page, depending on whether an image is found.
- * Incoming buffer is PAGE_SIZE and result is guaranteed
- * to be far less than that, so we don't worry about
- * overflow.
- **/
-int image_exists_read(const char *page, int count)
-{
-	int len = 0;
-	char *result;
-
-	if (toi_activate_storage(0))
-		return count;
-
-	if (!test_toi_state(TOI_RESUME_DEVICE_OK))
-		toi_attempt_to_parse_resume_device(0);
-
-	if (!toiActiveAllocator) {
-		len = sprintf((char *) page, "-1\n");
-	} else {
-		result = get_have_image_data();
-		if (result) {
-			len = sprintf((char *) page, "%s",  result);
-			toi_free_page(26, (unsigned long) result);
-		}
-	}
-
-	toi_deactivate_storage(0);
-
-	return len;
-}
-
-/**
- * image_exists_write - invalidate an image if one exists
- **/
-int image_exists_write(const char *buffer, int count)
-{
-	if (toi_activate_storage(0))
-		return count;
-
-	if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
-		toiActiveAllocator->remove_image();
-
-	toi_deactivate_storage(0);
-
-	clear_result_state(TOI_KEPT_IMAGE);
-
-	return count;
-}
diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
deleted file mode 100644
index 56645a5c6..000000000
--- a/kernel/power/tuxonice_io.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * kernel/power/tuxonice_io.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/utsname.h>
-#include "tuxonice_pagedir.h"
-
-/* Non-module data saved in our image header */
-struct toi_header {
-	/*
-	 * Mirror struct swsusp_info, but without
-	 * the page aligned attribute
-	 */
-	struct new_utsname uts;
-	u32 version_code;
-	unsigned long num_physpages;
-	int cpus;
-	unsigned long image_pages;
-	unsigned long pages;
-	unsigned long size;
-
-	/* Our own data */
-	unsigned long orig_mem_free;
-	int page_size;
-	int pageset_2_size;
-	int param0;
-	int param1;
-	int param2;
-	int param3;
-	int progress0;
-	int progress1;
-	int progress2;
-	int progress3;
-	int io_time[2][2];
-	struct pagedir pagedir;
-	dev_t root_fs;
-	unsigned long bkd; /* Boot kernel data locn */
-};
-
-extern int write_pageset(struct pagedir *pagedir);
-extern int write_image_header(void);
-extern int read_pageset1(void);
-extern int read_pageset2(int overwrittenpagesonly);
-
-extern int toi_attempt_to_parse_resume_device(int quiet);
-extern void attempt_to_parse_resume_device2(void);
-extern void attempt_to_parse_alt_resume_param(void);
-int image_exists_read(const char *page, int count);
-int image_exists_write(const char *buffer, int count);
-extern void save_restore_alt_param(int replace, int quiet);
-extern atomic_t toi_io_workers;
-
-/* Args to save_restore_alt_param */
-#define RESTORE 0
-#define SAVE 1
-
-#define NOQUIET 0
-#define QUIET 1
-
-extern wait_queue_head_t toi_io_queue_flusher;
-extern int toi_bio_queue_flusher_should_finish;
-
-int fs_info_space_needed(void);
-
-extern int toi_max_workers;
diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
deleted file mode 100644
index 18f22bdb6..000000000
--- a/kernel/power/tuxonice_modules.c
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-
-LIST_HEAD(toi_filters);
-LIST_HEAD(toiAllocators);
-
-LIST_HEAD(toi_modules);
-
-struct toi_module_ops *toiActiveAllocator;
-
-static int toi_num_filters;
-int toiNumAllocators, toi_num_modules;
-
-/*
- * toi_header_storage_for_modules
- *
- * Returns the amount of space needed to store configuration
- * data needed by the modules prior to copying back the original
- * kernel. We can exclude data for pageset2 because it will be
- * available anyway once the kernel is copied back.
- */
-long toi_header_storage_for_modules(void)
-{
-	struct toi_module_ops *this_module;
-	int bytes = 0;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled ||
-		    (this_module->type == WRITER_MODULE &&
-		     toiActiveAllocator != this_module))
-			continue;
-		if (this_module->storage_needed) {
-			int this = this_module->storage_needed() +
-				sizeof(struct toi_module_header) +
-				sizeof(int);
-			this_module->header_requested = this;
-			bytes += this;
-		}
-	}
-
-	/* One more for the empty terminator */
-	return bytes + sizeof(struct toi_module_header);
-}
-
-void print_toi_header_storage_for_modules(void)
-{
-	struct toi_module_ops *this_module;
-	int bytes = 0;
-
-	printk(KERN_DEBUG "Header storage:\n");
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled ||
-		    (this_module->type == WRITER_MODULE &&
-		     toiActiveAllocator != this_module))
-			continue;
-		if (this_module->storage_needed) {
-			int this = this_module->storage_needed() +
-				sizeof(struct toi_module_header) +
-				sizeof(int);
-			this_module->header_requested = this;
-			bytes += this;
-			printk(KERN_DEBUG "+ %16s : %-4d/%d.\n",
-					this_module->name,
-					this_module->header_used, this);
-		}
-	}
-
-	printk(KERN_DEBUG "+ empty terminator : %zu.\n",
-			sizeof(struct toi_module_header));
-	printk(KERN_DEBUG "                     ====\n");
-	printk(KERN_DEBUG "                     %zu\n",
-			bytes + sizeof(struct toi_module_header));
-}
-
-/*
- * toi_memory_for_modules
- *
- * Returns the amount of memory requested by modules for
- * doing their work during the cycle.
- */
-
-long toi_memory_for_modules(int print_parts)
-{
-	long bytes = 0, result;
-	struct toi_module_ops *this_module;
-
-	if (print_parts)
-		printk(KERN_INFO "Memory for modules:\n===================\n");
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		int this;
-		if (!this_module->enabled)
-			continue;
-		if (this_module->memory_needed) {
-			this = this_module->memory_needed();
-			if (print_parts)
-				printk(KERN_INFO "%10d bytes (%5ld pages) for "
-						"module '%s'.\n", this,
-						DIV_ROUND_UP(this, PAGE_SIZE),
-						this_module->name);
-			bytes += this;
-		}
-	}
-
-	result = DIV_ROUND_UP(bytes, PAGE_SIZE);
-	if (print_parts)
-		printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
-
-	return result;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Returns the compression ratio expected when saving the image.
- */
-
-int toi_expected_compression_ratio(void)
-{
-	int ratio = 100;
-	struct toi_module_ops *this_module;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled)
-			continue;
-		if (this_module->expected_compression)
-			ratio = ratio * this_module->expected_compression()
-				/ 100;
-	}
-
-	return ratio;
-}
-
-/* toi_find_module_given_dir
- * Functionality :	Return a module (if found), given a pointer
- * 			to its directory name
- */
-
-static struct toi_module_ops *toi_find_module_given_dir(char *name)
-{
-	struct toi_module_ops *this_module, *found_module = NULL;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!strcmp(name, this_module->directory)) {
-			found_module = this_module;
-			break;
-		}
-	}
-
-	return found_module;
-}
-
-/* toi_find_module_given_name
- * Functionality :	Return a module (if found), given a pointer
- * 			to its name
- */
-
-struct toi_module_ops *toi_find_module_given_name(char *name)
-{
-	struct toi_module_ops *this_module, *found_module = NULL;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!strcmp(name, this_module->name)) {
-			found_module = this_module;
-			break;
-		}
-	}
-
-	return found_module;
-}
-
-/*
- * toi_print_module_debug_info
- * Functionality   : Get debugging info from modules into a buffer.
- */
-int toi_print_module_debug_info(char *buffer, int buffer_size)
-{
-	struct toi_module_ops *this_module;
-	int len = 0;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled)
-			continue;
-		if (this_module->print_debug_info) {
-			int result;
-			result = this_module->print_debug_info(buffer + len,
-					buffer_size - len);
-			len += result;
-		}
-	}
-
-	/* Ensure null terminated */
-	buffer[buffer_size] = 0;
-
-	return len;
-}
-
-/*
- * toi_register_module
- *
- * Register a module.
- */
-int toi_register_module(struct toi_module_ops *module)
-{
-	int i;
-	struct kobject *kobj;
-
-        if (!hibernation_available())
-          return -ENODEV;
-
-	module->enabled = 1;
-
-	if (toi_find_module_given_name(module->name)) {
-		printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
-				" which is already registered.\n",
-				module->name);
-		return -EBUSY;
-	}
-
-	switch (module->type) {
-	case FILTER_MODULE:
-		list_add_tail(&module->type_list, &toi_filters);
-		toi_num_filters++;
-		break;
-	case WRITER_MODULE:
-		list_add_tail(&module->type_list, &toiAllocators);
-		toiNumAllocators++;
-		break;
-	case MISC_MODULE:
-	case MISC_HIDDEN_MODULE:
-	case BIO_ALLOCATOR_MODULE:
-		break;
-	default:
-		printk(KERN_ERR "Hmmm. Module '%s' has an invalid type."
-			" It has been ignored.\n", module->name);
-		return -EINVAL;
-	}
-	list_add_tail(&module->module_list, &toi_modules);
-	toi_num_modules++;
-
-	if ((!module->directory && !module->shared_directory) ||
-			!module->sysfs_data || !module->num_sysfs_entries)
-		return 0;
-
-	/*
-	 * Modules may share a directory, but those with shared_dir
-	 * set must be loaded (via symbol dependencies) after parents
-	 * and unloaded beforehand.
-	 */
-	if (module->shared_directory) {
-		struct toi_module_ops *shared =
-			toi_find_module_given_dir(module->shared_directory);
-		if (!shared) {
-			printk(KERN_ERR "TuxOnIce: Module %s wants to share "
-					"%s's directory but %s isn't loaded.\n",
-					module->name, module->shared_directory,
-					module->shared_directory);
-			toi_unregister_module(module);
-			return -ENODEV;
-		}
-		kobj = shared->dir_kobj;
-	} else {
-		if (!strncmp(module->directory, "[ROOT]", 6))
-			kobj = tuxonice_kobj;
-		else
-			kobj = make_toi_sysdir(module->directory);
-	}
-	module->dir_kobj = kobj;
-	for (i = 0; i < module->num_sysfs_entries; i++) {
-		int result = toi_register_sysfs_file(kobj,
-				&module->sysfs_data[i]);
-		if (result)
-			return result;
-	}
-	return 0;
-}
-
-/*
- * toi_unregister_module
- *
- * Remove a module.
- */
-void toi_unregister_module(struct toi_module_ops *module)
-{
-	int i;
-
-	if (module->dir_kobj)
-		for (i = 0; i < module->num_sysfs_entries; i++)
-			toi_unregister_sysfs_file(module->dir_kobj,
-					&module->sysfs_data[i]);
-
-	if (!module->shared_directory && module->directory &&
-			strncmp(module->directory, "[ROOT]", 6))
-		remove_toi_sysdir(module->dir_kobj);
-
-	switch (module->type) {
-	case FILTER_MODULE:
-		list_del(&module->type_list);
-		toi_num_filters--;
-		break;
-	case WRITER_MODULE:
-		list_del(&module->type_list);
-		toiNumAllocators--;
-		if (toiActiveAllocator == module) {
-			toiActiveAllocator = NULL;
-			clear_toi_state(TOI_CAN_RESUME);
-			clear_toi_state(TOI_CAN_HIBERNATE);
-		}
-		break;
-	case MISC_MODULE:
-	case MISC_HIDDEN_MODULE:
-	case BIO_ALLOCATOR_MODULE:
-		break;
-	default:
-		printk(KERN_ERR "Module '%s' has an invalid type."
-			" It has been ignored.\n", module->name);
-		return;
-	}
-	list_del(&module->module_list);
-	toi_num_modules--;
-}
-
-/*
- * toi_move_module_tail
- *
- * Rearrange modules when reloading the config.
- */
-void toi_move_module_tail(struct toi_module_ops *module)
-{
-	switch (module->type) {
-	case FILTER_MODULE:
-		if (toi_num_filters > 1)
-			list_move_tail(&module->type_list, &toi_filters);
-		break;
-	case WRITER_MODULE:
-		if (toiNumAllocators > 1)
-			list_move_tail(&module->type_list, &toiAllocators);
-		break;
-	case MISC_MODULE:
-	case MISC_HIDDEN_MODULE:
-	case BIO_ALLOCATOR_MODULE:
-		break;
-	default:
-		printk(KERN_ERR "Module '%s' has an invalid type."
-			" It has been ignored.\n", module->name);
-		return;
-	}
-	if ((toi_num_filters + toiNumAllocators) > 1)
-		list_move_tail(&module->module_list, &toi_modules);
-}
-
-/*
- * toi_initialise_modules
- *
- * Get ready to do some work!
- */
-int toi_initialise_modules(int starting_cycle, int early)
-{
-	struct toi_module_ops *this_module;
-	int result;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		this_module->header_requested = 0;
-		this_module->header_used = 0;
-		if (!this_module->enabled)
-			continue;
-		if (this_module->early != early)
-			continue;
-		if (this_module->initialise) {
-			result = this_module->initialise(starting_cycle);
-			if (result) {
-				toi_cleanup_modules(starting_cycle);
-				return result;
-			}
-			this_module->initialised = 1;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * toi_cleanup_modules
- *
- * Tell modules the work is done.
- */
-void toi_cleanup_modules(int finishing_cycle)
-{
-	struct toi_module_ops *this_module;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (!this_module->enabled || !this_module->initialised)
-			continue;
-		if (this_module->cleanup)
-			this_module->cleanup(finishing_cycle);
-		this_module->initialised = 0;
-	}
-}
-
-/*
- * toi_pre_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
-	struct toi_module_ops *this_module;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (this_module->enabled && this_module->pre_atomic_restore)
-			this_module->pre_atomic_restore(bkd);
-	}
-}
-
-/*
- * toi_post_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
-	struct toi_module_ops *this_module;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (this_module->enabled && this_module->post_atomic_restore)
-			this_module->post_atomic_restore(bkd);
-	}
-}
-
-/*
- * toi_get_next_filter
- *
- * Get the next filter in the pipeline.
- */
-struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
-{
-	struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
-
-	list_for_each_entry(this_filter, &toi_filters, type_list) {
-		if (!this_filter->enabled)
-			continue;
-		if ((last_filter == filter_sought) || (!filter_sought))
-			return this_filter;
-		last_filter = this_filter;
-	}
-
-	return toiActiveAllocator;
-}
-
-/**
- * toi_show_modules: Printk what support is loaded.
- */
-void toi_print_modules(void)
-{
-	struct toi_module_ops *this_module;
-	int prev = 0;
-
-	printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for");
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		if (this_module->type == MISC_HIDDEN_MODULE)
-			continue;
-		printk("%s %s%s%s", prev ? "," : "",
-				this_module->enabled ? "" : "[",
-				this_module->name,
-				this_module->enabled ? "" : "]");
-		prev = 1;
-	}
-
-	printk(".\n");
-}
-
-/* toi_get_modules
- *
- * Take a reference to modules so they can't go away under us.
- */
-
-int toi_get_modules(void)
-{
-	struct toi_module_ops *this_module;
-
-	list_for_each_entry(this_module, &toi_modules, module_list) {
-		struct toi_module_ops *this_module2;
-
-		if (try_module_get(this_module->module))
-			continue;
-
-		/* Failed! Reverse gets and return error */
-		list_for_each_entry(this_module2, &toi_modules,
-				module_list) {
-			if (this_module == this_module2)
-				return -EINVAL;
-			module_put(this_module2->module);
-		}
-	}
-	return 0;
-}
-
-/* toi_put_modules
- *
- * Release our references to modules we used.
- */
-
-void toi_put_modules(void)
-{
-	struct toi_module_ops *this_module;
-
-	list_for_each_entry(this_module, &toi_modules, module_list)
-		module_put(this_module->module);
-}
diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
deleted file mode 100644
index 34ffe2ee3..000000000
--- a/kernel/power/tuxonice_modules.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations for modules. Modules are additions to
- * TuxOnIce that provide facilities such as image compression or
- * encryption, backends for storage of the image and user interfaces.
- *
- */
-
-#ifndef TOI_MODULES_H
-#define TOI_MODULES_H
-
-/* This is the maximum size we store in the image header for a module name */
-#define TOI_MAX_MODULE_NAME_LENGTH 30
-
-struct toi_boot_kernel_data;
-
-/* Per-module metadata */
-struct toi_module_header {
-	char name[TOI_MAX_MODULE_NAME_LENGTH];
-	int enabled;
-	int type;
-	int index;
-	int data_length;
-	unsigned long signature;
-};
-
-enum {
-	FILTER_MODULE,
-	WRITER_MODULE,
-	BIO_ALLOCATOR_MODULE,
-	MISC_MODULE,
-	MISC_HIDDEN_MODULE,
-};
-
-enum {
-	TOI_ASYNC,
-	TOI_SYNC
-};
-
-enum {
-	TOI_VIRT,
-	TOI_PAGE,
-};
-
-#define TOI_MAP(type, addr) \
- (type == TOI_PAGE ? kmap(addr) : addr)
-
-#define TOI_UNMAP(type, addr) \
- do { \
-   if (type == TOI_PAGE) \
-     kunmap(addr); \
- } while(0)
-
-struct toi_module_ops {
-	/* Functions common to all modules */
-	int type;
-	char *name;
-	char *directory;
-	char *shared_directory;
-	struct kobject *dir_kobj;
-	struct module *module;
-	int enabled, early, initialised;
-	struct list_head module_list;
-
-	/* List of filters or allocators */
-	struct list_head list, type_list;
-
-	/*
-	 * Requirements for memory and storage in
-	 * the image header..
-	 */
-	int (*memory_needed) (void);
-	int (*storage_needed) (void);
-
-	int header_requested, header_used;
-
-	int (*expected_compression) (void);
-
-	/*
-	 * Debug info
-	 */
-	int (*print_debug_info) (char *buffer, int size);
-	int (*save_config_info) (char *buffer);
-	void (*load_config_info) (char *buffer, int len);
-
-	/*
-	 * Initialise & cleanup - general routines called
-	 * at the start and end of a cycle.
-	 */
-	int (*initialise) (int starting_cycle);
-	void (*cleanup) (int finishing_cycle);
-
-	void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd);
-	void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd);
-
-	/*
-	 * Calls for allocating storage (allocators only).
-	 *
-	 * Header space is requested separately and cannot fail, but the
-	 * reservation is only applied when main storage is allocated.
-	 * The header space reservation is thus always set prior to
-	 * requesting the allocation of storage - and prior to querying
-	 * how much storage is available.
-	 */
-
-	unsigned long (*storage_available) (void);
-	void (*reserve_header_space) (unsigned long space_requested);
-	int (*register_storage) (void);
-	int (*allocate_storage) (unsigned long space_requested);
-	unsigned long (*storage_allocated) (void);
-	void (*free_unused_storage) (void);
-
-	/*
-	 * Routines used in image I/O.
-	 */
-	int (*rw_init) (int rw, int stream_number);
-	int (*rw_cleanup) (int rw);
-	int (*write_page) (unsigned long index, int buf_type, void *buf,
-			unsigned int buf_size);
-	int (*read_page) (unsigned long *index, int buf_type, void *buf,
-			unsigned int *buf_size);
-	int (*io_flusher) (int rw);
-
-	/* Reset module if image exists but reading aborted */
-	void (*noresume_reset) (void);
-
-	/* Read and write the metadata */
-	int (*write_header_init) (void);
-	int (*write_header_cleanup) (void);
-
-	int (*read_header_init) (void);
-	int (*read_header_cleanup) (void);
-
-	/* To be called after read_header_init */
-	int (*get_header_version) (void);
-
-	int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
-			char *buffer_start, int buffer_size);
-
-	int (*rw_header_chunk_noreadahead) (int rw,
-			struct toi_module_ops *owner, char *buffer_start,
-			int buffer_size);
-
-	/* Attempt to parse an image location */
-	int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
-
-	/* Throttle I/O according to throughput */
-	void (*update_throughput_throttle) (int jif_index);
-
-	/* Flush outstanding I/O */
-	int (*finish_all_io) (void);
-
-	/* Determine whether image exists that we can restore */
-	int (*image_exists) (int quiet);
-
-	/* Mark the image as having tried to resume */
-	int (*mark_resume_attempted) (int);
-
-	/* Destroy image if one exists */
-	int (*remove_image) (void);
-
-	/* Sysfs Data */
-	struct toi_sysfs_data *sysfs_data;
-	int num_sysfs_entries;
-
-	/* Block I/O allocator */
-	struct toi_bio_allocator_ops *bio_allocator_ops;
-};
-
-extern int toi_num_modules, toiNumAllocators;
-
-extern struct toi_module_ops *toiActiveAllocator;
-extern struct list_head toi_filters, toiAllocators, toi_modules;
-
-extern void toi_prepare_console_modules(void);
-extern void toi_cleanup_console_modules(void);
-
-extern struct toi_module_ops *toi_find_module_given_name(char *name);
-extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
-
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_move_module_tail(struct toi_module_ops *module);
-
-extern long toi_header_storage_for_modules(void);
-extern long toi_memory_for_modules(int print_parts);
-extern void print_toi_header_storage_for_modules(void);
-extern int toi_expected_compression_ratio(void);
-
-extern int toi_print_module_debug_info(char *buffer, int buffer_size);
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_unregister_module(struct toi_module_ops *module);
-
-extern int toi_initialise_modules(int starting_cycle, int early);
-#define toi_initialise_modules_early(starting) \
-	toi_initialise_modules(starting, 1)
-#define toi_initialise_modules_late(starting) \
-	toi_initialise_modules(starting, 0)
-extern void toi_cleanup_modules(int finishing_cycle);
-
-extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-
-extern void toi_print_modules(void);
-
-int toi_get_modules(void);
-void toi_put_modules(void);
-#endif
diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
deleted file mode 100644
index 0db58af8b..000000000
--- a/kernel/power/tuxonice_netlink.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Functions for communicating with a userspace helper via netlink.
- */
-
-#include <linux/suspend.h>
-#include <linux/sched.h>
-#include <linux/kmod.h>
-#include "tuxonice_netlink.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct user_helper_data *uhd_list;
-
-/*
- * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
- * none can be allocated).
- */
-static void toi_fill_skb_pool(struct user_helper_data *uhd)
-{
-	while (uhd->pool_level < uhd->pool_limit) {
-		struct sk_buff *new_skb =
-			alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
-		if (!new_skb)
-			break;
-
-		new_skb->next = uhd->emerg_skbs;
-		uhd->emerg_skbs = new_skb;
-		uhd->pool_level++;
-	}
-}
-
-/*
- * Try to allocate a single skb. If we can't get one, try to use one from
- * our pool.
- */
-static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
-{
-	struct sk_buff *skb =
-		alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
-	if (skb)
-		return skb;
-
-	skb = uhd->emerg_skbs;
-	if (skb) {
-		uhd->pool_level--;
-		uhd->emerg_skbs = skb->next;
-		skb->next = NULL;
-	}
-
-	return skb;
-}
-
-void toi_send_netlink_message(struct user_helper_data *uhd,
-		int type, void *params, size_t len)
-{
-	struct sk_buff *skb;
-	struct nlmsghdr *nlh;
-	void *dest;
-	struct task_struct *t;
-
-	if (uhd->pid == -1)
-		return;
-
-	if (uhd->debug)
-		printk(KERN_ERR "toi_send_netlink_message: Send "
-				"message type %d.\n", type);
-
-	skb = toi_get_skb(uhd);
-	if (!skb) {
-		printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
-		return;
-	}
-
-	nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0);
-	uhd->sock_seq++;
-
-	dest = NLMSG_DATA(nlh);
-	if (params && len > 0)
-		memcpy(dest, params, len);
-
-	netlink_unicast(uhd->nl, skb, uhd->pid, 0);
-
-	toi_read_lock_tasklist();
-	t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
-	if (!t) {
-		toi_read_unlock_tasklist();
-		if (uhd->pid > -1)
-			printk(KERN_INFO "Hmm. Can't find the userspace task"
-				" %d.\n", uhd->pid);
-		return;
-	}
-	wake_up_process(t);
-	toi_read_unlock_tasklist();
-
-	yield();
-}
-
-static void send_whether_debugging(struct user_helper_data *uhd)
-{
-	static u8 is_debugging = 1;
-
-	toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
-			&is_debugging, sizeof(u8));
-}
-
-/*
- * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
- * are hibernating.
- */
-static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid)
-{
-	struct task_struct *t;
-
-	if (uhd->debug)
-		printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid);
-
-	toi_read_lock_tasklist();
-	t = find_task_by_pid_ns(pid, &init_pid_ns);
-	if (!t) {
-		toi_read_unlock_tasklist();
-		printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
-				pid);
-		return -EINVAL;
-	}
-
-	t->flags |= PF_NOFREEZE;
-
-	toi_read_unlock_tasklist();
-	uhd->pid = pid;
-
-	toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
-
-	return 0;
-}
-
-/*
- * Called when the userspace process has informed us that it's ready to roll.
- */
-static int nl_ready(struct user_helper_data *uhd, u32 version)
-{
-	if (version != uhd->interface_version) {
-		printk(KERN_INFO "%s userspace process using invalid interface"
-				" version (%d - kernel wants %d). Trying to "
-				"continue without it.\n",
-				uhd->name, version, uhd->interface_version);
-		if (uhd->not_ready)
-			uhd->not_ready();
-		return -EINVAL;
-	}
-
-	complete(&uhd->wait_for_process);
-
-	return 0;
-}
-
-void toi_netlink_close_complete(struct user_helper_data *uhd)
-{
-	if (uhd->nl) {
-		netlink_kernel_release(uhd->nl);
-		uhd->nl = NULL;
-	}
-
-	while (uhd->emerg_skbs) {
-		struct sk_buff *next = uhd->emerg_skbs->next;
-		kfree_skb(uhd->emerg_skbs);
-		uhd->emerg_skbs = next;
-	}
-
-	uhd->pid = -1;
-}
-
-static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
-		struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-	int type = nlh->nlmsg_type;
-	int *data;
-	int err;
-
-	if (uhd->debug)
-		printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n",
-				type);
-
-	/* Let the more specific handler go first. It returns
-	 * 1 for valid messages that it doesn't know. */
-	err = uhd->rcv_msg(skb, nlh);
-	if (err != 1)
-		return err;
-
-	/* Only allow one task to receive NOFREEZE privileges */
-	if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
-		printk(KERN_INFO "Received extra nofreeze me requests.\n");
-		return -EBUSY;
-	}
-
-	data = NLMSG_DATA(nlh);
-
-	switch (type) {
-	case NETLINK_MSG_NOFREEZE_ME:
-		return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
-	case NETLINK_MSG_GET_DEBUGGING:
-		send_whether_debugging(uhd);
-		return 0;
-	case NETLINK_MSG_READY:
-		if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) {
-			printk(KERN_INFO "Invalid ready mesage.\n");
-			if (uhd->not_ready)
-				uhd->not_ready();
-			return -EINVAL;
-		}
-		return nl_ready(uhd, (u32) *data);
-	case NETLINK_MSG_CLEANUP:
-		toi_netlink_close_complete(uhd);
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-static void toi_user_rcv_skb(struct sk_buff *skb)
-{
-	int err;
-	struct nlmsghdr *nlh;
-	struct user_helper_data *uhd = uhd_list;
-
-	while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
-		uhd = uhd->next;
-
-	if (!uhd)
-		return;
-
-	while (skb->len >= NLMSG_SPACE(0)) {
-		u32 rlen;
-
-		nlh = (struct nlmsghdr *) skb->data;
-		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-			return;
-
-		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
-		if (rlen > skb->len)
-			rlen = skb->len;
-
-		err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
-		if (err)
-			netlink_ack(skb, nlh, err);
-		else if (nlh->nlmsg_flags & NLM_F_ACK)
-			netlink_ack(skb, nlh, 0);
-		skb_pull(skb, rlen);
-	}
-}
-
-static int netlink_prepare(struct user_helper_data *uhd)
-{
-	struct netlink_kernel_cfg cfg = {
-		.groups = 0,
-		.input = toi_user_rcv_skb,
-	};
-
-	uhd->next = uhd_list;
-	uhd_list = uhd;
-
-	uhd->sock_seq = 0x42c0ffee;
-	uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg);
-	if (!uhd->nl) {
-		printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
-				uhd->name);
-		return -ENOMEM;
-	}
-
-	toi_fill_skb_pool(uhd);
-
-	return 0;
-}
-
-void toi_netlink_close(struct user_helper_data *uhd)
-{
-	struct task_struct *t;
-
-	toi_read_lock_tasklist();
-	t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
-	if (t)
-		t->flags &= ~PF_NOFREEZE;
-	toi_read_unlock_tasklist();
-
-	toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
-}
-int toi_netlink_setup(struct user_helper_data *uhd)
-{
-	/* In case userui didn't cleanup properly on us */
-	toi_netlink_close_complete(uhd);
-
-	if (netlink_prepare(uhd) < 0) {
-		printk(KERN_INFO "Netlink prepare failed.\n");
-		return 1;
-	}
-
-	if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
-				UMH_WAIT_EXEC, uhd->debug) < 0) {
-		printk(KERN_INFO "Launch userspace program failed.\n");
-		toi_netlink_close_complete(uhd);
-		return 1;
-	}
-
-	/* Wait 2 seconds for the userspace process to make contact */
-	wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
-
-	if (uhd->pid == -1) {
-		printk(KERN_INFO "%s: Failed to contact userspace process.\n",
-				uhd->name);
-		toi_netlink_close_complete(uhd);
-		return 1;
-	}
-
-	return 0;
-}
diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
deleted file mode 100644
index 89e154599..000000000
--- a/kernel/power/tuxonice_netlink.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for functions for communicating with a userspace helper
- * via netlink.
- */
-
-#include <linux/netlink.h>
-#include <net/sock.h>
-
-#define NETLINK_MSG_BASE 0x10
-
-#define NETLINK_MSG_READY 0x10
-#define	NETLINK_MSG_NOFREEZE_ME 0x16
-#define NETLINK_MSG_GET_DEBUGGING 0x19
-#define NETLINK_MSG_CLEANUP 0x24
-#define NETLINK_MSG_NOFREEZE_ACK 0x27
-#define NETLINK_MSG_IS_DEBUGGING 0x28
-
-struct user_helper_data {
-	int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
-	void (*not_ready) (void);
-	struct sock *nl;
-	u32 sock_seq;
-	pid_t pid;
-	char *comm;
-	char program[256];
-	int pool_level;
-	int pool_limit;
-	struct sk_buff *emerg_skbs;
-	int skb_size;
-	int netlink_id;
-	char *name;
-	struct user_helper_data *next;
-	struct completion wait_for_process;
-	u32 interface_version;
-	int must_init;
-	int debug;
-};
-
-#ifdef CONFIG_NET
-int toi_netlink_setup(struct user_helper_data *uhd);
-void toi_netlink_close(struct user_helper_data *uhd);
-void toi_send_netlink_message(struct user_helper_data *uhd,
-		int type, void *params, size_t len);
-void toi_netlink_close_complete(struct user_helper_data *uhd);
-#else
-static inline int toi_netlink_setup(struct user_helper_data *uhd)
-{
-	return 0;
-}
-
-static inline void toi_netlink_close(struct user_helper_data *uhd) { };
-static inline void toi_send_netlink_message(struct user_helper_data *uhd,
-		int type, void *params, size_t len) { };
-static inline void toi_netlink_close_complete(struct user_helper_data *uhd)
-	{ };
-#endif
diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
deleted file mode 100644
index 9ea185af1..000000000
--- a/kernel/power/tuxonice_pagedir.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for handling pagesets.
- * Note that pbes aren't actually stored as such. They're stored as
- * bitmaps and extents.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/hardirq.h>
-#include <linux/sched.h>
-#include <linux/cpu.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
-
-static int ptoi_pfn;
-static struct pbe *this_low_pbe;
-static struct pbe **last_low_pbe_ptr;
-
-void toi_reset_alt_image_pageset2_pfn(void)
-{
-  memory_bm_position_reset(pageset2_map);
-}
-
-static struct page *first_conflicting_page;
-
-/*
- * free_conflicting_pages
- */
-
-static void free_conflicting_pages(void)
-{
-	while (first_conflicting_page) {
-		struct page *next =
-			*((struct page **) kmap(first_conflicting_page));
-		kunmap(first_conflicting_page);
-		toi__free_page(29, first_conflicting_page);
-		first_conflicting_page = next;
-	}
-}
-
-/* __toi_get_nonconflicting_page
- *
- * Description: Gets order zero pages that won't be overwritten
- *		while copying the original pages.
- */
-
-struct page *___toi_get_nonconflicting_page(int can_be_highmem)
-{
-	struct page *page;
-	gfp_t flags = TOI_ATOMIC_GFP;
-	if (can_be_highmem)
-		flags |= __GFP_HIGHMEM;
-
-
-	if (test_toi_state(TOI_LOADING_ALT_IMAGE) &&
-			pageset2_map && ptoi_pfn) {
-		do {
-			ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0);
-			if (ptoi_pfn != BM_END_OF_MAP) {
-				page = pfn_to_page(ptoi_pfn);
-				if (!PagePageset1(page) &&
-				    (can_be_highmem || !PageHighMem(page)))
-					return page;
-			}
-		} while (ptoi_pfn);
-	}
-
-	do {
-		page = toi_alloc_page(29, flags | __GFP_ZERO);
-		if (!page) {
-			printk(KERN_INFO "Failed to get nonconflicting "
-					"page.\n");
-			return NULL;
-		}
-		if (PagePageset1(page)) {
-			struct page **next = (struct page **) kmap(page);
-			*next = first_conflicting_page;
-			first_conflicting_page = page;
-			kunmap(page);
-		}
-	} while (PagePageset1(page));
-
-	return page;
-}
-
-unsigned long __toi_get_nonconflicting_page(void)
-{
-	struct page *page = ___toi_get_nonconflicting_page(0);
-	return page ? (unsigned long) page_address(page) : 0;
-}
-
-static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
-		int highmem)
-{
-	if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
-		     + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
-		struct page *new_page =
-			___toi_get_nonconflicting_page(highmem);
-		if (!new_page)
-			return ERR_PTR(-ENOMEM);
-		this_pbe = (struct pbe *) kmap(new_page);
-		memset(this_pbe, 0, PAGE_SIZE);
-		*page_ptr = new_page;
-	} else
-		this_pbe++;
-
-	return this_pbe;
-}
-
-/**
- * get_pageset1_load_addresses - generate pbes for conflicting pages
- *
- * We check here that pagedir & pages it points to won't collide
- * with pages where we're going to restore from the loaded pages
- * later.
- *
- * Returns:
- *	Zero on success, one if couldn't find enough pages (shouldn't
- *	happen).
- **/
-int toi_get_pageset1_load_addresses(void)
-{
-	int pfn, highallocd = 0, lowallocd = 0;
-	int low_needed = pagedir1.size - get_highmem_size(pagedir1);
-	int high_needed = get_highmem_size(pagedir1);
-	int low_pages_for_highmem = 0;
-	gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
-	struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
-		    *low_pbe_page, *last_low_pbe_page = NULL;
-	struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
-		   *this_high_pbe = NULL;
-	unsigned long orig_low_pfn, orig_high_pfn;
-	int high_pbes_done = 0, low_pbes_done = 0;
-	int low_direct = 0, high_direct = 0, result = 0, i;
-	int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0;
-
-        toi_trace_index++;
-
-	memory_bm_position_reset(pageset1_map);
-	memory_bm_position_reset(pageset1_copy_map);
-
-	last_low_pbe_ptr = &restore_pblist;
-
-	/* First, allocate pages for the start of our pbe lists. */
-	if (high_needed) {
-		high_pbe_page = ___toi_get_nonconflicting_page(1);
-		if (!high_pbe_page) {
-			result = -ENOMEM;
-			goto out;
-		}
-		this_high_pbe = (struct pbe *) kmap(high_pbe_page);
-		memset(this_high_pbe, 0, PAGE_SIZE);
-	}
-
-	low_pbe_page = ___toi_get_nonconflicting_page(0);
-	if (!low_pbe_page) {
-		result = -ENOMEM;
-		goto out;
-	}
-	this_low_pbe = (struct pbe *) page_address(low_pbe_page);
-
-	/*
-	 * Next, allocate the number of pages we need.
-	 */
-
-	i = low_needed + high_needed;
-
-	do {
-		int is_high;
-
-		if (i == low_needed)
-			flags &= ~__GFP_HIGHMEM;
-
-		page = toi_alloc_page(30, flags);
-		BUG_ON(!page);
-
-		SetPagePageset1Copy(page);
-		is_high = PageHighMem(page);
-
-		if (PagePageset1(page)) {
-			if (is_high)
-				high_direct++;
-			else
-				low_direct++;
-		} else {
-			if (is_high)
-				highallocd++;
-			else
-				lowallocd++;
-		}
-	} while (--i);
-
-	high_needed -= high_direct;
-	low_needed -= low_direct;
-
-	/*
-	 * Do we need to use some lowmem pages for the copies of highmem
-	 * pages?
-	 */
-	if (high_needed > highallocd) {
-		low_pages_for_highmem = high_needed - highallocd;
-		high_needed -= low_pages_for_highmem;
-		low_needed += low_pages_for_highmem;
-	}
-
-	/*
-	 * Now generate our pbes (which will be used for the atomic restore),
-	 * and free unneeded pages.
-	 */
-	memory_bm_position_reset(pageset1_copy_map);
-	for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP;
-			pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) {
-		int is_high;
-		page = pfn_to_page(pfn);
-		is_high = PageHighMem(page);
-
-		if (PagePageset1(page))
-			continue;
-
-		/* Nope. We're going to use this page. Add a pbe. */
-		if (is_high || low_pages_for_highmem) {
-			struct page *orig_page;
-			high_pbes_done++;
-			if (!is_high)
-				low_pages_for_highmem--;
-			do {
-				orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0);
-				BUG_ON(orig_high_pfn == BM_END_OF_MAP);
-				orig_page = pfn_to_page(orig_high_pfn);
-			} while (!PageHighMem(orig_page) ||
-					PagePageset1Copy(orig_page));
-
-			this_high_pbe->orig_address = (void *) orig_high_pfn;
-			this_high_pbe->address = page;
-			this_high_pbe->next = NULL;
-			toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p",
-					high_page, high_offset, page, orig_high_pfn, orig_page);
-			if (last_high_pbe_page != high_pbe_page) {
-				*last_high_pbe_ptr =
-					(struct pbe *) high_pbe_page;
-				if (last_high_pbe_page) {
-					kunmap(last_high_pbe_page);
-					high_page++;
-					high_offset = 0;
-				} else
-					high_offset++;
-				last_high_pbe_page = high_pbe_page;
-			} else {
-				*last_high_pbe_ptr = this_high_pbe;
-				high_offset++;
-			}
-			last_high_pbe_ptr = &this_high_pbe->next;
-			this_high_pbe = get_next_pbe(&high_pbe_page,
-					this_high_pbe, 1);
-			if (IS_ERR(this_high_pbe)) {
-				printk(KERN_INFO
-						"This high pbe is an error.\n");
-				return -ENOMEM;
-			}
-		} else {
-			struct page *orig_page;
-			low_pbes_done++;
-			do {
-				orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0);
-				BUG_ON(orig_low_pfn == BM_END_OF_MAP);
-				orig_page = pfn_to_page(orig_low_pfn);
-			} while (PageHighMem(orig_page) ||
-					PagePageset1Copy(orig_page));
-
-			this_low_pbe->orig_address = page_address(orig_page);
-			this_low_pbe->address = page_address(page);
-			this_low_pbe->next = NULL;
-			toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p",
-					low_page, low_offset, this_low_pbe->orig_address,
-					orig_low_pfn, this_low_pbe->address);
-                        TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address);
-			*last_low_pbe_ptr = this_low_pbe;
-			last_low_pbe_ptr = &this_low_pbe->next;
-			this_low_pbe = get_next_pbe(&low_pbe_page,
-					this_low_pbe, 0);
-			if (low_pbe_page != last_low_pbe_page) {
-				if (last_low_pbe_page) {
-					low_page++;
-					low_offset = 0;
-				} else {
-                                    low_offset++;
-                                }
-				last_low_pbe_page = low_pbe_page;
-			} else
-				low_offset++;
-			if (IS_ERR(this_low_pbe)) {
-				printk(KERN_INFO "this_low_pbe is an error.\n");
-				return -ENOMEM;
-			}
-		}
-	}
-
-	if (high_pbe_page)
-		kunmap(high_pbe_page);
-
-	if (last_high_pbe_page != high_pbe_page) {
-		if (last_high_pbe_page)
-			kunmap(last_high_pbe_page);
-		toi__free_page(29, high_pbe_page);
-	}
-
-	free_conflicting_pages();
-
-out:
-	return result;
-}
-
-int add_boot_kernel_data_pbe(void)
-{
-	this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
-	if (!this_low_pbe->address) {
-		printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
-		return -ENOMEM;
-	}
-
-	toi_bkd.size = sizeof(toi_bkd);
-	memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
-
-	*last_low_pbe_ptr = this_low_pbe;
-	this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
-	this_low_pbe->next = NULL;
-	return 0;
-}
diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
deleted file mode 100644
index 80d1a3d8c..000000000
--- a/kernel/power/tuxonice_pagedir.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for routines for handling pagesets.
- */
-
-#ifndef KERNEL_POWER_PAGEDIR_H
-#define KERNEL_POWER_PAGEDIR_H
-
-/* Pagedir
- *
- * Contains the metadata for a set of pages saved in the image.
- */
-
-struct pagedir {
-	int id;
-	unsigned long size;
-#ifdef CONFIG_HIGHMEM
-	unsigned long size_high;
-#endif
-};
-
-#ifdef CONFIG_HIGHMEM
-#define get_highmem_size(pagedir) (pagedir.size_high)
-#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
-#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
-#else
-#define get_highmem_size(pagedir) (0)
-#define set_highmem_size(pagedir, sz) do { } while (0)
-#define inc_highmem_size(pagedir) do { } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size)
-#endif
-
-extern struct pagedir pagedir1, pagedir2;
-
-extern void toi_copy_pageset1(void);
-
-extern int toi_get_pageset1_load_addresses(void);
-
-extern unsigned long __toi_get_nonconflicting_page(void);
-struct page *___toi_get_nonconflicting_page(int can_be_highmem);
-
-extern void toi_reset_alt_image_pageset2_pfn(void);
-extern int add_boot_kernel_data_pbe(void);
-#endif
diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
deleted file mode 100644
index 307d09f33..000000000
--- a/kernel/power/tuxonice_pageflags.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for serialising and relocating pageflags in which we
- * store our image metadata.
- */
-
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-int toi_pageflags_space_needed(void)
-{
-	return memory_bm_space_needed(pageset1_map);
-}
diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
deleted file mode 100644
index 30ee577c3..000000000
--- a/kernel/power/tuxonice_pageflags.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-
-struct  memory_bitmap;
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-void memory_bm_clear(struct memory_bitmap *bm);
-
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index);
-void memory_bm_position_reset(struct memory_bitmap *bm);
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-int toi_alloc_bitmap(struct memory_bitmap **bm);
-void toi_free_bitmap(struct memory_bitmap **bm);
-void memory_bm_clear(struct memory_bitmap *bm);
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-
-struct toi_module_ops;
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
-	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
-	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_space_needed(struct memory_bitmap *bm);
-
-extern struct memory_bitmap *pageset1_map;
-extern struct memory_bitmap *pageset1_copy_map;
-extern struct memory_bitmap *pageset2_map;
-extern struct memory_bitmap *page_resave_map;
-extern struct memory_bitmap *io_map;
-extern struct memory_bitmap *nosave_map;
-extern struct memory_bitmap *free_map;
-extern struct memory_bitmap *compare_map;
-
-#define PagePageset1(page) \
-	(pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1(page) \
-	(memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1(page) \
-	(memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset1Copy(page) \
-	(memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1Copy(page) \
-	(memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1Copy(page) \
-	(memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset2(page) \
-	(memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset2(page) \
-	(memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset2(page) \
-	(memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageWasRW(page) \
-	(memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPageWasRW(page) \
-	(memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageWasRW(page) \
-	(memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageResave(page) (page_resave_map ? \
-	memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageResave(page) \
-	(memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageResave(page) \
-	(memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosave(page) (nosave_map ? \
-	memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosave(page) \
-	(mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosave(page) \
-	(memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosaveFree(page) (free_map ? \
-		memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosaveFree(page) \
-	(memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosaveFree(page) \
-	(memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageCompareChanged(page) (compare_map ? \
-		memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageCompareChanged(page) \
-	(memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageCompareChanged(page) \
-	(memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-
-extern void save_pageflags(struct memory_bitmap *pagemap);
-extern int load_pageflags(struct memory_bitmap *pagemap);
-extern int toi_pageflags_space_needed(void);
-#endif
diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
deleted file mode 100644
index f8e969625..000000000
--- a/kernel/power/tuxonice_power_off.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for powering down.
- */
-
-#include <linux/device.h>
-#include <linux/suspend.h>
-#include <linux/mm.h>
-#include <linux/pm.h>
-#include <linux/reboot.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/fs.h>
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-
-unsigned long toi_poweroff_method; /* 0 - Kernel power off */
-
-static int wake_delay;
-static char lid_state_file[256], wake_alarm_dir[256];
-static struct file *lid_file, *alarm_file, *epoch_file;
-static int post_wake_state = -1;
-
-static int did_suspend_to_both;
-
-/*
- * __toi_power_down
- * Functionality   : Powers down or reboots the computer once the image
- *                   has been written to disk.
- * Key Assumptions : Able to reboot/power down via code called or that
- *                   the warning emitted if the calls fail will be visible
- *                   to the user (ie printk resumes devices).
- */
-
-static void __toi_power_down(int method)
-{
-	int error;
-
-	toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." :
-			"Powering down.");
-
-	if (test_result_state(TOI_ABORTED))
-		goto out;
-
-	if (test_action_state(TOI_REBOOT))
-		kernel_restart(NULL);
-
-	switch (method) {
-	case 0:
-		break;
-	case 3:
-		/*
-		 * Re-read the overwritten part of pageset2 to make post-resume
-		 * faster.
-		 */
-		if (read_pageset2(1))
-			panic("Attempt to reload pagedir 2 failed. "
-					"Try rebooting.");
-
-		pm_prepare_console();
-
-		error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
-		if (!error) {
-			pm_restore_gfp_mask();
-			error = suspend_devices_and_enter(PM_SUSPEND_MEM);
-			pm_restrict_gfp_mask();
-			if (!error)
-				did_suspend_to_both = 1;
-		}
-		pm_notifier_call_chain(PM_POST_SUSPEND);
-		pm_restore_console();
-
-		/* Success - we're now post-resume-from-ram */
-		if (did_suspend_to_both)
-			return;
-
-		/* Failed to suspend to ram - do normal power off */
-		break;
-	case 4:
-		/*
-		 * If succeeds, doesn't return. If fails, do a simple
-		 * powerdown.
-		 */
-		hibernation_platform_enter();
-		break;
-	case 5:
-		/* Historic entry only now */
-		break;
-	}
-
-	if (method && method != 5)
-		toi_cond_pause(1,
-			"Falling back to alternate power off method.");
-
-	if (test_result_state(TOI_ABORTED))
-		goto out;
-
-        if (pm_power_off)
-            kernel_power_off();
-	kernel_halt();
-	toi_cond_pause(1, "Powerdown failed.");
-	while (1)
-		cpu_relax();
-
-out:
-	if (read_pageset2(1))
-		panic("Attempt to reload pagedir 2 failed. Try rebooting.");
-	return;
-}
-
-#define CLOSE_FILE(file) \
-	if (file) { \
-		filp_close(file, NULL); file = NULL; \
-	}
-
-static void powerdown_cleanup(int toi_or_resume)
-{
-	if (!toi_or_resume)
-		return;
-
-	CLOSE_FILE(lid_file);
-	CLOSE_FILE(alarm_file);
-	CLOSE_FILE(epoch_file);
-}
-
-static void open_file(char *format, char *arg, struct file **var, int mode,
-		char *desc)
-{
-	char buf[256];
-
-	if (strlen(arg)) {
-		sprintf(buf, format, arg);
-		*var = filp_open(buf, mode, 0);
-		if (IS_ERR(*var) || !*var) {
-			printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
-				desc, buf, *var);
-			*var = NULL;
-		}
-	}
-}
-
-static int powerdown_init(int toi_or_resume)
-{
-	if (!toi_or_resume)
-		return 0;
-
-	did_suspend_to_both = 0;
-
-	open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
-			O_RDONLY, "lid");
-
-	if (strlen(wake_alarm_dir)) {
-		open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
-				&alarm_file, O_WRONLY, "alarm");
-
-		open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
-				&epoch_file, O_RDONLY, "epoch");
-	}
-
-	return 0;
-}
-
-static int lid_closed(void)
-{
-	char array[25];
-	ssize_t size;
-	loff_t pos = 0;
-
-	if (!lid_file)
-		return 0;
-
-	size = vfs_read(lid_file, (char __user *) array, 25, &pos);
-	if ((int) size < 1) {
-		printk(KERN_INFO "Failed to read lid state file (%d).\n",
-			(int) size);
-		return 0;
-	}
-
-	if (!strcmp(array, "state:      closed\n"))
-		return 1;
-
-	return 0;
-}
-
-static void write_alarm_file(int value)
-{
-	ssize_t size;
-	char buf[40];
-	loff_t pos = 0;
-
-	if (!alarm_file)
-		return;
-
-	sprintf(buf, "%d\n", value);
-
-	size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
-
-	if (size < 0)
-		printk(KERN_INFO "Error %d writing alarm value %s.\n",
-				(int) size, buf);
-}
-
-/**
- * toi_check_resleep: See whether to powerdown again after waking.
- *
- * After waking, check whether we should powerdown again in a (usually
- * different) way. We only do this if the lid switch is still closed.
- */
-void toi_check_resleep(void)
-{
-	/* We only return if we suspended to ram and woke. */
-	if (lid_closed() && post_wake_state >= 0)
-		__toi_power_down(post_wake_state);
-}
-
-void toi_power_down(void)
-{
-	if (alarm_file && wake_delay) {
-		char array[25];
-		loff_t pos = 0;
-		size_t size = vfs_read(epoch_file, (char __user *) array, 25,
-				&pos);
-
-		if (((int) size) < 1)
-			printk(KERN_INFO "Failed to read epoch file (%d).\n",
-					(int) size);
-		else {
-			unsigned long since_epoch;
-			if (!kstrtoul(array, 0, &since_epoch)) {
-				/* Clear any wakeup time. */
-				write_alarm_file(0);
-
-				/* Set new wakeup time. */
-				write_alarm_file(since_epoch + wake_delay);
-			}
-		}
-	}
-
-	__toi_power_down(toi_poweroff_method);
-
-	toi_check_resleep();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_ACPI)
-	SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL),
-	SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL),
-	SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL),
-	SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0,
-			NULL),
-	SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0),
-	SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both,
-		0, 0, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops powerdown_ops = {
-	.type				= MISC_HIDDEN_MODULE,
-	.name				= "poweroff",
-	.initialise			= powerdown_init,
-	.cleanup			= powerdown_cleanup,
-	.directory			= "[ROOT]",
-	.module				= THIS_MODULE,
-	.sysfs_data			= sysfs_params,
-	.num_sysfs_entries		= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-int toi_poweroff_init(void)
-{
-	return toi_register_module(&powerdown_ops);
-}
-
-void toi_poweroff_exit(void)
-{
-	toi_unregister_module(&powerdown_ops);
-}
diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
deleted file mode 100644
index 6e1d8bb39..000000000
--- a/kernel/power/tuxonice_power_off.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for the powering down.
- */
-
-int toi_pm_state_finish(void);
-void toi_power_down(void);
-extern unsigned long toi_poweroff_method;
-int toi_poweroff_init(void);
-void toi_poweroff_exit(void);
-void toi_check_resleep(void);
-
-extern int platform_begin(int platform_mode);
-extern int platform_pre_snapshot(int platform_mode);
-extern void platform_leave(int platform_mode);
-extern void platform_end(int platform_mode);
-extern void platform_finish(int platform_mode);
-extern int platform_pre_restore(int platform_mode);
-extern void platform_restore_cleanup(int platform_mode);
diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
deleted file mode 100644
index e0593252f..000000000
--- a/kernel/power/tuxonice_prepare_image.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * We need to eat memory until we can:
- * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
- * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
- *    main_storage_needed())
- * 3. Reload the pagedir and pageset1 to places that don't collide with their
- *    final destinations, not knowing to what extent the resumed kernel will
- *    overlap with the one loaded at boot time. I think the resumed kernel
- *    should overlap completely, but I don't want to rely on this as it is
- *    an unproven assumption. We therefore assume there will be no overlap at
- *    all (worse case).
- * 4. Meet the user's requested limit (if any) on the size of the image.
- *    The limit is in MB, so pages/256 (assuming 4K pages).
- *
- */
-
-#include <linux/highmem.h>
-#include <linux/freezer.h>
-#include <linux/hardirq.h>
-#include <linux/mmzone.h>
-#include <linux/console.h>
-#include <linux/tuxonice.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_builtin.h"
-
-static unsigned long num_nosave, main_storage_allocated, storage_limit,
-	    header_storage_needed;
-unsigned long extra_pd1_pages_allowance =
-	CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE;
-long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT;
-static int no_ps2_needed;
-
-struct attention_list {
-	struct task_struct *task;
-	struct attention_list *next;
-};
-
-static struct attention_list *attention_list;
-
-#define PAGESET1 0
-#define PAGESET2 1
-
-void free_attention_list(void)
-{
-	struct attention_list *last = NULL;
-
-	while (attention_list) {
-		last = attention_list;
-		attention_list = attention_list->next;
-		toi_kfree(6, last, sizeof(*last));
-	}
-}
-
-static int build_attention_list(void)
-{
-	int i, task_count = 0;
-	struct task_struct *p;
-	struct attention_list *next;
-
-	/*
-	 * Count all userspace process (with task->mm) marked PF_NOFREEZE.
-	 */
-	toi_read_lock_tasklist();
-	for_each_process(p)
-		if ((p->flags & PF_NOFREEZE) || p == current)
-			task_count++;
-	toi_read_unlock_tasklist();
-
-	/*
-	 * Allocate attention list structs.
-	 */
-	for (i = 0; i < task_count; i++) {
-		struct attention_list *this =
-			toi_kzalloc(6, sizeof(struct attention_list),
-					TOI_WAIT_GFP);
-		if (!this) {
-			printk(KERN_INFO "Failed to allocate slab for "
-					"attention list.\n");
-			free_attention_list();
-			return 1;
-		}
-		this->next = NULL;
-		if (attention_list)
-			this->next = attention_list;
-		attention_list = this;
-	}
-
-	next = attention_list;
-	toi_read_lock_tasklist();
-	for_each_process(p)
-		if ((p->flags & PF_NOFREEZE) || p == current) {
-			next->task = p;
-			next = next->next;
-		}
-	toi_read_unlock_tasklist();
-	return 0;
-}
-
-static void pageset2_full(void)
-{
-	struct zone *zone;
-	struct page *page;
-	unsigned long flags;
-	int i;
-
-        toi_trace_index++;
-
-	for_each_populated_zone(zone) {
-		spin_lock_irqsave(&zone->lru_lock, flags);
-		for_each_lru(i) {
-			if (!zone_page_state(zone, NR_LRU_BASE + i))
-				continue;
-
-			list_for_each_entry(page, &zone->lruvec.lists[i], lru) {
-				struct address_space *mapping;
-
-				mapping = page_mapping(page);
-				if (!mapping || !mapping->host ||
-				    !(mapping->host->i_flags & S_ATOMIC_COPY)) {
-                                    if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
-                                        TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified.");
-                                    } else {
-                                        TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full.");
-                                        SetPagePageset2(page);
-                                    }
-                                }
-			}
-		}
-		spin_unlock_irqrestore(&zone->lru_lock, flags);
-	}
-}
-
-/*
- * toi_mark_task_as_pageset
- * Functionality   : Marks all the saveable pages belonging to a given process
- * 		     as belonging to a particular pageset.
- */
-
-static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
-{
-	struct vm_area_struct *vma;
-	struct mm_struct *mm;
-
-	mm = t->active_mm;
-
-	if (!mm || !mm->mmap)
-		return;
-
-        toi_trace_index++;
-
-	if (!irqs_disabled())
-		down_read(&mm->mmap_sem);
-
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		unsigned long posn;
-
-		if (!vma->vm_start ||
-		    vma->vm_flags & VM_PFNMAP)
-			continue;
-
-		for (posn = vma->vm_start; posn < vma->vm_end;
-				posn += PAGE_SIZE) {
-			struct page *page = follow_page(vma, posn, 0);
-			struct address_space *mapping;
-
-			if (!page || !pfn_valid(page_to_pfn(page)))
-				continue;
-
-			mapping = page_mapping(page);
-			if (mapping && mapping->host &&
-			    mapping->host->i_flags & S_ATOMIC_COPY && pageset2)
-				continue;
-
-                        if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
-                                TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2);
-                                continue;
-                        }
-
-			if (pageset2) {
-                                TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1");
-				SetPagePageset2(page);
-                        } else {
-                                TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2");
-				ClearPagePageset2(page);
-				SetPagePageset1(page);
-			}
-		}
-	}
-
-	if (!irqs_disabled())
-		up_read(&mm->mmap_sem);
-}
-
-static void mark_tasks(int pageset)
-{
-	struct task_struct *p;
-
-	toi_read_lock_tasklist();
-	for_each_process(p) {
-		if (!p->mm)
-			continue;
-
-		if (p->flags & PF_KTHREAD)
-			continue;
-
-		toi_mark_task_as_pageset(p, pageset);
-	}
-	toi_read_unlock_tasklist();
-
-}
-
-/* mark_pages_for_pageset2
- *
- * Description:	Mark unshared pages in processes not needed for hibernate as
- * 		being able to be written out in a separate pagedir.
- * 		HighMem pages are simply marked as pageset2. They won't be
- * 		needed during hibernate.
- */
-
-static void toi_mark_pages_for_pageset2(void)
-{
-	struct attention_list *this = attention_list;
-
-	memory_bm_clear(pageset2_map);
-
-	if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed)
-		return;
-
-	if (test_action_state(TOI_PAGESET2_FULL))
-		pageset2_full();
-	else
-		mark_tasks(PAGESET2);
-
-	/*
-	 * Because the tasks in attention_list are ones related to hibernating,
-	 * we know that they won't go away under us.
-	 */
-
-	while (this) {
-		if (!test_result_state(TOI_ABORTED))
-			toi_mark_task_as_pageset(this->task, PAGESET1);
-		this = this->next;
-	}
-}
-
-/*
- * The atomic copy of pageset1 is stored in pageset2 pages.
- * But if pageset1 is larger (normally only just after boot),
- * we need to allocate extra pages to store the atomic copy.
- * The following data struct and functions are used to handle
- * the allocation and freeing of that memory.
- */
-
-static unsigned long extra_pages_allocated;
-
-struct extras {
-	struct page *page;
-	int order;
-	struct extras *next;
-};
-
-static struct extras *extras_list;
-
-/* toi_free_extra_pagedir_memory
- *
- * Description:	Free previously allocated extra pagedir memory.
- */
-void toi_free_extra_pagedir_memory(void)
-{
-	/* Free allocated pages */
-	while (extras_list) {
-		struct extras *this = extras_list;
-		int i;
-
-		extras_list = this->next;
-
-		for (i = 0; i < (1 << this->order); i++)
-			ClearPageNosave(this->page + i);
-
-		toi_free_pages(9, this->page, this->order);
-		toi_kfree(7, this, sizeof(*this));
-	}
-
-	extra_pages_allocated = 0;
-}
-
-/* toi_allocate_extra_pagedir_memory
- *
- * Description:	Allocate memory for making the atomic copy of pagedir1 in the
- * 		case where it is bigger than pagedir2.
- * Arguments:	int	num_to_alloc: Number of extra pages needed.
- * Result:	int. 	Number of extra pages we now have allocated.
- */
-static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
-{
-	int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
-	gfp_t flags = TOI_ATOMIC_GFP;
-
-	if (num_to_alloc < 1)
-		return 0;
-
-	order = fls(num_to_alloc);
-	if (order >= MAX_ORDER)
-		order = MAX_ORDER - 1;
-
-	while (num_to_alloc) {
-		struct page *newpage;
-		unsigned long virt;
-		struct extras *extras_entry;
-
-		while ((1 << order) > num_to_alloc)
-			order--;
-
-		extras_entry = (struct extras *) toi_kzalloc(7,
-			sizeof(struct extras), TOI_ATOMIC_GFP);
-
-		if (!extras_entry)
-			return extra_pages_allocated;
-
-		virt = toi_get_free_pages(9, flags, order);
-		while (!virt && order) {
-			order--;
-			virt = toi_get_free_pages(9, flags, order);
-		}
-
-		if (!virt) {
-			toi_kfree(7, extras_entry, sizeof(*extras_entry));
-			return extra_pages_allocated;
-		}
-
-		newpage = virt_to_page(virt);
-
-		extras_entry->page = newpage;
-		extras_entry->order = order;
-		extras_entry->next = extras_list;
-
-		extras_list = extras_entry;
-
-		for (j = 0; j < (1 << order); j++) {
-			SetPageNosave(newpage + j);
-			SetPagePageset1Copy(newpage + j);
-		}
-
-		extra_pages_allocated += (1 << order);
-		num_to_alloc -= (1 << order);
-	}
-
-	return extra_pages_allocated;
-}
-
-/*
- * real_nr_free_pages: Count pcp pages for a zone type or all zones
- * (-1 for all, otherwise zone_idx() result desired).
- */
-unsigned long real_nr_free_pages(unsigned long zone_idx_mask)
-{
-	struct zone *zone;
-	int result = 0, cpu;
-
-	/* PCP lists */
-	for_each_populated_zone(zone) {
-		if (!(zone_idx_mask & (1 << zone_idx(zone))))
-			continue;
-
-		for_each_online_cpu(cpu) {
-			struct per_cpu_pageset *pset =
-				per_cpu_ptr(zone->pageset, cpu);
-			struct per_cpu_pages *pcp = &pset->pcp;
-			result += pcp->count;
-		}
-
-		result += zone_page_state(zone, NR_FREE_PAGES);
-	}
-	return result;
-}
-
-/*
- * Discover how much extra memory will be required by the drivers
- * when they're asked to hibernate. We can then ensure that amount
- * of memory is available when we really want it.
- */
-static void get_extra_pd1_allowance(void)
-{
-	unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final;
-
-	toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
-
-	if (toi_go_atomic(PMSG_FREEZE, 1))
-		return;
-
-	final = real_nr_free_pages(all_zones_mask);
-	toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0);
-
-	extra_pd1_pages_allowance = (orig_num_free > final) ?
-		orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE :
-		MIN_EXTRA_PAGES_ALLOWANCE;
-}
-
-/*
- * Amount of storage needed, possibly taking into account the
- * expected compression ratio and possibly also ignoring our
- * allowance for extra pages.
- */
-static unsigned long main_storage_needed(int use_ecr,
-		int ignore_extra_pd1_allow)
-{
-	return (pagedir1.size + pagedir2.size +
-	  (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
-	 (use_ecr ? toi_expected_compression_ratio() : 100) / 100;
-}
-
-/*
- * Storage needed for the image header, in bytes until the return.
- */
-unsigned long get_header_storage_needed(void)
-{
-	unsigned long bytes = sizeof(struct toi_header) +
-			toi_header_storage_for_modules() +
-			toi_pageflags_space_needed() +
-			fs_info_space_needed();
-
-	return DIV_ROUND_UP(bytes, PAGE_SIZE);
-}
-
-/*
- * When freeing memory, pages from either pageset might be freed.
- *
- * When seeking to free memory to be able to hibernate, for every ps1 page
- * freed, we need 2 less pages for the atomic copy because there is one less
- * page to copy and one more page into which data can be copied.
- *
- * Freeing ps2 pages saves us nothing directly. No more memory is available
- * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
- * that's too much work to figure out.
- *
- * => ps1_to_free functions
- *
- * Of course if we just want to reduce the image size, because of storage
- * limitations or an image size limit either ps will do.
- *
- * => any_to_free function
- */
-
-static unsigned long lowpages_usable_for_highmem_copy(void)
-{
-	unsigned long needed = get_lowmem_size(pagedir1) +
-			extra_pd1_pages_allowance + MIN_FREE_RAM +
-			toi_memory_for_modules(0),
-		available = get_lowmem_size(pagedir2) +
-			 real_nr_free_low_pages() + extra_pages_allocated;
-
-	return available > needed ? available - needed : 0;
-}
-
-static unsigned long highpages_ps1_to_free(void)
-{
-	unsigned long need = get_highmem_size(pagedir1),
-		      available = get_highmem_size(pagedir2) +
-			      real_nr_free_high_pages() +
-			      lowpages_usable_for_highmem_copy();
-
-	return need > available ? DIV_ROUND_UP(need - available, 2) : 0;
-}
-
-static unsigned long lowpages_ps1_to_free(void)
-{
-	unsigned long needed = get_lowmem_size(pagedir1) +
-			extra_pd1_pages_allowance + MIN_FREE_RAM +
-			toi_memory_for_modules(0),
-		available = get_lowmem_size(pagedir2) +
-			 real_nr_free_low_pages() + extra_pages_allocated;
-
-	return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0;
-}
-
-static unsigned long current_image_size(void)
-{
-	return pagedir1.size + pagedir2.size + header_storage_needed;
-}
-
-static unsigned long storage_still_required(void)
-{
-	unsigned long needed = main_storage_needed(1, 1);
-	return needed > storage_limit ? needed - storage_limit : 0;
-}
-
-static unsigned long ram_still_required(void)
-{
-	unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) +
-		2 * extra_pd1_pages_allowance,
-		  available = real_nr_free_low_pages() + extra_pages_allocated;
-	return needed > available ? needed - available : 0;
-}
-
-unsigned long any_to_free(int use_image_size_limit)
-{
-	int use_soft_limit = use_image_size_limit && image_size_limit > 0;
-	unsigned long current_size = current_image_size(),
-		      soft_limit = use_soft_limit ? (image_size_limit << 8) : 0,
-		      to_free = use_soft_limit ? (current_size > soft_limit ?
-				      current_size - soft_limit : 0) : 0,
-		      storage_limit = storage_still_required(),
-		      ram_limit = ram_still_required(),
-		      first_max = max(to_free, storage_limit);
-
-	return max(first_max, ram_limit);
-}
-
-static int need_pageset2(void)
-{
-	return (real_nr_free_low_pages() + extra_pages_allocated -
-		2 * extra_pd1_pages_allowance - MIN_FREE_RAM -
-		 toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size;
-}
-
-/* amount_needed
- *
- * Calculates the amount by which the image size needs to be reduced to meet
- * our constraints.
- */
-static unsigned long amount_needed(int use_image_size_limit)
-{
-	return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
-			any_to_free(use_image_size_limit));
-}
-
-static int image_not_ready(int use_image_size_limit)
-{
-	toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
-		"Amount still needed (%lu) > 0:%u,"
-		" Storage allocd: %lu < %lu: %u.\n",
-			amount_needed(use_image_size_limit),
-			(amount_needed(use_image_size_limit) > 0),
-			main_storage_allocated,
-			main_storage_needed(1, 1),
-			main_storage_allocated < main_storage_needed(1, 1));
-
-	toi_cond_pause(0, NULL);
-
-	return (amount_needed(use_image_size_limit) > 0) ||
-		 main_storage_allocated < main_storage_needed(1, 1);
-}
-
-static void display_failure_reason(int tries_exceeded)
-{
-	unsigned long storage_required = storage_still_required(),
-	    ram_required = ram_still_required(),
-	    high_ps1 = highpages_ps1_to_free(),
-	    low_ps1 = lowpages_ps1_to_free();
-
-	printk(KERN_INFO "Failed to prepare the image because...\n");
-
-	if (!storage_limit) {
-		printk(KERN_INFO "- You need some storage available to be "
-				"able to hibernate.\n");
-		return;
-	}
-
-	if (tries_exceeded)
-		printk(KERN_INFO "- The maximum number of iterations was "
-				"reached without successfully preparing the "
-				"image.\n");
-
-	if (storage_required) {
-		printk(KERN_INFO " - We need at least %lu pages of storage "
-				"(ignoring the header), but only have %lu.\n",
-				main_storage_needed(1, 1),
-				main_storage_allocated);
-		set_abort_result(TOI_INSUFFICIENT_STORAGE);
-	}
-
-	if (ram_required) {
-		printk(KERN_INFO " - We need %lu more free pages of low "
-				"memory.\n", ram_required);
-		printk(KERN_INFO "     Minimum free     : %8d\n", MIN_FREE_RAM);
-		printk(KERN_INFO "   + Reqd. by modules : %8lu\n",
-				toi_memory_for_modules(0));
-		printk(KERN_INFO "   + 2 * extra allow  : %8lu\n",
-				2 * extra_pd1_pages_allowance);
-		printk(KERN_INFO "   - Currently free   : %8lu\n",
-				real_nr_free_low_pages());
-		printk(KERN_INFO "   - Pages allocd     : %8lu\n",
-				extra_pages_allocated);
-		printk(KERN_INFO "                      : ========\n");
-		printk(KERN_INFO "     Still needed     : %8lu\n",
-				ram_required);
-
-		/* Print breakdown of memory needed for modules */
-		toi_memory_for_modules(1);
-		set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
-	}
-
-	if (high_ps1) {
-		printk(KERN_INFO "- We need to free %lu highmem pageset 1 "
-				"pages.\n", high_ps1);
-		set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
-	}
-
-	if (low_ps1) {
-		printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
-				"pages.\n", low_ps1);
-		set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
-	}
-}
-
-static void display_stats(int always, int sub_extra_pd1_allow)
-{
-	char buffer[255];
-	snprintf(buffer, 254,
-		"Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). "
-		"Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). "
-		"Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n",
-
-		/* Free */
-		real_nr_free_pages(all_zones_mask),
-		real_nr_free_low_pages(),
-
-		/* Sets */
-		pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
-		pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
-
-		/* Nosave */
-		num_nosave, extra_pages_allocated,
-		num_nosave - extra_pages_allocated,
-
-		/* Storage */
-		main_storage_allocated,
-		storage_limit,
-		main_storage_needed(1, sub_extra_pd1_allow),
-		main_storage_needed(1, 1),
-
-		/* Needed */
-		lowpages_ps1_to_free(), highpages_ps1_to_free(),
-		any_to_free(1),
-		MIN_FREE_RAM, toi_memory_for_modules(0),
-		extra_pd1_pages_allowance,
-		image_size_limit,
-
-		need_pageset2() ? "yes" : "no");
-
-	if (always)
-		printk("%s", buffer);
-	else
-		toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
-}
-
-/* flag_image_pages
- *
- * This routine generates our lists of pages to be stored in each
- * pageset. Since we store the data using extents, and adding new
- * extents might allocate a new extent page, this routine may well
- * be called more than once.
- */
-static void flag_image_pages(int atomic_copy)
-{
-	int num_free = 0, num_unmodified = 0;
-	unsigned long loop;
-	struct zone *zone;
-
-	pagedir1.size = 0;
-	pagedir2.size = 0;
-
-	set_highmem_size(pagedir1, 0);
-	set_highmem_size(pagedir2, 0);
-
-	num_nosave = 0;
-        toi_trace_index++;
-
-	memory_bm_clear(pageset1_map);
-
-	toi_generate_free_page_map();
-
-	/*
-	 * Pages not to be saved are marked Nosave irrespective of being
-	 * reserved.
-	 */
-	for_each_populated_zone(zone) {
-		int highmem = is_highmem(zone);
-
-		for (loop = 0; loop < zone->spanned_pages; loop++) {
-			unsigned long pfn = zone->zone_start_pfn + loop;
-			struct page *page;
-			int chunk_size;
-
-			if (!pfn_valid(pfn)) {
-                            TOI_TRACE_DEBUG(pfn, "_Flag Invalid");
-                            continue;
-                        }
-
-			chunk_size = toi_size_of_free_region(zone, pfn);
-			if (chunk_size) {
-                            unsigned long y;
-                            for (y = pfn; y < pfn + chunk_size; y++) {
-                                page = pfn_to_page(y);
-                                TOI_TRACE_DEBUG(y, "_Flag Free");
-                                ClearPagePageset1(page);
-                                ClearPagePageset2(page);
-                            }
-				num_free += chunk_size;
-				loop += chunk_size - 1;
-				continue;
-			}
-
-			page = pfn_to_page(pfn);
-
-			if (PageNosave(page)) {
-                            char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave";
-                            TOI_TRACE_DEBUG(pfn, "_Flag %s", desc);
-                            num_nosave++;
-                            continue;
-                        }
-
-			page = highmem ? saveable_highmem_page(zone, pfn) :
-				saveable_page(zone, pfn);
-
-			if (!page) {
-                                TOI_TRACE_DEBUG(pfn, "_Flag Nosave2");
-				num_nosave++;
-				continue;
-			}
-
-                        if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
-                            TOI_TRACE_DEBUG(pfn, "_Unmodified");
-                            num_unmodified++;
-                            continue;
-                        }
-
-			if (PagePageset2(page)) {
-				pagedir2.size++;
-                                TOI_TRACE_DEBUG(pfn, "_Flag PS2");
-				if (PageHighMem(page))
-					inc_highmem_size(pagedir2);
-				else
-					SetPagePageset1Copy(page);
-				if (PageResave(page)) {
-					SetPagePageset1(page);
-					ClearPagePageset1Copy(page);
-					pagedir1.size++;
-					if (PageHighMem(page))
-						inc_highmem_size(pagedir1);
-				}
-			} else {
-				pagedir1.size++;
-                                TOI_TRACE_DEBUG(pfn, "_Flag PS1");
-				SetPagePageset1(page);
-				if (PageHighMem(page))
-					inc_highmem_size(pagedir1);
-			}
-		}
-	}
-
-	if (!atomic_copy)
-		toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
-			"Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)"
-				    " + Unmodified (%d) + NumFree (%d) = %d.\n",
-			pagedir1.size, pagedir2.size, num_nosave, num_unmodified,
-                        num_free, pagedir1.size + pagedir2.size + num_nosave + num_free);
-}
-
-void toi_recalculate_image_contents(int atomic_copy)
-{
-	memory_bm_clear(pageset1_map);
-	if (!atomic_copy) {
-		unsigned long pfn;
-		memory_bm_position_reset(pageset2_map);
-		for (pfn = memory_bm_next_pfn(pageset2_map, 0);
-				pfn != BM_END_OF_MAP;
-				pfn = memory_bm_next_pfn(pageset2_map, 0))
-			ClearPagePageset1Copy(pfn_to_page(pfn));
-		/* Need to call this before getting pageset1_size! */
-		toi_mark_pages_for_pageset2();
-	}
-        memory_bm_position_reset(pageset2_map);
-	flag_image_pages(atomic_copy);
-
-	if (!atomic_copy) {
-		storage_limit = toiActiveAllocator->storage_available();
-		display_stats(0, 0);
-	}
-}
-
-int try_allocate_extra_memory(void)
-{
-	unsigned long wanted = pagedir1.size +  extra_pd1_pages_allowance -
-		get_lowmem_size(pagedir2);
-	if (wanted > extra_pages_allocated) {
-		unsigned long got = toi_allocate_extra_pagedir_memory(wanted);
-		if (wanted < got) {
-			toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
-				"Want %d extra pages for pageset1, got %d.\n",
-				wanted, got);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/* update_image
- *
- * Allocate [more] memory and storage for the image.
- */
-static void update_image(int ps2_recalc)
-{
-	int old_header_req;
-	unsigned long seek;
-
-	if (try_allocate_extra_memory())
-		return;
-
-	if (ps2_recalc)
-		goto recalc;
-
-	thaw_kernel_threads();
-
-	/*
-	 * Allocate remaining storage space, if possible, up to the
-	 * maximum we know we'll need. It's okay to allocate the
-	 * maximum if the writer is the swapwriter, but
-	 * we don't want to grab all available space on an NFS share.
-	 * We therefore ignore the expected compression ratio here,
-	 * thereby trying to allocate the maximum image size we could
-	 * need (assuming compression doesn't expand the image), but
-	 * don't complain if we can't get the full amount we're after.
-	 */
-
-	do {
-		int result;
-
-		old_header_req = header_storage_needed;
-		toiActiveAllocator->reserve_header_space(header_storage_needed);
-
-		/* How much storage is free with the reservation applied? */
-		storage_limit = toiActiveAllocator->storage_available();
-		seek = min(storage_limit, main_storage_needed(0, 0));
-
-		result = toiActiveAllocator->allocate_storage(seek);
-		if (result)
-			printk("Failed to allocate storage (%d).\n", result);
-
-		main_storage_allocated =
-			toiActiveAllocator->storage_allocated();
-
-		/* Need more header because more storage allocated? */
-		header_storage_needed = get_header_storage_needed();
-
-	} while (header_storage_needed > old_header_req);
-
-	if (freeze_kernel_threads())
-		set_abort_result(TOI_FREEZING_FAILED);
-
-recalc:
-	toi_recalculate_image_contents(0);
-}
-
-/* attempt_to_freeze
- *
- * Try to freeze processes.
- */
-
-static int attempt_to_freeze(void)
-{
-	int result;
-
-	/* Stop processes before checking again */
-	toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
-			"filesystems.");
-	result = freeze_processes();
-
-	if (result)
-		set_abort_result(TOI_FREEZING_FAILED);
-
-	result = freeze_kernel_threads();
-
-	if (result)
-		set_abort_result(TOI_FREEZING_FAILED);
-
-	return result;
-}
-
-/* eat_memory
- *
- * Try to free some memory, either to meet hard or soft constraints on the image
- * characteristics.
- *
- * Hard constraints:
- * - Pageset1 must be < half of memory;
- * - We must have enough memory free at resume time to have pageset1
- *   be able to be loaded in pages that don't conflict with where it has to
- *   be restored.
- * Soft constraints
- * - User specificied image size limit.
- */
-static void eat_memory(void)
-{
-	unsigned long amount_wanted = 0;
-	int did_eat_memory = 0;
-
-	/*
-	 * Note that if we have enough storage space and enough free memory, we
-	 * may exit without eating anything. We give up when the last 10
-	 * iterations ate no extra pages because we're not going to get much
-	 * more anyway, but the few pages we get will take a lot of time.
-	 *
-	 * We freeze processes before beginning, and then unfreeze them if we
-	 * need to eat memory until we think we have enough. If our attempts
-	 * to freeze fail, we give up and abort.
-	 */
-
-	amount_wanted = amount_needed(1);
-
-	switch (image_size_limit) {
-	case -1: /* Don't eat any memory */
-		if (amount_wanted > 0) {
-			set_abort_result(TOI_WOULD_EAT_MEMORY);
-			return;
-		}
-		break;
-	case -2:  /* Free caches only */
-		drop_pagecache();
-		toi_recalculate_image_contents(0);
-		amount_wanted = amount_needed(1);
-		break;
-	default:
-		break;
-	}
-
-	if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
-			image_size_limit != -1) {
-		unsigned long request = amount_wanted;
-		unsigned long high_req = max(highpages_ps1_to_free(),
-				any_to_free(1));
-		unsigned long low_req = lowpages_ps1_to_free();
-		unsigned long got = 0;
-
-		toi_prepare_status(CLEAR_BAR,
-				"Seeking to free %ldMB of memory.",
-				MB(amount_wanted));
-
-		thaw_kernel_threads();
-
-		/*
-		 * Ask for too many because shrink_memory_mask doesn't
-		 * currently return enough most of the time.
-		 */
-		
-		if (low_req)
-			got = shrink_memory_mask(low_req, GFP_KERNEL);
-		if (high_req)
-			shrink_memory_mask(high_req - got, GFP_HIGHUSER);
-
-		did_eat_memory = 1;
-
-		toi_recalculate_image_contents(0);
-
-		amount_wanted = amount_needed(1);
-
-		printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &"
-				" %ld pages from anywhere, got %ld.\n",
-				high_req, low_req,
-				request - amount_wanted);
-
-		toi_cond_pause(0, NULL);
-
-		if (freeze_kernel_threads())
-			set_abort_result(TOI_FREEZING_FAILED);
-	}
-
-	if (did_eat_memory)
-		toi_recalculate_image_contents(0);
-}
-
-/* toi_prepare_image
- *
- * Entry point to the whole image preparation section.
- *
- * We do four things:
- * - Freeze processes;
- * - Ensure image size constraints are met;
- * - Complete all the preparation for saving the image,
- *   including allocation of storage. The only memory
- *   that should be needed when we're finished is that
- *   for actually storing the image (and we know how
- *   much is needed for that because the modules tell
- *   us).
- * - Make sure that all dirty buffers are written out.
- */
-#define MAX_TRIES 2
-int toi_prepare_image(void)
-{
-	int result = 1, tries = 1;
-
-	main_storage_allocated = 0;
-	no_ps2_needed = 0;
-
-	if (attempt_to_freeze())
-		return 1;
-
-	lock_device_hotplug();
-	set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
-
-	if (!extra_pd1_pages_allowance)
-		get_extra_pd1_allowance();
-
-	storage_limit = toiActiveAllocator->storage_available();
-
-	if (!storage_limit) {
-		printk(KERN_INFO "No storage available. Didn't try to prepare "
-				"an image.\n");
-		display_failure_reason(0);
-		set_abort_result(TOI_NOSTORAGE_AVAILABLE);
-		return 1;
-	}
-
-	if (build_attention_list()) {
-		abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
-				"Unable to successfully prepare the image.\n");
-		return 1;
-	}
-
-	toi_recalculate_image_contents(0);
-
-	do {
-		toi_prepare_status(CLEAR_BAR,
-				"Preparing Image. Try %d.", tries);
-
-		eat_memory();
-
-		if (test_result_state(TOI_ABORTED))
-			break;
-
-		update_image(0);
-
-		tries++;
-
-	} while (image_not_ready(1) && tries <= MAX_TRIES &&
-			!test_result_state(TOI_ABORTED));
-
-	result = image_not_ready(0);
-
-        /* TODO: Handle case where need to remove existing image and resave
-         * instead of adding to incremental image. */
-
-	if (!test_result_state(TOI_ABORTED)) {
-		if (result) {
-			display_stats(1, 0);
-			display_failure_reason(tries > MAX_TRIES);
-			abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
-				"Unable to successfully prepare the image.\n");
-		} else {
-			/* Pageset 2 needed? */
-			if (!need_pageset2() &&
-				  test_action_state(TOI_NO_PS2_IF_UNNEEDED)) {
-				no_ps2_needed = 1;
-				toi_recalculate_image_contents(0);
-				update_image(1);
-			}
-
-			toi_cond_pause(1, "Image preparation complete.");
-		}
-	}
-
-	return result ? result : allocate_checksum_pages();
-}
diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
deleted file mode 100644
index af6769ee2..000000000
--- a/kernel/power/tuxonice_prepare_image.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <asm/sections.h>
-
-extern int toi_prepare_image(void);
-extern void toi_recalculate_image_contents(int storage_available);
-extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask);
-extern long image_size_limit;
-extern void toi_free_extra_pagedir_memory(void);
-extern unsigned long extra_pd1_pages_allowance;
-extern void free_attention_list(void);
-
-#define MIN_FREE_RAM 100
-#define MIN_EXTRA_PAGES_ALLOWANCE 500
-
-#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
-#ifdef CONFIG_HIGHMEM
-#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
-						(1 << ZONE_HIGHMEM)))
-#else
-#define real_nr_free_high_pages() (0)
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
-
-/* For eat_memory function */
-#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
-#endif
-
-unsigned long get_header_storage_needed(void);
-unsigned long any_to_free(int use_image_size_limit);
-int try_allocate_extra_memory(void);
diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c
deleted file mode 100644
index 710e48dee..000000000
--- a/kernel/power/tuxonice_prune.c
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * kernel/power/tuxonice_prune.c
- *
- * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file implements a TuxOnIce module that seeks to prune the
- * amount of data written to disk. It builds a table of hashes
- * of the uncompressed data, and writes the pfn of the previous page
- * with the same contents instead of repeating the data when a match
- * is found.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-#include <crypto/hash.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-/*
- * We never write a page bigger than PAGE_SIZE, so use a large number
- * to indicate that data is a PFN.
- */
-#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100)
-
-static unsigned long toi_pruned_pages;
-
-static struct toi_module_ops toi_prune_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_prune_hash_algo_name[32] = "sha1";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
-	struct shash_desc desc;
-	char *digest;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_prune_crypto_prepare(void)
-{
-	int cpu, ret, digestsize;
-
-	if (!*toi_prune_hash_algo_name) {
-		printk(KERN_INFO "TuxOnIce: Pruning enabled but no "
-				"hash algorithm set.\n");
-		return 1;
-	}
-
-	for_each_online_cpu(cpu) {
-		struct cpu_context *this = &per_cpu(contexts, cpu);
-		this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0);
-		if (IS_ERR(this->desc.tfm)) {
-			printk(KERN_INFO "TuxOnIce: Failed to allocate the "
-					"%s prune hash algorithm.\n",
-					toi_prune_hash_algo_name);
-			this->desc.tfm = NULL;
-			return 1;
-		}
-
-		if (!digestsize)
-			digestsize = crypto_shash_digestsize(this->desc.tfm);
-
-		this->digest = kmalloc(digestsize, GFP_KERNEL);
-		if (!this->digest) {
-			printk(KERN_INFO "TuxOnIce: Failed to allocate space "
-					"for digest output.\n");
-			crypto_free_shash(this->desc.tfm);
-			this->desc.tfm = NULL;
-		}
-
-		this->desc.flags = 0;
-
-		ret = crypto_shash_init(&this->desc);
-		if (ret < 0) {
-			printk(KERN_INFO "TuxOnIce: Failed to initialise the "
-					"%s prune hash algorithm.\n",
-					toi_prune_hash_algo_name);
-			kfree(this->digest);
-			this->digest = NULL;
-			crypto_free_shash(this->desc.tfm);
-			this->desc.tfm = NULL;
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-static int toi_prune_rw_cleanup(int writing)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		struct cpu_context *this = &per_cpu(contexts, cpu);
-		if (this->desc.tfm) {
-			crypto_free_shash(this->desc.tfm);
-			this->desc.tfm = NULL;
-		}
-
-		if (this->digest) {
-			kfree(this->digest);
-			this->digest = NULL;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * toi_prune_init
- */
-
-static int toi_prune_init(int toi_or_resume)
-{
-	if (!toi_or_resume)
-		return 0;
-
-	toi_pruned_pages = 0;
-
-	next_driver = toi_get_next_filter(&toi_prune_ops);
-
-	return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_prune_rw_init()
- */
-
-static int toi_prune_rw_init(int rw, int stream_number)
-{
-	if (toi_prune_crypto_prepare()) {
-		printk(KERN_ERR "Failed to initialise prune "
-				"algorithm.\n");
-		if (rw == READ) {
-			printk(KERN_INFO "Unable to read the image.\n");
-			return -ENODEV;
-		} else {
-			printk(KERN_INFO "Continuing without "
-				"pruning the image.\n");
-			toi_prune_ops.enabled = 0;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * toi_prune_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page:	Pointer to a buffer of size PAGE_SIZE, containing
- * data to be checked.
- *
- * Returns:	0 on success. Otherwise the error is that returned by later
- * 		modules, -ECHILD if we have a broken pipeline or -EIO if
- * 		zlib errs.
- */
-static int toi_prune_write_page(unsigned long index, int buf_type,
-		void *buffer_page, unsigned int buf_size)
-{
-	int ret = 0, cpu = smp_processor_id(), write_data = 1;
-	struct cpu_context *ctx = &per_cpu(contexts, cpu);
-	u8* output_buffer = buffer_page;
-	int output_len = buf_size;
-	int out_buf_type = buf_type;
-	void *buffer_start;
-	u32 buf[4];
-
-	if (ctx->desc.tfm) {
-
-		buffer_start = TOI_MAP(buf_type, buffer_page);
-		ctx->len = OUT_BUF_SIZE;
-
-		ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest);
-		if (ret) {
-			printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret);
-		} else {
-			mutex_lock(&stats_lock);
-
-			toi_pruned_pages++;
-
-			mutex_unlock(&stats_lock);
-
-		}
-
-		TOI_UNMAP(buf_type, buffer_page);
-	}
-
-	if (write_data)
-		ret = next_driver->write_page(index, out_buf_type,
-				output_buffer, output_len);
-	else
-		ret = next_driver->write_page(index, out_buf_type,
-				output_buffer, output_len);
-
-	return ret;
-}
-
-/*
- * toi_prune_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules or from a previously loaded page and
- * fill the input buffer.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_prune_read_page(unsigned long *index, int buf_type,
-		void *buffer_page, unsigned int *buf_size)
-{
-	int ret, cpu = smp_processor_id();
-	unsigned int len;
-	char *buffer_start;
-	struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
-	if (!ctx->desc.tfm)
-		return next_driver->read_page(index, TOI_PAGE, buffer_page,
-				buf_size);
-
-	/*
-	 * All our reads must be synchronous - we can't handle
-	 * data that hasn't been read yet.
-	 */
-
-	ret = next_driver->read_page(index, buf_type, buffer_page, &len);
-
-	if (len == PRUNE_DATA_IS_PFN) {
-		buffer_start = kmap(buffer_page);
-	}
-
-	return ret;
-}
-
-/*
- * toi_prune_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_prune_print_debug_stats(char *buffer, int size)
-{
-	int len;
-
-	/* Output the number of pages pruned. */
-	if (*toi_prune_hash_algo_name)
-		len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
-				toi_prune_hash_algo_name);
-	else
-		len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
-	if (toi_pruned_pages)
-		len += scnprintf(buffer+len, size - len, "  Pruned "
-			"%lu pages).\n",
-		  toi_pruned_pages);
-	return len;
-}
-
-/*
- * toi_prune_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_prune_memory_needed(void)
-{
-	return 2 * PAGE_SIZE;
-}
-
-static int toi_prune_storage_needed(void)
-{
-	return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
-		strlen(toi_prune_hash_algo_name) + 1;
-}
-
-/*
- * toi_prune_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_prune_save_config_info(char *buffer)
-{
-	int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0;
-
-	*((unsigned long *) buffer) = toi_pruned_pages;
-	offset += sizeof(unsigned long);
-	*((int *) (buffer + offset)) = len;
-	offset += sizeof(int);
-	strncpy(buffer + offset, toi_prune_hash_algo_name, len);
-	return offset + len;
-}
-
-/* toi_prune_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description:	Reload information needed for passing back to the
- * resumed kernel.
- */
-static void toi_prune_load_config_info(char *buffer, int size)
-{
-	int len, offset = 0;
-
-	toi_pruned_pages = *((unsigned long *) buffer);
-	offset += sizeof(unsigned long);
-	len = *((int *) (buffer + offset));
-	offset += sizeof(int);
-	strncpy(toi_prune_hash_algo_name, buffer + offset, len);
-}
-
-static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-	bkd->pruned_pages = toi_pruned_pages;
-}
-
-static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-	toi_pruned_pages = bkd->pruned_pages;
-}
-
-/*
- * toi_expected_ratio
- *
- * Description:	Returns the expected ratio between data passed into this module
- * 		and the amount of data output when writing.
- * Returns:	100 - we have no idea how many pages will be pruned.
- */
-
-static int toi_prune_expected_ratio(void)
-{
-	return 100;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0,
-			NULL),
-	SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_prune_ops = {
-	.type			= FILTER_MODULE,
-	.name			= "prune",
-	.directory		= "prune",
-	.module			= THIS_MODULE,
-	.initialise		= toi_prune_init,
-	.memory_needed 		= toi_prune_memory_needed,
-	.print_debug_info	= toi_prune_print_debug_stats,
-	.save_config_info	= toi_prune_save_config_info,
-	.load_config_info	= toi_prune_load_config_info,
-	.storage_needed		= toi_prune_storage_needed,
-	.expected_compression	= toi_prune_expected_ratio,
-
-	.pre_atomic_restore	= toi_prune_pre_atomic_restore,
-	.post_atomic_restore	= toi_prune_post_atomic_restore,
-
-	.rw_init		= toi_prune_rw_init,
-	.rw_cleanup		= toi_prune_rw_cleanup,
-
-	.write_page		= toi_prune_write_page,
-	.read_page		= toi_prune_read_page,
-
-	.sysfs_data		= sysfs_params,
-	.num_sysfs_entries	= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_prune_load(void)
-{
-	return toi_register_module(&toi_prune_ops);
-}
-
-late_initcall(toi_prune_load);
diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
deleted file mode 100644
index e99f6e24f..000000000
--- a/kernel/power/tuxonice_storage.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for talking to a userspace program that manages storage.
- *
- * The kernel side:
- * - starts the userspace program;
- * - sends messages telling it when to open and close the connection;
- * - tells it when to quit;
- *
- * The user space side:
- * - passes messages regarding status;
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_ui.h"
-
-static struct user_helper_data usm_helper_data;
-static struct toi_module_ops usm_ops;
-static int message_received, usm_prepare_count;
-static int storage_manager_last_action, storage_manager_action;
-
-static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-	int type;
-	int *data;
-
-	type = nlh->nlmsg_type;
-
-	/* A control message: ignore them */
-	if (type < NETLINK_MSG_BASE)
-		return 0;
-
-	/* Unknown message: reply with EINVAL */
-	if (type >= USM_MSG_MAX)
-		return -EINVAL;
-
-	/* All operations require privileges, even GET */
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	/* Only allow one task to receive NOFREEZE privileges */
-	if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
-		return -EBUSY;
-
-	data = (int *) NLMSG_DATA(nlh);
-
-	switch (type) {
-	case USM_MSG_SUCCESS:
-	case USM_MSG_FAILED:
-		message_received = type;
-		complete(&usm_helper_data.wait_for_process);
-		break;
-	default:
-		printk(KERN_INFO "Storage manager doesn't recognise "
-				"message %d.\n", type);
-	}
-
-	return 1;
-}
-
-#ifdef CONFIG_NET
-static int activations;
-
-int toi_activate_storage(int force)
-{
-	int tries = 1;
-
-	if (usm_helper_data.pid == -1 || !usm_ops.enabled)
-		return 0;
-
-	message_received = 0;
-	activations++;
-
-	if (activations > 1 && !force)
-		return 0;
-
-	while ((!message_received || message_received == USM_MSG_FAILED) &&
-			tries < 2) {
-		toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
-				"%d.\n", tries);
-
-		init_completion(&usm_helper_data.wait_for_process);
-
-		toi_send_netlink_message(&usm_helper_data,
-			USM_MSG_CONNECT,
-			NULL, 0);
-
-		/* Wait 2 seconds for the userspace process to make contact */
-		wait_for_completion_timeout(&usm_helper_data.wait_for_process,
-				2*HZ);
-
-		tries++;
-	}
-
-	return 0;
-}
-
-int toi_deactivate_storage(int force)
-{
-	if (usm_helper_data.pid == -1 || !usm_ops.enabled)
-		return 0;
-
-	message_received = 0;
-	activations--;
-
-	if (activations && !force)
-		return 0;
-
-	init_completion(&usm_helper_data.wait_for_process);
-
-	toi_send_netlink_message(&usm_helper_data,
-			USM_MSG_DISCONNECT,
-			NULL, 0);
-
-	wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
-
-	if (!message_received || message_received == USM_MSG_FAILED) {
-		printk(KERN_INFO "Returning failure disconnecting storage.\n");
-		return 1;
-	}
-
-	return 0;
-}
-#endif
-
-static void storage_manager_simulate(void)
-{
-	printk(KERN_INFO "--- Storage manager simulate ---\n");
-	toi_prepare_usm();
-	schedule();
-	printk(KERN_INFO "--- Activate storage 1 ---\n");
-	toi_activate_storage(1);
-	schedule();
-	printk(KERN_INFO "--- Deactivate storage 1 ---\n");
-	toi_deactivate_storage(1);
-	schedule();
-	printk(KERN_INFO "--- Cleanup usm ---\n");
-	toi_cleanup_usm();
-	schedule();
-	printk(KERN_INFO "--- Storage manager simulate ends ---\n");
-}
-
-static int usm_storage_needed(void)
-{
-	return sizeof(int) + strlen(usm_helper_data.program) + 1;
-}
-
-static int usm_save_config_info(char *buf)
-{
-	int len = strlen(usm_helper_data.program);
-	memcpy(buf, usm_helper_data.program, len + 1);
-	return sizeof(int) + len + 1;
-}
-
-static void usm_load_config_info(char *buf, int size)
-{
-	/* Don't load the saved path if one has already been set */
-	if (usm_helper_data.program[0])
-		return;
-
-	memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf));
-}
-
-static int usm_memory_needed(void)
-{
-	/* ball park figure of 32 pages */
-	return 32 * PAGE_SIZE;
-}
-
-/* toi_prepare_usm
- */
-int toi_prepare_usm(void)
-{
-	usm_prepare_count++;
-
-	if (usm_prepare_count > 1 || !usm_ops.enabled)
-		return 0;
-
-	usm_helper_data.pid = -1;
-
-	if (!*usm_helper_data.program)
-		return 0;
-
-	toi_netlink_setup(&usm_helper_data);
-
-	if (usm_helper_data.pid == -1)
-		printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
-				" start it.\n");
-
-	toi_activate_storage(0);
-
-	return usm_helper_data.pid != -1;
-}
-
-void toi_cleanup_usm(void)
-{
-	usm_prepare_count--;
-
-	if (usm_helper_data.pid > -1 && !usm_prepare_count) {
-		toi_deactivate_storage(0);
-		toi_netlink_close(&usm_helper_data);
-	}
-}
-
-static void storage_manager_activate(void)
-{
-	if (storage_manager_action == storage_manager_last_action)
-		return;
-
-	if (storage_manager_action)
-		toi_prepare_usm();
-	else
-		toi_cleanup_usm();
-
-	storage_manager_last_action = storage_manager_action;
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate),
-	SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL),
-	SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0,
-		NULL),
-	SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1,
-			0, storage_manager_activate)
-};
-
-static struct toi_module_ops usm_ops = {
-	.type				= MISC_MODULE,
-	.name				= "usm",
-	.directory			= "storage_manager",
-	.module				= THIS_MODULE,
-	.storage_needed			= usm_storage_needed,
-	.save_config_info		= usm_save_config_info,
-	.load_config_info		= usm_load_config_info,
-	.memory_needed			= usm_memory_needed,
-
-	.sysfs_data			= sysfs_params,
-	.num_sysfs_entries		= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-/* toi_usm_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-int toi_usm_init(void)
-{
-	usm_helper_data.nl = NULL;
-	usm_helper_data.program[0] = '\0';
-	usm_helper_data.pid = -1;
-	usm_helper_data.skb_size = 0;
-	usm_helper_data.pool_limit = 6;
-	usm_helper_data.netlink_id = NETLINK_TOI_USM;
-	usm_helper_data.name = "userspace storage manager";
-	usm_helper_data.rcv_msg = usm_user_rcv_msg;
-	usm_helper_data.interface_version = 2;
-	usm_helper_data.must_init = 0;
-	init_completion(&usm_helper_data.wait_for_process);
-
-	return toi_register_module(&usm_ops);
-}
-
-void toi_usm_exit(void)
-{
-	toi_netlink_close_complete(&usm_helper_data);
-	toi_unregister_module(&usm_ops);
-}
diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
deleted file mode 100644
index 1ed9ab156..000000000
--- a/kernel/power/tuxonice_storage.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_NET
-int toi_prepare_usm(void);
-void toi_cleanup_usm(void);
-
-int toi_activate_storage(int force);
-int toi_deactivate_storage(int force);
-extern int toi_usm_init(void);
-extern void toi_usm_exit(void);
-#else
-static inline int toi_usm_init(void) { return 0; }
-static inline void toi_usm_exit(void) { }
-
-static inline int toi_activate_storage(int force)
-{
-	return 0;
-}
-
-static inline int toi_deactivate_storage(int force)
-{
-	return 0;
-}
-
-static inline int toi_prepare_usm(void) { return 0; }
-static inline void toi_cleanup_usm(void) { }
-#endif
-
-enum {
-	USM_MSG_BASE = 0x10,
-
-	/* Kernel -> Userspace */
-	USM_MSG_CONNECT = 0x30,
-	USM_MSG_DISCONNECT = 0x31,
-	USM_MSG_SUCCESS = 0x40,
-	USM_MSG_FAILED = 0x41,
-
-	USM_MSG_MAX,
-};
diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
deleted file mode 100644
index ce3215033..000000000
--- a/kernel/power/tuxonice_swap.c
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * kernel/power/tuxonice_swap.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of swap space as a
- * backing store.
- */
-
-#include <linux/suspend.h>
-#include <linux/blkdev.h>
-#include <linux/swapops.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct toi_module_ops toi_swapops;
-
-/* For swapfile automatically swapon/off'd. */
-static char swapfilename[255] = "";
-static int toi_swapon_status;
-
-/* Swap Pages */
-static unsigned long swap_allocated;
-
-static struct sysinfo swapinfo;
-
-static int is_ram_backed(struct swap_info_struct *si)
-{
-	if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) ||
-	    !strncmp(si->bdev->bd_disk->disk_name, "zram", 4))
-		return 1;
-
-	return 0;
-}
-
-/**
- * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
- *
- * Activate the given swapfile if it wasn't already enabled. Remember whether
- * we really did swapon it for swapoffing later.
- */
-static void enable_swapfile(void)
-{
-	int activateswapresult = -EINVAL;
-
-	if (swapfilename[0]) {
-		/* Attempt to swap on with maximum priority */
-		activateswapresult = sys_swapon(swapfilename, 0xFFFF);
-		if (activateswapresult && activateswapresult != -EBUSY)
-			printk(KERN_ERR "TuxOnIce: The swapfile/partition "
-				"specified by /sys/power/tuxonice/swap/swapfile"
-				" (%s) could not be turned on (error %d). "
-				"Attempting to continue.\n",
-				swapfilename, activateswapresult);
-		if (!activateswapresult)
-			toi_swapon_status = 1;
-	}
-}
-
-/**
- * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
- *
- * If we did successfully swapon a file at the start of the cycle, swapoff
- * it now (finishing up).
- */
-static void disable_swapfile(void)
-{
-	if (!toi_swapon_status)
-		return;
-
-	sys_swapoff(swapfilename);
-	toi_swapon_status = 0;
-}
-
-static int add_blocks_to_extent_chain(struct toi_bdev_info *chain,
-		unsigned long start, unsigned long end)
-{
-	if (test_action_state(TOI_TEST_BIO))
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to "
-				"chain %p.", start << chain->bmap_shift,
-				end << chain->bmap_shift, chain);
-
-	return toi_add_to_extent_chain(&chain->blocks, start, end);
-}
-
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
-	struct hibernate_extent *extentpointer = NULL;
-	unsigned long address, extent_min = 0, extent_max = 0;
-	int empty = 1;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for "
-			"chain %d.", chain->allocator_index);
-
-	if (!chain->allocations.first)
-		return 0;
-
-	if (chain->blocks.first)
-		toi_put_extent_chain(&chain->blocks);
-
-	toi_extent_for_each(&chain->allocations, extentpointer, address) {
-		swp_entry_t swap_address = (swp_entry_t) { address };
-		struct block_device *bdev;
-		sector_t new_sector = map_swap_entry(swap_address, &bdev);
-
-		if (empty) {
-			empty = 0;
-			extent_min = extent_max = new_sector;
-			continue;
-		}
-
-		if (new_sector == extent_max + 1) {
-			extent_max++;
-			continue;
-		}
-
-		if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
-			printk(KERN_ERR "Out of memory while making block "
-					"chains.\n");
-			return -ENOMEM;
-		}
-
-		extent_min = new_sector;
-		extent_max = new_sector;
-	}
-
-	if (!empty &&
-	    add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
-		printk(KERN_ERR "Out of memory while making block chains.\n");
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-/*
- * Like si_swapinfo, except that we don't include ram backed swap (compcache!)
- * and don't need to use the spinlocks (userspace is stopped when this
- * function is called).
- */
-void si_swapinfo_no_compcache(void)
-{
-	unsigned int i;
-
-	si_swapinfo(&swapinfo);
-	swapinfo.freeswap = 0;
-	swapinfo.totalswap = 0;
-
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		struct swap_info_struct *si = get_swap_info_struct(i);
-		if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) {
-			swapinfo.totalswap += si->inuse_pages;
-			swapinfo.freeswap += si->pages - si->inuse_pages;
-		}
-	}
-}
-/*
- * We can't just remember the value from allocation time, because other
- * processes might have allocated swap in the mean time.
- */
-static unsigned long toi_swap_storage_available(void)
-{
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available.");
-	si_swapinfo_no_compcache();
-	return swapinfo.freeswap + swap_allocated;
-}
-
-static int toi_swap_initialise(int starting_cycle)
-{
-	if (!starting_cycle)
-		return 0;
-
-	enable_swapfile();
-	return 0;
-}
-
-static void toi_swap_cleanup(int ending_cycle)
-{
-	if (!ending_cycle)
-		return;
-
-	disable_swapfile();
-}
-
-static void toi_swap_free_storage(struct toi_bdev_info *chain)
-{
-	/* Free swap entries */
-	struct hibernate_extent *extentpointer;
-	unsigned long extentvalue;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.",
-			chain);
-
-	swap_allocated -= chain->allocations.size;
-	toi_extent_for_each(&chain->allocations, extentpointer, extentvalue)
-		swap_free((swp_entry_t) { extentvalue });
-
-	toi_put_extent_chain(&chain->allocations);
-}
-
-static void free_swap_range(unsigned long min, unsigned long max)
-{
-	int j;
-
-	for (j = min; j <= max; j++)
-		swap_free((swp_entry_t) { j });
-	swap_allocated -= (max - min + 1);
-}
-
-/*
- * Allocation of a single swap type. Swap priorities are handled at the higher
- * level.
- */
-static int toi_swap_allocate_storage(struct toi_bdev_info *chain,
-		unsigned long request)
-{
-	unsigned long gotten = 0;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "  Swap allocate storage: Asked to"
-			" allocate %lu pages from device %d.", request,
-			chain->allocator_index);
-
-	while (gotten < request) {
-		swp_entry_t start, end;
-                if (0) {
-                    /* Broken at the moment for SSDs */
-                    get_swap_range_of_type(chain->allocator_index, &start, &end,
-                            request - gotten + 1);
-                } else {
-                    start = end = get_swap_page_of_type(chain->allocator_index);
-                }
-		if (start.val) {
-			int added = end.val - start.val + 1;
-			if (toi_add_to_extent_chain(&chain->allocations,
-						start.val, end.val)) {
-				printk(KERN_INFO "Failed to allocate extent for "
-					"%lu-%lu.\n", start.val, end.val);
-				free_swap_range(start.val, end.val);
-				break;
-			}
-			gotten += added;
-			swap_allocated += added;
-		} else
-			break;
-	}
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "  Allocated %lu pages.", gotten);
-	return gotten;
-}
-
-static int toi_swap_register_storage(void)
-{
-	int i, result = 0;
-
-	toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage.");
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		struct swap_info_struct *si = get_swap_info_struct(i);
-		struct toi_bdev_info *devinfo;
-		unsigned char *p;
-		unsigned char buf[256];
-		struct fs_info *fs_info;
-
-		if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si))
-			continue;
-
-		devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info),
-				GFP_ATOMIC);
-		if (!devinfo) {
-			printk("Failed to allocate devinfo struct for swap "
-					"device %d.\n", i);
-			return -ENOMEM;
-		}
-
-		devinfo->bdev = si->bdev;
-		devinfo->allocator = &toi_swapops;
-		devinfo->allocator_index = i;
-
-		fs_info = fs_info_from_block_dev(si->bdev);
-		if (fs_info && !IS_ERR(fs_info)) {
-			memcpy(devinfo->uuid, &fs_info->uuid, 16);
-			free_fs_info(fs_info);
-		} else
-			result = (int) PTR_ERR(fs_info);
-
-		if (!fs_info)
-			printk("fs_info from block dev returned %d.\n", result);
-		devinfo->dev_t = si->bdev->bd_dev;
-		devinfo->prio = si->prio;
-		devinfo->bmap_shift = 3;
-		devinfo->blocks_per_page = 1;
-
-		p = d_path(&si->swap_file->f_path, buf, sizeof(buf));
-		sprintf(devinfo->name, "swap on %s", p);
-
-		toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:"
-				" Device %d (%lx), prio %d.", i,
-				(unsigned long) devinfo->dev_t, devinfo->prio);
-		toi_bio_ops.register_storage(devinfo);
-	}
-
-	return 0;
-}
-
-static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used)
-{
-    struct hibernate_extent *extentpointer = NULL;
-    unsigned long extentvalue;
-    unsigned long i = 0, first_freed = 0;
-
-    toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) {
-        i++;
-        if (i > used) {
-            swap_free((swp_entry_t) { extentvalue });
-            if (!first_freed)
-                first_freed = extentvalue;
-        }
-    }
-
-    return first_freed;
-}
-
-/*
- * workspace_size
- *
- * Description:
- * Returns the number of bytes of RAM needed for this
- * code to do its work. (Used when calculating whether
- * we have enough memory to be able to hibernate & resume).
- *
- */
-static int toi_swap_memory_needed(void)
-{
-	return 1;
-}
-
-/*
- * Print debug info
- *
- * Description:
- */
-static int toi_swap_print_debug_stats(char *buffer, int size)
-{
-	int len = 0;
-
-	len = scnprintf(buffer, size, "- Swap Allocator enabled.\n");
-	if (swapfilename[0])
-		len += scnprintf(buffer+len, size-len,
-			"  Attempting to automatically swapon: %s.\n",
-			swapfilename);
-
-	si_swapinfo_no_compcache();
-
-	len += scnprintf(buffer+len, size-len,
-			"  Swap available for image: %lu pages.\n",
-			swapinfo.freeswap + swap_allocated);
-
-	return len;
-}
-
-static int header_locations_read_sysfs(const char *page, int count)
-{
-	int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
-	struct inode *swapf = NULL;
-	int zone;
-	char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
-	char *path, *output = (char *) page;
-	int path_len;
-
-	if (!page)
-		return 0;
-
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		struct swap_info_struct *si =  get_swap_info_struct(i);
-
-		if (!si || !(si->flags & SWP_WRITEOK))
-			continue;
-
-		if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
-			haveswap = 1;
-			if (!printedpartitionsmessage) {
-				len += sprintf(output + len,
-					"For swap partitions, simply use the "
-					"format: resume=swap:/dev/hda1.\n");
-				printedpartitionsmessage = 1;
-			}
-		} else {
-			path_len = 0;
-
-			path = d_path(&si->swap_file->f_path, path_page,
-					PAGE_SIZE);
-			path_len = snprintf(path_page, PAGE_SIZE, "%s", path);
-
-			haveswap = 1;
-			swapf = si->swap_file->f_mapping->host;
-			zone = bmap(swapf, 0);
-			if (!zone) {
-				len += sprintf(output + len,
-					"Swapfile %s has been corrupted. Reuse"
-					" mkswap on it and try again.\n",
-					path_page);
-			} else {
-				char name_buffer[BDEVNAME_SIZE];
-				len += sprintf(output + len,
-					"For swapfile `%s`,"
-					" use resume=swap:/dev/%s:0x%x.\n",
-					path_page,
-					bdevname(si->bdev, name_buffer),
-					zone << (swapf->i_blkbits - 9));
-			}
-		}
-	}
-
-	if (!haveswap)
-		len = sprintf(output, "You need to turn on swap partitions "
-				"before examining this file.\n");
-
-	toi_free_page(10, (unsigned long) path_page);
-	return len;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL),
-	SYSFS_CUSTOM("headerlocations", SYSFS_READONLY,
-			header_locations_read_sysfs, NULL, 0, NULL),
-	SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0,
-			attempt_to_parse_resume_device2),
-};
-
-static struct toi_bio_allocator_ops toi_bio_swapops = {
-	.register_storage			= toi_swap_register_storage,
-	.storage_available			= toi_swap_storage_available,
-	.allocate_storage			= toi_swap_allocate_storage,
-	.bmap					= get_main_pool_phys_params,
-	.free_storage				= toi_swap_free_storage,
-        .free_unused_storage                    = toi_swap_free_unused_storage,
-};
-
-static struct toi_module_ops toi_swapops = {
-	.type					= BIO_ALLOCATOR_MODULE,
-	.name					= "swap storage",
-	.directory				= "swap",
-	.module					= THIS_MODULE,
-	.memory_needed				= toi_swap_memory_needed,
-	.print_debug_info			= toi_swap_print_debug_stats,
-	.initialise				= toi_swap_initialise,
-	.cleanup				= toi_swap_cleanup,
-	.bio_allocator_ops			= &toi_bio_swapops,
-
-	.sysfs_data		= sysfs_params,
-	.num_sysfs_entries	= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_swap_load(void)
-{
-	return toi_register_module(&toi_swapops);
-}
-
-late_initcall(toi_swap_load);
diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
deleted file mode 100644
index 79c9315b6..000000000
--- a/kernel/power/tuxonice_sysfs.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.c
- *
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains support for sysfs entries for tuning TuxOnIce.
- *
- * We have a generic handler that deals with the most common cases, and
- * hooks for special handlers to use.
- */
-
-#include <linux/suspend.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_alloc.h"
-
-static int toi_sysfs_initialised;
-
-static void toi_initialise_sysfs(void);
-
-static struct toi_sysfs_data sysfs_params[];
-
-#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
-
-static void toi_main_wrapper(void)
-{
-	toi_try_hibernate();
-}
-
-static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
-			      char *page)
-{
-	struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
-	int len = 0;
-	int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ;
-
-	if (full_prep && toi_start_anything(0))
-		return -EBUSY;
-
-	if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
-		toi_prepare_usm();
-
-	switch (sysfs_data->type) {
-	case TOI_SYSFS_DATA_CUSTOM:
-		len = (sysfs_data->data.special.read_sysfs) ?
-			(sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
-			: 0;
-		break;
-	case TOI_SYSFS_DATA_BIT:
-		len = sprintf(page, "%d\n",
-			-test_bit(sysfs_data->data.bit.bit,
-				sysfs_data->data.bit.bit_vector));
-		break;
-	case TOI_SYSFS_DATA_INTEGER:
-		len = sprintf(page, "%d\n",
-			*(sysfs_data->data.integer.variable));
-		break;
-	case TOI_SYSFS_DATA_LONG:
-		len = sprintf(page, "%ld\n",
-			*(sysfs_data->data.a_long.variable));
-		break;
-	case TOI_SYSFS_DATA_UL:
-		len = sprintf(page, "%lu\n",
-			*(sysfs_data->data.ul.variable));
-		break;
-	case TOI_SYSFS_DATA_STRING:
-		len = sprintf(page, "%s\n",
-			sysfs_data->data.string.variable);
-		break;
-	}
-
-	if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
-		toi_cleanup_usm();
-
-	if (full_prep)
-		toi_finish_anything(0);
-
-	return len;
-}
-
-#define BOUND(_variable, _type) do { \
-	if (*_variable < sysfs_data->data._type.minimum) \
-		*_variable = sysfs_data->data._type.minimum; \
-	else if (*_variable > sysfs_data->data._type.maximum) \
-		*_variable = sysfs_data->data._type.maximum; \
-} while (0)
-
-static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
-		const char *my_buf, size_t count)
-{
-	int assigned_temp_buffer = 0, result = count;
-	struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
-
-	if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
-		return -EBUSY;
-
-	((char *) my_buf)[count] = 0;
-
-	if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
-		toi_prepare_usm();
-
-	switch (sysfs_data->type) {
-	case TOI_SYSFS_DATA_CUSTOM:
-		if (sysfs_data->data.special.write_sysfs)
-			result = (sysfs_data->data.special.write_sysfs)(my_buf,
-					count);
-		break;
-	case TOI_SYSFS_DATA_BIT:
-		{
-		unsigned long value;
-		result = kstrtoul(my_buf, 0, &value);
-		if (result)
-			break;
-		if (value)
-			set_bit(sysfs_data->data.bit.bit,
-				(sysfs_data->data.bit.bit_vector));
-		else
-			clear_bit(sysfs_data->data.bit.bit,
-				(sysfs_data->data.bit.bit_vector));
-		}
-		break;
-	case TOI_SYSFS_DATA_INTEGER:
-		{
-			long temp;
-			result = kstrtol(my_buf, 0, &temp);
-			if (result)
-				break;
-			*(sysfs_data->data.integer.variable) = (int) temp;
-			BOUND(sysfs_data->data.integer.variable, integer);
-			break;
-		}
-	case TOI_SYSFS_DATA_LONG:
-		{
-			long *variable =
-				sysfs_data->data.a_long.variable;
-			result = kstrtol(my_buf, 0, variable);
-			if (result)
-				break;
-			BOUND(variable, a_long);
-			break;
-		}
-	case TOI_SYSFS_DATA_UL:
-		{
-			unsigned long *variable =
-				sysfs_data->data.ul.variable;
-			result = kstrtoul(my_buf, 0, variable);
-			if (result)
-				break;
-			BOUND(variable, ul);
-			break;
-		}
-		break;
-	case TOI_SYSFS_DATA_STRING:
-		{
-			int copy_len = count;
-			char *variable =
-				sysfs_data->data.string.variable;
-
-			if (sysfs_data->data.string.max_length &&
-			    (copy_len > sysfs_data->data.string.max_length))
-				copy_len = sysfs_data->data.string.max_length;
-
-			if (!variable) {
-				variable = (char *) toi_get_zeroed_page(31,
-						TOI_ATOMIC_GFP);
-				sysfs_data->data.string.variable = variable;
-				assigned_temp_buffer = 1;
-			}
-			strncpy(variable, my_buf, copy_len);
-			if (copy_len && my_buf[copy_len - 1] == '\n')
-				variable[count - 1] = 0;
-			variable[count] = 0;
-		}
-		break;
-	}
-
-	if (!result)
-		result = count;
-
-	/* Side effect routine? */
-	if (result == count && sysfs_data->write_side_effect)
-		sysfs_data->write_side_effect();
-
-	/* Free temporary buffers */
-	if (assigned_temp_buffer) {
-		toi_free_page(31,
-			(unsigned long) sysfs_data->data.string.variable);
-		sysfs_data->data.string.variable = NULL;
-	}
-
-	if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
-		toi_cleanup_usm();
-
-	toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
-
-	return result;
-}
-
-static struct sysfs_ops toi_sysfs_ops = {
-	.show	= &toi_attr_show,
-	.store	= &toi_attr_store,
-};
-
-static struct kobj_type toi_ktype = {
-	.sysfs_ops	= &toi_sysfs_ops,
-};
-
-struct kobject *tuxonice_kobj;
-
-/* Non-module sysfs entries.
- *
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-	SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL,
-		SYSFS_HIBERNATING, toi_main_wrapper),
-	SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL,
-		SYSFS_RESUMING, toi_try_resume)
-};
-
-void remove_toi_sysdir(struct kobject *kobj)
-{
-	if (!kobj)
-		return;
-
-	kobject_put(kobj);
-}
-
-struct kobject *make_toi_sysdir(char *name)
-{
-	struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
-
-	if (!kobj) {
-		printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
-				"dir!\n");
-		return NULL;
-	}
-
-	kobj->ktype = &toi_ktype;
-
-	return kobj;
-}
-
-/* toi_register_sysfs_file
- *
- * Helper for registering a new /sysfs/tuxonice entry.
- */
-
-int toi_register_sysfs_file(
-		struct kobject *kobj,
-		struct toi_sysfs_data *toi_sysfs_data)
-{
-	int result;
-
-	if (!toi_sysfs_initialised)
-		toi_initialise_sysfs();
-
-	result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
-	if (result)
-		printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
-			"returned %d.\n",
-			toi_sysfs_data->attr.name, result);
-	kobj->ktype = &toi_ktype;
-
-	return result;
-}
-
-/* toi_unregister_sysfs_file
- *
- * Helper for removing unwanted /sys/power/tuxonice entries.
- *
- */
-void toi_unregister_sysfs_file(struct kobject *kobj,
-		struct toi_sysfs_data *toi_sysfs_data)
-{
-	sysfs_remove_file(kobj, &toi_sysfs_data->attr);
-}
-
-void toi_cleanup_sysfs(void)
-{
-	int i,
-	    numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
-	if (!toi_sysfs_initialised)
-		return;
-
-	for (i = 0; i < numfiles; i++)
-		toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
-	kobject_put(tuxonice_kobj);
-	toi_sysfs_initialised = 0;
-}
-
-/* toi_initialise_sysfs
- *
- * Initialise the /sysfs/tuxonice directory.
- */
-
-static void toi_initialise_sysfs(void)
-{
-	int i;
-	int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
-	if (toi_sysfs_initialised)
-		return;
-
-	/* Make our TuxOnIce directory a child of /sys/power */
-	tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
-	if (!tuxonice_kobj)
-		return;
-
-	toi_sysfs_initialised = 1;
-
-	for (i = 0; i < numfiles; i++)
-		toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-}
-
-int toi_sysfs_init(void)
-{
-	toi_initialise_sysfs();
-	return 0;
-}
-
-void toi_sysfs_exit(void)
-{
-	toi_cleanup_sysfs();
-}
diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
deleted file mode 100644
index 5b331b19a..000000000
--- a/kernel/power/tuxonice_sysfs.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/sysfs.h>
-
-struct toi_sysfs_data {
-	struct attribute attr;
-	int type;
-	int flags;
-	union {
-		struct {
-			unsigned long *bit_vector;
-			int bit;
-		} bit;
-		struct {
-			int *variable;
-			int minimum;
-			int maximum;
-		} integer;
-		struct {
-			long *variable;
-			long minimum;
-			long maximum;
-		} a_long;
-		struct {
-			unsigned long *variable;
-			unsigned long minimum;
-			unsigned long maximum;
-		} ul;
-		struct {
-			char *variable;
-			int max_length;
-		} string;
-		struct {
-			int (*read_sysfs) (const char *buffer, int count);
-			int (*write_sysfs) (const char *buffer, int count);
-			void *data;
-		} special;
-	} data;
-
-	/* Side effects routine. Used, eg, for reparsing the
-	 * resume= entry when it changes */
-	void (*write_side_effect) (void);
-	struct list_head sysfs_data_list;
-};
-
-enum {
-	TOI_SYSFS_DATA_NONE = 1,
-	TOI_SYSFS_DATA_CUSTOM,
-	TOI_SYSFS_DATA_BIT,
-	TOI_SYSFS_DATA_INTEGER,
-	TOI_SYSFS_DATA_UL,
-	TOI_SYSFS_DATA_LONG,
-	TOI_SYSFS_DATA_STRING
-};
-
-#define SYSFS_WRITEONLY 0200
-#define SYSFS_READONLY 0444
-#define SYSFS_RW 0644
-
-#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \
-	.attr = {.name  = _name , .mode   = _mode }, \
-	.type = TOI_SYSFS_DATA_BIT, \
-	.flags = _flags, \
-	.data = { .bit = { .bit_vector = _ul, .bit = _bit } } }
-
-#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \
-	.attr = {.name  = _name , .mode   = _mode }, \
-	.type = TOI_SYSFS_DATA_INTEGER, \
-	.flags = _flags, \
-	.data = { .integer = { .variable = _int, .minimum = _min, \
-			.maximum = _max } }, \
-	.write_side_effect = _wse }
-
-#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \
-	.attr = {.name  = _name , .mode   = _mode }, \
-	.type = TOI_SYSFS_DATA_UL, \
-	.flags = _flags, \
-	.data = { .ul = { .variable = _ul, .minimum = _min, \
-			.maximum = _max } } }
-
-#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \
-	.attr = {.name  = _name , .mode   = _mode }, \
-	.type = TOI_SYSFS_DATA_LONG, \
-	.flags = _flags, \
-	.data = { .a_long = { .variable = _long, .minimum = _min, \
-			.maximum = _max } } }
-
-#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \
-	.attr = {.name  = _name , .mode   = _mode }, \
-	.type = TOI_SYSFS_DATA_STRING, \
-	.flags = _flags, \
-	.data = { .string = { .variable = _string, .max_length = _max_len } }, \
-	.write_side_effect = _wse }
-
-#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \
-	.attr = {.name  = _name , .mode   = _mode }, \
-	.type = TOI_SYSFS_DATA_CUSTOM, \
-	.flags = _flags, \
-	.data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \
-	.write_side_effect = _wse }
-
-#define SYSFS_NONE(_name, _wse) { \
-	.attr = {.name  = _name , .mode   = SYSFS_WRITEONLY }, \
-	.type = TOI_SYSFS_DATA_NONE, \
-	.write_side_effect = _wse, \
-}
-
-/* Flags */
-#define SYSFS_NEEDS_SM_FOR_READ 1
-#define SYSFS_NEEDS_SM_FOR_WRITE 2
-#define SYSFS_HIBERNATE 4
-#define SYSFS_RESUME 8
-#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
-#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_NEEDS_SM_FOR_BOTH \
- (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
-
-int toi_register_sysfs_file(struct kobject *kobj,
-		struct toi_sysfs_data *toi_sysfs_data);
-void toi_unregister_sysfs_file(struct kobject *kobj,
-		struct toi_sysfs_data *toi_sysfs_data);
-
-extern struct kobject *tuxonice_kobj;
-
-struct kobject *make_toi_sysdir(char *name);
-void remove_toi_sysdir(struct kobject *obj);
-extern void toi_cleanup_sysfs(void);
-
-extern int toi_sysfs_init(void);
-extern void toi_sysfs_exit(void);
diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
deleted file mode 100644
index c405f9b9a..000000000
--- a/kernel/power/tuxonice_ui.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/reboot.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_builtin.h"
-
-static char local_printf_buf[1024];	/* Same as printk - should be safe */
-struct ui_ops *toi_current_ui;
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress, either from userui or /dev/console if userui isn't
- * available. The non-userui path is particularly for at boot-time, prior
- * to userui being started, when we have an important warning to give to
- * the user.
- */
-static char toi_wait_for_keypress(int timeout)
-{
-	if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
-		return ' ';
-
-	return toi_wait_for_keypress_dev_console(timeout);
-}
-
-/* toi_early_boot_message()
- * Description:	Handle errors early in the process of booting.
- * 		The user may press C to continue booting, perhaps
- * 		invalidating the image,  or space to reboot.
- * 		This works from either the serial console or normally
- * 		attached keyboard.
- *
- * 		Note that we come in here from init, while the kernel is
- * 		locked. If we want to get events from the serial console,
- * 		we need to temporarily unlock the kernel.
- *
- * 		toi_early_boot_message may also be called post-boot.
- * 		In this case, it simply printks the message and returns.
- *
- * Arguments:	int	Whether we are able to erase the image.
- * 		int	default_answer. What to do when we timeout. This
- * 			will normally be continue, but the user might
- * 			provide command line options (__setup) to override
- * 			particular cases.
- * 		Char *. Pointer to a string explaining why we're moaning.
- */
-
-#define say(message, a...) printk(KERN_EMERG message, ##a)
-
-void toi_early_boot_message(int message_detail, int default_answer,
-	char *warning_reason, ...)
-{
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
-	unsigned long orig_state = get_toi_state(), continue_req = 0;
-	unsigned long orig_loglevel = console_loglevel;
-	int can_ask = 1;
-#else
-	int can_ask = 0;
-#endif
-
-	va_list args;
-	int printed_len;
-
-	if (!toi_wait) {
-		set_toi_state(TOI_CONTINUE_REQ);
-		can_ask = 0;
-	}
-
-	if (warning_reason) {
-		va_start(args, warning_reason);
-		printed_len = vsnprintf(local_printf_buf,
-				sizeof(local_printf_buf),
-				warning_reason,
-				args);
-		va_end(args);
-	}
-
-	if (!test_toi_state(TOI_BOOT_TIME)) {
-		printk("TuxOnIce: %s\n", local_printf_buf);
-		return;
-	}
-
-	if (!can_ask) {
-		continue_req = !!default_answer;
-		goto post_ask;
-	}
-
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
-	console_loglevel = 7;
-
-	say("=== TuxOnIce ===\n\n");
-	if (warning_reason) {
-		say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
-		switch (message_detail) {
-		case 0:
-			say("If you continue booting, note that any image WILL"
-				"NOT BE REMOVED.\nTuxOnIce is unable to do so "
-				"because the appropriate modules aren't\n"
-				"loaded. You should manually remove the image "
-				"to avoid any\npossibility of corrupting your "
-				"filesystem(s) later.\n");
-			break;
-		case 1:
-			say("If you want to use the current TuxOnIce image, "
-				"reboot and try\nagain with the same kernel "
-				"that you hibernated from. If you want\n"
-				"to forget that image, continue and the image "
-				"will be erased.\n");
-			break;
-		}
-		say("Press SPACE to reboot or C to continue booting with "
-			"this kernel\n\n");
-		if (toi_wait > 0)
-			say("Default action if you don't select one in %d "
-				"seconds is: %s.\n",
-				toi_wait,
-				default_answer == TOI_CONTINUE_REQ ?
-				"continue booting" : "reboot");
-	} else {
-		say("BIG FAT WARNING!!\n\n"
-			"You have tried to resume from this image before.\n"
-			"If it failed once, it may well fail again.\n"
-			"Would you like to remove the image and boot "
-			"normally?\nThis will be equivalent to entering "
-			"noresume on the\nkernel command line.\n\n"
-			"Press SPACE to remove the image or C to continue "
-			"resuming.\n\n");
-		if (toi_wait > 0)
-			say("Default action if you don't select one in %d "
-				"seconds is: %s.\n", toi_wait,
-				!!default_answer ?
-				"continue resuming" : "remove the image");
-	}
-	console_loglevel = orig_loglevel;
-
-	set_toi_state(TOI_SANITY_CHECK_PROMPT);
-	clear_toi_state(TOI_CONTINUE_REQ);
-
-	if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
-		continue_req = !!default_answer;
-	else
-		continue_req = test_toi_state(TOI_CONTINUE_REQ);
-
-#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
-
-post_ask:
-	if ((warning_reason) && (!continue_req))
-		kernel_restart(NULL);
-
-	restore_toi_state(orig_state);
-	if (continue_req)
-		set_toi_state(TOI_CONTINUE_REQ);
-}
-
-#undef say
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
-	SYSFS_INT("default_console_level", SYSFS_RW,
-			&toi_bkd.toi_default_console_level, 0, 7, 0, NULL),
-	SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0,
-			1 << 30, 0),
-	SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL,
-			0)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
-	.type				= MISC_HIDDEN_MODULE,
-	.name				= "printk ui",
-	.directory			= "user_interface",
-	.module				= THIS_MODULE,
-	.sysfs_data			= sysfs_params,
-	.num_sysfs_entries		= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-int toi_register_ui_ops(struct ui_ops *this_ui)
-{
-	if (toi_current_ui) {
-		printk(KERN_INFO "Only one TuxOnIce user interface module can "
-				"be loaded at a time.");
-		return -EBUSY;
-	}
-
-	toi_current_ui = this_ui;
-
-	return 0;
-}
-
-void toi_remove_ui_ops(struct ui_ops *this_ui)
-{
-	if (toi_current_ui != this_ui)
-		return;
-
-	toi_current_ui = NULL;
-}
-
-/* toi_console_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-
-int toi_ui_init(void)
-{
-	return toi_register_module(&userui_ops);
-}
-
-void toi_ui_exit(void)
-{
-	toi_unregister_module(&userui_ops);
-}
diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
deleted file mode 100644
index d71c607f6..000000000
--- a/kernel/power/tuxonice_ui.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- */
-
-enum {
-	DONT_CLEAR_BAR,
-	CLEAR_BAR
-};
-
-enum {
-	/* Userspace -> Kernel */
-	USERUI_MSG_ABORT = 0x11,
-	USERUI_MSG_SET_STATE = 0x12,
-	USERUI_MSG_GET_STATE = 0x13,
-	USERUI_MSG_GET_DEBUG_STATE = 0x14,
-	USERUI_MSG_SET_DEBUG_STATE = 0x15,
-	USERUI_MSG_SPACE = 0x18,
-	USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
-	USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
-	USERUI_MSG_GET_LOGLEVEL = 0x1C,
-	USERUI_MSG_SET_LOGLEVEL = 0x1D,
-	USERUI_MSG_PRINTK = 0x1E,
-
-	/* Kernel -> Userspace */
-	USERUI_MSG_MESSAGE = 0x21,
-	USERUI_MSG_PROGRESS = 0x22,
-	USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
-
-	USERUI_MSG_MAX,
-};
-
-struct userui_msg_params {
-	u32 a, b, c, d;
-	char text[255];
-};
-
-struct ui_ops {
-	char (*wait_for_key) (int timeout);
-	u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...);
-	void (*prepare_status) (int clearbar, const char *fmt, ...);
-	void (*cond_pause) (int pause, char *message);
-	void (*abort)(int result_code, const char *fmt, ...);
-	void (*prepare)(void);
-	void (*cleanup)(void);
-	void (*message)(u32 section, u32 level, u32 normally_logged,
-			const char *fmt, ...);
-};
-
-extern struct ui_ops *toi_current_ui;
-
-#define toi_update_status(val, max, fmt, args...) \
- (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
-	max)
-
-#define toi_prepare_console(void) \
-	do { if (toi_current_ui) \
-		(toi_current_ui->prepare)(); \
-	} while (0)
-
-#define toi_cleanup_console(void) \
-	do { if (toi_current_ui) \
-		(toi_current_ui->cleanup)(); \
-	} while (0)
-
-#define abort_hibernate(result, fmt, args...) \
-	do { if (toi_current_ui) \
-		(toi_current_ui->abort)(result, fmt, ##args); \
-	     else { \
-		set_abort_result(result); \
-	     } \
-	} while (0)
-
-#define toi_cond_pause(pause, message) \
-	do { if (toi_current_ui) \
-		(toi_current_ui->cond_pause)(pause, message); \
-	} while (0)
-
-#define toi_prepare_status(clear, fmt, args...) \
-	do { if (toi_current_ui) \
-		(toi_current_ui->prepare_status)(clear, fmt, ##args); \
-	     else \
-		printk(KERN_INFO fmt "%s", ##args, "\n"); \
-	} while (0)
-
-#define toi_message(sn, lev, log, fmt, a...) \
-do { \
-	if (toi_current_ui && (!sn || test_debug_state(sn))) \
-		toi_current_ui->message(sn, lev, log, fmt, ##a); \
-} while (0)
-
-__exit void toi_ui_cleanup(void);
-extern int toi_ui_init(void);
-extern void toi_ui_exit(void);
-extern int toi_register_ui_ops(struct ui_ops *this_ui);
-extern void toi_remove_ui_ops(struct ui_ops *this_ui);
diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
deleted file mode 100644
index edc885c72..000000000
--- a/kernel/power/tuxonice_userui.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * kernel/power/user_ui.c
- *
- * Copyright (C) 2005-2007 Bernard Blackham
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/ctype.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>
-#include <linux/reboot.h>
-#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vt.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-
-static char local_printf_buf[1024];	/* Same as printk - should be safe */
-
-static struct user_helper_data ui_helper_data;
-static struct toi_module_ops userui_ops;
-static int orig_kmsg;
-
-static char lastheader[512];
-static int lastheader_message_len;
-static int ui_helper_changed; /* Used at resume-time so don't overwrite value
-				set from initrd/ramfs. */
-
-/* Number of distinct progress amounts that userspace can display */
-static int progress_granularity = 30;
-
-static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
-static int userui_wait_should_wake;
-
-#define toi_stop_waiting_for_userui_key() \
-{ \
-	userui_wait_should_wake = true; \
-	wake_up_interruptible(&userui_wait_for_key); \
-}
-
-/**
- * ui_nl_set_state - Update toi_action based on a message from userui.
- *
- * @n: The bit (1 << bit) to set.
- */
-static void ui_nl_set_state(int n)
-{
-	/* Only let them change certain settings */
-	static const u32 toi_action_mask =
-		(1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
-		(1 << TOI_LOGALL) |
-		(1 << TOI_SINGLESTEP) |
-		(1 << TOI_PAUSE_NEAR_PAGESET_END);
-	static unsigned long new_action;
-
-	new_action = (toi_bkd.toi_action & (~toi_action_mask)) |
-		(n & toi_action_mask);
-
-	printk(KERN_DEBUG "n is %x. Action flags being changed from %lx "
-			"to %lx.", n, toi_bkd.toi_action, new_action);
-	toi_bkd.toi_action = new_action;
-
-	if (!test_action_state(TOI_PAUSE) &&
-			!test_action_state(TOI_SINGLESTEP))
-		toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_post_atomic_restore - Tell userui that atomic restore just happened.
- *
- * Tell userui that atomic restore just occured, so that it can do things like
- * redrawing the screen, re-getting settings and so on.
- */
-static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-	toi_send_netlink_message(&ui_helper_data,
-			USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
-}
-
-/**
- * userui_storage_needed - Report how much memory in image header is needed.
- */
-static int userui_storage_needed(void)
-{
-	return sizeof(ui_helper_data.program) + 1 + sizeof(int);
-}
-
-/**
- * userui_save_config_info - Fill buffer with config info for image header.
- *
- * @buf: Buffer into which to put the config info we want to save.
- */
-static int userui_save_config_info(char *buf)
-{
-	*((int *) buf) = progress_granularity;
-	memcpy(buf + sizeof(int), ui_helper_data.program,
-			sizeof(ui_helper_data.program));
-	return sizeof(ui_helper_data.program) + sizeof(int) + 1;
-}
-
-/**
- * userui_load_config_info - Restore config info from buffer.
- *
- * @buf: Buffer containing header info loaded.
- * @size: Size of data loaded for this module.
- */
-static void userui_load_config_info(char *buf, int size)
-{
-	progress_granularity = *((int *) buf);
-	size -= sizeof(int);
-
-	/* Don't load the saved path if one has already been set */
-	if (ui_helper_changed)
-		return;
-
-	if (size > sizeof(ui_helper_data.program))
-		size = sizeof(ui_helper_data.program);
-
-	memcpy(ui_helper_data.program, buf + sizeof(int), size);
-	ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
-}
-
-/**
- * set_ui_program_set: Record that userui program was changed.
- *
- * Side effect routine for when the userui program is set. In an initrd or
- * ramfs, the user may set a location for the userui program. If this happens,
- * we don't want to reload the value that was saved in the image header. This
- * routine allows us to flag that we shouldn't restore the program name from
- * the image header.
- */
-static void set_ui_program_set(void)
-{
-	ui_helper_changed = 1;
-}
-
-/**
- * userui_memory_needed - Tell core how much memory to reserve for us.
- */
-static int userui_memory_needed(void)
-{
-	/* ball park figure of 128 pages */
-	return 128 * PAGE_SIZE;
-}
-
-/**
- * userui_update_status - Update the progress bar and (if on) in-bar message.
- *
- * @value: Current progress percentage numerator.
- * @maximum: Current progress percentage denominator.
- * @fmt: Message to be displayed in the middle of the progress bar.
- *
- * Note that a NULL message does not mean that any previous message is erased!
- * For that, you need toi_prepare_status with clearbar on.
- *
- * Returns an unsigned long, being the next numerator (as determined by the
- * maximum and progress granularity) where status needs to be updated.
- * This is to reduce unnecessary calls to update_status.
- */
-static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...)
-{
-	static u32 last_step = 9999;
-	struct userui_msg_params msg;
-	u32 this_step, next_update;
-	int bitshift;
-
-	if (ui_helper_data.pid == -1)
-		return 0;
-
-	if ((!maximum) || (!progress_granularity))
-		return maximum;
-
-	if (value < 0)
-		value = 0;
-
-	if (value > maximum)
-		value = maximum;
-
-	/* Try to avoid math problems - we can't do 64 bit math here
-	 * (and shouldn't need it - anyone got screen resolution
-	 * of 65536 pixels or more?) */
-	bitshift = fls(maximum) - 16;
-	if (bitshift > 0) {
-		u32 temp_maximum = maximum >> bitshift;
-		u32 temp_value = value >> bitshift;
-		this_step = (u32)
-			(temp_value * progress_granularity / temp_maximum);
-		next_update = (((this_step + 1) * temp_maximum /
-					progress_granularity) + 1) << bitshift;
-	} else {
-		this_step = (u32) (value * progress_granularity / maximum);
-		next_update = ((this_step + 1) * maximum /
-				progress_granularity) + 1;
-	}
-
-	if (this_step == last_step)
-		return next_update;
-
-	memset(&msg, 0, sizeof(msg));
-
-	msg.a = this_step;
-	msg.b = progress_granularity;
-
-	if (fmt) {
-		va_list args;
-		va_start(args, fmt);
-		vsnprintf(msg.text, sizeof(msg.text), fmt, args);
-		va_end(args);
-		msg.text[sizeof(msg.text)-1] = '\0';
-	}
-
-	toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
-			&msg, sizeof(msg));
-	last_step = this_step;
-
-	return next_update;
-}
-
-/**
- * userui_message - Display a message without necessarily logging it.
- *
- * @section: Type of message. Messages can be filtered by type.
- * @level: Degree of importance of the message. Lower values = higher priority.
- * @normally_logged: Whether logged even if log_everything is off.
- * @fmt: Message (and parameters).
- *
- * This function is intended to do the same job as printk, but without normally
- * logging what is printed. The point is to be able to get debugging info on
- * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
- *
- * It may be called from an interrupt context - can't sleep!
- */
-static void userui_message(u32 section, u32 level, u32 normally_logged,
-		const char *fmt, ...)
-{
-	struct userui_msg_params msg;
-
-	if ((level) && (level > console_loglevel))
-		return;
-
-	memset(&msg, 0, sizeof(msg));
-
-	msg.a = section;
-	msg.b = level;
-	msg.c = normally_logged;
-
-	if (fmt) {
-		va_list args;
-		va_start(args, fmt);
-		vsnprintf(msg.text, sizeof(msg.text), fmt, args);
-		va_end(args);
-		msg.text[sizeof(msg.text)-1] = '\0';
-	}
-
-	if (test_action_state(TOI_LOGALL))
-		printk(KERN_INFO "%s\n", msg.text);
-
-	toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
-			&msg, sizeof(msg));
-}
-
-/**
- * wait_for_key_via_userui - Wait for userui to receive a keypress.
- */
-static void wait_for_key_via_userui(void)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue(&userui_wait_for_key, &wait);
-	set_current_state(TASK_INTERRUPTIBLE);
-
-	wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake);
-	userui_wait_should_wake = false;
-
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&userui_wait_for_key, &wait);
-}
-
-/**
- * userui_prepare_status - Display high level messages.
- *
- * @clearbar: Whether to clear the progress bar.
- * @fmt...: New message for the title.
- *
- * Prepare the 'nice display', drawing the header and version, along with the
- * current action and perhaps also resetting the progress bar.
- */
-static void userui_prepare_status(int clearbar, const char *fmt, ...)
-{
-	va_list args;
-
-	if (fmt) {
-		va_start(args, fmt);
-		lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
-		va_end(args);
-	}
-
-	if (clearbar)
-		toi_update_status(0, 1, NULL);
-
-	if (ui_helper_data.pid == -1)
-		printk(KERN_EMERG "%s\n", lastheader);
-	else
-		toi_message(0, TOI_STATUS, 1, lastheader, NULL);
-}
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress from userui.
- *
- * FIXME: Implement timeout?
- */
-static char userui_wait_for_keypress(int timeout)
-{
-	char key = '\0';
-
-	if (ui_helper_data.pid != -1) {
-		wait_for_key_via_userui();
-		key = ' ';
-	}
-
-	return key;
-}
-
-/**
- * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
- *
- * @result_code: Reason why we're aborting (1 << bit).
- * @fmt: Message to display if telling the user what's going on.
- *
- * Abort a cycle. If this wasn't at the user's request (and we're displaying
- * output), tell the user why and wait for them to acknowledge the message.
- */
-static void userui_abort_hibernate(int result_code, const char *fmt, ...)
-{
-	va_list args;
-	int printed_len = 0;
-
-	set_result_state(result_code);
-
-	if (test_result_state(TOI_ABORTED))
-		return;
-
-	set_result_state(TOI_ABORTED);
-
-	if (test_result_state(TOI_ABORT_REQUESTED))
-		return;
-
-	va_start(args, fmt);
-	printed_len = vsnprintf(local_printf_buf,  sizeof(local_printf_buf),
-			fmt, args);
-	va_end(args);
-	if (ui_helper_data.pid != -1)
-		printed_len = sprintf(local_printf_buf + printed_len,
-					" (Press SPACE to continue)");
-
-	toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
-
-	if (ui_helper_data.pid != -1)
-		userui_wait_for_keypress(0);
-}
-
-/**
- * request_abort_hibernate - Abort hibernating or resuming at user request.
- *
- * Handle the user requesting the cancellation of a hibernation or resume by
- * pressing escape.
- */
-static void request_abort_hibernate(void)
-{
-	if (test_result_state(TOI_ABORT_REQUESTED) ||
-	   !test_action_state(TOI_CAN_CANCEL))
-		return;
-
-	if (test_toi_state(TOI_NOW_RESUMING)) {
-		toi_prepare_status(CLEAR_BAR, "Escape pressed. "
-					"Powering down again.");
-		set_toi_state(TOI_STOP_RESUME);
-		while (!test_toi_state(TOI_IO_STOPPED))
-			schedule();
-		if (toiActiveAllocator->mark_resume_attempted)
-			toiActiveAllocator->mark_resume_attempted(0);
-		toi_power_down();
-	}
-
-	toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
-					" ABORTING HIBERNATION ---");
-	set_abort_result(TOI_ABORT_REQUESTED);
-	toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_user_rcv_msg - Receive a netlink message from userui.
- *
- * @skb: skb received.
- * @nlh: Netlink header received.
- */
-static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-	int type;
-	int *data;
-
-	type = nlh->nlmsg_type;
-
-	/* A control message: ignore them */
-	if (type < NETLINK_MSG_BASE)
-		return 0;
-
-	/* Unknown message: reply with EINVAL */
-	if (type >= USERUI_MSG_MAX)
-		return -EINVAL;
-
-	/* All operations require privileges, even GET */
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	/* Only allow one task to receive NOFREEZE privileges */
-	if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
-		printk(KERN_INFO "Got NOFREEZE_ME request when "
-			"ui_helper_data.pid is %d.\n", ui_helper_data.pid);
-		return -EBUSY;
-	}
-
-	data = (int *) NLMSG_DATA(nlh);
-
-	switch (type) {
-	case USERUI_MSG_ABORT:
-		request_abort_hibernate();
-		return 0;
-	case USERUI_MSG_GET_STATE:
-		toi_send_netlink_message(&ui_helper_data,
-				USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
-				sizeof(toi_bkd.toi_action));
-		return 0;
-	case USERUI_MSG_GET_DEBUG_STATE:
-		toi_send_netlink_message(&ui_helper_data,
-				USERUI_MSG_GET_DEBUG_STATE,
-				&toi_bkd.toi_debug_state,
-				sizeof(toi_bkd.toi_debug_state));
-		return 0;
-	case USERUI_MSG_SET_STATE:
-		if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
-			return -EINVAL;
-		ui_nl_set_state(*data);
-		return 0;
-	case USERUI_MSG_SET_DEBUG_STATE:
-		if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
-			return -EINVAL;
-		toi_bkd.toi_debug_state = (*data);
-		return 0;
-	case USERUI_MSG_SPACE:
-		toi_stop_waiting_for_userui_key();
-		return 0;
-	case USERUI_MSG_GET_POWERDOWN_METHOD:
-		toi_send_netlink_message(&ui_helper_data,
-				USERUI_MSG_GET_POWERDOWN_METHOD,
-				&toi_poweroff_method,
-				sizeof(toi_poweroff_method));
-		return 0;
-	case USERUI_MSG_SET_POWERDOWN_METHOD:
-		if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char)))
-			return -EINVAL;
-		toi_poweroff_method = (unsigned long)(*data);
-		return 0;
-	case USERUI_MSG_GET_LOGLEVEL:
-		toi_send_netlink_message(&ui_helper_data,
-				USERUI_MSG_GET_LOGLEVEL,
-				&toi_bkd.toi_default_console_level,
-				sizeof(toi_bkd.toi_default_console_level));
-		return 0;
-	case USERUI_MSG_SET_LOGLEVEL:
-		if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
-			return -EINVAL;
-		toi_bkd.toi_default_console_level = (*data);
-		return 0;
-	case USERUI_MSG_PRINTK:
-		printk(KERN_INFO "%s", (char *) data);
-		return 0;
-	}
-
-	/* Unhandled here */
-	return 1;
-}
-
-/**
- * userui_cond_pause - Possibly pause at user request.
- *
- * @pause: Whether to pause or just display the message.
- * @message: Message to display at the start of pausing.
- *
- * Potentially pause and wait for the user to tell us to continue. We normally
- * only pause when @pause is set. While paused, the user can do things like
- * changing the loglevel, toggling the display of debugging sections and such
- * like.
- */
-static void userui_cond_pause(int pause, char *message)
-{
-	int displayed_message = 0, last_key = 0;
-
-	while (last_key != 32 &&
-		ui_helper_data.pid != -1 &&
-		((test_action_state(TOI_PAUSE) && pause) ||
-		 (test_action_state(TOI_SINGLESTEP)))) {
-		if (!displayed_message) {
-			toi_prepare_status(DONT_CLEAR_BAR,
-			   "%s Press SPACE to continue.%s",
-			   message ? message : "",
-			   (test_action_state(TOI_SINGLESTEP)) ?
-			   " Single step on." : "");
-			displayed_message = 1;
-		}
-		last_key = userui_wait_for_keypress(0);
-	}
-	schedule();
-}
-
-/**
- * userui_prepare_console - Prepare the console for use.
- *
- * Prepare a console for use, saving current kmsg settings and attempting to
- * start userui. Console loglevel changes are handled by userui.
- */
-static void userui_prepare_console(void)
-{
-	orig_kmsg = vt_kmsg_redirect(fg_console + 1);
-
-	ui_helper_data.pid = -1;
-
-	if (!userui_ops.enabled) {
-		printk(KERN_INFO "TuxOnIce: Userui disabled.\n");
-		return;
-	}
-
-	if (*ui_helper_data.program)
-		toi_netlink_setup(&ui_helper_data);
-	else
-		printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
-}
-
-/**
- * userui_cleanup_console - Cleanup after a cycle.
- *
- * Tell userui to cleanup, and restore kmsg_redirect to its original value.
- */
-
-static void userui_cleanup_console(void)
-{
-	if (ui_helper_data.pid > -1)
-		toi_netlink_close(&ui_helper_data);
-
-	vt_kmsg_redirect(orig_kmsg);
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
-	SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_CAN_CANCEL, 0),
-	SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action,
-			TOI_PAUSE, 0),
-	SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL),
-	SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
-			2048, 0, NULL),
-	SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0,
-			set_ui_program_set),
-	SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
-	.type				= MISC_MODULE,
-	.name				= "userui",
-	.shared_directory		= "user_interface",
-	.module				= THIS_MODULE,
-	.storage_needed			= userui_storage_needed,
-	.save_config_info		= userui_save_config_info,
-	.load_config_info		= userui_load_config_info,
-	.memory_needed			= userui_memory_needed,
-	.post_atomic_restore		= userui_post_atomic_restore,
-	.sysfs_data			= sysfs_params,
-	.num_sysfs_entries		= sizeof(sysfs_params) /
-		sizeof(struct toi_sysfs_data),
-};
-
-static struct ui_ops my_ui_ops = {
-	.update_status			= userui_update_status,
-	.message			= userui_message,
-	.prepare_status			= userui_prepare_status,
-	.abort				= userui_abort_hibernate,
-	.cond_pause			= userui_cond_pause,
-	.prepare			= userui_prepare_console,
-	.cleanup			= userui_cleanup_console,
-	.wait_for_key			= userui_wait_for_keypress,
-};
-
-/**
- * toi_user_ui_init - Boot time initialisation for user interface.
- *
- * Invoked from the core init routine.
- */
-static __init int toi_user_ui_init(void)
-{
-	int result;
-
-	ui_helper_data.nl = NULL;
-	strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
-	ui_helper_data.pid = -1;
-	ui_helper_data.skb_size = sizeof(struct userui_msg_params);
-	ui_helper_data.pool_limit = 6;
-	ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
-	ui_helper_data.name = "userspace ui";
-	ui_helper_data.rcv_msg = userui_user_rcv_msg;
-	ui_helper_data.interface_version = 8;
-	ui_helper_data.must_init = 0;
-	ui_helper_data.not_ready = userui_cleanup_console;
-	init_completion(&ui_helper_data.wait_for_process);
-	result = toi_register_module(&userui_ops);
-        if (!result) {
-          result = toi_register_ui_ops(&my_ui_ops);
-          if (result)
-            toi_unregister_module(&userui_ops);
-        }
-
-	return result;
-}
-
-late_initcall(toi_user_ui_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 83cf08088..f62f2d3f9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -33,7 +33,6 @@
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/syscalls.h>
-#include <linux/suspend.h>
 #include <linux/kexec.h>
 #include <linux/kdb.h>
 #include <linux/ratelimit.h>
@@ -86,6 +85,18 @@ static struct lockdep_map console_lock_dep_map = {
 #endif
 
 /*
+ * Number of registered extended console drivers.
+ *
+ * If extended consoles are present, in-kernel cont reassembly is disabled
+ * and each fragment is stored as a separate log entry with proper
+ * continuation flag so that every emitted message has full metadata.  This
+ * doesn't change the result for regular consoles or /proc/kmsg.  For
+ * /dev/kmsg, as long as the reader concatenates messages according to
+ * consecutive continuation flags, the end result should be the same too.
+ */
+static int nr_ext_console_drivers;
+
+/*
  * Helper macros to handle lockdep when locking/unlocking console_sem. We use
  * macros instead of functions so that _RET_IP_ contains useful information.
  */
@@ -196,14 +207,14 @@ static int console_may_schedule;
  * need to be changed in the future, when the requirements change.
  *
  * /dev/kmsg exports the structured data in the following line format:
- *   "level,sequnum,timestamp;<message text>\n"
+ *   "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n"
+ *
+ * Users of the export format should ignore possible additional values
+ * separated by ',', and find the message after the ';' character.
  *
  * The optional key/value pairs are attached as continuation lines starting
  * with a space character and terminated by a newline. All possible
  * non-prinatable characters are escaped in the "\xff" notation.
- *
- * Users of the export format should ignore possible additional values
- * separated by ',', and find the message after the ';' character.
  */
 
 enum log_flags {
@@ -269,20 +280,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
 static u32 log_buf_len = __LOG_BUF_LEN;
 
-#ifdef CONFIG_TOI_INCREMENTAL
-void toi_set_logbuf_untracked(void)
-{
-    int i;
-    struct page *log_buf_start_page = virt_to_page(__log_buf);
-
-    printk("Not protecting kernel printk log buffer (%p-%p).\n",
-            __log_buf, __log_buf + __LOG_BUF_LEN);
-
-    for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++)
-        SetPageTOI_Untracked(log_buf_start_page + i);
-}
-#endif
-
 /* Return log buffer address */
 char *log_buf_addr_get(void)
 {
@@ -492,13 +489,13 @@ static int syslog_action_restricted(int type)
 	       type != SYSLOG_ACTION_SIZE_BUFFER;
 }
 
-int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, int source)
 {
 	/*
 	 * If this is from /proc/kmsg and we've already opened it, then we've
 	 * already done the capabilities checks at open time.
 	 */
-	if (from_file && type != SYSLOG_ACTION_OPEN)
+	if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
 		goto ok;
 
 	if (syslog_action_restricted(type)) {
@@ -521,6 +518,86 @@ ok:
 	return security_syslog(type);
 }
 
+static void append_char(char **pp, char *e, char c)
+{
+	if (*pp < e)
+		*(*pp)++ = c;
+}
+
+static ssize_t msg_print_ext_header(char *buf, size_t size,
+				    struct printk_log *msg, u64 seq,
+				    enum log_flags prev_flags)
+{
+	u64 ts_usec = msg->ts_nsec;
+	char cont = '-';
+
+	do_div(ts_usec, 1000);
+
+	/*
+	 * If we couldn't merge continuation line fragments during the print,
+	 * export the stored flags to allow an optional external merge of the
+	 * records. Merging the records isn't always neccessarily correct, like
+	 * when we hit a race during printing. In most cases though, it produces
+	 * better readable output. 'c' in the record flags mark the first
+	 * fragment of a line, '+' the following.
+	 */
+	if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT))
+		cont = 'c';
+	else if ((msg->flags & LOG_CONT) ||
+		 ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+		cont = '+';
+
+	return scnprintf(buf, size, "%u,%llu,%llu,%c;",
+		       (msg->facility << 3) | msg->level, seq, ts_usec, cont);
+}
+
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+				  char *dict, size_t dict_len,
+				  char *text, size_t text_len)
+{
+	char *p = buf, *e = buf + size;
+	size_t i;
+
+	/* escape non-printable characters */
+	for (i = 0; i < text_len; i++) {
+		unsigned char c = text[i];
+
+		if (c < ' ' || c >= 127 || c == '\\')
+			p += scnprintf(p, e - p, "\\x%02x", c);
+		else
+			append_char(&p, e, c);
+	}
+	append_char(&p, e, '\n');
+
+	if (dict_len) {
+		bool line = true;
+
+		for (i = 0; i < dict_len; i++) {
+			unsigned char c = dict[i];
+
+			if (line) {
+				append_char(&p, e, ' ');
+				line = false;
+			}
+
+			if (c == '\0') {
+				append_char(&p, e, '\n');
+				line = true;
+				continue;
+			}
+
+			if (c < ' ' || c >= 127 || c == '\\') {
+				p += scnprintf(p, e - p, "\\x%02x", c);
+				continue;
+			}
+
+			append_char(&p, e, c);
+		}
+		append_char(&p, e, '\n');
+	}
+
+	return p - buf;
+}
 
 /* /dev/kmsg - userspace message inject/listen interface */
 struct devkmsg_user {
@@ -528,7 +605,7 @@ struct devkmsg_user {
 	u32 idx;
 	enum log_flags prev;
 	struct mutex lock;
-	char buf[8192];
+	char buf[CONSOLE_EXT_LOG_MAX];
 };
 
 static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
@@ -586,9 +663,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 {
 	struct devkmsg_user *user = file->private_data;
 	struct printk_log *msg;
-	u64 ts_usec;
-	size_t i;
-	char cont = '-';
 	size_t len;
 	ssize_t ret;
 
@@ -624,66 +698,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	}
 
 	msg = log_from_idx(user->idx);
-	ts_usec = msg->ts_nsec;
-	do_div(ts_usec, 1000);
+	len = msg_print_ext_header(user->buf, sizeof(user->buf),
+				   msg, user->seq, user->prev);
+	len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
+				  log_dict(msg), msg->dict_len,
+				  log_text(msg), msg->text_len);
 
-	/*
-	 * If we couldn't merge continuation line fragments during the print,
-	 * export the stored flags to allow an optional external merge of the
-	 * records. Merging the records isn't always neccessarily correct, like
-	 * when we hit a race during printing. In most cases though, it produces
-	 * better readable output. 'c' in the record flags mark the first
-	 * fragment of a line, '+' the following.
-	 */
-	if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
-		cont = 'c';
-	else if ((msg->flags & LOG_CONT) ||
-		 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
-		cont = '+';
-
-	len = sprintf(user->buf, "%u,%llu,%llu,%c;",
-		      (msg->facility << 3) | msg->level,
-		      user->seq, ts_usec, cont);
 	user->prev = msg->flags;
-
-	/* escape non-printable characters */
-	for (i = 0; i < msg->text_len; i++) {
-		unsigned char c = log_text(msg)[i];
-
-		if (c < ' ' || c >= 127 || c == '\\')
-			len += sprintf(user->buf + len, "\\x%02x", c);
-		else
-			user->buf[len++] = c;
-	}
-	user->buf[len++] = '\n';
-
-	if (msg->dict_len) {
-		bool line = true;
-
-		for (i = 0; i < msg->dict_len; i++) {
-			unsigned char c = log_dict(msg)[i];
-
-			if (line) {
-				user->buf[len++] = ' ';
-				line = false;
-			}
-
-			if (c == '\0') {
-				user->buf[len++] = '\n';
-				line = true;
-				continue;
-			}
-
-			if (c < ' ' || c >= 127 || c == '\\') {
-				len += sprintf(user->buf + len, "\\x%02x", c);
-				continue;
-			}
-
-			user->buf[len++] = c;
-		}
-		user->buf[len++] = '\n';
-	}
-
 	user->idx = log_next(user->idx);
 	user->seq++;
 	raw_spin_unlock_irq(&logbuf_lock);
@@ -1269,13 +1290,13 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 	return len;
 }
 
-int do_syslog(int type, char __user *buf, int len, bool from_file)
+int do_syslog(int type, char __user *buf, int len, int source)
 {
 	bool clear = false;
 	static int saved_console_loglevel = LOGLEVEL_DEFAULT;
 	int error;
 
-	error = check_syslog_permissions(type, from_file);
+	error = check_syslog_permissions(type, source);
 	if (error)
 		goto out;
 
@@ -1358,7 +1379,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			syslog_prev = 0;
 			syslog_partial = 0;
 		}
-		if (from_file) {
+		if (source == SYSLOG_FROM_PROC) {
 			/*
 			 * Short-cut for poll(/"proc/kmsg") which simply checks
 			 * for pending data, not the size; return the count of
@@ -1405,7 +1426,9 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
  * log_buf[start] to log_buf[end - 1].
  * The console_lock must be held.
  */
-static void call_console_drivers(int level, const char *text, size_t len)
+static void call_console_drivers(int level,
+				 const char *ext_text, size_t ext_len,
+				 const char *text, size_t len)
 {
 	struct console *con;
 
@@ -1426,7 +1449,10 @@ static void call_console_drivers(int level, const char *text, size_t len)
 		if (!cpu_online(smp_processor_id()) &&
 		    !(con->flags & CON_ANYTIME))
 			continue;
-		con->write(con, text, len, level);
+		if (con->flags & CON_EXTENDED)
+			con->write(con, ext_text, ext_len, level);
+		else
+			con->write(con, text, len, level);
 	}
 }
 
@@ -1569,8 +1595,12 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
 	if (cont.len && cont.flushed)
 		return false;
 
-	if (cont.len + len > sizeof(cont.buf)) {
-		/* the line gets too long, split it up in separate records */
+	/*
+	 * If ext consoles are present, flush and skip in-kernel
+	 * continuation.  See nr_ext_console_drivers definition.  Also, if
+	 * the line gets too long, split it up in separate records.
+	 */
+	if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) {
 		cont_flush(LOG_CONT);
 		return false;
 	}
@@ -1905,9 +1935,19 @@ static struct cont {
 	u8 level;
 	bool flushed:1;
 } cont;
+static char *log_text(const struct printk_log *msg) { return NULL; }
+static char *log_dict(const struct printk_log *msg) { return NULL; }
 static struct printk_log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
-static void call_console_drivers(int level, const char *text, size_t len) {}
+static ssize_t msg_print_ext_header(char *buf, size_t size,
+				    struct printk_log *msg, u64 seq,
+				    enum log_flags prev_flags) { return 0; }
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+				  char *dict, size_t dict_len,
+				  char *text, size_t text_len) { return 0; }
+static void call_console_drivers(int level,
+				 const char *ext_text, size_t ext_len,
+				 const char *text, size_t len) {}
 static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
 			     bool syslog, char *buf, size_t size) { return 0; }
 static size_t cont_print_text(char *text, size_t size) { return 0; }
@@ -2160,7 +2200,7 @@ static void console_cont_flush(char *text, size_t size)
 	len = cont_print_text(text, size);
 	raw_spin_unlock(&logbuf_lock);
 	stop_critical_timings();
-	call_console_drivers(cont.level, text, len);
+	call_console_drivers(cont.level, NULL, 0, text, len);
 	start_critical_timings();
 	local_irq_restore(flags);
 	return;
@@ -2184,6 +2224,7 @@ out:
  */
 void console_unlock(void)
 {
+	static char ext_text[CONSOLE_EXT_LOG_MAX];
 	static char text[LOG_LINE_MAX + PREFIX_MAX];
 	static u64 seen_seq;
 	unsigned long flags;
@@ -2202,6 +2243,7 @@ void console_unlock(void)
 again:
 	for (;;) {
 		struct printk_log *msg;
+		size_t ext_len = 0;
 		size_t len;
 		int level;
 
@@ -2247,13 +2289,22 @@ skip:
 		level = msg->level;
 		len += msg_print_text(msg, console_prev, false,
 				      text + len, sizeof(text) - len);
+		if (nr_ext_console_drivers) {
+			ext_len = msg_print_ext_header(ext_text,
+						sizeof(ext_text),
+						msg, console_seq, console_prev);
+			ext_len += msg_print_ext_body(ext_text + ext_len,
+						sizeof(ext_text) - ext_len,
+						log_dict(msg), msg->dict_len,
+						log_text(msg), msg->text_len);
+		}
 		console_idx = log_next(console_idx);
 		console_seq++;
 		console_prev = msg->flags;
 		raw_spin_unlock(&logbuf_lock);
 
 		stop_critical_timings();	/* don't trace print latency */
-		call_console_drivers(level, text, len);
+		call_console_drivers(level, ext_text, ext_len, text, len);
 		start_critical_timings();
 		local_irq_restore(flags);
 	}
@@ -2509,6 +2560,11 @@ void register_console(struct console *newcon)
 		newcon->next = console_drivers->next;
 		console_drivers->next = newcon;
 	}
+
+	if (newcon->flags & CON_EXTENDED)
+		if (!nr_ext_console_drivers++)
+			pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n");
+
 	if (newcon->flags & CON_PRINTBUFFER) {
 		/*
 		 * console_unlock(); will print out the buffered messages
@@ -2581,6 +2637,9 @@ int unregister_console(struct console *console)
 		}
 	}
 
+	if (!res && (console->flags & CON_EXTENDED))
+		nr_ext_console_drivers--;
+
 	/*
 	 * If this isn't the last console and it has CON_CONSDEV set, we
 	 * need to set it on the next preferred console.
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8dbe27611..59e32684c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -241,6 +241,7 @@ rcu_torture_free(struct rcu_torture *p)
 struct rcu_torture_ops {
 	int ttype;
 	void (*init)(void);
+	void (*cleanup)(void);
 	int (*readlock)(void);
 	void (*read_delay)(struct torture_random_state *rrsp);
 	void (*readunlock)(int idx);
@@ -477,10 +478,12 @@ static struct rcu_torture_ops rcu_busted_ops = {
  */
 
 DEFINE_STATIC_SRCU(srcu_ctl);
+static struct srcu_struct srcu_ctld;
+static struct srcu_struct *srcu_ctlp = &srcu_ctl;
 
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
 {
-	return srcu_read_lock(&srcu_ctl);
+	return srcu_read_lock(srcu_ctlp);
 }
 
 static void srcu_read_delay(struct torture_random_state *rrsp)
@@ -499,49 +502,49 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
 		rcu_read_delay(rrsp);
 }
 
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
 {
-	srcu_read_unlock(&srcu_ctl, idx);
+	srcu_read_unlock(srcu_ctlp, idx);
 }
 
 static unsigned long srcu_torture_completed(void)
 {
-	return srcu_batches_completed(&srcu_ctl);
+	return srcu_batches_completed(srcu_ctlp);
 }
 
 static void srcu_torture_deferred_free(struct rcu_torture *rp)
 {
-	call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+	call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb);
 }
 
 static void srcu_torture_synchronize(void)
 {
-	synchronize_srcu(&srcu_ctl);
+	synchronize_srcu(srcu_ctlp);
 }
 
 static void srcu_torture_call(struct rcu_head *head,
 			      void (*func)(struct rcu_head *head))
 {
-	call_srcu(&srcu_ctl, head, func);
+	call_srcu(srcu_ctlp, head, func);
 }
 
 static void srcu_torture_barrier(void)
 {
-	srcu_barrier(&srcu_ctl);
+	srcu_barrier(srcu_ctlp);
 }
 
 static void srcu_torture_stats(void)
 {
 	int cpu;
-	int idx = srcu_ctl.completed & 0x1;
+	int idx = srcu_ctlp->completed & 0x1;
 
 	pr_alert("%s%s per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
 		long c0, c1;
 
-		c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
-		c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
+		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
 		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
 	}
 	pr_cont("\n");
@@ -549,7 +552,7 @@ static void srcu_torture_stats(void)
 
 static void srcu_torture_synchronize_expedited(void)
 {
-	synchronize_srcu_expedited(&srcu_ctl);
+	synchronize_srcu_expedited(srcu_ctlp);
 }
 
 static struct rcu_torture_ops srcu_ops = {
@@ -569,6 +572,38 @@ static struct rcu_torture_ops srcu_ops = {
 	.name		= "srcu"
 };
 
+static void srcu_torture_init(void)
+{
+	rcu_sync_torture_init();
+	WARN_ON(init_srcu_struct(&srcu_ctld));
+	srcu_ctlp = &srcu_ctld;
+}
+
+static void srcu_torture_cleanup(void)
+{
+	cleanup_srcu_struct(&srcu_ctld);
+	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
+}
+
+/* As above, but dynamically allocated. */
+static struct rcu_torture_ops srcud_ops = {
+	.ttype		= SRCU_FLAVOR,
+	.init		= srcu_torture_init,
+	.cleanup	= srcu_torture_cleanup,
+	.readlock	= srcu_torture_read_lock,
+	.read_delay	= srcu_read_delay,
+	.readunlock	= srcu_torture_read_unlock,
+	.started	= NULL,
+	.completed	= srcu_torture_completed,
+	.deferred_free	= srcu_torture_deferred_free,
+	.sync		= srcu_torture_synchronize,
+	.exp_sync	= srcu_torture_synchronize_expedited,
+	.call		= srcu_torture_call,
+	.cb_barrier	= srcu_torture_barrier,
+	.stats		= srcu_torture_stats,
+	.name		= "srcud"
+};
+
 /*
  * Definitions for sched torture testing.
  */
@@ -672,8 +707,8 @@ static void rcu_torture_boost_cb(struct rcu_head *head)
 	struct rcu_boost_inflight *rbip =
 		container_of(head, struct rcu_boost_inflight, rcu);
 
-	smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
-	rbip->inflight = 0;
+	/* Ensure RCU-core accesses precede clearing ->inflight */
+	smp_store_release(&rbip->inflight, 0);
 }
 
 static int rcu_torture_boost(void *arg)
@@ -710,9 +745,9 @@ static int rcu_torture_boost(void *arg)
 		call_rcu_time = jiffies;
 		while (ULONG_CMP_LT(jiffies, endtime)) {
 			/* If we don't have a callback in flight, post one. */
-			if (!rbi.inflight) {
-				smp_mb(); /* RCU core before ->inflight = 1. */
-				rbi.inflight = 1;
+			if (!smp_load_acquire(&rbi.inflight)) {
+				/* RCU core before ->inflight = 1. */
+				smp_store_release(&rbi.inflight, 1);
 				call_rcu(&rbi.rcu, rcu_torture_boost_cb);
 				if (jiffies - call_rcu_time >
 					 test_boost_duration * HZ - HZ / 2) {
@@ -751,11 +786,10 @@ checkwait:	stutter_wait("rcu_torture_boost");
 	} while (!torture_must_stop());
 
 	/* Clean up and exit. */
-	while (!kthread_should_stop() || rbi.inflight) {
+	while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
 		torture_shutdown_absorb("rcu_torture_boost");
 		schedule_timeout_uninterruptible(1);
 	}
-	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
 	destroy_rcu_head_on_stack(&rbi.rcu);
 	torture_kthread_stopping("rcu_torture_boost");
 	return 0;
@@ -1054,7 +1088,7 @@ static void rcu_torture_timer(unsigned long unused)
 	p = rcu_dereference_check(rcu_torture_current,
 				  rcu_read_lock_bh_held() ||
 				  rcu_read_lock_sched_held() ||
-				  srcu_read_lock_held(&srcu_ctl));
+				  srcu_read_lock_held(srcu_ctlp));
 	if (p == NULL) {
 		/* Leave because rcu_torture_writer is not yet underway */
 		cur_ops->readunlock(idx);
@@ -1128,7 +1162,7 @@ rcu_torture_reader(void *arg)
 		p = rcu_dereference_check(rcu_torture_current,
 					  rcu_read_lock_bh_held() ||
 					  rcu_read_lock_sched_held() ||
-					  srcu_read_lock_held(&srcu_ctl));
+					  srcu_read_lock_held(srcu_ctlp));
 		if (p == NULL) {
 			/* Wait for rcu_torture_writer to get underway */
 			cur_ops->readunlock(idx);
@@ -1413,12 +1447,15 @@ static int rcu_torture_barrier_cbs(void *arg)
 	do {
 		wait_event(barrier_cbs_wq[myid],
 			   (newphase =
-			    ACCESS_ONCE(barrier_phase)) != lastphase ||
+			    smp_load_acquire(&barrier_phase)) != lastphase ||
 			   torture_must_stop());
 		lastphase = newphase;
-		smp_mb(); /* ensure barrier_phase load before ->call(). */
 		if (torture_must_stop())
 			break;
+		/*
+		 * The above smp_load_acquire() ensures barrier_phase load
+		 * is ordered before the folloiwng ->call().
+		 */
 		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
 		if (atomic_dec_and_test(&barrier_cbs_count))
 			wake_up(&barrier_wq);
@@ -1439,8 +1476,8 @@ static int rcu_torture_barrier(void *arg)
 	do {
 		atomic_set(&barrier_cbs_invoked, 0);
 		atomic_set(&barrier_cbs_count, n_barrier_cbs);
-		smp_mb(); /* Ensure barrier_phase after prior assignments. */
-		barrier_phase = !barrier_phase;
+		/* Ensure barrier_phase ordered after prior assignments. */
+		smp_store_release(&barrier_phase, !barrier_phase);
 		for (i = 0; i < n_barrier_cbs; i++)
 			wake_up(&barrier_cbs_wq[i]);
 		wait_event(barrier_wq,
@@ -1588,10 +1625,14 @@ rcu_torture_cleanup(void)
 			rcutorture_booster_cleanup(i);
 	}
 
-	/* Wait for all RCU callbacks to fire.  */
-
+	/*
+	 * Wait for all RCU callbacks to fire, then do flavor-specific
+	 * cleanup operations.
+	 */
 	if (cur_ops->cb_barrier != NULL)
 		cur_ops->cb_barrier();
+	if (cur_ops->cleanup != NULL)
+		cur_ops->cleanup();
 
 	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
 
@@ -1668,8 +1709,8 @@ rcu_torture_init(void)
 	int cpu;
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] = {
-		&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
-		RCUTORTURE_TASKS_OPS
+		&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
+		&sched_ops, RCUTORTURE_TASKS_OPS
 	};
 
 	if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -1701,7 +1742,7 @@ rcu_torture_init(void)
 	if (nreaders >= 0) {
 		nrealreaders = nreaders;
 	} else {
-		nrealreaders = num_online_cpus() - 1;
+		nrealreaders = num_online_cpus() - 2 - nreaders;
 		if (nrealreaders <= 0)
 			nrealreaders = 1;
 	}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index cad76e76b..fb33d35ee 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -151,7 +151,7 @@ static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
 	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
 		sum += t;
 	}
 	return sum;
@@ -168,7 +168,7 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
 	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
 		sum += t;
 	}
 	return sum;
@@ -265,8 +265,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
-		sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
 	}
 	return sum;
 }
@@ -296,7 +296,7 @@ int __srcu_read_lock(struct srcu_struct *sp)
 {
 	int idx;
 
-	idx = ACCESS_ONCE(sp->completed) & 0x1;
+	idx = READ_ONCE(sp->completed) & 0x1;
 	preempt_disable();
 	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index ec3086879..c291bd65d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,7 +35,7 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 
 #include "rcu.h"
 
@@ -49,39 +49,6 @@ static void __call_rcu(struct rcu_head *head,
 
 #include "tiny_plugin.h"
 
-/*
- * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode.
- */
-void rcu_idle_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-/*
- * Exit an interrupt handler towards idle.
- */
-void rcu_irq_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_exit);
-
-/*
- * Exit idle, so that we are no longer in an extended quiescent state.
- */
-void rcu_idle_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
-/*
- * Enter an interrupt handler, moving away from idle.
- */
-void rcu_irq_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_enter);
-
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
 
 /*
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index f94e209a1..e492a5253 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,16 +144,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 		return;
 	rcp->ticks_this_gp++;
 	j = jiffies;
-	js = ACCESS_ONCE(rcp->jiffies_stall);
+	js = READ_ONCE(rcp->jiffies_stall);
 	if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
 		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
 		       rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
 		       jiffies - rcp->gp_start, rcp->qlen);
 		dump_stack();
-		ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
-			3 * rcu_jiffies_till_stall_check() + 3;
+		WRITE_ONCE(rcp->jiffies_stall,
+			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	} else if (ULONG_CMP_GE(j, js)) {
-		ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+		WRITE_ONCE(rcp->jiffies_stall,
+			   jiffies + rcu_jiffies_till_stall_check());
 	}
 }
 
@@ -161,7 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
 {
 	rcp->ticks_this_gp = 0;
 	rcp->gp_start = jiffies;
-	ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+	WRITE_ONCE(rcp->jiffies_stall,
+		   jiffies + rcu_jiffies_till_stall_check());
 }
 
 static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8cf7304b2..65137bc28 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -54,7 +54,7 @@
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
 #include <linux/random.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/suspend.h>
 
 #include "tree.h"
@@ -91,7 +91,7 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
 
 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
 DEFINE_RCU_TPS(sname) \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
 struct rcu_state sname##_state = { \
 	.level = { &sname##_state.node[0] }, \
 	.rda = &sname##_data, \
@@ -110,11 +110,18 @@ struct rcu_state sname##_state = { \
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
 
-static struct rcu_state *rcu_state_p;
+static struct rcu_state *const rcu_state_p;
+static struct rcu_data __percpu *const rcu_data_p;
 LIST_HEAD(rcu_struct_flavors);
 
-/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
-static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+/* Dump rcu_node combining tree at boot to verify correct setup. */
+static bool dump_tree;
+module_param(dump_tree, bool, 0444);
+/* Control rcu_node-tree auto-balancing at boot time. */
+static bool rcu_fanout_exact;
+module_param(rcu_fanout_exact, bool, 0444);
+/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
 module_param(rcu_fanout_leaf, int, 0444);
 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
 static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
@@ -159,17 +166,46 @@ static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 
 /* rcuc/rcub kthread realtime priority */
+#ifdef CONFIG_RCU_KTHREAD_PRIO
 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
+static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
+#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
 module_param(kthread_prio, int, 0644);
 
 /* Delay in jiffies for grace-period initialization delays, debug only. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
+static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
+module_param(gp_preinit_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+static const int gp_preinit_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+
 #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
 static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
 module_param(gp_init_delay, int, 0644);
 #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
 static const int gp_init_delay;
 #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-#define PER_RCU_NODE_PERIOD 10	/* Number of grace periods between delays. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
+static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
+module_param(gp_cleanup_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static const int gp_cleanup_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+
+/*
+ * Number of grace periods between delays, normalized by the duration of
+ * the delay.  The longer the the delay, the more the grace periods between
+ * each delay.  The reason for this normalization is that it means that,
+ * for non-zero delays, the overall slowdown of grace periods is constant
+ * regardless of the duration of the delay.  This arrangement balances
+ * the need for long delays to increase some race probabilities with the
+ * need for fast grace periods to increase other race probabilities.
+ */
+#define PER_RCU_NODE_PERIOD 3	/* Number of grace periods between delays. */
 
 /*
  * Track the rcutorture test sequence number and the update version
@@ -191,17 +227,17 @@ unsigned long rcutorture_vernum;
  */
 unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
 {
-	return ACCESS_ONCE(rnp->qsmaskinitnext);
+	return READ_ONCE(rnp->qsmaskinitnext);
 }
 
 /*
- * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
+ * Return true if an RCU grace period is in progress.  The READ_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
  * structure's ->lock, but of course results can be subject to change.
  */
 static int rcu_gp_in_progress(struct rcu_state *rsp)
 {
-	return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+	return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum);
 }
 
 /*
@@ -278,8 +314,8 @@ static void rcu_momentary_dyntick_idle(void)
 		if (!(resched_mask & rsp->flavor_mask))
 			continue;
 		smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
-		if (ACCESS_ONCE(rdp->mynode->completed) !=
-		    ACCESS_ONCE(rdp->cond_resched_completed))
+		if (READ_ONCE(rdp->mynode->completed) !=
+		    READ_ONCE(rdp->cond_resched_completed))
 			continue;
 
 		/*
@@ -491,9 +527,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
 		break;
 	}
 	if (rsp != NULL) {
-		*flags = ACCESS_ONCE(rsp->gp_flags);
-		*gpnum = ACCESS_ONCE(rsp->gpnum);
-		*completed = ACCESS_ONCE(rsp->completed);
+		*flags = READ_ONCE(rsp->gp_flags);
+		*gpnum = READ_ONCE(rsp->gpnum);
+		*completed = READ_ONCE(rsp->completed);
 		return;
 	}
 	*flags = 0;
@@ -539,10 +575,10 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 static int rcu_future_needs_gp(struct rcu_state *rsp)
 {
 	struct rcu_node *rnp = rcu_get_root(rsp);
-	int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+	int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
 	int *fp = &rnp->need_future_gp[idx];
 
-	return ACCESS_ONCE(*fp);
+	return READ_ONCE(*fp);
 }
 
 /*
@@ -565,7 +601,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 		return 1;  /* Yes, this CPU has newly registered callbacks. */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
 		if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
-		    ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+		    ULONG_CMP_LT(READ_ONCE(rsp->completed),
 				 rdp->nxtcompleted[i]))
 			return 1;  /* Yes, CBs for future grace period. */
 	return 0; /* No grace period needed. */
@@ -585,7 +621,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
 	trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
-	if (!user && !is_idle_task(current)) {
+	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+	    !user && !is_idle_task(current)) {
 		struct task_struct *idle __maybe_unused =
 			idle_task(smp_processor_id());
 
@@ -604,7 +641,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 	smp_mb__before_atomic();  /* See above. */
 	atomic_inc(&rdtp->dynticks);
 	smp_mb__after_atomic();  /* Force ordering with next sojourn. */
-	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     atomic_read(&rdtp->dynticks) & 0x1);
 	rcu_dynticks_task_enter();
 
 	/*
@@ -630,7 +668,8 @@ static void rcu_eqs_enter(bool user)
 
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
-	WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     (oldval & DYNTICK_TASK_NEST_MASK) == 0);
 	if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
 		rdtp->dynticks_nesting = 0;
 		rcu_eqs_enter_common(oldval, user);
@@ -703,7 +742,8 @@ void rcu_irq_exit(void)
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting--;
-	WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     rdtp->dynticks_nesting < 0);
 	if (rdtp->dynticks_nesting)
 		trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
 	else
@@ -728,10 +768,12 @@ static void rcu_eqs_exit_common(long long oldval, int user)
 	atomic_inc(&rdtp->dynticks);
 	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
 	smp_mb__after_atomic();  /* See above. */
-	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     !(atomic_read(&rdtp->dynticks) & 0x1));
 	rcu_cleanup_after_idle();
 	trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
-	if (!user && !is_idle_task(current)) {
+	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+	    !user && !is_idle_task(current)) {
 		struct task_struct *idle __maybe_unused =
 			idle_task(smp_processor_id());
 
@@ -755,7 +797,7 @@ static void rcu_eqs_exit(bool user)
 
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
-	WARN_ON_ONCE(oldval < 0);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
 	if (oldval & DYNTICK_TASK_NEST_MASK) {
 		rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
 	} else {
@@ -828,7 +870,8 @@ void rcu_irq_enter(void)
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting++;
-	WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     rdtp->dynticks_nesting == 0);
 	if (oldval)
 		trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
 	else
@@ -1011,9 +1054,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		return 1;
 	} else {
-		if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+		if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
 				 rdp->mynode->gpnum))
-			ACCESS_ONCE(rdp->gpwrap) = true;
+			WRITE_ONCE(rdp->gpwrap, true);
 		return 0;
 	}
 }
@@ -1093,12 +1136,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 	if (ULONG_CMP_GE(jiffies,
 			 rdp->rsp->gp_start + jiffies_till_sched_qs) ||
 	    ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-		if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
-			ACCESS_ONCE(rdp->cond_resched_completed) =
-				ACCESS_ONCE(rdp->mynode->completed);
+		if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+			WRITE_ONCE(rdp->cond_resched_completed,
+				   READ_ONCE(rdp->mynode->completed));
 			smp_mb(); /* ->cond_resched_completed before *rcrmp. */
-			ACCESS_ONCE(*rcrmp) =
-				ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+			WRITE_ONCE(*rcrmp,
+				   READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
 			resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
 			rdp->rsp->jiffies_resched += 5; /* Enable beating. */
 		} else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
@@ -1119,9 +1162,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
 	rsp->gp_start = j;
 	smp_wmb(); /* Record start time before stall time. */
 	j1 = rcu_jiffies_till_stall_check();
-	ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
+	WRITE_ONCE(rsp->jiffies_stall, j + j1);
 	rsp->jiffies_resched = j + j1 / 2;
-	rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+	rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs);
 }
 
 /*
@@ -1133,10 +1176,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 	unsigned long j;
 
 	j = jiffies;
-	gpa = ACCESS_ONCE(rsp->gp_activity);
+	gpa = READ_ONCE(rsp->gp_activity);
 	if (j - gpa > 2 * HZ)
-		pr_err("%s kthread starved for %ld jiffies!\n",
-		       rsp->name, j - gpa);
+		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+		       rsp->name, j - gpa,
+		       rsp->gpnum, rsp->completed, rsp->gp_flags);
 }
 
 /*
@@ -1173,12 +1217,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	/* Only let one CPU complain about others per time interval. */
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+	delta = jiffies - READ_ONCE(rsp->jiffies_stall);
 	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-	ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+	WRITE_ONCE(rsp->jiffies_stall,
+		   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/*
@@ -1212,12 +1257,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	if (ndetected) {
 		rcu_dump_cpu_stacks(rsp);
 	} else {
-		if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
-		    ACCESS_ONCE(rsp->completed) == gpnum) {
+		if (READ_ONCE(rsp->gpnum) != gpnum ||
+		    READ_ONCE(rsp->completed) == gpnum) {
 			pr_err("INFO: Stall ended before state dump start\n");
 		} else {
 			j = jiffies;
-			gpa = ACCESS_ONCE(rsp->gp_activity);
+			gpa = READ_ONCE(rsp->gp_activity);
 			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
 			       rsp->name, j - gpa, j, gpa,
 			       jiffies_till_next_fqs,
@@ -1262,9 +1307,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	rcu_dump_cpu_stacks(rsp);
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
-		ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
-				     3 * rcu_jiffies_till_stall_check() + 3;
+	if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
+		WRITE_ONCE(rsp->jiffies_stall,
+			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/*
@@ -1307,20 +1352,20 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Given this check, comparisons of jiffies, rsp->jiffies_stall,
 	 * and rsp->gp_start suffice to forestall false positives.
 	 */
-	gpnum = ACCESS_ONCE(rsp->gpnum);
+	gpnum = READ_ONCE(rsp->gpnum);
 	smp_rmb(); /* Pick up ->gpnum first... */
-	js = ACCESS_ONCE(rsp->jiffies_stall);
+	js = READ_ONCE(rsp->jiffies_stall);
 	smp_rmb(); /* ...then ->jiffies_stall before the rest... */
-	gps = ACCESS_ONCE(rsp->gp_start);
+	gps = READ_ONCE(rsp->gp_start);
 	smp_rmb(); /* ...and finally ->gp_start before ->completed. */
-	completed = ACCESS_ONCE(rsp->completed);
+	completed = READ_ONCE(rsp->completed);
 	if (ULONG_CMP_GE(completed, gpnum) ||
 	    ULONG_CMP_LT(j, js) ||
 	    ULONG_CMP_GE(gps, js))
 		return; /* No stall or GP completed since entering function. */
 	rnp = rdp->mynode;
 	if (rcu_gp_in_progress(rsp) &&
-	    (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+	    (READ_ONCE(rnp->qsmask) & rdp->grpmask)) {
 
 		/* We haven't checked in, so go dump stack. */
 		print_cpu_stall(rsp);
@@ -1347,7 +1392,7 @@ void rcu_cpu_stall_reset(void)
 	struct rcu_state *rsp;
 
 	for_each_rcu_flavor(rsp)
-		ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+		WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2);
 }
 
 /*
@@ -1457,7 +1502,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 	 * doing some extra useless work.
 	 */
 	if (rnp->gpnum != rnp->completed ||
-	    ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
+	    READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
 		rnp->need_future_gp[c & 0x1]++;
 		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
 		goto out;
@@ -1542,7 +1587,7 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 {
 	if (current == rsp->gp_kthread ||
-	    !ACCESS_ONCE(rsp->gp_flags) ||
+	    !READ_ONCE(rsp->gp_flags) ||
 	    !rsp->gp_kthread)
 		return;
 	wake_up(&rsp->gp_wq);
@@ -1677,7 +1722,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 
 	/* Handle the ends of any preceding grace periods first. */
 	if (rdp->completed == rnp->completed &&
-	    !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+	    !unlikely(READ_ONCE(rdp->gpwrap))) {
 
 		/* No grace period end, so just accelerate recent callbacks. */
 		ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1692,7 +1737,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
 	}
 
-	if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+	if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) {
 		/*
 		 * If the current grace period is waiting for this CPU,
 		 * set up to detect a quiescent state, otherwise don't
@@ -1704,7 +1749,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
-		ACCESS_ONCE(rdp->gpwrap) = false;
+		WRITE_ONCE(rdp->gpwrap, false);
 	}
 	return ret;
 }
@@ -1717,9 +1762,9 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	local_irq_save(flags);
 	rnp = rdp->mynode;
-	if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-	     rdp->completed == ACCESS_ONCE(rnp->completed) &&
-	     !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
+	if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
+	     rdp->completed == READ_ONCE(rnp->completed) &&
+	     !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
 	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
@@ -1731,6 +1776,13 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 		rcu_gp_kthread_wake(rsp);
 }
 
+static void rcu_gp_slow(struct rcu_state *rsp, int delay)
+{
+	if (delay > 0 &&
+	    !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+		schedule_timeout_uninterruptible(delay);
+}
+
 /*
  * Initialize a new grace period.  Return 0 if no grace period required.
  */
@@ -1740,15 +1792,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	ACCESS_ONCE(rsp->gp_activity) = jiffies;
+	WRITE_ONCE(rsp->gp_activity, jiffies);
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
-	if (!ACCESS_ONCE(rsp->gp_flags)) {
+	if (!READ_ONCE(rsp->gp_flags)) {
 		/* Spurious wakeup, tell caller to go back to sleep.  */
 		raw_spin_unlock_irq(&rnp->lock);
 		return 0;
 	}
-	ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+	WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
 
 	if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
 		/*
@@ -1773,6 +1825,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	 * will handle subsequent offline CPUs.
 	 */
 	rcu_for_each_leaf_node(rsp, rnp) {
+		rcu_gp_slow(rsp, gp_preinit_delay);
 		raw_spin_lock_irq(&rnp->lock);
 		smp_mb__after_unlock_lock();
 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1829,14 +1882,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	 * process finishes, because this kthread handles both.
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
+		rcu_gp_slow(rsp, gp_init_delay);
 		raw_spin_lock_irq(&rnp->lock);
 		smp_mb__after_unlock_lock();
 		rdp = this_cpu_ptr(rsp->rda);
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
-		ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
+		WRITE_ONCE(rnp->gpnum, rsp->gpnum);
 		if (WARN_ON_ONCE(rnp->completed != rsp->completed))
-			ACCESS_ONCE(rnp->completed) = rsp->completed;
+			WRITE_ONCE(rnp->completed, rsp->completed);
 		if (rnp == rdp->mynode)
 			(void)__note_gp_changes(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
@@ -1845,10 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
-		ACCESS_ONCE(rsp->gp_activity) = jiffies;
-		if (gp_init_delay > 0 &&
-		    !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
-			schedule_timeout_uninterruptible(gp_init_delay);
+		WRITE_ONCE(rsp->gp_activity, jiffies);
 	}
 
 	return 1;
@@ -1864,7 +1915,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 	unsigned long maxj;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	ACCESS_ONCE(rsp->gp_activity) = jiffies;
+	WRITE_ONCE(rsp->gp_activity, jiffies);
 	rsp->n_force_qs++;
 	if (fqs_state == RCU_SAVE_DYNTICK) {
 		/* Collect dyntick-idle snapshots. */
@@ -1882,11 +1933,11 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 		force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
 	}
 	/* Clear flag to prevent immediate re-entry. */
-	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 		raw_spin_lock_irq(&rnp->lock);
 		smp_mb__after_unlock_lock();
-		ACCESS_ONCE(rsp->gp_flags) =
-			ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
+		WRITE_ONCE(rsp->gp_flags,
+			   READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
 		raw_spin_unlock_irq(&rnp->lock);
 	}
 	return fqs_state;
@@ -1903,7 +1954,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	ACCESS_ONCE(rsp->gp_activity) = jiffies;
+	WRITE_ONCE(rsp->gp_activity, jiffies);
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
 	gp_duration = jiffies - rsp->gp_start;
@@ -1934,7 +1985,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		smp_mb__after_unlock_lock();
 		WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 		WARN_ON_ONCE(rnp->qsmask);
-		ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+		WRITE_ONCE(rnp->completed, rsp->gpnum);
 		rdp = this_cpu_ptr(rsp->rda);
 		if (rnp == rdp->mynode)
 			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
@@ -1942,7 +1993,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		nocb += rcu_future_gp_cleanup(rsp, rnp);
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
-		ACCESS_ONCE(rsp->gp_activity) = jiffies;
+		WRITE_ONCE(rsp->gp_activity, jiffies);
+		rcu_gp_slow(rsp, gp_cleanup_delay);
 	}
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock_irq(&rnp->lock);
@@ -1950,16 +2002,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	rcu_nocb_gp_set(rnp, nocb);
 
 	/* Declare grace period done. */
-	ACCESS_ONCE(rsp->completed) = rsp->gpnum;
+	WRITE_ONCE(rsp->completed, rsp->gpnum);
 	trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
 	rsp->fqs_state = RCU_GP_IDLE;
 	rdp = this_cpu_ptr(rsp->rda);
 	/* Advance CBs to reduce false positives below. */
 	needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
 	if (needgp || cpu_needs_another_gp(rsp, rdp)) {
-		ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+		WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
 		trace_rcu_grace_period(rsp->name,
-				       ACCESS_ONCE(rsp->gpnum),
+				       READ_ONCE(rsp->gpnum),
 				       TPS("newreq"));
 	}
 	raw_spin_unlock_irq(&rnp->lock);
@@ -1983,20 +2035,20 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		/* Handle grace-period start. */
 		for (;;) {
 			trace_rcu_grace_period(rsp->name,
-					       ACCESS_ONCE(rsp->gpnum),
+					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwait"));
 			rsp->gp_state = RCU_GP_WAIT_GPS;
 			wait_event_interruptible(rsp->gp_wq,
-						 ACCESS_ONCE(rsp->gp_flags) &
+						 READ_ONCE(rsp->gp_flags) &
 						 RCU_GP_FLAG_INIT);
 			/* Locking provides needed memory barrier. */
 			if (rcu_gp_init(rsp))
 				break;
 			cond_resched_rcu_qs();
-			ACCESS_ONCE(rsp->gp_activity) = jiffies;
+			WRITE_ONCE(rsp->gp_activity, jiffies);
 			WARN_ON(signal_pending(current));
 			trace_rcu_grace_period(rsp->name,
-					       ACCESS_ONCE(rsp->gpnum),
+					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwaitsig"));
 		}
 
@@ -2012,39 +2064,39 @@ static int __noreturn rcu_gp_kthread(void *arg)
 			if (!ret)
 				rsp->jiffies_force_qs = jiffies + j;
 			trace_rcu_grace_period(rsp->name,
-					       ACCESS_ONCE(rsp->gpnum),
+					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
 			rsp->gp_state = RCU_GP_WAIT_FQS;
 			ret = wait_event_interruptible_timeout(rsp->gp_wq,
-					((gf = ACCESS_ONCE(rsp->gp_flags)) &
+					((gf = READ_ONCE(rsp->gp_flags)) &
 					 RCU_GP_FLAG_FQS) ||
-					(!ACCESS_ONCE(rnp->qsmask) &&
+					(!READ_ONCE(rnp->qsmask) &&
 					 !rcu_preempt_blocked_readers_cgp(rnp)),
 					j);
 			/* Locking provides needed memory barriers. */
 			/* If grace period done, leave loop. */
-			if (!ACCESS_ONCE(rnp->qsmask) &&
+			if (!READ_ONCE(rnp->qsmask) &&
 			    !rcu_preempt_blocked_readers_cgp(rnp))
 				break;
 			/* If time for quiescent-state forcing, do it. */
 			if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
 			    (gf & RCU_GP_FLAG_FQS)) {
 				trace_rcu_grace_period(rsp->name,
-						       ACCESS_ONCE(rsp->gpnum),
+						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsstart"));
 				fqs_state = rcu_gp_fqs(rsp, fqs_state);
 				trace_rcu_grace_period(rsp->name,
-						       ACCESS_ONCE(rsp->gpnum),
+						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsend"));
 				cond_resched_rcu_qs();
-				ACCESS_ONCE(rsp->gp_activity) = jiffies;
+				WRITE_ONCE(rsp->gp_activity, jiffies);
 			} else {
 				/* Deal with stray signal. */
 				cond_resched_rcu_qs();
-				ACCESS_ONCE(rsp->gp_activity) = jiffies;
+				WRITE_ONCE(rsp->gp_activity, jiffies);
 				WARN_ON(signal_pending(current));
 				trace_rcu_grace_period(rsp->name,
-						       ACCESS_ONCE(rsp->gpnum),
+						       READ_ONCE(rsp->gpnum),
 						       TPS("fqswaitsig"));
 			}
 			j = jiffies_till_next_fqs;
@@ -2086,8 +2138,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 		 */
 		return false;
 	}
-	ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
-	trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+	WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
+	trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
 			       TPS("newreq"));
 
 	/*
@@ -2137,6 +2189,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
 	rcu_gp_kthread_wake(rsp);
 }
@@ -2334,8 +2387,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Send the specified CPU's RCU callbacks to the orphanage.  The
  * specified CPU must be offline, and the caller must hold the
@@ -2346,7 +2397,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 			  struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	/* No-CBs CPUs do not have orphanable callbacks. */
-	if (rcu_is_nocb_cpu(rdp->cpu))
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
 		return;
 
 	/*
@@ -2359,7 +2410,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 		rsp->qlen += rdp->qlen;
 		rdp->n_cbs_orphaned += rdp->qlen;
 		rdp->qlen_lazy = 0;
-		ACCESS_ONCE(rdp->qlen) = 0;
+		WRITE_ONCE(rdp->qlen, 0);
 	}
 
 	/*
@@ -2405,7 +2456,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
 	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
 
 	/* No-CBs CPUs are handled specially. */
-	if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+	    rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
 		return;
 
 	/* Do the accounting first. */
@@ -2452,6 +2504,9 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 	RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
 	RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
 
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return;
+
 	RCU_TRACE(mask = rdp->grpmask);
 	trace_rcu_grace_period(rsp->name,
 			       rnp->gpnum + 1 - !!(rnp->qsmask & mask),
@@ -2480,7 +2535,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	long mask;
 	struct rcu_node *rnp = rnp_leaf;
 
-	if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+	    rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
 		return;
 	for (;;) {
 		mask = rnp->grpmask;
@@ -2511,6 +2567,9 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return;
+
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2532,6 +2591,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return;
+
 	/* Adjust any no-longer-needed kthreads. */
 	rcu_boost_kthread_setaffinity(rnp, -1);
 
@@ -2546,26 +2608,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 		  cpu, rdp->qlen, rdp->nxtlist);
 }
 
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
-{
-}
-
-static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
-}
-
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Invoke any RCU callbacks that have made it to the end of their grace
  * period.  Thottle as specified by rdp->blimit.
@@ -2580,7 +2622,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	/* If no callbacks are ready, just return. */
 	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
 		trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
-		trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+		trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
 				    need_resched(), is_idle_task(current),
 				    rcu_is_callbacks_kthread());
 		return;
@@ -2636,7 +2678,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 	smp_mb(); /* List handling before counting for rcu_barrier(). */
 	rdp->qlen_lazy -= count_lazy;
-	ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
+	WRITE_ONCE(rdp->qlen, rdp->qlen - count);
 	rdp->n_cbs_invoked += count;
 
 	/* Reinstate batch limit if we have worked down the excess. */
@@ -2730,10 +2772,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		mask = 0;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		smp_mb__after_unlock_lock();
-		if (!rcu_gp_in_progress(rsp)) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-			return;
-		}
 		if (rnp->qsmask == 0) {
 			if (rcu_state_p == &rcu_sched_state ||
 			    rsp != rcu_state_p ||
@@ -2763,8 +2801,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		bit = 1;
 		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
 			if ((rnp->qsmask & bit) != 0) {
-				if ((rnp->qsmaskinit & bit) == 0)
-					*isidle = false; /* Pending hotplug. */
 				if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
 					mask |= bit;
 			}
@@ -2793,7 +2829,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	/* Funnel through hierarchy to reduce memory contention. */
 	rnp = __this_cpu_read(rsp->rda->mynode);
 	for (; rnp != NULL; rnp = rnp->parent) {
-		ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+		ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
 		      !raw_spin_trylock(&rnp->fqslock);
 		if (rnp_old != NULL)
 			raw_spin_unlock(&rnp_old->fqslock);
@@ -2809,13 +2845,12 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	raw_spin_lock_irqsave(&rnp_old->lock, flags);
 	smp_mb__after_unlock_lock();
 	raw_spin_unlock(&rnp_old->fqslock);
-	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 		rsp->n_force_qs_lh++;
 		raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 		return;  /* Someone beat us to it. */
 	}
-	ACCESS_ONCE(rsp->gp_flags) =
-		ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
+	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 	rcu_gp_kthread_wake(rsp);
 }
@@ -2881,7 +2916,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+	if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
 		return;
 	if (likely(!rsp->boost)) {
 		rcu_do_batch(rsp, rdp);
@@ -2972,7 +3007,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
 	if (debug_rcu_head_queue(head)) {
 		/* Probable double call_rcu(), so leak the callback. */
-		ACCESS_ONCE(head->func) = rcu_leak_callback;
+		WRITE_ONCE(head->func, rcu_leak_callback);
 		WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
 		return;
 	}
@@ -3011,7 +3046,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 		if (!likely(rdp->nxtlist))
 			init_default_callback_list(rdp);
 	}
-	ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
+	WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
 	if (lazy)
 		rdp->qlen_lazy++;
 	else
@@ -3287,7 +3322,7 @@ void synchronize_sched_expedited(void)
 	if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
 			 (ulong)atomic_long_read(&rsp->expedited_done) +
 			 ULONG_MAX / 8)) {
-		synchronize_sched();
+		wait_rcu_gp(call_rcu_sched);
 		atomic_long_inc(&rsp->expedited_wrap);
 		return;
 	}
@@ -3450,14 +3485,14 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 
 	/* Has another RCU grace period completed?  */
-	if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
+	if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
 		rdp->n_rp_gp_completed++;
 		return 1;
 	}
 
 	/* Has a new RCU grace period started? */
-	if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
-	    unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
+	if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
+	    unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
 		rdp->n_rp_gp_started++;
 		return 1;
 	}
@@ -3493,7 +3528,7 @@ static int rcu_pending(void)
  * non-NULL, store an indication of whether all callbacks are lazy.
  * (If there are no callbacks, all of them are deemed to be lazy.)
  */
-static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
+static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
 {
 	bool al = true;
 	bool hc = false;
@@ -3564,7 +3599,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 {
 	int cpu;
 	struct rcu_data *rdp;
-	unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+	unsigned long snap = READ_ONCE(rsp->n_barrier_done);
 	unsigned long snap_done;
 
 	_rcu_barrier_trace(rsp, "Begin", -1, snap);
@@ -3606,10 +3641,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
 
 	/*
 	 * Increment ->n_barrier_done to avoid duplicate work.  Use
-	 * ACCESS_ONCE() to prevent the compiler from speculating
+	 * WRITE_ONCE() to prevent the compiler from speculating
 	 * the increment to precede the early-exit check.
 	 */
-	ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+	WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
 	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
 	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
 	smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3645,7 +3680,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 				__call_rcu(&rdp->barrier_head,
 					   rcu_barrier_callback, rsp, cpu, 0);
 			}
-		} else if (ACCESS_ONCE(rdp->qlen)) {
+		} else if (READ_ONCE(rdp->qlen)) {
 			_rcu_barrier_trace(rsp, "OnlineQ", cpu,
 					   rsp->n_barrier_done);
 			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3665,7 +3700,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 
 	/* Increment ->n_barrier_done to prevent duplicate work. */
 	smp_mb(); /* Keep increment after above mechanism. */
-	ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+	WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
 	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
 	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
 	smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3780,7 +3815,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
 	rdp->completed = rnp->completed;
 	rdp->passed_quiesce = false;
-	rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+	rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
 	rdp->qs_pending = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -3924,16 +3959,16 @@ void rcu_scheduler_starting(void)
 
 /*
  * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
  */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
 	int i;
 
-	if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+	if (rcu_fanout_exact) {
 		rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
 		for (i = rcu_num_lvls - 2; i >= 0; i--)
-			rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+			rsp->levelspread[i] = RCU_FANOUT;
 	} else {
 		int ccur;
 		int cprv;
@@ -3971,9 +4006,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 
 	BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
 
-	/* Silence gcc 4.8 warning about array index out of range. */
-	if (rcu_num_lvls > RCU_NUM_LVLS)
-		panic("rcu_init_one: rcu_num_lvls overflow");
+	/* Silence gcc 4.8 false positive about array index out of range. */
+	if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
+		panic("rcu_init_one: rcu_num_lvls out of range");
 
 	/* Initialize the level-tracking arrays. */
 
@@ -4059,7 +4094,7 @@ static void __init rcu_init_geometry(void)
 		jiffies_till_next_fqs = d;
 
 	/* If the compile-time values are accurate, just leave. */
-	if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+	if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
 	    nr_cpu_ids == NR_CPUS)
 		return;
 	pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
@@ -4073,7 +4108,7 @@ static void __init rcu_init_geometry(void)
 	rcu_capacity[0] = 1;
 	rcu_capacity[1] = rcu_fanout_leaf;
 	for (i = 2; i <= MAX_RCU_LVLS; i++)
-		rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+		rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
 
 	/*
 	 * The boot-time rcu_fanout_leaf parameter is only permitted
@@ -4083,7 +4118,7 @@ static void __init rcu_init_geometry(void)
 	 * the configured number of CPUs.  Complain and fall back to the
 	 * compile-time values if these limits are exceeded.
 	 */
-	if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+	if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
 	    rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
 	    n > rcu_capacity[MAX_RCU_LVLS]) {
 		WARN_ON(1);
@@ -4109,6 +4144,28 @@ static void __init rcu_init_geometry(void)
 	rcu_num_nodes -= n;
 }
 
+/*
+ * Dump out the structure of the rcu_node combining tree associated
+ * with the rcu_state structure referenced by rsp.
+ */
+static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
+{
+	int level = 0;
+	struct rcu_node *rnp;
+
+	pr_info("rcu_node tree layout dump\n");
+	pr_info(" ");
+	rcu_for_each_node_breadth_first(rsp, rnp) {
+		if (rnp->level != level) {
+			pr_cont("\n");
+			pr_info(" ");
+			level = rnp->level;
+		}
+		pr_cont("%d:%d ^%d  ", rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	pr_cont("\n");
+}
+
 void __init rcu_init(void)
 {
 	int cpu;
@@ -4119,6 +4176,8 @@ void __init rcu_init(void)
 	rcu_init_geometry();
 	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
 	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+	if (dump_tree)
+		rcu_dump_rcu_node_tree(&rcu_sched_state);
 	__rcu_init_preempt();
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a69d3dab2..4adb7ca0b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -35,11 +35,33 @@
  * In practice, this did work well going from three levels to four.
  * Of course, your mileage may vary.
  */
+
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT_1	      (CONFIG_RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2	      (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_3	      (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_4	      (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#ifdef CONFIG_RCU_FANOUT
+#define RCU_FANOUT CONFIG_RCU_FANOUT
+#else /* #ifdef CONFIG_RCU_FANOUT */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT 64
+# else
+# define RCU_FANOUT 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT */
+
+#ifdef CONFIG_RCU_FANOUT_LEAF
+#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT_LEAF 64
+# else
+# define RCU_FANOUT_LEAF 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
+
+#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2	      (RCU_FANOUT_1 * RCU_FANOUT)
+#define RCU_FANOUT_3	      (RCU_FANOUT_2 * RCU_FANOUT)
+#define RCU_FANOUT_4	      (RCU_FANOUT_3 * RCU_FANOUT)
 
 #if NR_CPUS <= RCU_FANOUT_1
 #  define RCU_NUM_LVLS	      1
@@ -170,7 +192,6 @@ struct rcu_node {
 				/*  if there is no such task.  If there */
 				/*  is no current expedited grace period, */
 				/*  then there can cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
 	struct list_head *boost_tasks;
 				/* Pointer to first task that needs to be */
 				/*  priority boosted, or NULL if no priority */
@@ -208,7 +229,6 @@ struct rcu_node {
 	unsigned long n_balk_nos;
 				/* Refused to boost: not sure why, though. */
 				/*  This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_RCU_NOCB_CPU
 	wait_queue_head_t nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
@@ -519,14 +539,11 @@ extern struct list_head rcu_struct_flavors;
  * RCU implementation internal declarations:
  */
 extern struct rcu_state rcu_sched_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
 
 extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 #ifdef CONFIG_PREEMPT_RCU
 extern struct rcu_state rcu_preempt_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 #ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8c0ec0f5a..013485fb2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -43,7 +43,17 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
 
-#endif /* #ifdef CONFIG_RCU_BOOST */
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
+ * all uses are in dead code.  Provide a definition to keep the compiler
+ * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
+ * This probably needs to be excluded from -rt builds.
+ */
+#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
 
 #ifdef CONFIG_RCU_NOCB_CPU
 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -60,11 +70,11 @@ static void __init rcu_bootup_announce_oddness(void)
 {
 	if (IS_ENABLED(CONFIG_RCU_TRACE))
 		pr_info("\tRCU debugfs-based tracing is enabled.\n");
-	if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
-	    (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+	if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
+	    (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
 		pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
-		       CONFIG_RCU_FANOUT);
-	if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+		       RCU_FANOUT);
+	if (rcu_fanout_exact)
 		pr_info("\tHierarchical RCU autobalancing is disabled.\n");
 	if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
 		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
@@ -76,10 +86,10 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tAdditional per-CPU info printed with stalls.\n");
 	if (NUM_RCU_LVL_4 != 0)
 		pr_info("\tFour-level hierarchy is enabled.\n");
-	if (CONFIG_RCU_FANOUT_LEAF != 16)
+	if (RCU_FANOUT_LEAF != 16)
 		pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
-			CONFIG_RCU_FANOUT_LEAF);
-	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+			RCU_FANOUT_LEAF);
+	if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
 		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
 	if (nr_cpu_ids != NR_CPUS)
 		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
@@ -90,7 +100,8 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_PREEMPT_RCU
 
 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state_p = &rcu_preempt_state;
+static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
 
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -116,11 +127,11 @@ static void __init rcu_bootup_announce(void)
  */
 static void rcu_preempt_qs(void)
 {
-	if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+	if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
 		trace_rcu_grace_period(TPS("rcu_preempt"),
-				       __this_cpu_read(rcu_preempt_data.gpnum),
+				       __this_cpu_read(rcu_data_p->gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+		__this_cpu_write(rcu_data_p->passed_quiesce, 1);
 		barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
 		current->rcu_read_unlock_special.b.need_qs = false;
 	}
@@ -150,7 +161,7 @@ static void rcu_preempt_note_context_switch(void)
 	    !t->rcu_read_unlock_special.b.blocked) {
 
 		/* Possibly blocking in an RCU read-side critical section. */
-		rdp = this_cpu_ptr(rcu_preempt_state.rda);
+		rdp = this_cpu_ptr(rcu_state_p->rda);
 		rnp = rdp->mynode;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		smp_mb__after_unlock_lock();
@@ -180,10 +191,9 @@ static void rcu_preempt_note_context_switch(void)
 		if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
 			list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
 			rnp->gp_tasks = &t->rcu_node_entry;
-#ifdef CONFIG_RCU_BOOST
-			if (rnp->boost_tasks != NULL)
+			if (IS_ENABLED(CONFIG_RCU_BOOST) &&
+			    rnp->boost_tasks != NULL)
 				rnp->boost_tasks = rnp->gp_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
 		} else {
 			list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
 			if (rnp->qsmask & rdp->grpmask)
@@ -263,9 +273,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 	bool empty_exp_now;
 	unsigned long flags;
 	struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
 	bool drop_boost_mutex = false;
-#endif /* #ifdef CONFIG_RCU_BOOST */
 	struct rcu_node *rnp;
 	union rcu_special special;
 
@@ -307,9 +315,11 @@ void rcu_read_unlock_special(struct task_struct *t)
 		t->rcu_read_unlock_special.b.blocked = false;
 
 		/*
-		 * Remove this task from the list it blocked on.  The
-		 * task can migrate while we acquire the lock, but at
-		 * most one time.  So at most two passes through loop.
+		 * Remove this task from the list it blocked on.  The task
+		 * now remains queued on the rcu_node corresponding to
+		 * the CPU it first blocked on, so the first attempt to
+		 * acquire the task's rcu_node's ->lock will succeed.
+		 * Keep the loop and add a WARN_ON() out of sheer paranoia.
 		 */
 		for (;;) {
 			rnp = t->rcu_blocked_node;
@@ -317,6 +327,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 			smp_mb__after_unlock_lock();
 			if (rnp == t->rcu_blocked_node)
 				break;
+			WARN_ON_ONCE(1);
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
@@ -331,12 +342,12 @@ void rcu_read_unlock_special(struct task_struct *t)
 			rnp->gp_tasks = np;
 		if (&t->rcu_node_entry == rnp->exp_tasks)
 			rnp->exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
-		if (&t->rcu_node_entry == rnp->boost_tasks)
-			rnp->boost_tasks = np;
-		/* Snapshot ->boost_mtx ownership with rcu_node lock held. */
-		drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
-#endif /* #ifdef CONFIG_RCU_BOOST */
+		if (IS_ENABLED(CONFIG_RCU_BOOST)) {
+			if (&t->rcu_node_entry == rnp->boost_tasks)
+				rnp->boost_tasks = np;
+			/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
+			drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+		}
 
 		/*
 		 * If this was the last task on the current list, and if
@@ -353,24 +364,21 @@ void rcu_read_unlock_special(struct task_struct *t)
 							 rnp->grplo,
 							 rnp->grphi,
 							 !!rnp->gp_tasks);
-			rcu_report_unblock_qs_rnp(&rcu_preempt_state,
-						  rnp, flags);
+			rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
 		} else {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		}
 
-#ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
-		if (drop_boost_mutex)
+		if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
 			rt_mutex_unlock(&rnp->boost_mtx);
-#endif /* #ifdef CONFIG_RCU_BOOST */
 
 		/*
 		 * If this was the last task on the expedited lists,
 		 * then we need to report up the rcu_node hierarchy.
 		 */
 		if (!empty_exp && empty_exp_now)
-			rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+			rcu_report_exp_rnp(rcu_state_p, rnp, true);
 	} else {
 		local_irq_restore(flags);
 	}
@@ -390,7 +398,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-	t = list_entry(rnp->gp_tasks,
+	t = list_entry(rnp->gp_tasks->prev,
 		       struct task_struct, rcu_node_entry);
 	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 		sched_show_task(t);
@@ -447,7 +455,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 	if (!rcu_preempt_blocked_readers_cgp(rnp))
 		return 0;
 	rcu_print_task_stall_begin(rnp);
-	t = list_entry(rnp->gp_tasks,
+	t = list_entry(rnp->gp_tasks->prev,
 		       struct task_struct, rcu_node_entry);
 	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
 		pr_cont(" P%d", t->pid);
@@ -491,8 +499,8 @@ static void rcu_preempt_check_callbacks(void)
 		return;
 	}
 	if (t->rcu_read_lock_nesting > 0 &&
-	    __this_cpu_read(rcu_preempt_data.qs_pending) &&
-	    !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+	    __this_cpu_read(rcu_data_p->qs_pending) &&
+	    !__this_cpu_read(rcu_data_p->passed_quiesce))
 		t->rcu_read_unlock_special.b.need_qs = true;
 }
 
@@ -500,7 +508,7 @@ static void rcu_preempt_check_callbacks(void)
 
 static void rcu_preempt_do_callbacks(void)
 {
-	rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+	rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
 }
 
 #endif /* #ifdef CONFIG_RCU_BOOST */
@@ -510,7 +518,7 @@ static void rcu_preempt_do_callbacks(void)
  */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-	__call_rcu(head, func, &rcu_preempt_state, -1, 0);
+	__call_rcu(head, func, rcu_state_p, -1, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
@@ -570,7 +578,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
 	return !rcu_preempted_readers_exp(rnp) &&
-	       ACCESS_ONCE(rnp->expmask) == 0;
+	       READ_ONCE(rnp->expmask) == 0;
 }
 
 /*
@@ -711,12 +719,12 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
 void synchronize_rcu_expedited(void)
 {
 	struct rcu_node *rnp;
-	struct rcu_state *rsp = &rcu_preempt_state;
+	struct rcu_state *rsp = rcu_state_p;
 	unsigned long snap;
 	int trycount = 0;
 
 	smp_mb(); /* Caller's modifications seen first by other CPUs. */
-	snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+	snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
 	smp_mb(); /* Above access cannot bleed into critical section. */
 
 	/*
@@ -740,7 +748,7 @@ void synchronize_rcu_expedited(void)
 	 */
 	while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
 		if (ULONG_CMP_LT(snap,
-		    ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+		    READ_ONCE(sync_rcu_preempt_exp_count))) {
 			put_online_cpus();
 			goto mb_ret; /* Others did our work for us. */
 		}
@@ -752,7 +760,7 @@ void synchronize_rcu_expedited(void)
 			return;
 		}
 	}
-	if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+	if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
 		put_online_cpus();
 		goto unlock_mb_ret; /* Others did our work for us. */
 	}
@@ -780,8 +788,7 @@ void synchronize_rcu_expedited(void)
 
 	/* Clean up and exit. */
 	smp_mb(); /* ensure expedited GP seen before counter increment. */
-	ACCESS_ONCE(sync_rcu_preempt_exp_count) =
-					sync_rcu_preempt_exp_count + 1;
+	WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
 unlock_mb_ret:
 	mutex_unlock(&sync_rcu_preempt_exp_mutex);
 mb_ret:
@@ -799,7 +806,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
  */
 void rcu_barrier(void)
 {
-	_rcu_barrier(&rcu_preempt_state);
+	_rcu_barrier(rcu_state_p);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
@@ -808,7 +815,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
  */
 static void __init __rcu_init_preempt(void)
 {
-	rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+	rcu_init_one(rcu_state_p, rcu_data_p);
 }
 
 /*
@@ -831,7 +838,8 @@ void exit_rcu(void)
 
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
-static struct rcu_state *rcu_state_p = &rcu_sched_state;
+static struct rcu_state *const rcu_state_p = &rcu_sched_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
 
 /*
  * Tell them what RCU they are running.
@@ -994,8 +1002,8 @@ static int rcu_boost(struct rcu_node *rnp)
 	struct task_struct *t;
 	struct list_head *tb;
 
-	if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
-	    ACCESS_ONCE(rnp->boost_tasks) == NULL)
+	if (READ_ONCE(rnp->exp_tasks) == NULL &&
+	    READ_ONCE(rnp->boost_tasks) == NULL)
 		return 0;  /* Nothing left to boost. */
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1048,8 +1056,8 @@ static int rcu_boost(struct rcu_node *rnp)
 	rt_mutex_lock(&rnp->boost_mtx);
 	rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
 
-	return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
-	       ACCESS_ONCE(rnp->boost_tasks) != NULL;
+	return READ_ONCE(rnp->exp_tasks) != NULL ||
+	       READ_ONCE(rnp->boost_tasks) != NULL;
 }
 
 /*
@@ -1173,7 +1181,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	struct sched_param sp;
 	struct task_struct *t;
 
-	if (&rcu_preempt_state != rsp)
+	if (rcu_state_p != rsp)
 		return 0;
 
 	if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
@@ -1367,13 +1375,12 @@ static void rcu_prepare_kthreads(int cpu)
  * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
  * any flavor of RCU.
  */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
-	*delta_jiffies = ULONG_MAX;
-	return rcu_cpu_has_callbacks(NULL);
+	*nextevt = KTIME_MAX;
+	return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
+	       ? 0 : rcu_cpu_has_callbacks(NULL);
 }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1432,8 +1439,6 @@ module_param(rcu_idle_gp_delay, int, 0644);
 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
 module_param(rcu_idle_lazy_gp_delay, int, 0644);
 
-extern int tick_nohz_active;
-
 /*
  * Try to advance callbacks for all flavors of RCU on the current CPU, but
  * only if it has been awhile since the last time we did so.  Afterwards,
@@ -1462,7 +1467,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
 		 * callbacks not yet ready to invoke.
 		 */
 		if ((rdp->completed != rnp->completed ||
-		     unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
+		     unlikely(READ_ONCE(rdp->gpwrap))) &&
 		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
 			note_gp_changes(rsp, rdp);
 
@@ -1480,17 +1485,22 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
  *
  * The caller must have disabled interrupts.
  */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *dj)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+	unsigned long dj;
+
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
+		*nextevt = KTIME_MAX;
+		return 0;
+	}
 
 	/* Snapshot to detect later posting of non-lazy callback. */
 	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
 
 	/* If no callbacks, RCU doesn't need the CPU. */
 	if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
-		*dj = ULONG_MAX;
+		*nextevt = KTIME_MAX;
 		return 0;
 	}
 
@@ -1504,14 +1514,14 @@ int rcu_needs_cpu(unsigned long *dj)
 
 	/* Request timer delay depending on laziness, and round. */
 	if (!rdtp->all_lazy) {
-		*dj = round_up(rcu_idle_gp_delay + jiffies,
+		dj = round_up(rcu_idle_gp_delay + jiffies,
 			       rcu_idle_gp_delay) - jiffies;
 	} else {
-		*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+		dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
 	}
+	*nextevt = basemono + dj * TICK_NSEC;
 	return 0;
 }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Prepare a CPU for idle from an RCU perspective.  The first major task
@@ -1525,7 +1535,6 @@ int rcu_needs_cpu(unsigned long *dj)
  */
 static void rcu_prepare_for_idle(void)
 {
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
 	bool needwake;
 	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -1533,8 +1542,11 @@ static void rcu_prepare_for_idle(void)
 	struct rcu_state *rsp;
 	int tne;
 
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+		return;
+
 	/* Handle nohz enablement switches conservatively. */
-	tne = ACCESS_ONCE(tick_nohz_active);
+	tne = READ_ONCE(tick_nohz_active);
 	if (tne != rdtp->tick_nohz_enabled_snap) {
 		if (rcu_cpu_has_callbacks(NULL))
 			invoke_rcu_core(); /* force nohz to see update. */
@@ -1580,7 +1592,6 @@ static void rcu_prepare_for_idle(void)
 		if (needwake)
 			rcu_gp_kthread_wake(rsp);
 	}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 
 /*
@@ -1590,12 +1601,11 @@ static void rcu_prepare_for_idle(void)
  */
 static void rcu_cleanup_after_idle(void)
 {
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-	if (rcu_is_nocb_cpu(smp_processor_id()))
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+	    rcu_is_nocb_cpu(smp_processor_id()))
 		return;
 	if (rcu_try_advance_all_cbs())
 		invoke_rcu_core();
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 
 /*
@@ -1760,7 +1770,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 	       atomic_read(&rdtp->dynticks) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
-	       ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
+	       READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
 	       fast_no_hz);
 }
 
@@ -1898,11 +1908,11 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 {
 	struct rcu_data *rdp_leader = rdp->nocb_leader;
 
-	if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+	if (!READ_ONCE(rdp_leader->nocb_kthread))
 		return;
-	if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+	if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
-		ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
 		wake_up(&rdp_leader->nocb_wq);
 	}
 }
@@ -1934,14 +1944,14 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 	ret = atomic_long_read(&rdp->nocb_q_count);
 
 #ifdef CONFIG_PROVE_RCU
-	rhp = ACCESS_ONCE(rdp->nocb_head);
+	rhp = READ_ONCE(rdp->nocb_head);
 	if (!rhp)
-		rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+		rhp = READ_ONCE(rdp->nocb_gp_head);
 	if (!rhp)
-		rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+		rhp = READ_ONCE(rdp->nocb_follower_head);
 
 	/* Having no rcuo kthread but CBs after scheduler starts is bad! */
-	if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+	if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
 	    rcu_scheduler_fully_active) {
 		/* RCU callback enqueued before CPU first came online??? */
 		pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
@@ -1975,12 +1985,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 	atomic_long_add(rhcount, &rdp->nocb_q_count);
 	/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
 	old_rhpp = xchg(&rdp->nocb_tail, rhtp);
-	ACCESS_ONCE(*old_rhpp) = rhp;
+	WRITE_ONCE(*old_rhpp, rhp);
 	atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
 	smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
 
 	/* If we are not being polled and there is a kthread, awaken it ... */
-	t = ACCESS_ONCE(rdp->nocb_kthread);
+	t = READ_ONCE(rdp->nocb_kthread);
 	if (rcu_nocb_poll || !t) {
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 				    TPS("WakeNotPoll"));
@@ -2118,7 +2128,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	for (;;) {
 		wait_event_interruptible(
 			rnp->nocb_gp_wq[c & 0x1],
-			(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+			(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
 		if (likely(d))
 			break;
 		WARN_ON(signal_pending(current));
@@ -2145,7 +2155,7 @@ wait_again:
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
 		wait_event_interruptible(my_rdp->nocb_wq,
-				!ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+				!READ_ONCE(my_rdp->nocb_leader_sleep));
 		/* Memory barrier handled by smp_mb() calls below and repoll. */
 	} else if (firsttime) {
 		firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2159,12 +2169,12 @@ wait_again:
 	 */
 	gotcbs = false;
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-		rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+		rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
 		if (!rdp->nocb_gp_head)
 			continue;  /* No CBs here, try next follower. */
 
 		/* Move callbacks to wait-for-GP list, which is empty. */
-		ACCESS_ONCE(rdp->nocb_head) = NULL;
+		WRITE_ONCE(rdp->nocb_head, NULL);
 		rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
 		gotcbs = true;
 	}
@@ -2184,7 +2194,7 @@ wait_again:
 		my_rdp->nocb_leader_sleep = true;
 		smp_mb();  /* Ensure _sleep true before scan. */
 		for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
-			if (ACCESS_ONCE(rdp->nocb_head)) {
+			if (READ_ONCE(rdp->nocb_head)) {
 				/* Found CB, so short-circuit next wait. */
 				my_rdp->nocb_leader_sleep = false;
 				break;
@@ -2205,7 +2215,7 @@ wait_again:
 
 	/* Each pass through the following loop wakes a follower, if needed. */
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-		if (ACCESS_ONCE(rdp->nocb_head))
+		if (READ_ONCE(rdp->nocb_head))
 			my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
 		if (!rdp->nocb_gp_head)
 			continue; /* No CBs, so no need to wake follower. */
@@ -2241,7 +2251,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    "FollowerSleep");
 			wait_event_interruptible(rdp->nocb_wq,
-						 ACCESS_ONCE(rdp->nocb_follower_head));
+						 READ_ONCE(rdp->nocb_follower_head));
 		} else if (firsttime) {
 			/* Don't drown trace log with "Poll"! */
 			firsttime = false;
@@ -2282,10 +2292,10 @@ static int rcu_nocb_kthread(void *arg)
 			nocb_follower_wait(rdp);
 
 		/* Pull the ready-to-invoke callbacks onto local list. */
-		list = ACCESS_ONCE(rdp->nocb_follower_head);
+		list = READ_ONCE(rdp->nocb_follower_head);
 		BUG_ON(!list);
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
-		ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+		WRITE_ONCE(rdp->nocb_follower_head, NULL);
 		tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
 
 		/* Each pass through the following loop invokes a callback. */
@@ -2324,7 +2334,7 @@ static int rcu_nocb_kthread(void *arg)
 /* Is a deferred wakeup of rcu_nocb_kthread() required? */
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
 {
-	return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+	return READ_ONCE(rdp->nocb_defer_wakeup);
 }
 
 /* Do a deferred wakeup of rcu_nocb_kthread(). */
@@ -2334,8 +2344,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 
 	if (!rcu_nocb_need_deferred_wakeup(rdp))
 		return;
-	ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
-	ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+	ndw = READ_ONCE(rdp->nocb_defer_wakeup);
+	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
 	wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
 	trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
 }
@@ -2448,7 +2458,7 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
 	t = kthread_run(rcu_nocb_kthread, rdp_spawn,
 			"rcuo%c/%d", rsp->abbr, cpu);
 	BUG_ON(IS_ERR(t));
-	ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+	WRITE_ONCE(rdp_spawn->nocb_kthread, t);
 }
 
 /*
@@ -2663,7 +2673,7 @@ static void rcu_sysidle_enter(int irq)
 
 	/* Record start of fully idle period. */
 	j = jiffies;
-	ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+	WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
 	smp_mb__before_atomic();
 	atomic_inc(&rdtp->dynticks_idle);
 	smp_mb__after_atomic();
@@ -2681,7 +2691,7 @@ static void rcu_sysidle_enter(int irq)
  */
 void rcu_sysidle_force_exit(void)
 {
-	int oldstate = ACCESS_ONCE(full_sysidle_state);
+	int oldstate = READ_ONCE(full_sysidle_state);
 	int newoldstate;
 
 	/*
@@ -2794,7 +2804,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 	smp_mb(); /* Read counters before timestamps. */
 
 	/* Pick up timestamps. */
-	j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+	j = READ_ONCE(rdtp->dynticks_idle_jiffies);
 	/* If this CPU entered idle more recently, update maxj timestamp. */
 	if (ULONG_CMP_LT(*maxj, j))
 		*maxj = j;
@@ -2831,11 +2841,11 @@ static unsigned long rcu_sysidle_delay(void)
 static void rcu_sysidle(unsigned long j)
 {
 	/* Check the current state. */
-	switch (ACCESS_ONCE(full_sysidle_state)) {
+	switch (READ_ONCE(full_sysidle_state)) {
 	case RCU_SYSIDLE_NOT:
 
 		/* First time all are idle, so note a short idle period. */
-		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+		WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
 		break;
 
 	case RCU_SYSIDLE_SHORT:
@@ -2873,7 +2883,7 @@ static void rcu_sysidle_cancel(void)
 {
 	smp_mb();
 	if (full_sysidle_state > RCU_SYSIDLE_SHORT)
-		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+		WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
 }
 
 /*
@@ -2925,7 +2935,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
 	smp_mb();  /* grace period precedes setting inuse. */
 
 	rshp = container_of(rhp, struct rcu_sysidle_head, rh);
-	ACCESS_ONCE(rshp->inuse) = 0;
+	WRITE_ONCE(rshp->inuse, 0);
 }
 
 /*
@@ -2936,7 +2946,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
 bool rcu_sys_is_idle(void)
 {
 	static struct rcu_sysidle_head rsh;
-	int rss = ACCESS_ONCE(full_sysidle_state);
+	int rss = READ_ONCE(full_sysidle_state);
 
 	if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
 		return false;
@@ -2964,7 +2974,7 @@ bool rcu_sys_is_idle(void)
 			}
 			rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
 			oldrss = rss;
-			rss = ACCESS_ONCE(full_sysidle_state);
+			rss = READ_ONCE(full_sysidle_state);
 		}
 	}
 
@@ -3048,10 +3058,10 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_cpu(smp_processor_id()) &&
 	    (!rcu_gp_in_progress(rsp) ||
-	     ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
-		return 1;
+	     ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
+		return true;
 #endif /* #ifdef CONFIG_NO_HZ_FULL */
-	return 0;
+	return false;
 }
 
 /*
@@ -3077,7 +3087,7 @@ static void rcu_bind_gp_kthread(void)
 static void rcu_dynticks_task_enter(void)
 {
 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-	ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+	WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
 }
 
@@ -3085,6 +3095,6 @@ static void rcu_dynticks_task_enter(void)
 static void rcu_dynticks_task_exit(void)
 {
 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-	ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+	WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
 }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index f92361efd..3ea7ffc7d 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -277,7 +277,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 	seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+		   READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
 	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");
@@ -323,8 +323,8 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
 	struct rcu_node *rnp = &rsp->node[0];
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	completed = ACCESS_ONCE(rsp->completed);
-	gpnum = ACCESS_ONCE(rsp->gpnum);
+	completed = READ_ONCE(rsp->completed);
+	gpnum = READ_ONCE(rsp->gpnum);
 	if (completed == gpnum)
 		gpage = 0;
 	else
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1f133350d..afaecb7a7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -150,14 +150,14 @@ void __rcu_read_unlock(void)
 		barrier();  /* critical section before exit code. */
 		t->rcu_read_lock_nesting = INT_MIN;
 		barrier();  /* assign before ->rcu_read_unlock_special load */
-		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
+		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
 			rcu_read_unlock_special(t);
 		barrier();  /* ->rcu_read_unlock_special load before assign */
 		t->rcu_read_lock_nesting = 0;
 	}
 #ifdef CONFIG_PROVE_LOCKING
 	{
-		int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+		int rrln = READ_ONCE(t->rcu_read_lock_nesting);
 
 		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
 	}
@@ -389,17 +389,17 @@ module_param(rcu_cpu_stall_timeout, int, 0644);
 
 int rcu_jiffies_till_stall_check(void)
 {
-	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
 
 	/*
 	 * Limit check must be consistent with the Kconfig limits
 	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
 	 */
 	if (till_stall_check < 3) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+		WRITE_ONCE(rcu_cpu_stall_timeout, 3);
 		till_stall_check = 3;
 	} else if (till_stall_check > 300) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+		WRITE_ONCE(rcu_cpu_stall_timeout, 300);
 		till_stall_check = 300;
 	}
 	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
@@ -550,12 +550,12 @@ static void check_holdout_task(struct task_struct *t,
 {
 	int cpu;
 
-	if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
-	    t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
-	    !ACCESS_ONCE(t->on_rq) ||
+	if (!READ_ONCE(t->rcu_tasks_holdout) ||
+	    t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
+	    !READ_ONCE(t->on_rq) ||
 	    (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
 	     !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
-		ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+		WRITE_ONCE(t->rcu_tasks_holdout, false);
 		list_del_init(&t->rcu_tasks_holdout_list);
 		put_task_struct(t);
 		return;
@@ -639,11 +639,11 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 		 */
 		rcu_read_lock();
 		for_each_process_thread(g, t) {
-			if (t != current && ACCESS_ONCE(t->on_rq) &&
+			if (t != current && READ_ONCE(t->on_rq) &&
 			    !is_idle_task(t)) {
 				get_task_struct(t);
-				t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
-				ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+				t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
+				WRITE_ONCE(t->rcu_tasks_holdout, true);
 				list_add(&t->rcu_tasks_holdout_list,
 					 &rcu_tasks_holdouts);
 			}
@@ -672,7 +672,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 			struct task_struct *t1;
 
 			schedule_timeout_interruptible(HZ);
-			rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+			rtst = READ_ONCE(rcu_task_stall_timeout);
 			needreport = rtst > 0 &&
 				     time_after(jiffies, lastreport + rtst);
 			if (needreport)
@@ -728,7 +728,7 @@ static void rcu_spawn_tasks_kthread(void)
 	static struct task_struct *rcu_tasks_kthread_ptr;
 	struct task_struct *t;
 
-	if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+	if (READ_ONCE(rcu_tasks_kthread_ptr)) {
 		smp_mb(); /* Ensure caller sees full kthread. */
 		return;
 	}
@@ -740,7 +740,7 @@ static void rcu_spawn_tasks_kthread(void)
 	t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
 	BUG_ON(IS_ERR(t));
 	smp_mb(); /* Ensure others see full kthread. */
-	ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+	WRITE_ONCE(rcu_tasks_kthread_ptr, t);
 	mutex_unlock(&rcu_tasks_kthread_mutex);
 }
 
diff --git a/kernel/relay.c b/kernel/relay.c
index e9dbaeb8f..0b4570cfa 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -81,10 +81,7 @@ static struct page **relay_alloc_page_array(unsigned int n_pages)
  */
 static void relay_free_page_array(struct page **array)
 {
-	if (is_vmalloc_addr(array))
-		vfree(array);
-	else
-		kfree(array);
+	kvfree(array);
 }
 
 /**
diff --git a/kernel/resource.c b/kernel/resource.c
index 90552aab5..fed052a1b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -504,13 +504,13 @@ int region_is_ram(resource_size_t start, unsigned long size)
 {
 	struct resource *p;
 	resource_size_t end = start + size - 1;
-	int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 	const char *name = "System RAM";
 	int ret = -1;
 
 	read_lock(&resource_lock);
 	for (p = iomem_resource.child; p ; p = p->sibling) {
-		if (end < p->start)
+		if (p->end < start)
 			continue;
 
 		if (p->start <= start && end <= p->end) {
@@ -521,7 +521,7 @@ int region_is_ram(resource_size_t start, unsigned long size)
 				ret = 1;
 			break;
 		}
-		if (p->end < start)
+		if (end < p->start)
 			break;	/* not found */
 	}
 	read_unlock(&resource_lock);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54b88a1c0..67687973c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,16 +11,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-ifdef CONFIG_SCHED_BFS
-obj-y += bfs.o clock.o
-else
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-$(CONFIG_SMP) += cpudeadline.o
+obj-y += wait.o completion.o idle.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
-endif
-obj-y += wait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd6..750ed601d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
 #include "sched.h"
 
 #include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 
 	p->signal->autogroup = autogroup_kref_get(ag);
 
-	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+	if (!READ_ONCE(sysctl_sched_autogroup_enabled))
 		goto out;
 
 	for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
 	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
 #endif /* CONFIG_SCHED_DEBUG */
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142..890c95f25 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
-	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+	int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
 
 	if (enabled && task_wants_autogroup(p, tg))
 		return p->signal->autogroup->tg;
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c
deleted file mode 100644
index 5366182bd..000000000
--- a/kernel/sched/bfs.c
+++ /dev/null
@@ -1,7420 +0,0 @@
-/*
- *  kernel/sched/bfs.c, was kernel/sched.c
- *
- *  Kernel scheduler and related syscalls
- *
- *  Copyright (C) 1991-2002  Linus Torvalds
- *
- *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
- *		make semaphores SMP safe
- *  1998-11-19	Implemented schedule_timeout() and related stuff
- *		by Andrea Arcangeli
- *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
- *		hybrid priority-list and round-robin design with
- *		an array-switch method of distributing timeslices
- *		and per-CPU runqueues.  Cleanups and useful suggestions
- *		by Davide Libenzi, preemptible kernel bits by Robert Love.
- *  2003-09-03	Interactivity tuning by Con Kolivas.
- *  2004-04-02	Scheduler domains code by Nick Piggin
- *  2007-04-15  Work begun on replacing all interactivity tuning with a
- *              fair scheduling design by Con Kolivas.
- *  2007-05-05  Load balancing (smp-nice) and other improvements
- *              by Peter Williams
- *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
- *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
- *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
- *              Thomas Gleixner, Mike Kravetz
- *  now		Brainfuck deadline scheduling policy by Con Kolivas deletes
- *              a whole lot of those previous things.
- */
-
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-#include <linux/highmem.h>
-#include <asm/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
-#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
-#include <linux/cpuset.h>
-#include <linux/cpumask.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/syscalls.h>
-#include <linux/sched/sysctl.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
-#include <linux/delayacct.h>
-#include <linux/log2.h>
-#include <linux/bootmem.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
-#include <linux/init_task.h>
-#include <linux/binfmts.h>
-#include <linux/context_tracking.h>
-#include <linux/sched/prio.h>
-
-#include <asm/irq_regs.h>
-#include <asm/switch_to.h>
-#include <asm/tlb.h>
-#include <asm/unistd.h>
-#include <asm/mutex.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
-#include "cpupri.h"
-#include "../workqueue_internal.h"
-#include "../smpboot.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-
-#include "bfs_sched.h"
-
-#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
-#define rt_task(p)		rt_prio((p)->prio)
-#define rt_queue(rq)		rt_prio((rq)->rq_prio)
-#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
-					(policy) == SCHED_RR)
-#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
-
-#define is_idle_policy(policy)	((policy) == SCHED_IDLEPRIO)
-#define idleprio_task(p)	unlikely(is_idle_policy((p)->policy))
-#define task_running_idle(p)	unlikely((p)->prio == IDLE_PRIO)
-#define idle_queue(rq)		(unlikely(is_idle_policy((rq)->rq_policy)))
-
-#define is_iso_policy(policy)	((policy) == SCHED_ISO)
-#define iso_task(p)		unlikely(is_iso_policy((p)->policy))
-#define iso_queue(rq)		unlikely(is_iso_policy((rq)->rq_policy))
-#define task_running_iso(p)	unlikely((p)->prio == ISO_PRIO)
-#define rq_running_iso(rq)	((rq)->rq_prio == ISO_PRIO)
-
-#define rq_idle(rq)		((rq)->rq_prio == PRIO_LIMIT)
-
-#define ISO_PERIOD		((5 * HZ * grq.noc) + 1)
-
-#define SCHED_PRIO(p)		((p) + MAX_RT_PRIO)
-#define STOP_PRIO		(MAX_RT_PRIO - 1)
-
-/*
- * Some helpers for converting to/from various scales. Use shifts to get
- * approximate multiples of ten for less overhead.
- */
-#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
-#define JIFFY_NS		(1000000000 / HZ)
-#define HALF_JIFFY_NS		(1000000000 / HZ / 2)
-#define HALF_JIFFY_US		(1000000 / HZ / 2)
-#define MS_TO_NS(TIME)		((TIME) << 20)
-#define MS_TO_US(TIME)		((TIME) << 10)
-#define NS_TO_MS(TIME)		((TIME) >> 20)
-#define NS_TO_US(TIME)		((TIME) >> 10)
-
-#define RESCHED_US	(100) /* Reschedule if less than this many μs left */
-
-void print_scheduler_version(void)
-{
-	printk(KERN_INFO "BFS CPU scheduler v0.464 by Con Kolivas.\n");
-}
-
-/*
- * This is the time all tasks within the same priority round robin.
- * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
- * Tunable via /proc interface.
- */
-#ifdef CONFIG_PCK_INTERACTIVE
-int rr_interval __read_mostly = 3;
-#else
-int rr_interval __read_mostly = 6;
-#endif
-
-/*
- * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
- * are allowed to run five seconds as real time tasks. This is the total over
- * all online cpus.
- */
-#ifdef CONFIG_PCK_INTERACTIVE
-int sched_iso_cpu __read_mostly = 25;
-#else
-int sched_iso_cpu __read_mostly = 70;
-#endif
-
-/*
- * The relative length of deadline for each priority(nice) level.
- */
-static int prio_ratios[NICE_WIDTH] __read_mostly;
-
-/*
- * The quota handed out to tasks of all priority levels when refilling their
- * time_slice.
- */
-static inline int timeslice(void)
-{
-	return MS_TO_US(rr_interval);
-}
-
-/*
- * The global runqueue data that all CPUs work off. Data is protected either
- * by the global grq lock, or the discrete lock that precedes the data in this
- * struct.
- */
-struct global_rq {
-	raw_spinlock_t lock;
-	unsigned long nr_running;
-	unsigned long nr_uninterruptible;
-	unsigned long long nr_switches;
-	struct list_head queue[PRIO_LIMIT];
-	DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
-	unsigned long qnr; /* queued not running */
-#ifdef CONFIG_SMP
-	cpumask_t cpu_idle_map;
-	bool idle_cpus;
-#endif
-	int noc; /* num_online_cpus stored and updated when it changes */
-	u64 niffies; /* Nanosecond jiffies */
-	unsigned long last_jiffy; /* Last jiffy we updated niffies */
-
-	raw_spinlock_t iso_lock;
-	int iso_ticks;
-	bool iso_refractory;
-};
-
-#ifdef CONFIG_SMP
-/*
- * We add the notion of a root-domain which will be used to define per-domain
- * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
- * exclusive cpuset is created, we also create and attach a new root-domain
- * object.
- *
- */
-struct root_domain {
-	atomic_t refcount;
-	atomic_t rto_count;
-	struct rcu_head rcu;
-	cpumask_var_t span;
-	cpumask_var_t online;
-
-	/*
-	 * The "RT overload" flag: it gets set if a CPU has more than
-	 * one runnable RT task.
-	 */
-	cpumask_var_t rto_mask;
-	struct cpupri cpupri;
-};
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-
-#endif /* CONFIG_SMP */
-
-/* There can be only one */
-static struct global_rq grq;
-
-static DEFINE_MUTEX(sched_hotcpu_mutex);
-
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#ifdef CONFIG_SMP
-struct rq *cpu_rq(int cpu)
-{
-	return &per_cpu(runqueues, (cpu));
-}
-#define task_rq(p)		cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
-/*
- * sched_domains_mutex serialises calls to init_sched_domains,
- * detach_destroy_domains and partition_sched_domains.
- */
-static DEFINE_MUTEX(sched_domains_mutex);
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
-}
-#else
-struct rq *uprq;
-#endif /* CONFIG_SMP */
-
-static inline void update_rq_clock(struct rq *rq);
-
-/*
- * Sanity check should sched_clock return bogus values. We make sure it does
- * not appear to go backwards, and use jiffies to determine the maximum and
- * minimum it could possibly have increased, and round down to the nearest
- * jiffy when it falls outside this.
- */
-static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
-{
-	unsigned long min_diff, max_diff;
-
-	if (jiff_diff > 1)
-		min_diff = JIFFIES_TO_NS(jiff_diff - 1);
-	else
-		min_diff = 1;
-	/*  Round up to the nearest tick for maximum */
-	max_diff = JIFFIES_TO_NS(jiff_diff + 1);
-
-	if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
-		*niff_diff = min_diff;
-}
-
-#ifdef CONFIG_SMP
-static inline int cpu_of(struct rq *rq)
-{
-	return rq->cpu;
-}
-
-/*
- * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
- * clock is updated with the grq.lock held, it is an opportunity to update the
- * niffies value. Any CPU can update it by adding how much its clock has
- * increased since it last updated niffies, minus any added niffies by other
- * CPUs.
- */
-static inline void update_clocks(struct rq *rq)
-{
-	s64 ndiff;
-	long jdiff;
-
-	update_rq_clock(rq);
-	ndiff = rq->clock - rq->old_clock;
-	/* old_clock is only updated when we are updating niffies */
-	rq->old_clock = rq->clock;
-	ndiff -= grq.niffies - rq->last_niffy;
-	jdiff = jiffies - grq.last_jiffy;
-	niffy_diff(&ndiff, jdiff);
-	grq.last_jiffy += jdiff;
-	grq.niffies += ndiff;
-	rq->last_niffy = grq.niffies;
-}
-#else /* CONFIG_SMP */
-static inline int cpu_of(struct rq *rq)
-{
-	return 0;
-}
-
-static inline void update_clocks(struct rq *rq)
-{
-	s64 ndiff;
-	long jdiff;
-
-	update_rq_clock(rq);
-	ndiff = rq->clock - rq->old_clock;
-	rq->old_clock = rq->clock;
-	jdiff = jiffies - grq.last_jiffy;
-	niffy_diff(&ndiff, jdiff);
-	grq.last_jiffy += jdiff;
-	grq.niffies += ndiff;
-}
-#endif
-
-#include "stats.h"
-
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next)	do { } while (0)
-#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev)	do { } while (0)
-#endif
-#ifndef finish_arch_post_lock_switch
-# define finish_arch_post_lock_switch()	do { } while (0)
-#endif
-
-/*
- * All common locking functions performed on grq.lock. rq->clock is local to
- * the CPU accessing it so it can be modified just with interrupts disabled
- * when we're not updating niffies.
- * Looking up task_rq must be done under grq.lock to be safe.
- */
-static void update_rq_clock_task(struct rq *rq, s64 delta);
-
-static inline void update_rq_clock(struct rq *rq)
-{
-	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-
-	if (unlikely(delta < 0))
-		return;
-	rq->clock += delta;
-	update_rq_clock_task(rq, delta);
-}
-
-static inline bool task_running(struct task_struct *p)
-{
-	return p->on_cpu;
-}
-
-static inline void grq_lock(void)
-	__acquires(grq.lock)
-{
-	raw_spin_lock(&grq.lock);
-}
-
-static inline void grq_unlock(void)
-	__releases(grq.lock)
-{
-	raw_spin_unlock(&grq.lock);
-}
-
-static inline void grq_lock_irq(void)
-	__acquires(grq.lock)
-{
-	raw_spin_lock_irq(&grq.lock);
-}
-
-static inline void time_lock_grq(struct rq *rq)
-	__acquires(grq.lock)
-{
-	grq_lock();
-	update_clocks(rq);
-}
-
-static inline void grq_unlock_irq(void)
-	__releases(grq.lock)
-{
-	raw_spin_unlock_irq(&grq.lock);
-}
-
-static inline void grq_lock_irqsave(unsigned long *flags)
-	__acquires(grq.lock)
-{
-	raw_spin_lock_irqsave(&grq.lock, *flags);
-}
-
-static inline void grq_unlock_irqrestore(unsigned long *flags)
-	__releases(grq.lock)
-{
-	raw_spin_unlock_irqrestore(&grq.lock, *flags);
-}
-
-static inline struct rq
-*task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
-{
-	grq_lock_irqsave(flags);
-	return task_rq(p);
-}
-
-static inline struct rq
-*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
-{
-	struct rq *rq = task_grq_lock(p, flags);
-	update_clocks(rq);
-	return rq;
-}
-
-static inline struct rq *task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
-{
-	grq_lock_irq();
-	return task_rq(p);
-}
-
-static inline void time_task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
-{
-	struct rq *rq = task_grq_lock_irq(p);
-	update_clocks(rq);
-}
-
-static inline void task_grq_unlock_irq(void)
-	__releases(grq.lock)
-{
-	grq_unlock_irq();
-}
-
-static inline void task_grq_unlock(unsigned long *flags)
-	__releases(grq.lock)
-{
-	grq_unlock_irqrestore(flags);
-}
-
-/**
- * grunqueue_is_locked
- *
- * Returns true if the global runqueue is locked.
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-bool grunqueue_is_locked(void)
-{
-	return raw_spin_is_locked(&grq.lock);
-}
-
-void grq_unlock_wait(void)
-	__releases(grq.lock)
-{
-	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-	raw_spin_unlock_wait(&grq.lock);
-}
-
-static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
-	__acquires(grq.lock)
-{
-	local_irq_save(*flags);
-	time_lock_grq(rq);
-}
-
-static inline struct rq *__task_grq_lock(struct task_struct *p)
-	__acquires(grq.lock)
-{
-	grq_lock();
-	return task_rq(p);
-}
-
-static inline void __task_grq_unlock(void)
-	__releases(grq.lock)
-{
-	grq_unlock();
-}
-
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
-	/* this is a valid case when another task releases the spinlock */
-	grq.lock.owner = current;
-#endif
-	/*
-	 * If we are tracking spinlock dependencies then we have to
-	 * fix up the runqueue lock - which gets 'carried over' from
-	 * prev into current:
-	 */
-	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
-
-	grq_unlock_irq();
-}
-
-static inline bool deadline_before(u64 deadline, u64 time)
-{
-	return (deadline < time);
-}
-
-static inline bool deadline_after(u64 deadline, u64 time)
-{
-	return (deadline > time);
-}
-
-/*
- * A task that is queued but not running will be on the grq run list.
- * A task that is not running or queued will not be on the grq run list.
- * A task that is currently running will have ->on_cpu set but not on the
- * grq run list.
- */
-static inline bool task_queued(struct task_struct *p)
-{
-	return (!list_empty(&p->run_list));
-}
-
-/*
- * Removing from the global runqueue. Enter with grq locked.
- */
-static void dequeue_task(struct task_struct *p)
-{
-	list_del_init(&p->run_list);
-	if (list_empty(grq.queue + p->prio))
-		__clear_bit(p->prio, grq.prio_bitmap);
-	sched_info_dequeued(task_rq(p), p);
-}
-
-/*
- * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
- * an idle task, we ensure none of the following conditions are met.
- */
-static bool idleprio_suitable(struct task_struct *p)
-{
-	return (!freezing(p) && !signal_pending(p) &&
-		!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
-}
-
-/*
- * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
- * that the iso_refractory flag is not set.
- */
-static bool isoprio_suitable(void)
-{
-	return !grq.iso_refractory;
-}
-
-/*
- * Adding to the global runqueue. Enter with grq locked.
- */
-static void enqueue_task(struct task_struct *p, struct rq *rq)
-{
-	if (!rt_task(p)) {
-		/* Check it hasn't gotten rt from PI */
-		if ((idleprio_task(p) && idleprio_suitable(p)) ||
-		   (iso_task(p) && isoprio_suitable()))
-			p->prio = p->normal_prio;
-		else
-			p->prio = NORMAL_PRIO;
-	}
-	__set_bit(p->prio, grq.prio_bitmap);
-	list_add_tail(&p->run_list, grq.queue + p->prio);
-	sched_info_queued(rq, p);
-}
-
-static inline void requeue_task(struct task_struct *p)
-{
-	sched_info_queued(task_rq(p), p);
-}
-
-/*
- * Returns the relative length of deadline all compared to the shortest
- * deadline which is that of nice -20.
- */
-static inline int task_prio_ratio(struct task_struct *p)
-{
-	return prio_ratios[TASK_USER_PRIO(p)];
-}
-
-/*
- * task_timeslice - all tasks of all priorities get the exact same timeslice
- * length. CPU distribution is handled by giving different deadlines to
- * tasks of different priorities. Use 128 as the base value for fast shifts.
- */
-static inline int task_timeslice(struct task_struct *p)
-{
-	return (rr_interval * task_prio_ratio(p) / 128);
-}
-
-static void resched_task(struct task_struct *p);
-
-static inline void resched_curr(struct rq *rq)
-{
-	resched_task(rq->curr);
-}
-
-/*
- * qnr is the "queued but not running" count which is the total number of
- * tasks on the global runqueue list waiting for cpu time but not actually
- * currently running on a cpu.
- */
-static inline void inc_qnr(void)
-{
-	grq.qnr++;
-}
-
-static inline void dec_qnr(void)
-{
-	grq.qnr--;
-}
-
-static inline int queued_notrunning(void)
-{
-	return grq.qnr;
-}
-
-#ifdef CONFIG_SMP
-/*
- * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
- * allow easy lookup of whether any suitable idle CPUs are available.
- * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
- * idle_cpus variable than to do a full bitmask check when we are busy.
- */
-static inline void set_cpuidle_map(int cpu)
-{
-	if (likely(cpu_online(cpu))) {
-		cpumask_set_cpu(cpu, &grq.cpu_idle_map);
-		grq.idle_cpus = true;
-	}
-}
-
-static inline void clear_cpuidle_map(int cpu)
-{
-	cpumask_clear_cpu(cpu, &grq.cpu_idle_map);
-	if (cpumask_empty(&grq.cpu_idle_map))
-		grq.idle_cpus = false;
-}
-
-static bool suitable_idle_cpus(struct task_struct *p)
-{
-	if (!grq.idle_cpus)
-		return false;
-	return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
-}
-
-#define CPUIDLE_DIFF_THREAD	(1)
-#define CPUIDLE_DIFF_CORE	(2)
-#define CPUIDLE_CACHE_BUSY	(4)
-#define CPUIDLE_DIFF_CPU	(8)
-#define CPUIDLE_THREAD_BUSY	(16)
-#define CPUIDLE_THROTTLED	(32)
-#define CPUIDLE_DIFF_NODE	(64)
-
-static inline bool scaling_rq(struct rq *rq);
-
-/*
- * The best idle CPU is chosen according to the CPUIDLE ranking above where the
- * lowest value would give the most suitable CPU to schedule p onto next. The
- * order works out to be the following:
- *
- * Same core, idle or busy cache, idle or busy threads
- * Other core, same cache, idle or busy cache, idle threads.
- * Same node, other CPU, idle cache, idle threads.
- * Same node, other CPU, busy cache, idle threads.
- * Other core, same cache, busy threads.
- * Same node, other CPU, busy threads.
- * Other node, other CPU, idle cache, idle threads.
- * Other node, other CPU, busy cache, idle threads.
- * Other node, other CPU, busy threads.
- */
-static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
-{
-	int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THROTTLED |
-		CPUIDLE_THREAD_BUSY | CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY |
-		CPUIDLE_DIFF_CORE | CPUIDLE_DIFF_THREAD;
-	int cpu_tmp;
-
-	if (cpumask_test_cpu(best_cpu, tmpmask))
-		goto out;
-
-	for_each_cpu(cpu_tmp, tmpmask) {
-		int ranking, locality;
-		struct rq *tmp_rq;
-
-		ranking = 0;
-		tmp_rq = cpu_rq(cpu_tmp);
-
-		locality = rq->cpu_locality[cpu_tmp];
-#ifdef CONFIG_NUMA
-		if (locality > 3)
-			ranking |= CPUIDLE_DIFF_NODE;
-		else
-#endif
-		if (locality > 2)
-			ranking |= CPUIDLE_DIFF_CPU;
-#ifdef CONFIG_SCHED_MC
-		else if (locality == 2)
-			ranking |= CPUIDLE_DIFF_CORE;
-		if (!(tmp_rq->cache_idle(cpu_tmp)))
-			ranking |= CPUIDLE_CACHE_BUSY;
-#endif
-#ifdef CONFIG_SCHED_SMT
-		if (locality == 1)
-			ranking |= CPUIDLE_DIFF_THREAD;
-		if (!(tmp_rq->siblings_idle(cpu_tmp)))
-			ranking |= CPUIDLE_THREAD_BUSY;
-#endif
-		if (scaling_rq(tmp_rq))
-			ranking |= CPUIDLE_THROTTLED;
-
-		if (ranking < best_ranking) {
-			best_cpu = cpu_tmp;
-			best_ranking = ranking;
-		}
-	}
-out:
-	return best_cpu;
-}
-
-static void resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
-{
-	best_cpu = best_mask_cpu(best_cpu, rq, tmpmask);
-	resched_curr(cpu_rq(best_cpu));
-}
-
-bool cpus_share_cache(int this_cpu, int that_cpu)
-{
-	struct rq *this_rq = cpu_rq(this_cpu);
-
-	return (this_rq->cpu_locality[that_cpu] < 3);
-}
-
-#ifdef CONFIG_SCHED_SMT
-#ifdef CONFIG_SMT_NICE
-static const cpumask_t *thread_cpumask(int cpu);
-
-/* Find the best real time priority running on any SMT siblings of cpu and if
- * none are running, the static priority of the best deadline task running.
- * The lookups to the other runqueues is done lockless as the occasional wrong
- * value would be harmless. */
-static int best_smt_bias(int cpu)
-{
-	int other_cpu, best_bias = 0;
-
-	for_each_cpu(other_cpu, thread_cpumask(cpu)) {
-		struct rq *rq;
-
-		if (other_cpu == cpu)
-			continue;
-		rq = cpu_rq(other_cpu);
-		if (rq_idle(rq))
-			continue;
-		if (!rq->online)
-			continue;
-		if (!rq->rq_mm)
-			continue;
-		if (likely(rq->rq_smt_bias > best_bias))
-			best_bias = rq->rq_smt_bias;
-	}
-	return best_bias;
-}
-
-static int task_prio_bias(struct task_struct *p)
-{
-	if (rt_task(p))
-		return 1 << 30;
-	else if (task_running_iso(p))
-		return 1 << 29;
-	else if (task_running_idle(p))
-		return 0;
-	return MAX_PRIO - p->static_prio;
-}
-
-/* We've already decided p can run on CPU, now test if it shouldn't for SMT
- * nice reasons. */
-static bool smt_should_schedule(struct task_struct *p, int cpu)
-{
-	int best_bias, task_bias;
-
-	/* Kernel threads always run */
-	if (unlikely(!p->mm))
-		return true;
-	if (rt_task(p))
-		return true;
-	if (!idleprio_suitable(p))
-		return true;
-	best_bias = best_smt_bias(cpu);
-	/* The smt siblings are all idle or running IDLEPRIO */
-	if (best_bias < 1)
-		return true;
-	task_bias = task_prio_bias(p);
-	if (task_bias < 1)
-		return false;
-	if (task_bias >= best_bias)
-		return true;
-	/* Dither 25% cpu of normal tasks regardless of nice difference */
-	if (best_bias % 4 == 1)
-		return true;
-	/* Sorry, you lose */
-	return false;
-}
-#endif
-#endif
-
-static bool resched_best_idle(struct task_struct *p)
-{
-	cpumask_t tmpmask;
-	int best_cpu;
-
-	cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
-	best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask);
-#ifdef CONFIG_SMT_NICE
-	if (!smt_should_schedule(p, best_cpu))
-		return false;
-#endif
-	resched_curr(cpu_rq(best_cpu));
-	return true;
-}
-
-static inline void resched_suitable_idle(struct task_struct *p)
-{
-	if (suitable_idle_cpus(p))
-		resched_best_idle(p);
-}
-/*
- * Flags to tell us whether this CPU is running a CPU frequency governor that
- * has slowed its speed or not. No locking required as the very rare wrongly
- * read value would be harmless.
- */
-void cpu_scaling(int cpu)
-{
-	cpu_rq(cpu)->scaling = true;
-}
-
-void cpu_nonscaling(int cpu)
-{
-	cpu_rq(cpu)->scaling = false;
-}
-
-static inline bool scaling_rq(struct rq *rq)
-{
-	return rq->scaling;
-}
-
-static inline int locality_diff(struct task_struct *p, struct rq *rq)
-{
-	return rq->cpu_locality[task_cpu(p)];
-}
-#else /* CONFIG_SMP */
-static inline void set_cpuidle_map(int cpu)
-{
-}
-
-static inline void clear_cpuidle_map(int cpu)
-{
-}
-
-static inline bool suitable_idle_cpus(struct task_struct *p)
-{
-	return uprq->curr == uprq->idle;
-}
-
-static inline void resched_suitable_idle(struct task_struct *p)
-{
-}
-
-void cpu_scaling(int __unused)
-{
-}
-
-void cpu_nonscaling(int __unused)
-{
-}
-
-/*
- * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
- * always returns 0.
- */
-static inline bool scaling_rq(struct rq *rq)
-{
-	return false;
-}
-
-static inline int locality_diff(struct task_struct *p, struct rq *rq)
-{
-	return 0;
-}
-#endif /* CONFIG_SMP */
-EXPORT_SYMBOL_GPL(cpu_scaling);
-EXPORT_SYMBOL_GPL(cpu_nonscaling);
-
-static inline int normal_prio(struct task_struct *p)
-{
-	if (has_rt_policy(p))
-		return MAX_RT_PRIO - 1 - p->rt_priority;
-	if (idleprio_task(p))
-		return IDLE_PRIO;
-	if (iso_task(p))
-		return ISO_PRIO;
-	return NORMAL_PRIO;
-}
-
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks as it will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
-{
-	p->normal_prio = normal_prio(p);
-	/*
-	 * If we are RT tasks or we were boosted to RT priority,
-	 * keep the priority unchanged. Otherwise, update priority
-	 * to the normal priority:
-	 */
-	if (!rt_prio(p->prio))
-		return p->normal_prio;
-	return p->prio;
-}
-
-/*
- * activate_task - move a task to the runqueue. Enter with grq locked.
- */
-static void activate_task(struct task_struct *p, struct rq *rq)
-{
-	update_clocks(rq);
-
-	/*
-	 * Sleep time is in units of nanosecs, so shift by 20 to get a
-	 * milliseconds-range estimation of the amount of time that the task
-	 * spent sleeping:
-	 */
-	if (unlikely(prof_on == SLEEP_PROFILING)) {
-		if (p->state == TASK_UNINTERRUPTIBLE)
-			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
-				     (rq->clock_task - p->last_ran) >> 20);
-	}
-
-	p->prio = effective_prio(p);
-	if (task_contributes_to_load(p))
-		grq.nr_uninterruptible--;
-	enqueue_task(p, rq);
-	rq->soft_affined++;
-	p->on_rq = 1;
-	grq.nr_running++;
-	inc_qnr();
-}
-
-static inline void clear_sticky(struct task_struct *p);
-
-/*
- * deactivate_task - If it's running, it's not on the grq and we can just
- * decrement the nr_running. Enter with grq locked.
- */
-static inline void deactivate_task(struct task_struct *p, struct rq *rq)
-{
-	if (task_contributes_to_load(p))
-		grq.nr_uninterruptible++;
-	rq->soft_affined--;
-	p->on_rq = 0;
-	grq.nr_running--;
-	clear_sticky(p);
-}
-
-#ifdef CONFIG_SMP
-void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_LOCKDEP
-	/*
-	 * The caller should hold grq lock.
-	 */
-	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
-#endif
-	if (task_cpu(p) == cpu)
-		return;
-	trace_sched_migrate_task(p, cpu);
-	perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
-
-	/*
-	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
-	 * successfully executed on another CPU. We must ensure that updates of
-	 * per-task data have been completed by this moment.
-	 */
-	smp_wmb();
-	if (p->on_rq) {
-		task_rq(p)->soft_affined--;
-		cpu_rq(cpu)->soft_affined++;
-	}
-	task_thread_info(p)->cpu = cpu;
-}
-
-static inline void clear_sticky(struct task_struct *p)
-{
-	p->sticky = false;
-}
-
-static inline bool task_sticky(struct task_struct *p)
-{
-	return p->sticky;
-}
-
-/* Reschedule the best idle CPU that is not this one. */
-static void
-resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p)
-{
-	cpumask_t tmpmask;
-
-	cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
-	cpumask_clear_cpu(cpu, &tmpmask);
-	if (cpumask_empty(&tmpmask))
-		return;
-	resched_best_mask(cpu, rq, &tmpmask);
-}
-
-/*
- * We set the sticky flag on a task that is descheduled involuntarily meaning
- * it is awaiting further CPU time. If the last sticky task is still sticky
- * but unlucky enough to not be the next task scheduled, we unstick it and try
- * to find it an idle CPU. Realtime tasks do not stick to minimise their
- * latency at all times.
- */
-static inline void
-swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
-{
-	if (rq->sticky_task) {
-		if (rq->sticky_task == p) {
-			p->sticky = true;
-			return;
-		}
-		if (task_sticky(rq->sticky_task)) {
-			clear_sticky(rq->sticky_task);
-			resched_closest_idle(rq, cpu, rq->sticky_task);
-		}
-	}
-	if (!rt_task(p)) {
-		p->sticky = true;
-		rq->sticky_task = p;
-	} else {
-		resched_closest_idle(rq, cpu, p);
-		rq->sticky_task = NULL;
-	}
-}
-
-static inline void unstick_task(struct rq *rq, struct task_struct *p)
-{
-	rq->sticky_task = NULL;
-	clear_sticky(p);
-}
-#else
-static inline void clear_sticky(struct task_struct *p)
-{
-}
-
-static inline bool task_sticky(struct task_struct *p)
-{
-	return false;
-}
-
-static inline void
-swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
-{
-}
-
-static inline void unstick_task(struct rq *rq, struct task_struct *p)
-{
-}
-#endif
-
-/*
- * Move a task off the global queue and take it to a cpu for it will
- * become the running task.
- */
-static inline void take_task(int cpu, struct task_struct *p)
-{
-	set_task_cpu(p, cpu);
-	dequeue_task(p);
-	clear_sticky(p);
-	dec_qnr();
-}
-
-/*
- * Returns a descheduling task to the grq runqueue unless it is being
- * deactivated.
- */
-static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate)
-{
-	if (deactivate)
-		deactivate_task(p, rq);
-	else {
-		inc_qnr();
-		enqueue_task(p, rq);
-	}
-}
-
-/* Enter with grq lock held. We know p is on the local cpu */
-static inline void __set_tsk_resched(struct task_struct *p)
-{
-	set_tsk_need_resched(p);
-	set_preempt_need_resched();
-}
-
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-void resched_task(struct task_struct *p)
-{
-	int cpu;
-
-	lockdep_assert_held(&grq.lock);
-
-	if (test_tsk_need_resched(p))
-		return;
-
-	set_tsk_need_resched(p);
-
-	cpu = task_cpu(p);
-	if (cpu == smp_processor_id()) {
-		set_preempt_need_resched();
-		return;
-	}
-
-	smp_send_reschedule(cpu);
-}
-
-/**
- * task_curr - is this task currently executing on a CPU?
- * @p: the task in question.
- *
- * Return: 1 if the task is currently executing. 0 otherwise.
- */
-inline int task_curr(const struct task_struct *p)
-{
-	return cpu_curr(task_cpu(p)) == p;
-}
-
-#ifdef CONFIG_SMP
-struct migration_req {
-	struct task_struct *task;
-	int dest_cpu;
-};
-
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change.  If it changes, i.e. @p might have woken up,
- * then return zero.  When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count).  If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
-{
-	unsigned long flags;
-	bool running, on_rq;
-	unsigned long ncsw;
-	struct rq *rq;
-
-	for (;;) {
-		rq = task_rq(p);
-
-		/*
-		 * If the task is actively running on another CPU
-		 * still, just relax and busy-wait without holding
-		 * any locks.
-		 *
-		 * NOTE! Since we don't hold any locks, it's not
-		 * even sure that "rq" stays as the right runqueue!
-		 * But we don't care, since this will return false
-		 * if the runqueue has changed and p is actually now
-		 * running somewhere else!
-		 */
-		while (task_running(p) && p == rq->curr) {
-			if (match_state && unlikely(p->state != match_state))
-				return 0;
-			cpu_relax();
-		}
-
-		/*
-		 * Ok, time to look more closely! We need the grq
-		 * lock now, to be *sure*. If we're wrong, we'll
-		 * just go back and repeat.
-		 */
-		rq = task_grq_lock(p, &flags);
-		trace_sched_wait_task(p);
-		running = task_running(p);
-		on_rq = p->on_rq;
-		ncsw = 0;
-		if (!match_state || p->state == match_state)
-			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_grq_unlock(&flags);
-
-		/*
-		 * If it changed from the expected state, bail out now.
-		 */
-		if (unlikely(!ncsw))
-			break;
-
-		/*
-		 * Was it really running after all now that we
-		 * checked with the proper locks actually held?
-		 *
-		 * Oops. Go back and try again..
-		 */
-		if (unlikely(running)) {
-			cpu_relax();
-			continue;
-		}
-
-		/*
-		 * It's not enough that it's not actively running,
-		 * it must be off the runqueue _entirely_, and not
-		 * preempted!
-		 *
-		 * So if it was still runnable (but just not actively
-		 * running right now), it's preempted, and we should
-		 * yield - it could be a while.
-		 */
-		if (unlikely(on_rq)) {
-			ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
-
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
-			continue;
-		}
-
-		/*
-		 * Ahh, all good. It wasn't running, and it wasn't
-		 * runnable, which means that it will never become
-		 * running in the future either. We're all done!
-		 */
-		break;
-	}
-
-	return ncsw;
-}
-
-/***
- * kick_process - kick a running thread to enter/exit the kernel
- * @p: the to-be-kicked thread
- *
- * Cause a process which is running on another CPU to enter
- * kernel-mode, without any delay. (to get signals handled.)
- *
- * NOTE: this function doesn't have to take the runqueue lock,
- * because all it wants to ensure is that the remote task enters
- * the kernel. If the IPI races and the task has been migrated
- * to another CPU then no harm is done and the purpose has been
- * achieved as well.
- */
-void kick_process(struct task_struct *p)
-{
-	int cpu;
-
-	preempt_disable();
-	cpu = task_cpu(p);
-	if ((cpu != smp_processor_id()) && task_curr(p))
-		smp_send_reschedule(cpu);
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(kick_process);
-#endif
-
-/*
- * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
- * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
- * between themselves, they cooperatively multitask. An idle rq scores as
- * prio PRIO_LIMIT so it is always preempted.
- */
-static inline bool
-can_preempt(struct task_struct *p, int prio, u64 deadline)
-{
-	/* Better static priority RT task or better policy preemption */
-	if (p->prio < prio)
-		return true;
-	if (p->prio > prio)
-		return false;
-	/* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
-	if (!deadline_before(p->deadline, deadline))
-		return false;
-	return true;
-}
-
-#ifdef CONFIG_SMP
-#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Check to see if there is a task that is affined only to offline CPUs but
- * still wants runtime. This happens to kernel threads during suspend/halt and
- * disabling of CPUs.
- */
-static inline bool online_cpus(struct task_struct *p)
-{
-	return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed)));
-}
-#else /* CONFIG_HOTPLUG_CPU */
-/* All available CPUs are always online without hotplug. */
-static inline bool online_cpus(struct task_struct *p)
-{
-	return true;
-}
-#endif
-
-/*
- * Check to see if p can run on cpu, and if not, whether there are any online
- * CPUs it can run on instead.
- */
-static inline bool needs_other_cpu(struct task_struct *p, int cpu)
-{
-	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
-		return true;
-	return false;
-}
-
-/*
- * When all else is equal, still prefer this_rq.
- */
-static void try_preempt(struct task_struct *p, struct rq *this_rq)
-{
-	struct rq *highest_prio_rq = NULL;
-	int cpu, highest_prio;
-	u64 latest_deadline;
-	cpumask_t tmp;
-
-	/*
-	 * We clear the sticky flag here because for a task to have called
-	 * try_preempt with the sticky flag enabled means some complicated
-	 * re-scheduling has occurred and we should ignore the sticky flag.
-	 */
-	clear_sticky(p);
-
-	if (suitable_idle_cpus(p) && resched_best_idle(p))
-		return;
-
-	/* IDLEPRIO tasks never preempt anything but idle */
-	if (p->policy == SCHED_IDLEPRIO)
-		return;
-
-	if (likely(online_cpus(p)))
-		cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
-	else
-		return;
-
-	highest_prio = latest_deadline = 0;
-
-	for_each_cpu(cpu, &tmp) {
-		struct rq *rq;
-		int rq_prio;
-
-		rq = cpu_rq(cpu);
-		rq_prio = rq->rq_prio;
-		if (rq_prio < highest_prio)
-			continue;
-
-		if (rq_prio > highest_prio ||
-		    deadline_after(rq->rq_deadline, latest_deadline)) {
-			latest_deadline = rq->rq_deadline;
-			highest_prio = rq_prio;
-			highest_prio_rq = rq;
-		}
-	}
-
-	if (likely(highest_prio_rq)) {
-#ifdef CONFIG_SMT_NICE
-		cpu = cpu_of(highest_prio_rq);
-		if (!smt_should_schedule(p, cpu))
-			return;
-#endif
-		if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline))
-			resched_curr(highest_prio_rq);
-	}
-}
-#else /* CONFIG_SMP */
-static inline bool needs_other_cpu(struct task_struct *p, int cpu)
-{
-	return false;
-}
-
-static void try_preempt(struct task_struct *p, struct rq *this_rq)
-{
-	if (p->policy == SCHED_IDLEPRIO)
-		return;
-	if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
-		resched_curr(uprq);
-}
-#endif /* CONFIG_SMP */
-
-static void
-ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
-{
-#ifdef CONFIG_SCHEDSTATS
-	struct rq *rq = this_rq();
-
-#ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-
-	if (cpu == this_cpu)
-		schedstat_inc(rq, ttwu_local);
-	else {
-		struct sched_domain *sd;
-
-		rcu_read_lock();
-		for_each_domain(this_cpu, sd) {
-			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-				schedstat_inc(sd, ttwu_wake_remote);
-				break;
-			}
-		}
-		rcu_read_unlock();
-	}
-
-#endif /* CONFIG_SMP */
-
-	schedstat_inc(rq, ttwu_count);
-#endif /* CONFIG_SCHEDSTATS */
-}
-
-void wake_up_if_idle(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	rcu_read_lock();
-
-	if (!is_idle_task(rcu_dereference(rq->curr)))
-		goto out;
-
-	grq_lock_irqsave(&flags);
-	if (likely(is_idle_task(rq->curr)))
-		smp_send_reschedule(cpu);
-	/* Else cpu is not in idle, do nothing here */
-	grq_unlock_irqrestore(&flags);
-
-out:
-	rcu_read_unlock();
-}
-
-#ifdef CONFIG_SMP
-void scheduler_ipi(void)
-{
-	/*
-	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
-	 * TIF_NEED_RESCHED remotely (for the first time) will also send
-	 * this IPI.
-	 */
-	preempt_fold_need_resched();
-}
-#endif
-
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
-				 bool is_sync)
-{
-	activate_task(p, rq);
-
-	/*
-	 * Sync wakeups (i.e. those types of wakeups where the waker
-	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption if there are no idle cpus,
-	 * instead waiting for current to deschedule.
-	 */
-	if (!is_sync || suitable_idle_cpus(p))
-		try_preempt(p, rq);
-}
-
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
-					bool success)
-{
-	trace_sched_wakeup(p, success);
-	p->state = TASK_RUNNING;
-
-	/*
-	 * if a worker is waking up, notify workqueue. Note that on BFS, we
-	 * don't really know what cpu it will be, so we fake it for
-	 * wq_worker_waking_up :/
-	 */
-	if ((p->flags & PF_WQ_WORKER) && success)
-		wq_worker_waking_up(p, cpu_of(rq));
-}
-
-/*
- * wake flags
- */
-#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
-#define WF_FORK		0x02		/* child wakeup after fork */
-#define WF_MIGRATED	0x4		/* internal use, task got migrated */
-
-/***
- * try_to_wake_up - wake up a thread
- * @p: the thread to be awakened
- * @state: the mask of task states that can be woken
- * @wake_flags: wake modifier flags (WF_*)
- *
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
- *
- * Return: %true if @p was woken up, %false if it was already running.
- * or @state didn't match @p's state.
- */
-static bool try_to_wake_up(struct task_struct *p, unsigned int state,
-			  int wake_flags)
-{
-	bool success = false;
-	unsigned long flags;
-	struct rq *rq;
-	int cpu;
-
-	get_cpu();
-
-	/*
-	 * If we are going to wake up a thread waiting for CONDITION we
-	 * need to ensure that CONDITION=1 done by the caller can not be
-	 * reordered with p->state check below. This pairs with mb() in
-	 * set_current_state() the waiting thread does.
-	 */
-	smp_mb__before_spinlock();
-
-	/*
-	 * No need to do time_lock_grq as we only need to update the rq clock
-	 * if we activate the task
-	 */
-	rq = task_grq_lock(p, &flags);
-	cpu = task_cpu(p);
-
-	/* state is a volatile long, どうして、分からない */
-	if (!((unsigned int)p->state & state))
-		goto out_unlock;
-
-	if (task_queued(p) || task_running(p))
-		goto out_running;
-
-	ttwu_activate(p, rq, wake_flags & WF_SYNC);
-	success = true;
-
-out_running:
-	ttwu_post_activation(p, rq, success);
-out_unlock:
-	task_grq_unlock(&flags);
-
-	ttwu_stat(p, cpu, wake_flags);
-
-	put_cpu();
-
-	return success;
-}
-
-/**
- * try_to_wake_up_local - try to wake up a local task with grq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that grq is locked and, @p is not the current task.
- * grq stays locked over invocation.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
-	struct rq *rq = task_rq(p);
-	bool success = false;
-
-	lockdep_assert_held(&grq.lock);
-
-	if (!(p->state & TASK_NORMAL))
-		return;
-
-	if (!task_queued(p)) {
-		if (likely(!task_running(p))) {
-			schedstat_inc(rq, ttwu_count);
-			schedstat_inc(rq, ttwu_local);
-		}
-		ttwu_activate(p, rq, false);
-		ttwu_stat(p, smp_processor_id(), 0);
-		success = true;
-	}
-	ttwu_post_activation(p, rq, success);
-}
-
-/**
- * wake_up_process - Wake up a specific process
- * @p: The process to be woken up.
- *
- * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.
- *
- * Return: 1 if the process was woken up, 0 if it was already running.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-int wake_up_process(struct task_struct *p)
-{
-	WARN_ON(task_is_stopped_or_traced(p));
-	return try_to_wake_up(p, TASK_NORMAL, 0);
-}
-EXPORT_SYMBOL(wake_up_process);
-
-int wake_up_state(struct task_struct *p, unsigned int state)
-{
-	return try_to_wake_up(p, state, 0);
-}
-
-static void time_slice_expired(struct task_struct *p);
-
-/*
- * Perform scheduler related setup for a newly forked process p.
- * p is forked by current.
- */
-int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
-{
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-	INIT_HLIST_HEAD(&p->preempt_notifiers);
-#endif
-	/*
-	 * The process state is set to the same value of the process executing
-	 * do_fork() code. That is running. This guarantees that nobody will
-	 * actually run it, and a signal or other external event cannot wake
-	 * it up and insert it on the runqueue either.
-	 */
-
-	/* Should be reset in fork.c but done here for ease of bfs patching */
-	p->on_rq =
-	p->utime =
-	p->stime =
-	p->utimescaled =
-	p->stimescaled =
-	p->sched_time =
-	p->stime_pc =
-	p->utime_pc = 0;
-
-	/*
-	 * Revert to default priority/policy on fork if requested.
-	 */
-	if (unlikely(p->sched_reset_on_fork)) {
-		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
-			p->policy = SCHED_NORMAL;
-			p->normal_prio = normal_prio(p);
-		}
-
-		if (PRIO_TO_NICE(p->static_prio) < 0) {
-			p->static_prio = NICE_TO_PRIO(0);
-			p->normal_prio = p->static_prio;
-		}
-
-		/*
-		 * We don't need the reset flag anymore after the fork. It has
-		 * fulfilled its duty:
-		 */
-		p->sched_reset_on_fork = 0;
-	}
-
-	INIT_LIST_HEAD(&p->run_list);
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-	if (unlikely(sched_info_on()))
-		memset(&p->sched_info, 0, sizeof(p->sched_info));
-#endif
-	p->on_cpu = false;
-	clear_sticky(p);
-	init_task_preempt_count(p);
-	return 0;
-}
-
-/*
- * wake_up_new_task - wake up a newly created task for the first time.
- *
- * This function will do some initial scheduler statistics housekeeping
- * that must be done for every newly created context, then puts the task
- * on the runqueue and wakes it.
- */
-void wake_up_new_task(struct task_struct *p)
-{
-	struct task_struct *parent;
-	unsigned long flags;
-	struct rq *rq;
-
-	parent = p->parent;
-	rq = task_grq_lock(p, &flags);
-
-	/*
-	 * Reinit new task deadline as its creator deadline could have changed
-	 * since call to dup_task_struct().
-	 */
-	p->deadline = rq->rq_deadline;
-
-	/*
-	 * If the task is a new process, current and parent are the same. If
-	 * the task is a new thread in the thread group, it will have much more
-	 * in common with current than with the parent.
-	 */
-	set_task_cpu(p, task_cpu(rq->curr));
-
-	/*
-	 * Make sure we do not leak PI boosting priority to the child.
-	 */
-	p->prio = rq->curr->normal_prio;
-
-	activate_task(p, rq);
-	trace_sched_wakeup_new(p, 1);
-	if (unlikely(p->policy == SCHED_FIFO))
-		goto after_ts_init;
-
-	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness. If it's negative, it won't
-	 * matter since that's the same as being 0. current's time_slice is
-	 * actually in rq_time_slice when it's running, as is its last_ran
-	 * value. rq->rq_deadline is only modified within schedule() so it
-	 * is always equal to current->deadline.
-	 */
-	p->last_ran = rq->rq_last_ran;
-	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
-		rq->rq_time_slice /= 2;
-		p->time_slice = rq->rq_time_slice;
-after_ts_init:
-		if (rq->curr == parent && !suitable_idle_cpus(p)) {
-			/*
-			 * The VM isn't cloned, so we're in a good position to
-			 * do child-runs-first in anticipation of an exec. This
-			 * usually avoids a lot of COW overhead.
-			 */
-			__set_tsk_resched(parent);
-		} else
-			try_preempt(p, rq);
-	} else {
-		if (rq->curr == parent) {
-			/*
-			* Forking task has run out of timeslice. Reschedule it and
-			* start its child with a new time slice and deadline. The
-			* child will end up running first because its deadline will
-			* be slightly earlier.
-			*/
-			rq->rq_time_slice = 0;
-			__set_tsk_resched(parent);
-		}
-		time_slice_expired(p);
-	}
-	task_grq_unlock(&flags);
-}
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-
-/**
- * preempt_notifier_register - tell me when current is being preempted & rescheduled
- * @notifier: notifier struct to register
- */
-void preempt_notifier_register(struct preempt_notifier *notifier)
-{
-	hlist_add_head(&notifier->link, &current->preempt_notifiers);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_register);
-
-/**
- * preempt_notifier_unregister - no longer interested in preemption notifications
- * @notifier: notifier struct to unregister
- *
- * This is safe to call from within a preemption notifier.
- */
-void preempt_notifier_unregister(struct preempt_notifier *notifier)
-{
-	hlist_del(&notifier->link);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-	struct preempt_notifier *notifier;
-
-	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
-		notifier->ops->sched_in(notifier, raw_smp_processor_id());
-}
-
-static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-				 struct task_struct *next)
-{
-	struct preempt_notifier *notifier;
-
-	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
-		notifier->ops->sched_out(notifier, next);
-}
-
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
-
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-}
-
-static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-				 struct task_struct *next)
-{
-}
-
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
-
-/**
- * prepare_task_switch - prepare to switch tasks
- * @rq: the runqueue preparing to switch
- * @next: the task we are going to switch to.
- *
- * This is called with the rq lock held and interrupts off. It must
- * be paired with a subsequent finish_task_switch after the context
- * switch.
- *
- * prepare_task_switch sets up locking and calls architecture specific
- * hooks.
- */
-static inline void
-prepare_task_switch(struct rq *rq, struct task_struct *prev,
-		    struct task_struct *next)
-{
-	sched_info_switch(rq, prev, next);
-	perf_event_task_sched_out(prev, next);
-	fire_sched_out_preempt_notifiers(prev, next);
-	prepare_lock_switch(rq, next);
-	prepare_arch_switch(next);
-	trace_sched_switch(prev, next);
-}
-
-/**
- * finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
- * @prev: the thread we just switched away from.
- *
- * finish_task_switch must be called after the context switch, paired
- * with a prepare_task_switch call before the context switch.
- * finish_task_switch will reconcile locking set up by prepare_task_switch,
- * and do any other architecture-specific cleanup actions.
- *
- * Note that we may have delayed dropping an mm in context_switch(). If
- * so, we finish that here outside of the runqueue lock.  (Doing it
- * with the lock held can cause deadlocks; see schedule() for
- * details.)
- *
- * The context switch have flipped the stack from under us and restored the
- * local variables which were saved when this task called schedule() in the
- * past. prev == current is still correct but we need to recalculate this_rq
- * because prev may have moved to another CPU.
- */
-static struct rq *finish_task_switch(struct task_struct *prev)
-	__releases(grq.lock)
-{
-	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
-	long prev_state;
-
-	rq->prev_mm = NULL;
-
-	/*
-	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
-	 * schedule one last time. The schedule call will never return, and
-	 * the scheduled task must drop that reference.
-	 * The test for TASK_DEAD must occur while the runqueue locks are
-	 * still held, otherwise prev could be scheduled on another cpu, die
-	 * there before we look at prev->state, and then the reference would
-	 * be dropped twice.
-	 *		Manfred Spraul <manfred@colorfullife.com>
-	 */
-	prev_state = prev->state;
-	vtime_task_switch(prev);
-	finish_arch_switch(prev);
-	perf_event_task_sched_in(prev, current);
-	finish_lock_switch(rq, prev);
-	finish_arch_post_lock_switch();
-
-	fire_sched_in_preempt_notifiers(current);
-	if (mm)
-		mmdrop(mm);
-	if (unlikely(prev_state == TASK_DEAD)) {
-		/*
-		 * Remove function-return probe instances associated with this
-		 * task and put them back on the free list.
-		 */
-		kprobe_flush_task(prev);
-		put_task_struct(prev);
-	}
-	return rq;
-}
-
-/**
- * schedule_tail - first thing a freshly forked thread must call.
- * @prev: the thread we just switched away from.
- */
-asmlinkage __visible void schedule_tail(struct task_struct *prev)
-	__releases(grq.lock)
-{
-	struct rq *rq;
-
-	/* finish_task_switch() drops rq->lock and enables preemption */
-	preempt_disable();
-	rq = finish_task_switch(prev);
-	preempt_enable();
-
-	if (current->set_child_tid)
-		put_user(task_pid_vnr(current), current->set_child_tid);
-}
-
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next)
-{
-	struct mm_struct *mm, *oldmm;
-
-	prepare_task_switch(rq, prev, next);
-
-	mm = next->mm;
-	oldmm = prev->active_mm;
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
-	if (!mm) {
-		next->active_mm = oldmm;
-		atomic_inc(&oldmm->mm_count);
-		enter_lazy_tlb(oldmm, next);
-	} else
-		switch_mm(oldmm, mm, next);
-
-	if (!prev->mm) {
-		prev->active_mm = NULL;
-		rq->prev_mm = oldmm;
-	}
-	/*
-	 * Since the runqueue lock will be released by the next
-	 * task (which is an invalid locking op but in the case
-	 * of the scheduler it's an obvious special-case), so we
-	 * do an early lockdep release here:
-	 */
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
-
-	/* Here we just switch the register state and the stack. */
-	context_tracking_task_switch(prev, next);
-	switch_to(prev, next, prev);
-
-	barrier();
-
-	return finish_task_switch(prev);
-}
-
-/*
- * nr_running, nr_uninterruptible and nr_context_switches:
- *
- * externally visible scheduler statistics: current number of runnable
- * threads, total number of context switches performed since bootup. All are
- * measured without grabbing the grq lock but the occasional inaccurate result
- * doesn't matter so long as it's positive.
- */
-unsigned long nr_running(void)
-{
-	long nr = grq.nr_running;
-
-	if (unlikely(nr < 0))
-		nr = 0;
-	return (unsigned long)nr;
-}
-
-static unsigned long nr_uninterruptible(void)
-{
-	long nu = grq.nr_uninterruptible;
-
-	if (unlikely(nu < 0))
-		nu = 0;
-	return nu;
-}
-
-/*
- * Check if only the current task is running on the cpu.
- */
-bool single_task_running(void)
-{
-	if (cpu_rq(smp_processor_id())->soft_affined == 1)
-		return true;
-	else
-		return false;
-}
-EXPORT_SYMBOL(single_task_running);
-
-unsigned long long nr_context_switches(void)
-{
-	long long ns = grq.nr_switches;
-
-	/* This is of course impossible */
-	if (unlikely(ns < 0))
-		ns = 1;
-	return (unsigned long long)ns;
-}
-
-unsigned long nr_iowait(void)
-{
-	unsigned long i, sum = 0;
-
-	for_each_possible_cpu(i)
-		sum += atomic_read(&cpu_rq(i)->nr_iowait);
-
-	return sum;
-}
-
-unsigned long nr_iowait_cpu(int cpu)
-{
-	struct rq *this = cpu_rq(cpu);
-	return atomic_read(&this->nr_iowait);
-}
-
-unsigned long nr_active(void)
-{
-	return nr_running() + nr_uninterruptible();
-}
-
-/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we
- * base it on the number of running or queued tasks with their ->rq pointer
- * set to this cpu as being the CPU they're more likely to run on. */
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
-{
-	struct rq *this = this_rq();
-
-	*nr_waiters = atomic_read(&this->nr_iowait);
-	*load = this->soft_affined;
-}
-
-/* Variables and functions for calc_load */
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
-
-/**
- * get_avenrun - get the load average array
- * @loads:	pointer to dest load array
- * @offset:	offset to add
- * @shift:	shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-	loads[0] = (avenrun[0] + offset) << shift;
-	loads[1] = (avenrun[1] + offset) << shift;
-	loads[2] = (avenrun[2] + offset) << shift;
-}
-
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-	load *= exp;
-	load += active * (FIXED_1 - exp);
-	return load >> FSHIFT;
-}
-
-/*
- * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
- */
-void calc_global_load(unsigned long ticks)
-{
-	long active;
-
-	if (time_before(jiffies, calc_load_update))
-		return;
-	active = nr_active() * FIXED_1;
-
-	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
-	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
-	calc_load_update = jiffies + LOAD_FREQ;
-}
-
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
-
-EXPORT_PER_CPU_SYMBOL(kstat);
-EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
-	sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
-	sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
-	__this_cpu_inc(irq_time_seq.sequence);
-	smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
-	smp_wmb();
-	__this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-	u64 irq_time;
-	unsigned seq;
-
-	do {
-		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
-		irq_time = per_cpu(cpu_softirq_time, cpu) +
-			   per_cpu(cpu_hardirq_time, cpu);
-	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
-	return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void irqtime_account_irq(struct task_struct *curr)
-{
-	unsigned long flags;
-	s64 delta;
-	int cpu;
-
-	if (!sched_clock_irqtime)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-	__this_cpu_add(irq_start_time, delta);
-
-	irq_time_write_begin();
-	/*
-	 * We do not account for softirq time from ksoftirqd here.
-	 * We want to continue accounting softirq time to ksoftirqd thread
-	 * in that case, so as not to confuse scheduler with a special task
-	 * that do not consume any time, but still wants to run.
-	 */
-	if (hardirq_count())
-		__this_cpu_add(cpu_hardirq_time, delta);
-	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-		__this_cpu_add(cpu_softirq_time, delta);
-
-	irq_time_write_end();
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(irqtime_account_irq);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
-	if (unlikely(steal > NSEC_PER_SEC))
-		return div_u64(steal, TICK_NSEC);
-
-	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-
-	/*
-	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
-	 * this case when a previous update_rq_clock() happened inside a
-	 * {soft,}irq region.
-	 *
-	 * When this happens, we stop ->clock_task and only update the
-	 * prev_irq_time stamp to account for the part that fit, so that a next
-	 * update will consume the rest. This ensures ->clock_task is
-	 * monotonic.
-	 *
-	 * It does however cause some slight miss-attribution of {soft,}irq
-	 * time, a more accurate solution would be to update the irq_time using
-	 * the current rq->clock timestamp, except that would require using
-	 * atomic ops.
-	 */
-	if (irq_delta > delta)
-		irq_delta = delta;
-
-	rq->prev_irq_time += irq_delta;
-	delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-	if (static_key_false((&paravirt_steal_rq_enabled))) {
-		s64 steal = paravirt_steal_clock(cpu_of(rq));
-
-		steal -= rq->prev_steal_time_rq;
-
-		if (unlikely(steal > delta))
-			steal = delta;
-
-		rq->prev_steal_time_rq += steal;
-
-		delta -= steal;
-	}
-#endif
-
-	rq->clock_task += delta;
-}
-
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
-#endif
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static void irqtime_account_hi_si(void)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	u64 latest_ns;
-
-	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
-	if (latest_ns > cpustat[CPUTIME_IRQ])
-		cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
-
-	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
-	if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
-		cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime	(0)
-
-static inline void irqtime_account_hi_si(void)
-{
-}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
-	if (static_key_false(&paravirt_steal_enabled)) {
-		u64 steal;
-		cputime_t steal_ct;
-
-		steal = paravirt_steal_clock(smp_processor_id());
-		steal -= this_rq()->prev_steal_time;
-
-		/*
-		 * cputime_t may be less precise than nsecs (eg: if it's
-		 * based on jiffies). Lets cast the result to cputime
-		 * granularity and account the rest on the next rounds.
-		 */
-		steal_ct = nsecs_to_cputime(steal);
-		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
-
-		account_steal_time(steal_ct);
-		return steal_ct;
-	}
-#endif
-	return false;
-}
-
-/*
- * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
- * tasks (sum on group iteration) belonging to @tsk's group.
- */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
-	struct signal_struct *sig = tsk->signal;
-	cputime_t utime, stime;
-	struct task_struct *t;
-	unsigned int seq, nextseq;
-	unsigned long flags;
-
-	rcu_read_lock();
-	/* Attempt a lockless read on the first round. */
-	nextseq = 0;
-	do {
-		seq = nextseq;
-		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
-		times->utime = sig->utime;
-		times->stime = sig->stime;
-		times->sum_exec_runtime = sig->sum_sched_runtime;
-
-		for_each_thread(tsk, t) {
-			task_cputime(t, &utime, &stime);
-			times->utime += utime;
-			times->stime += stime;
-			times->sum_exec_runtime += task_sched_runtime(t);
-		}
-		/* If lockless access failed, take the lock. */
-		nextseq = 1;
-	} while (need_seqretry(&sig->stats_lock, seq));
-	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
-	rcu_read_unlock();
-}
-
-/*
- * On each tick, see what percentage of that tick was attributed to each
- * component and add the percentage to the _pc values. Once a _pc value has
- * accumulated one tick's worth, account for that. This means the total
- * percentage of load components will always be 128 (pseudo 100) per tick.
- */
-static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-	if (atomic_read(&rq->nr_iowait) > 0) {
-		rq->iowait_pc += pc;
-		if (rq->iowait_pc >= 128) {
-			cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128;
-			rq->iowait_pc %= 128;
-		}
-	} else {
-		rq->idle_pc += pc;
-		if (rq->idle_pc >= 128) {
-			cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128;
-			rq->idle_pc %= 128;
-		}
-	}
-	acct_update_integrals(idle);
-}
-
-static void
-pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
-	       unsigned long pc, unsigned long ns)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-
-	p->stime_pc += pc;
-	if (p->stime_pc >= 128) {
-		int jiffs = p->stime_pc / 128;
-
-		p->stime_pc %= 128;
-		p->stime += (__force u64)cputime_one_jiffy * jiffs;
-		p->stimescaled += one_jiffy_scaled * jiffs;
-		account_group_system_time(p, cputime_one_jiffy * jiffs);
-	}
-	p->sched_time += ns;
-	account_group_exec_runtime(p, ns);
-
-	if (hardirq_count() - hardirq_offset) {
-		rq->irq_pc += pc;
-		if (rq->irq_pc >= 128) {
-			cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128;
-			rq->irq_pc %= 128;
-		}
-	} else if (in_serving_softirq()) {
-		rq->softirq_pc += pc;
-		if (rq->softirq_pc >= 128) {
-			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
-			rq->softirq_pc %= 128;
-		}
-	} else {
-		rq->system_pc += pc;
-		if (rq->system_pc >= 128) {
-			cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128;
-			rq->system_pc %= 128;
-		}
-	}
-	acct_update_integrals(p);
-}
-
-static void pc_user_time(struct rq *rq, struct task_struct *p,
-			 unsigned long pc, unsigned long ns)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-
-	p->utime_pc += pc;
-	if (p->utime_pc >= 128) {
-		int jiffs = p->utime_pc / 128;
-
-		p->utime_pc %= 128;
-		p->utime += (__force u64)cputime_one_jiffy * jiffs;
-		p->utimescaled += one_jiffy_scaled * jiffs;
-		account_group_user_time(p, cputime_one_jiffy * jiffs);
-	}
-	p->sched_time += ns;
-	account_group_exec_runtime(p, ns);
-
-	if (this_cpu_ksoftirqd() == p) {
-		/*
-		 * ksoftirqd time do not get accounted in cpu_softirq_time.
-		 * So, we have to handle it separately here.
-		 */
-		rq->softirq_pc += pc;
-		if (rq->softirq_pc >= 128) {
-			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
-			rq->softirq_pc %= 128;
-		}
-	}
-
-	if (task_nice(p) > 0 || idleprio_task(p)) {
-		rq->nice_pc += pc;
-		if (rq->nice_pc >= 128) {
-			cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128;
-			rq->nice_pc %= 128;
-		}
-	} else {
-		rq->user_pc += pc;
-		if (rq->user_pc >= 128) {
-			cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128;
-			rq->user_pc %= 128;
-		}
-	}
-	acct_update_integrals(p);
-}
-
-/*
- * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
- * shifts instead of 100
- */
-#define NS_TO_PC(NS)	(NS * 128 / JIFFY_NS)
-
-/*
- * This is called on clock ticks.
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
- * CPU scheduler quota accounting is also performed here in microseconds.
- */
-static void
-update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
-{
-	long account_ns = rq->clock_task - rq->rq_last_ran;
-	struct task_struct *idle = rq->idle;
-	unsigned long account_pc;
-
-	if (unlikely(account_ns < 0) || steal_account_process_tick())
-		goto ts_account;
-
-	account_pc = NS_TO_PC(account_ns);
-
-	/* Accurate tick timekeeping */
-	if (user_mode(get_irq_regs()))
-		pc_user_time(rq, p, account_pc, account_ns);
-	else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
-		pc_system_time(rq, p, HARDIRQ_OFFSET,
-			       account_pc, account_ns);
-	else
-		pc_idle_time(rq, idle, account_pc);
-
-	if (sched_clock_irqtime)
-		irqtime_account_hi_si();
-
-ts_account:
-	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
-	if (rq->rq_policy != SCHED_FIFO && p != idle) {
-		s64 time_diff = rq->clock - rq->timekeep_clock;
-
-		niffy_diff(&time_diff, 1);
-		rq->rq_time_slice -= NS_TO_US(time_diff);
-	}
-
-	rq->rq_last_ran = rq->clock_task;
-	rq->timekeep_clock = rq->clock;
-}
-
-/*
- * This is called on context switches.
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
- * CPU scheduler quota accounting is also performed here in microseconds.
- */
-static void
-update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
-{
-	long account_ns = rq->clock_task - rq->rq_last_ran;
-	struct task_struct *idle = rq->idle;
-	unsigned long account_pc;
-
-	if (unlikely(account_ns < 0))
-		goto ts_account;
-
-	account_pc = NS_TO_PC(account_ns);
-
-	/* Accurate subtick timekeeping */
-	if (p != idle) {
-		pc_user_time(rq, p, account_pc, account_ns);
-	}
-	else
-		pc_idle_time(rq, idle, account_pc);
-
-ts_account:
-	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
-	if (rq->rq_policy != SCHED_FIFO && p != idle) {
-		s64 time_diff = rq->clock - rq->timekeep_clock;
-
-		niffy_diff(&time_diff, 1);
-		rq->rq_time_slice -= NS_TO_US(time_diff);
-	}
-
-	rq->rq_last_ran = rq->clock_task;
-	rq->timekeep_clock = rq->clock;
-}
-
-/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_grq_lock() held.
- */
-static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
-	u64 ns = 0;
-
-	/*
-	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
-	 * project cycles that may never be accounted to this
-	 * thread, breaking clock_gettime().
-	 */
-	if (p == rq->curr && p->on_rq) {
-		update_clocks(rq);
-		ns = rq->clock_task - rq->rq_last_ran;
-		if (unlikely((s64)ns < 0))
-			ns = 0;
-	}
-
-	return ns;
-}
-
-/*
- * Return accounted runtime for the task.
- * Return separately the current's pending runtime that have not been
- * accounted yet.
- *
- */
-unsigned long long task_sched_runtime(struct task_struct *p)
-{
-	unsigned long flags;
-	struct rq *rq;
-	u64 ns;
-
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
-	/*
-	 * 64-bit doesn't need locks to atomically read a 64bit value.
-	 * So we have a optimization chance when the task's delta_exec is 0.
-	 * Reading ->on_cpu is racy, but this is ok.
-	 *
-	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
-	 * If we race with it entering cpu, unaccounted time is 0. This is
-	 * indistinguishable from the read occurring a few cycles earlier.
-	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
-	 * been accounted, so we're correct here as well.
-	 */
-	if (!p->on_cpu || !p->on_rq)
-		return tsk_seruntime(p);
-#endif
-
-	rq = task_grq_lock(p, &flags);
-	ns = p->sched_time + do_task_delta_exec(p, rq);
-	task_grq_unlock(&flags);
-
-	return ns;
-}
-
-/* Compatibility crap */
-void account_user_time(struct task_struct *p, cputime_t cputime,
-		       cputime_t cputime_scaled)
-{
-}
-
-void account_idle_time(cputime_t cputime)
-{
-}
-
-void update_cpu_load_nohz(void)
-{
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-void calc_load_enter_idle(void)
-{
-}
-
-void calc_load_exit_idle(void)
-{
-}
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
-			       cputime_t cputime_scaled)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-	/* Add guest time to process. */
-	p->utime += (__force u64)cputime;
-	p->utimescaled += (__force u64)cputime_scaled;
-	account_group_user_time(p, cputime);
-	p->gtime += (__force u64)cputime;
-
-	/* Add guest time to cpustat. */
-	if (task_nice(p) > 0) {
-		cpustat[CPUTIME_NICE] += (__force u64)cputime;
-		cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
-	} else {
-		cpustat[CPUTIME_USER] += (__force u64)cputime;
-		cpustat[CPUTIME_GUEST] += (__force u64)cputime;
-	}
-}
-
-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
-			cputime_t cputime_scaled, cputime64_t *target_cputime64)
-{
-	/* Add system time to process. */
-	p->stime += (__force u64)cputime;
-	p->stimescaled += (__force u64)cputime_scaled;
-	account_group_system_time(p, cputime);
-
-	/* Add system time to cpustat. */
-	*target_cputime64 += (__force u64)cputime;
-
-	/* Account for system time used */
-	acct_update_integrals(p);
-}
-
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * This is for guest only now.
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
-			 cputime_t cputime, cputime_t cputime_scaled)
-{
-
-	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
-		account_guest_time(p, cputime, cputime_scaled);
-}
-
-/*
- * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-	cpustat[CPUTIME_STEAL] += (__force u64)cputime;
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-static void account_idle_times(cputime_t cputime)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	struct rq *rq = this_rq();
-
-	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
-	else
-		cpustat[CPUTIME_IDLE] += (__force u64)cputime;
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-
-void account_process_tick(struct task_struct *p, int user_tick)
-{
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
-	account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-	account_idle_times(jiffies_to_cputime(ticks));
-}
-#endif
-
-static inline void grq_iso_lock(void)
-	__acquires(grq.iso_lock)
-{
-	raw_spin_lock(&grq.iso_lock);
-}
-
-static inline void grq_iso_unlock(void)
-	__releases(grq.iso_lock)
-{
-	raw_spin_unlock(&grq.iso_lock);
-}
-
-/*
- * Functions to test for when SCHED_ISO tasks have used their allocated
- * quota as real time scheduling and convert them back to SCHED_NORMAL.
- * Where possible, the data is tested lockless, to avoid grabbing iso_lock
- * because the occasional inaccurate result won't matter. However the
- * tick data is only ever modified under lock. iso_refractory is only simply
- * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
- */
-static bool set_iso_refractory(void)
-{
-	grq.iso_refractory = true;
-	return grq.iso_refractory;
-}
-
-static bool clear_iso_refractory(void)
-{
-	grq.iso_refractory = false;
-	return grq.iso_refractory;
-}
-
-/*
- * Test if SCHED_ISO tasks have run longer than their alloted period as RT
- * tasks and set the refractory flag if necessary. There is 10% hysteresis
- * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
- * slow division.
- */
-static bool test_ret_isorefractory(struct rq *rq)
-{
-	if (likely(!grq.iso_refractory)) {
-		if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu)
-			return set_iso_refractory();
-	} else {
-		if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))
-			return clear_iso_refractory();
-	}
-	return grq.iso_refractory;
-}
-
-static void iso_tick(void)
-{
-	grq_iso_lock();
-	grq.iso_ticks += 100;
-	grq_iso_unlock();
-}
-
-/* No SCHED_ISO task was running so decrease rq->iso_ticks */
-static inline void no_iso_tick(void)
-{
-	if (grq.iso_ticks) {
-		grq_iso_lock();
-		grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
-		if (unlikely(grq.iso_refractory && grq.iso_ticks <
-		    ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
-			clear_iso_refractory();
-		grq_iso_unlock();
-	}
-}
-
-/* This manages tasks that have run out of timeslice during a scheduler_tick */
-static void task_running_tick(struct rq *rq)
-{
-	struct task_struct *p;
-
-	/*
-	 * If a SCHED_ISO task is running we increment the iso_ticks. In
-	 * order to prevent SCHED_ISO tasks from causing starvation in the
-	 * presence of true RT tasks we account those as iso_ticks as well.
-	 */
-	if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) {
-		if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128)
-			iso_tick();
-	} else
-		no_iso_tick();
-
-	if (iso_queue(rq)) {
-		if (unlikely(test_ret_isorefractory(rq))) {
-			if (rq_running_iso(rq)) {
-				/*
-				 * SCHED_ISO task is running as RT and limit
-				 * has been hit. Force it to reschedule as
-				 * SCHED_NORMAL by zeroing its time_slice
-				 */
-				rq->rq_time_slice = 0;
-			}
-		}
-	}
-
-	/* SCHED_FIFO tasks never run out of timeslice. */
-	if (rq->rq_policy == SCHED_FIFO)
-		return;
-	/*
-	 * Tasks that were scheduled in the first half of a tick are not
-	 * allowed to run into the 2nd half of the next tick if they will
-	 * run out of time slice in the interim. Otherwise, if they have
-	 * less than RESCHED_US μs of time slice left they will be rescheduled.
-	 */
-	if (rq->dither) {
-		if (rq->rq_time_slice > HALF_JIFFY_US)
-			return;
-		else
-			rq->rq_time_slice = 0;
-	} else if (rq->rq_time_slice >= RESCHED_US)
-			return;
-
-	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
-	p = rq->curr;
-
-	grq_lock();
-	requeue_task(p);
-	__set_tsk_resched(p);
-	grq_unlock();
-}
-
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled. The data modified is all
- * local to struct rq so we don't need to grab grq lock.
- */
-void scheduler_tick(void)
-{
-	int cpu __maybe_unused = smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
-
-	sched_clock_tick();
-	/* grq lock not grabbed, so only update rq clock */
-	update_rq_clock(rq);
-	update_cpu_clock_tick(rq, rq->curr);
-	if (!rq_idle(rq))
-		task_running_tick(rq);
-	else
-		no_iso_tick();
-	rq->last_tick = rq->clock;
-	perf_event_task_tick();
-}
-
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-	if (in_lock_functions(addr)) {
-		addr = CALLER_ADDR2;
-		if (in_lock_functions(addr))
-			addr = CALLER_ADDR3;
-	}
-	return addr;
-}
-
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-				defined(CONFIG_PREEMPT_TRACER))
-void preempt_count_add(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-	/*
-	 * Underflow?
-	 */
-	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
-		return;
-#endif
-	__preempt_count_add(val);
-#ifdef CONFIG_DEBUG_PREEMPT
-	/*
-	 * Spinlock count overflowing soon?
-	 */
-	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
-				PREEMPT_MASK - 10);
-#endif
-	if (preempt_count() == val) {
-		unsigned long ip = get_parent_ip(CALLER_ADDR1);
-#ifdef CONFIG_DEBUG_PREEMPT
-		current->preempt_disable_ip = ip;
-#endif
-		trace_preempt_off(CALLER_ADDR0, ip);
-	}
-}
-EXPORT_SYMBOL(preempt_count_add);
-NOKPROBE_SYMBOL(preempt_count_add);
-
-void preempt_count_sub(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-	/*
-	 * Underflow?
-	 */
-	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
-		return;
-	/*
-	 * Is the spinlock portion underflowing?
-	 */
-	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
-			!(preempt_count() & PREEMPT_MASK)))
-		return;
-#endif
-
-	if (preempt_count() == val)
-		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-	__preempt_count_sub(val);
-}
-EXPORT_SYMBOL(preempt_count_sub);
-NOKPROBE_SYMBOL(preempt_count_sub);
-#endif
-
-/*
- * Deadline is "now" in niffies + (offset by priority). Setting the deadline
- * is the key to everything. It distributes cpu fairly amongst tasks of the
- * same nice value, it proportions cpu according to nice level, it means the
- * task that last woke up the longest ago has the earliest deadline, thus
- * ensuring that interactive tasks get low latency on wake up. The CPU
- * proportion works out to the square of the virtual deadline difference, so
- * this equation will give nice 19 3% CPU compared to nice 0.
- */
-static inline u64 prio_deadline_diff(int user_prio)
-{
-	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
-}
-
-static inline u64 task_deadline_diff(struct task_struct *p)
-{
-	return prio_deadline_diff(TASK_USER_PRIO(p));
-}
-
-static inline u64 static_deadline_diff(int static_prio)
-{
-	return prio_deadline_diff(USER_PRIO(static_prio));
-}
-
-static inline int longest_deadline_diff(void)
-{
-	return prio_deadline_diff(39);
-}
-
-static inline int ms_longest_deadline_diff(void)
-{
-	return NS_TO_MS(longest_deadline_diff());
-}
-
-/*
- * The time_slice is only refilled when it is empty and that is when we set a
- * new deadline.
- */
-static void time_slice_expired(struct task_struct *p)
-{
-	p->time_slice = timeslice();
-	p->deadline = grq.niffies + task_deadline_diff(p);
-#ifdef CONFIG_SMT_NICE
-	if (!p->mm)
-		p->smt_bias = 0;
-	else if (rt_task(p))
-		p->smt_bias = 1 << 30;
-	else if (task_running_iso(p))
-		p->smt_bias = 1 << 29;
-	else if (idleprio_task(p)) {
-		if (task_running_idle(p))
-			p->smt_bias = 0;
-		else
-			p->smt_bias = 1;
-	} else if (--p->smt_bias < 1)
-		p->smt_bias = MAX_PRIO - p->static_prio;
-#endif
-}
-
-/*
- * Timeslices below RESCHED_US are considered as good as expired as there's no
- * point rescheduling when there's so little time left. SCHED_BATCH tasks
- * have been flagged be not latency sensitive and likely to be fully CPU
- * bound so every time they're rescheduled they have their time_slice
- * refilled, but get a new later deadline to have little effect on
- * SCHED_NORMAL tasks.
-
- */
-static inline void check_deadline(struct task_struct *p)
-{
-	if (p->time_slice < RESCHED_US || batch_task(p))
-		time_slice_expired(p);
-}
-
-#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
-
-/*
- * Scheduler queue bitmap specific find next bit.
- */
-static inline unsigned long
-next_sched_bit(const unsigned long *addr, unsigned long offset)
-{
-	const unsigned long *p;
-	unsigned long result;
-	unsigned long size;
-	unsigned long tmp;
-
-	size = PRIO_LIMIT;
-	if (offset >= size)
-		return size;
-
-	p = addr + BITOP_WORD(offset);
-	result = offset & ~(BITS_PER_LONG-1);
-	size -= result;
-	offset %= BITS_PER_LONG;
-	if (offset) {
-		tmp = *(p++);
-		tmp &= (~0UL << offset);
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found_middle;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
-	}
-	if (!size)
-		return result;
-	tmp = *p;
-
-found_first:
-	tmp &= (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found_middle:
-	return result + __ffs(tmp);
-}
-
-/*
- * O(n) lookup of all tasks in the global runqueue. The real brainfuck
- * of lock contention and O(n). It's not really O(n) as only the queued,
- * but not running tasks are scanned, and is O(n) queued in the worst case
- * scenario only because the right task can be found before scanning all of
- * them.
- * Tasks are selected in this order:
- * Real time tasks are selected purely by their static priority and in the
- * order they were queued, so the lowest value idx, and the first queued task
- * of that priority value is chosen.
- * If no real time tasks are found, the SCHED_ISO priority is checked, and
- * all SCHED_ISO tasks have the same priority value, so they're selected by
- * the earliest deadline value.
- * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the
- * earliest deadline.
- * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
- * selected by the earliest deadline.
- */
-static inline struct
-task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
-{
-	struct task_struct *edt = NULL;
-	unsigned long idx = -1;
-
-	do {
-		struct list_head *queue;
-		struct task_struct *p;
-		u64 earliest_deadline;
-
-		idx = next_sched_bit(grq.prio_bitmap, ++idx);
-		if (idx >= PRIO_LIMIT)
-			return idle;
-		queue = grq.queue + idx;
-
-		if (idx < MAX_RT_PRIO) {
-			/* We found an rt task */
-			list_for_each_entry(p, queue, run_list) {
-				/* Make sure cpu affinity is ok */
-				if (needs_other_cpu(p, cpu))
-					continue;
-				edt = p;
-				goto out_take;
-			}
-			/*
-			 * None of the RT tasks at this priority can run on
-			 * this cpu
-			 */
-			continue;
-		}
-
-		/*
-		 * No rt tasks. Find the earliest deadline task. Now we're in
-		 * O(n) territory.
-		 */
-		earliest_deadline = ~0ULL;
-		list_for_each_entry(p, queue, run_list) {
-			u64 dl;
-
-			/* Make sure cpu affinity is ok */
-			if (needs_other_cpu(p, cpu))
-				continue;
-
-#ifdef CONFIG_SMT_NICE
-			if (!smt_should_schedule(p, cpu))
-				continue;
-#endif
-			/*
-			 * Soft affinity happens here by not scheduling a task
-			 * with its sticky flag set that ran on a different CPU
-			 * last when the CPU is scaling, or by greatly biasing
-			 * against its deadline when not, based on cpu cache
-			 * locality.
-			 */
-			if (task_sticky(p) && task_rq(p) != rq) {
-				if (scaling_rq(rq))
-					continue;
-				dl = p->deadline << locality_diff(p, rq);
-			} else
-				dl = p->deadline;
-
-			if (deadline_before(dl, earliest_deadline)) {
-				earliest_deadline = dl;
-				edt = p;
-			}
-		}
-	} while (!edt);
-
-out_take:
-	take_task(cpu, edt);
-	return edt;
-}
-
-
-/*
- * Print scheduling while atomic bug:
- */
-static noinline void __schedule_bug(struct task_struct *prev)
-{
-	if (oops_in_progress)
-		return;
-
-	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
-		prev->comm, prev->pid, preempt_count());
-
-	debug_show_held_locks(prev);
-	print_modules();
-	if (irqs_disabled())
-		print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
-	if (in_atomic_preempt_off()) {
-		pr_err("Preemption disabled at:");
-		print_ip_sym(current->preempt_disable_ip);
-		pr_cont("\n");
-	}
-#endif
-	dump_stack();
-	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
-}
-
-/*
- * Various schedule()-time debugging checks and statistics:
- */
-static inline void schedule_debug(struct task_struct *prev)
-{
-#ifdef CONFIG_SCHED_STACK_END_CHECK
-	BUG_ON(unlikely(task_stack_end_corrupted(prev)));
-#endif
-	/*
-	 * Test if we are atomic. Since do_exit() needs to call into
-	 * schedule() atomically, we ignore that path. Otherwise whine
-	 * if we are scheduling when we should not.
-	 */
-	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
-		__schedule_bug(prev);
-	rcu_sleep_check();
-
-	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-
-	schedstat_inc(this_rq(), sched_count);
-}
-
-/*
- * The currently running task's information is all stored in rq local data
- * which is only modified by the local CPU, thereby allowing the data to be
- * changed without grabbing the grq lock.
- */
-static inline void set_rq_task(struct rq *rq, struct task_struct *p)
-{
-	rq->rq_time_slice = p->time_slice;
-	rq->rq_deadline = p->deadline;
-	rq->rq_last_ran = p->last_ran = rq->clock_task;
-	rq->rq_policy = p->policy;
-	rq->rq_prio = p->prio;
-#ifdef CONFIG_SMT_NICE
-	rq->rq_mm = p->mm;
-	rq->rq_smt_bias = p->smt_bias;
-#endif
-	if (p != rq->idle)
-		rq->rq_running = true;
-	else
-		rq->rq_running = false;
-}
-
-static void reset_rq_task(struct rq *rq, struct task_struct *p)
-{
-	rq->rq_policy = p->policy;
-	rq->rq_prio = p->prio;
-#ifdef CONFIG_SMT_NICE
-	rq->rq_smt_bias = p->smt_bias;
-#endif
-}
-
-#ifdef CONFIG_SMT_NICE
-/* Iterate over smt siblings when we've scheduled a process on cpu and decide
- * whether they should continue running or be descheduled. */
-static void check_smt_siblings(int cpu)
-{
-	int other_cpu;
-
-	for_each_cpu(other_cpu, thread_cpumask(cpu)) {
-		struct task_struct *p;
-		struct rq *rq;
-
-		if (other_cpu == cpu)
-			continue;
-		rq = cpu_rq(other_cpu);
-		if (rq_idle(rq))
-			continue;
-		if (!rq->online)
-			continue;
-		p = rq->curr;
-		if (!smt_should_schedule(p, cpu)) {
-			set_tsk_need_resched(p);
-			smp_send_reschedule(other_cpu);
-		}
-	}
-}
-
-static void wake_smt_siblings(int cpu)
-{
-	int other_cpu;
-
-	if (!queued_notrunning())
-		return;
-
-	for_each_cpu(other_cpu, thread_cpumask(cpu)) {
-		struct rq *rq;
-
-		if (other_cpu == cpu)
-			continue;
-		rq = cpu_rq(other_cpu);
-		if (rq_idle(rq)) {
-			struct task_struct *p = rq->curr;
-
-			set_tsk_need_resched(p);
-			smp_send_reschedule(other_cpu);
-		}
-	}
-}
-#else
-static void check_smt_siblings(int __maybe_unused cpu) {}
-static void wake_smt_siblings(int __maybe_unused cpu) {}
-#endif
-
-/*
- * schedule() is the main scheduler function.
- *
- * The main means of driving the scheduler and thus entering this function are:
- *
- *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
- *
- *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
- *      paths. For example, see arch/x86/entry_64.S.
- *
- *      To drive preemption between tasks, the scheduler sets the flag in timer
- *      interrupt handler scheduler_tick().
- *
- *   3. Wakeups don't really cause entry into schedule(). They add a
- *      task to the run-queue and that's it.
- *
- *      Now, if the new task added to the run-queue preempts the current
- *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
- *      called on the nearest possible occasion:
- *
- *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
- *
- *         - in syscall or exception context, at the next outmost
- *           preempt_enable(). (this might be as soon as the wake_up()'s
- *           spin_unlock()!)
- *
- *         - in IRQ context, return from interrupt-handler to
- *           preemptible context
- *
- *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
- *         then at the next:
- *
- *          - cond_resched() call
- *          - explicit schedule() call
- *          - return from syscall or exception to user-space
- *          - return from interrupt-handler to user-space
- *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
- */
-static void __sched __schedule(void)
-{
-	struct task_struct *prev, *next, *idle;
-	unsigned long *switch_count;
-	bool deactivate;
-	struct rq *rq;
-	int cpu;
-
-need_resched:
-	deactivate = false;
-	preempt_disable();
-	cpu = smp_processor_id();
-	rq = cpu_rq(cpu);
-	rcu_note_context_switch();
-	prev = rq->curr;
-
-	schedule_debug(prev);
-
-	/*
-	 * Make sure that signal_pending_state()->signal_pending() below
-	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
-	 * done by the caller to avoid the race with signal_wake_up().
-	 */
-	smp_mb__before_spinlock();
-	grq_lock_irq();
-
-	switch_count = &prev->nivcsw;
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-		if (unlikely(signal_pending_state(prev->state, prev))) {
-			prev->state = TASK_RUNNING;
-		} else {
-			deactivate = true;
-			prev->on_rq = 0;
-
-			/*
-			 * If a worker is going to sleep, notify and
-			 * ask workqueue whether it wants to wake up a
-			 * task to maintain concurrency.  If so, wake
-			 * up the task.
-			 */
-			if (prev->flags & PF_WQ_WORKER) {
-				struct task_struct *to_wakeup;
-
-				to_wakeup = wq_worker_sleeping(prev, cpu);
-				if (to_wakeup) {
-					/* This shouldn't happen, but does */
-					if (unlikely(to_wakeup == prev))
-						deactivate = false;
-					else
-						try_to_wake_up_local(to_wakeup);
-				}
-			}
-		}
-		switch_count = &prev->nvcsw;
-	}
-
-	/*
-	 * If we are going to sleep and we have plugged IO queued, make
-	 * sure to submit it to avoid deadlocks.
-	 */
-	if (unlikely(deactivate && blk_needs_flush_plug(prev))) {
-		grq_unlock_irq();
-		preempt_enable_no_resched();
-		blk_schedule_flush_plug(prev);
-		goto need_resched;
-	}
-
-	update_clocks(rq);
-	update_cpu_clock_switch(rq, prev);
-	if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
-		rq->dither = false;
-	else
-		rq->dither = true;
-
-	clear_tsk_need_resched(prev);
-	clear_preempt_need_resched();
-
-	idle = rq->idle;
-	if (idle != prev) {
-		/* Update all the information stored on struct rq */
-		prev->time_slice = rq->rq_time_slice;
-		prev->deadline = rq->rq_deadline;
-		check_deadline(prev);
-		prev->last_ran = rq->clock_task;
-
-		/* Task changed affinity off this CPU */
-		if (likely(!needs_other_cpu(prev, cpu))) {
-			if (!deactivate) {
-				if (!queued_notrunning()) {
-					/*
-					 * We now know prev is the only thing that is
-					 * awaiting CPU so we can bypass rechecking for
-					 * the earliest deadline task and just run it
-					 * again.
-					 */
-					set_rq_task(rq, prev);
-					check_smt_siblings(cpu);
-					grq_unlock_irq();
-					goto rerun_prev_unlocked;
-				} else
-					swap_sticky(rq, cpu, prev);
-			}
-		}
-		return_task(prev, rq, deactivate);
-	}
-
-	if (unlikely(!queued_notrunning())) {
-		/*
-		 * This CPU is now truly idle as opposed to when idle is
-		 * scheduled as a high priority task in its own right.
-		 */
-		next = idle;
-		schedstat_inc(rq, sched_goidle);
-		set_cpuidle_map(cpu);
-	} else {
-		next = earliest_deadline_task(rq, cpu, idle);
-		if (likely(next->prio != PRIO_LIMIT))
-			clear_cpuidle_map(cpu);
-		else
-			set_cpuidle_map(cpu);
-	}
-
-	if (likely(prev != next)) {
-		/*
-		 * Don't reschedule an idle task or deactivated tasks
-		 */
-		if (prev != idle && !deactivate)
-			resched_suitable_idle(prev);
-		/*
-		 * Don't stick tasks when a real time task is going to run as
-		 * they may literally get stuck.
-		 */
-		if (rt_task(next))
-			unstick_task(rq, prev);
-		set_rq_task(rq, next);
-		if (next != idle)
-			check_smt_siblings(cpu);
-		else
-			wake_smt_siblings(cpu);
-		grq.nr_switches++;
-		prev->on_cpu = false;
-		next->on_cpu = true;
-		rq->curr = next;
-		++*switch_count;
-
-		rq = context_switch(rq, prev, next); /* unlocks the grq */
-		cpu = cpu_of(rq);
-		idle = rq->idle;
-	} else {
-		check_smt_siblings(cpu);
-		grq_unlock_irq();
-	}
-
-rerun_prev_unlocked:
-	sched_preempt_enable_no_resched();
-}
-
-asmlinkage __visible void __sched schedule(void)
-{
-	do {
-		__schedule();
-	} while (need_resched());
-}
-
-EXPORT_SYMBOL(schedule);
-
-#ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage __visible void __sched schedule_user(void)
-{
-	/*
-	 * If we come here after a random call to set_need_resched(),
-	 * or we have been woken up remotely but the IPI has not yet arrived,
-	 * we haven't yet exited the RCU idle mode. Do it here manually until
-	 * we find a better solution.
-	 *
-	 * NB: There are buggy callers of this function.  Ideally we
-	 * should warn if prev_state != IN_USER, but that will trigger
-	 * too frequently to make sense yet.
-	 */
-	enum ctx_state prev_state = exception_enter();
-	schedule();
-	exception_exit(prev_state);
-}
-#endif
-
-/**
- * schedule_preempt_disabled - called with preemption disabled
- *
- * Returns with preemption disabled. Note: preempt_count must be 1
- */
-void __sched schedule_preempt_disabled(void)
-{
-	sched_preempt_enable_no_resched();
-	schedule();
-	preempt_disable();
-}
-
-static void __sched notrace preempt_schedule_common(void)
-{
-	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
-		__schedule();
-		__preempt_count_sub(PREEMPT_ACTIVE);
-
-		/*
-		 * Check again in case we missed a preemption opportunity
-		 * between schedule and now.
-		 */
-		barrier();
-	} while (need_resched());
-}
-
-#ifdef CONFIG_PREEMPT
-/*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
- */
-asmlinkage __visible void __sched notrace preempt_schedule(void)
-{
-	/*
-	 * If there is a non-zero preempt_count or interrupts are disabled,
-	 * we do not want to preempt the current task. Just return..
-	 */
-	if (likely(!preemptible()))
-		return;
-
-	preempt_schedule_common();
-}
-NOKPROBE_SYMBOL(preempt_schedule);
-EXPORT_SYMBOL(preempt_schedule);
-
-#ifdef CONFIG_CONTEXT_TRACKING
-/**
- * preempt_schedule_context - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
-{
-	enum ctx_state prev_ctx;
-
-	if (likely(!preemptible()))
-		return;
-
-	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
-		/*
-		 * Needs preempt disabled in case user_exit() is traced
-		 * and the tracer calls preempt_enable_notrace() causing
-		 * an infinite recursion.
-		 */
-		prev_ctx = exception_enter();
-		__schedule();
-		exception_exit(prev_ctx);
-
-		__preempt_count_sub(PREEMPT_ACTIVE);
-		barrier();
-	} while (need_resched());
-}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
-
-#endif /* CONFIG_PREEMPT */
-
-/*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
- */
-asmlinkage __visible void __sched preempt_schedule_irq(void)
-{
-	enum ctx_state prev_state;
-
-	/* Catch callers which need to be fixed */
-	BUG_ON(preempt_count() || !irqs_disabled());
-
-	prev_state = exception_enter();
-
-	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
-		local_irq_enable();
-		schedule();
-		local_irq_disable();
-		__preempt_count_sub(PREEMPT_ACTIVE);
-
-		/*
-		 * Check again in case we missed a preemption opportunity
-		 * between schedule and now.
-		 */
-		barrier();
-	} while (need_resched());
-
-	exception_exit(prev_state);
-}
-
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
-			  void *key)
-{
-	return try_to_wake_up(curr->private, mode, wake_flags);
-}
-EXPORT_SYMBOL(default_wake_function);
-
-#ifdef CONFIG_RT_MUTEXES
-
-/*
- * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
- *
- * This function changes the 'effective' priority of a task. It does
- * not touch ->normal_prio like __setscheduler().
- *
- * Used by the rt_mutex code to implement priority inheritance
- * logic. Call site only calls if the priority of the task changed.
- */
-void rt_mutex_setprio(struct task_struct *p, int prio)
-{
-	unsigned long flags;
-	int queued, oldprio;
-	struct rq *rq;
-
-	BUG_ON(prio < 0 || prio > MAX_PRIO);
-
-	rq = task_grq_lock(p, &flags);
-
-	/*
-	 * Idle task boosting is a nono in general. There is one
-	 * exception, when PREEMPT_RT and NOHZ is active:
-	 *
-	 * The idle task calls get_next_timer_interrupt() and holds
-	 * the timer wheel base->lock on the CPU and another CPU wants
-	 * to access the timer (probably to cancel it). We can safely
-	 * ignore the boosting request, as the idle CPU runs this code
-	 * with interrupts disabled and will complete the lock
-	 * protected section without being interrupted. So there is no
-	 * real need to boost.
-	 */
-	if (unlikely(p == rq->idle)) {
-		WARN_ON(p != rq->curr);
-		WARN_ON(p->pi_blocked_on);
-		goto out_unlock;
-	}
-
-	trace_sched_pi_setprio(p, prio);
-	oldprio = p->prio;
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
-	p->prio = prio;
-	if (task_running(p) && prio > oldprio)
-		resched_task(p);
-	if (queued) {
-		enqueue_task(p, rq);
-		try_preempt(p, rq);
-	}
-
-out_unlock:
-	task_grq_unlock(&flags);
-}
-
-#endif
-
-/*
- * Adjust the deadline for when the priority is to change, before it's
- * changed.
- */
-static inline void adjust_deadline(struct task_struct *p, int new_prio)
-{
-	p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
-}
-
-void set_user_nice(struct task_struct *p, long nice)
-{
-	int queued, new_static, old_static;
-	unsigned long flags;
-	struct rq *rq;
-
-	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
-		return;
-	new_static = NICE_TO_PRIO(nice);
-	/*
-	 * We have to be careful, if called from sys_setpriority(),
-	 * the task might be in the middle of scheduling on another CPU.
-	 */
-	rq = time_task_grq_lock(p, &flags);
-	/*
-	 * The RT priorities are set via sched_setscheduler(), but we still
-	 * allow the 'normal' nice value to be set - but as expected
-	 * it wont have any effect on scheduling until the task is
-	 * not SCHED_NORMAL/SCHED_BATCH:
-	 */
-	if (has_rt_policy(p)) {
-		p->static_prio = new_static;
-		goto out_unlock;
-	}
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
-
-	adjust_deadline(p, new_static);
-	old_static = p->static_prio;
-	p->static_prio = new_static;
-	p->prio = effective_prio(p);
-
-	if (queued) {
-		enqueue_task(p, rq);
-		if (new_static < old_static)
-			try_preempt(p, rq);
-	} else if (task_running(p)) {
-		reset_rq_task(rq, p);
-		if (old_static < new_static)
-			resched_task(p);
-	}
-out_unlock:
-	task_grq_unlock(&flags);
-}
-EXPORT_SYMBOL(set_user_nice);
-
-/*
- * can_nice - check if a task can reduce its nice value
- * @p: task
- * @nice: nice value
- */
-int can_nice(const struct task_struct *p, const int nice)
-{
-	/* convert nice value [19,-20] to rlimit style value [1,40] */
-	int nice_rlim = nice_to_rlimit(nice);
-
-	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
-		capable(CAP_SYS_NICE));
-}
-
-#ifdef __ARCH_WANT_SYS_NICE
-
-/*
- * sys_nice - change the priority of the current process.
- * @increment: priority increment
- *
- * sys_setpriority is a more generic, but much slower function that
- * does similar things.
- */
-SYSCALL_DEFINE1(nice, int, increment)
-{
-	long nice, retval;
-
-	/*
-	 * Setpriority might change our priority at the same moment.
-	 * We don't have to worry. Conceptually one call occurs first
-	 * and we have a single winner.
-	 */
-
-	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
-	nice = task_nice(current) + increment;
-
-	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
-	if (increment < 0 && !can_nice(current, nice))
-		return -EPERM;
-
-	retval = security_task_setnice(current, nice);
-	if (retval)
-		return retval;
-
-	set_user_nice(current, nice);
-	return 0;
-}
-
-#endif
-
-/**
- * task_prio - return the priority value of a given task.
- * @p: the task in question.
- *
- * Return: The priority value as seen by users in /proc.
- * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
- * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
- */
-int task_prio(const struct task_struct *p)
-{
-	int delta, prio = p->prio - MAX_RT_PRIO;
-
-	/* rt tasks and iso tasks */
-	if (prio <= 0)
-		goto out;
-
-	/* Convert to ms to avoid overflows */
-	delta = NS_TO_MS(p->deadline - grq.niffies);
-	delta = delta * 40 / ms_longest_deadline_diff();
-	if (delta > 0 && delta <= 80)
-		prio += delta;
-	if (idleprio_task(p))
-		prio += 40;
-out:
-	return prio;
-}
-
-/**
- * idle_cpu - is a given cpu idle currently?
- * @cpu: the processor in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int idle_cpu(int cpu)
-{
-	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
-}
-
-/**
- * idle_task - return the idle task for a given cpu.
- * @cpu: the processor in question.
- *
- * Return: The idle task for the cpu @cpu.
- */
-struct task_struct *idle_task(int cpu)
-{
-	return cpu_rq(cpu)->idle;
-}
-
-/**
- * find_process_by_pid - find a process with a matching PID value.
- * @pid: the pid in question.
- *
- * The task of @pid, if found. %NULL otherwise.
- */
-static inline struct task_struct *find_process_by_pid(pid_t pid)
-{
-	return pid ? find_task_by_vpid(pid) : current;
-}
-
-/* Actually do priority change: must hold grq lock. */
-static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
-			   int prio, bool keep_boost)
-{
-	int oldrtprio, oldprio;
-
-	p->policy = policy;
-	oldrtprio = p->rt_priority;
-	p->rt_priority = prio;
-	p->normal_prio = normal_prio(p);
-	oldprio = p->prio;
-	/*
-	 * Keep a potential priority boosting if called from
-	 * sched_setscheduler().
-	 */
-	if (keep_boost)
-		p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
-	else
-		p->prio = p->normal_prio;
-	if (task_running(p)) {
-		reset_rq_task(rq, p);
-		/* Resched only if we might now be preempted */
-		if (p->prio > oldprio || p->rt_priority > oldrtprio)
-			resched_task(p);
-	}
-}
-
-/*
- * check the target process has a UID that matches the current process's
- */
-static bool check_same_owner(struct task_struct *p)
-{
-	const struct cred *cred = current_cred(), *pcred;
-	bool match;
-
-	rcu_read_lock();
-	pcred = __task_cred(p);
-	match = (uid_eq(cred->euid, pcred->euid) ||
-		 uid_eq(cred->euid, pcred->uid));
-	rcu_read_unlock();
-	return match;
-}
-
-static int __sched_setscheduler(struct task_struct *p, int policy,
-				const struct sched_param *param, bool user)
-{
-	struct sched_param zero_param = { .sched_priority = 0 };
-	int queued, retval, oldpolicy = -1;
-	unsigned long flags, rlim_rtprio = 0;
-	int reset_on_fork;
-	struct rq *rq;
-
-	/* may grab non-irq protected spin_locks */
-	BUG_ON(in_interrupt());
-
-	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
-		unsigned long lflags;
-
-		if (!lock_task_sighand(p, &lflags))
-			return -ESRCH;
-		rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
-		unlock_task_sighand(p, &lflags);
-		if (rlim_rtprio)
-			goto recheck;
-		/*
-		 * If the caller requested an RT policy without having the
-		 * necessary rights, we downgrade the policy to SCHED_ISO.
-		 * We also set the parameter to zero to pass the checks.
-		 */
-		policy = SCHED_ISO;
-		param = &zero_param;
-	}
-recheck:
-	/* double check policy once rq lock held */
-	if (policy < 0) {
-		reset_on_fork = p->sched_reset_on_fork;
-		policy = oldpolicy = p->policy;
-	} else {
-		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
-		policy &= ~SCHED_RESET_ON_FORK;
-
-		if (!SCHED_RANGE(policy))
-			return -EINVAL;
-	}
-
-	/*
-	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
-	 * SCHED_BATCH is 0.
-	 */
-	if (param->sched_priority < 0 ||
-	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
-	    (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
-		return -EINVAL;
-	if (is_rt_policy(policy) != (param->sched_priority != 0))
-		return -EINVAL;
-
-	/*
-	 * Allow unprivileged RT tasks to decrease priority:
-	 */
-	if (user && !capable(CAP_SYS_NICE)) {
-		if (is_rt_policy(policy)) {
-			unsigned long rlim_rtprio =
-					task_rlimit(p, RLIMIT_RTPRIO);
-
-			/* can't set/change the rt policy */
-			if (policy != p->policy && !rlim_rtprio)
-				return -EPERM;
-
-			/* can't increase priority */
-			if (param->sched_priority > p->rt_priority &&
-			    param->sched_priority > rlim_rtprio)
-				return -EPERM;
-		} else {
-			switch (p->policy) {
-				/*
-				 * Can only downgrade policies but not back to
-				 * SCHED_NORMAL
-				 */
-				case SCHED_ISO:
-					if (policy == SCHED_ISO)
-						goto out;
-					if (policy == SCHED_NORMAL)
-						return -EPERM;
-					break;
-				case SCHED_BATCH:
-					if (policy == SCHED_BATCH)
-						goto out;
-					if (policy != SCHED_IDLEPRIO)
-						return -EPERM;
-					break;
-				case SCHED_IDLEPRIO:
-					if (policy == SCHED_IDLEPRIO)
-						goto out;
-					return -EPERM;
-				default:
-					break;
-			}
-		}
-
-		/* can't change other user's priorities */
-		if (!check_same_owner(p))
-			return -EPERM;
-
-		/* Normal users shall not reset the sched_reset_on_fork flag */
-		if (p->sched_reset_on_fork && !reset_on_fork)
-			return -EPERM;
-	}
-
-	if (user) {
-		retval = security_task_setscheduler(p);
-		if (retval)
-			return retval;
-	}
-
-	/*
-	 * make sure no PI-waiters arrive (or leave) while we are
-	 * changing the priority of the task:
-	 */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	/*
-	 * To be able to change p->policy safely, the grunqueue lock must be
-	 * held.
-	 */
-	rq = __task_grq_lock(p);
-
-	/*
-	 * Changing the policy of the stop threads its a very bad idea
-	 */
-	if (p == rq->stop) {
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-		return -EINVAL;
-	}
-
-	/*
-	 * If not changing anything there's no need to proceed further:
-	 */
-	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
-			param->sched_priority == p->rt_priority))) {
-
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-		return 0;
-	}
-
-	/* recheck policy now with rq lock held */
-	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
-		policy = oldpolicy = -1;
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-		goto recheck;
-	}
-	update_clocks(rq);
-	p->sched_reset_on_fork = reset_on_fork;
-
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
-	__setscheduler(p, rq, policy, param->sched_priority, true);
-	if (queued) {
-		enqueue_task(p, rq);
-		try_preempt(p, rq);
-	}
-	__task_grq_unlock();
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-	rt_mutex_adjust_pi(p);
-out:
-	return 0;
-}
-
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-		       const struct sched_param *param)
-{
-	return __sched_setscheduler(p, policy, param, true);
-}
-
-EXPORT_SYMBOL_GPL(sched_setscheduler);
-
-int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
-{
-	const struct sched_param param = { .sched_priority = attr->sched_priority };
-	int policy = attr->sched_policy;
-
-	return __sched_setscheduler(p, policy, &param, true);
-}
-EXPORT_SYMBOL_GPL(sched_setattr);
-
-/**
- * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Just like sched_setscheduler, only don't bother checking if the
- * current context has permission.  For example, this is needed in
- * stop_machine(): we create temporary high priority worker threads,
- * but our caller might not have that capability.
- *
- * Return: 0 on success. An error code otherwise.
- */
-int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-			       const struct sched_param *param)
-{
-	return __sched_setscheduler(p, policy, param, false);
-}
-
-static int
-do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-{
-	struct sched_param lparam;
-	struct task_struct *p;
-	int retval;
-
-	if (!param || pid < 0)
-		return -EINVAL;
-	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
-		return -EFAULT;
-
-	rcu_read_lock();
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (p != NULL)
-		retval = sched_setscheduler(p, policy, &lparam);
-	rcu_read_unlock();
-
-	return retval;
-}
-
-/*
- * Mimics kernel/events/core.c perf_copy_attr().
- */
-static int sched_copy_attr(struct sched_attr __user *uattr,
-			   struct sched_attr *attr)
-{
-	u32 size;
-	int ret;
-
-	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
-		return -EFAULT;
-
-	/*
-	 * zero the full structure, so that a short copy will be nice.
-	 */
-	memset(attr, 0, sizeof(*attr));
-
-	ret = get_user(size, &uattr->size);
-	if (ret)
-		return ret;
-
-	if (size > PAGE_SIZE)	/* silly large */
-		goto err_size;
-
-	if (!size)		/* abi compat */
-		size = SCHED_ATTR_SIZE_VER0;
-
-	if (size < SCHED_ATTR_SIZE_VER0)
-		goto err_size;
-
-	/*
-	 * If we're handed a bigger struct than we know of,
-	 * ensure all the unknown bits are 0 - i.e. new
-	 * user-space does not rely on any kernel feature
-	 * extensions we dont know about yet.
-	 */
-	if (size > sizeof(*attr)) {
-		unsigned char __user *addr;
-		unsigned char __user *end;
-		unsigned char val;
-
-		addr = (void __user *)uattr + sizeof(*attr);
-		end  = (void __user *)uattr + size;
-
-		for (; addr < end; addr++) {
-			ret = get_user(val, addr);
-			if (ret)
-				return ret;
-			if (val)
-				goto err_size;
-		}
-		size = sizeof(*attr);
-	}
-
-	ret = copy_from_user(attr, uattr, size);
-	if (ret)
-		return -EFAULT;
-
-	/*
-	 * XXX: do we want to be lenient like existing syscalls; or do we want
-	 * to be strict and return an error on out-of-bounds values?
-	 */
-	attr->sched_nice = clamp(attr->sched_nice, -20, 19);
-
-	/* sched/core.c uses zero here but we already know ret is zero */
-	return 0;
-
-err_size:
-	put_user(sizeof(*attr), &uattr->size);
-	return -E2BIG;
-}
-
-/**
- * sys_sched_setscheduler - set/change the scheduler policy and RT priority
- * @pid: the pid in question.
- * @policy: new policy.
- *
- * Return: 0 on success. An error code otherwise.
- * @param: structure containing the new RT priority.
- */
-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
-				       struct sched_param __user *param)
-{
-	/* negative values for policy are not valid */
-	if (policy < 0)
-		return -EINVAL;
-
-	return do_sched_setscheduler(pid, policy, param);
-}
-
-/*
- * sched_setparam() passes in -1 for its policy, to let the functions
- * it calls know not to change it.
- */
-#define SETPARAM_POLICY	-1
-
-/**
- * sys_sched_setparam - set/change the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
-{
-	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
-}
-
-/**
- * sys_sched_setattr - same as above, but with extended sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- */
-SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
-			       unsigned int, flags)
-{
-	struct sched_attr attr;
-	struct task_struct *p;
-	int retval;
-
-	if (!uattr || pid < 0 || flags)
-		return -EINVAL;
-
-	retval = sched_copy_attr(uattr, &attr);
-	if (retval)
-		return retval;
-
-	if ((int)attr.sched_policy < 0)
-		return -EINVAL;
-
-	rcu_read_lock();
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (p != NULL)
-		retval = sched_setattr(p, &attr);
-	rcu_read_unlock();
-
-	return retval;
-}
-
-/**
- * sys_sched_getscheduler - get the policy (scheduling class) of a thread
- * @pid: the pid in question.
- *
- * Return: On success, the policy of the thread. Otherwise, a negative error
- * code.
- */
-SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
-{
-	struct task_struct *p;
-	int retval = -EINVAL;
-
-	if (pid < 0)
-		goto out_nounlock;
-
-	retval = -ESRCH;
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	if (p) {
-		retval = security_task_getscheduler(p);
-		if (!retval)
-			retval = p->policy;
-	}
-	rcu_read_unlock();
-
-out_nounlock:
-	return retval;
-}
-
-/**
- * sys_sched_getscheduler - get the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the RT priority.
- *
- * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
- * code.
- */
-SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
-{
-	struct sched_param lp = { .sched_priority = 0 };
-	struct task_struct *p;
-	int retval = -EINVAL;
-
-	if (!param || pid < 0)
-		goto out_nounlock;
-
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
-
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	if (has_rt_policy(p))
-		lp.sched_priority = p->rt_priority;
-	rcu_read_unlock();
-
-	/*
-	 * This one might sleep, we cannot do it with a spinlock held ...
-	 */
-	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
-out_nounlock:
-	return retval;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
-}
-
-static int sched_read_attr(struct sched_attr __user *uattr,
-			   struct sched_attr *attr,
-			   unsigned int usize)
-{
-	int ret;
-
-	if (!access_ok(VERIFY_WRITE, uattr, usize))
-		return -EFAULT;
-
-	/*
-	 * If we're handed a smaller struct than we know of,
-	 * ensure all the unknown bits are 0 - i.e. old
-	 * user-space does not get uncomplete information.
-	 */
-	if (usize < sizeof(*attr)) {
-		unsigned char *addr;
-		unsigned char *end;
-
-		addr = (void *)attr + usize;
-		end  = (void *)attr + sizeof(*attr);
-
-		for (; addr < end; addr++) {
-			if (*addr)
-				return -EFBIG;
-		}
-
-		attr->size = usize;
-	}
-
-	ret = copy_to_user(uattr, attr, attr->size);
-	if (ret)
-		return -EFAULT;
-
-	/* sched/core.c uses zero here but we already know ret is zero */
-	return ret;
-}
-
-/**
- * sys_sched_getattr - similar to sched_getparam, but with sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
-		unsigned int, size, unsigned int, flags)
-{
-	struct sched_attr attr = {
-		.size = sizeof(struct sched_attr),
-	};
-	struct task_struct *p;
-	int retval;
-
-	if (!uattr || pid < 0 || size > PAGE_SIZE ||
-	    size < SCHED_ATTR_SIZE_VER0 || flags)
-		return -EINVAL;
-
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
-
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	attr.sched_policy = p->policy;
-	if (rt_task(p))
-		attr.sched_priority = p->rt_priority;
-	else
-		attr.sched_nice = task_nice(p);
-
-	rcu_read_unlock();
-
-	retval = sched_read_attr(uattr, &attr, size);
-	return retval;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
-}
-
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
-{
-	cpumask_var_t cpus_allowed, new_mask;
-	struct task_struct *p;
-	int retval;
-
-	get_online_cpus();
-	rcu_read_lock();
-
-	p = find_process_by_pid(pid);
-	if (!p) {
-		rcu_read_unlock();
-		put_online_cpus();
-		return -ESRCH;
-	}
-
-	/* Prevent p going away */
-	get_task_struct(p);
-	rcu_read_unlock();
-
-	if (p->flags & PF_NO_SETAFFINITY) {
-		retval = -EINVAL;
-		goto out_put_task;
-	}
-	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
-		retval = -ENOMEM;
-		goto out_put_task;
-	}
-	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
-		retval = -ENOMEM;
-		goto out_free_cpus_allowed;
-	}
-	retval = -EPERM;
-	if (!check_same_owner(p)) {
-		rcu_read_lock();
-		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
-			rcu_read_unlock();
-			goto out_unlock;
-		}
-		rcu_read_unlock();
-	}
-
-	retval = security_task_setscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	cpuset_cpus_allowed(p, cpus_allowed);
-	cpumask_and(new_mask, in_mask, cpus_allowed);
-again:
-	retval = set_cpus_allowed_ptr(p, new_mask);
-
-	if (!retval) {
-		cpuset_cpus_allowed(p, cpus_allowed);
-		if (!cpumask_subset(new_mask, cpus_allowed)) {
-			/*
-			 * We must have raced with a concurrent cpuset
-			 * update. Just reset the cpus_allowed to the
-			 * cpuset's cpus_allowed
-			 */
-			cpumask_copy(new_mask, cpus_allowed);
-			goto again;
-		}
-	}
-out_unlock:
-	free_cpumask_var(new_mask);
-out_free_cpus_allowed:
-	free_cpumask_var(cpus_allowed);
-out_put_task:
-	put_task_struct(p);
-	put_online_cpus();
-	return retval;
-}
-
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-			     cpumask_t *new_mask)
-{
-	if (len < sizeof(cpumask_t)) {
-		memset(new_mask, 0, sizeof(cpumask_t));
-	} else if (len > sizeof(cpumask_t)) {
-		len = sizeof(cpumask_t);
-	}
-	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
-}
-
-
-/**
- * sys_sched_setaffinity - set the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new cpu mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
-		unsigned long __user *, user_mask_ptr)
-{
-	cpumask_var_t new_mask;
-	int retval;
-
-	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
-	if (retval == 0)
-		retval = sched_setaffinity(pid, new_mask);
-	free_cpumask_var(new_mask);
-	return retval;
-}
-
-long sched_getaffinity(pid_t pid, cpumask_t *mask)
-{
-	struct task_struct *p;
-	unsigned long flags;
-	int retval;
-
-	get_online_cpus();
-	rcu_read_lock();
-
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (!p)
-		goto out_unlock;
-
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	grq_lock_irqsave(&flags);
-	cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
-	grq_unlock_irqrestore(&flags);
-
-out_unlock:
-	rcu_read_unlock();
-	put_online_cpus();
-
-	return retval;
-}
-
-/**
- * sys_sched_getaffinity - get the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current cpu mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
-		unsigned long __user *, user_mask_ptr)
-{
-	int ret;
-	cpumask_var_t mask;
-
-	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
-		return -EINVAL;
-	if (len & (sizeof(unsigned long)-1))
-		return -EINVAL;
-
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	ret = sched_getaffinity(pid, mask);
-	if (ret == 0) {
-		size_t retlen = min_t(size_t, len, cpumask_size());
-
-		if (copy_to_user(user_mask_ptr, mask, retlen))
-			ret = -EFAULT;
-		else
-			ret = retlen;
-	}
-	free_cpumask_var(mask);
-
-	return ret;
-}
-
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. It does this by
- * scheduling away the current task. If it still has the earliest deadline
- * it will be scheduled again as the next task.
- *
- * Return: 0.
- */
-SYSCALL_DEFINE0(sched_yield)
-{
-	struct task_struct *p;
-
-	p = current;
-	grq_lock_irq();
-	schedstat_inc(task_rq(p), yld_count);
-	requeue_task(p);
-
-	/*
-	 * Since we are going to call schedule() anyway, there's
-	 * no need to preempt or enable interrupts:
-	 */
-	__release(grq.lock);
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
-	do_raw_spin_unlock(&grq.lock);
-	sched_preempt_enable_no_resched();
-
-	schedule();
-
-	return 0;
-}
-
-int __sched _cond_resched(void)
-{
-	if (should_resched()) {
-		preempt_schedule_common();
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(_cond_resched);
-
-/*
- * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
- * call schedule, and on return reacquire the lock.
- *
- * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
- * operations here to prevent schedule() from being called twice (once via
- * spin_unlock(), once by hand).
- */
-int __cond_resched_lock(spinlock_t *lock)
-{
-	int resched = should_resched();
-	int ret = 0;
-
-	lockdep_assert_held(lock);
-
-	if (spin_needbreak(lock) || resched) {
-		spin_unlock(lock);
-		if (resched)
-			preempt_schedule_common();
-		else
-			cpu_relax();
-		ret = 1;
-		spin_lock(lock);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(__cond_resched_lock);
-
-int __sched __cond_resched_softirq(void)
-{
-	BUG_ON(!in_softirq());
-
-	if (should_resched()) {
-		local_bh_enable();
-		preempt_schedule_common();
-		local_bh_disable();
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
-/**
- * yield - yield the current processor to other threads.
- *
- * Do not ever use this function, there's a 99% chance you're doing it wrong.
- *
- * The scheduler is at all times free to pick the calling task as the most
- * eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
- *
- * Typical broken usage is:
- *
- * while (!event)
- * 	yield();
- *
- * where one assumes that yield() will let 'the other' process run that will
- * make event true. If the current task is a SCHED_FIFO task that will never
- * happen. Never use yield() as a progress guarantee!!
- *
- * If you want to use yield() to wait for something, use wait_event().
- * If you want to use yield() to be 'nice' for others, use cond_resched().
- * If you still want to use yield(), do not!
- */
-void __sched yield(void)
-{
-	set_current_state(TASK_RUNNING);
-	sys_sched_yield();
-}
-EXPORT_SYMBOL(yield);
-
-/**
- * yield_to - yield the current processor to another thread in
- * your thread group, or accelerate that thread toward the
- * processor it's on.
- * @p: target task
- * @preempt: whether task preemption is allowed or not
- *
- * It's the caller's job to ensure that the target task struct
- * can't go away on us before we can do any checks.
- *
- * Return:
- *	true (>0) if we indeed boosted the target task.
- *	false (0) if we failed to boost the target.
- *	-ESRCH if there's no task to yield to.
- */
-int __sched yield_to(struct task_struct *p, bool preempt)
-{
-	struct rq *rq, *p_rq;
-	unsigned long flags;
-	int yielded = 0;
-
-	rq = this_rq();
-	grq_lock_irqsave(&flags);
-	if (task_running(p) || p->state) {
-		yielded = -ESRCH;
-		goto out_unlock;
-	}
-
-	p_rq = task_rq(p);
-	yielded = 1;
-	if (p->deadline > rq->rq_deadline)
-		p->deadline = rq->rq_deadline;
-	p->time_slice += rq->rq_time_slice;
-	rq->rq_time_slice = 0;
-	if (p->time_slice > timeslice())
-		p->time_slice = timeslice();
-	if (preempt && rq != p_rq)
-		resched_curr(p_rq);
-out_unlock:
-	grq_unlock_irqrestore(&flags);
-
-	if (yielded > 0)
-		schedule();
-	return yielded;
-}
-EXPORT_SYMBOL_GPL(yield_to);
-
-/*
- * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
- * that process accounting knows that this is a task in IO wait state.
- *
- * But don't do that if it is a deliberate, throttling IO wait (this task
- * has set its backing_dev_info: the queue against which it should throttle)
- */
-
-long __sched io_schedule_timeout(long timeout)
-{
-	int old_iowait = current->in_iowait;
-	struct rq *rq;
-	long ret;
-
-	current->in_iowait = 1;
-	blk_schedule_flush_plug(current);
-
-	delayacct_blkio_start();
-	rq = raw_rq();
-	atomic_inc(&rq->nr_iowait);
-	ret = schedule_timeout(timeout);
-	current->in_iowait = old_iowait;
-	atomic_dec(&rq->nr_iowait);
-	delayacct_blkio_end();
-
-	return ret;
-}
-EXPORT_SYMBOL(io_schedule_timeout);
-
-/**
- * sys_sched_get_priority_max - return maximum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the maximum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
-{
-	int ret = -EINVAL;
-
-	switch (policy) {
-	case SCHED_FIFO:
-	case SCHED_RR:
-		ret = MAX_USER_RT_PRIO-1;
-		break;
-	case SCHED_NORMAL:
-	case SCHED_BATCH:
-	case SCHED_ISO:
-	case SCHED_IDLEPRIO:
-		ret = 0;
-		break;
-	}
-	return ret;
-}
-
-/**
- * sys_sched_get_priority_min - return minimum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the minimum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
-{
-	int ret = -EINVAL;
-
-	switch (policy) {
-	case SCHED_FIFO:
-	case SCHED_RR:
-		ret = 1;
-		break;
-	case SCHED_NORMAL:
-	case SCHED_BATCH:
-	case SCHED_ISO:
-	case SCHED_IDLEPRIO:
-		ret = 0;
-		break;
-	}
-	return ret;
-}
-
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
-		struct timespec __user *, interval)
-{
-	struct task_struct *p;
-	unsigned int time_slice;
-	unsigned long flags;
-	int retval;
-	struct timespec t;
-
-	if (pid < 0)
-		return -EINVAL;
-
-	retval = -ESRCH;
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	if (!p)
-		goto out_unlock;
-
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	grq_lock_irqsave(&flags);
-	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
-	grq_unlock_irqrestore(&flags);
-
-	rcu_read_unlock();
-	t = ns_to_timespec(time_slice);
-	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
-	return retval;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
-}
-
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
-void sched_show_task(struct task_struct *p)
-{
-	unsigned long free = 0;
-	int ppid;
-	unsigned long state = p->state;
-
-	if (state)
-		state = __ffs(state) + 1;
-	printk(KERN_INFO "%-15.15s %c", p->comm,
-		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT " running  ");
-	else
-		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT "  running task    ");
-	else
-		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
-#ifdef CONFIG_DEBUG_STACK_USAGE
-	free = stack_not_used(p);
-#endif
-	ppid = 0;
-	rcu_read_lock();
-	if (pid_alive(p))
-		ppid = task_pid_nr(rcu_dereference(p->real_parent));
-	rcu_read_unlock();
-	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-		task_pid_nr(p), ppid,
-		(unsigned long)task_thread_info(p)->flags);
-
-	print_worker_info(KERN_INFO, p);
-	show_stack(p, NULL);
-}
-
-void show_state_filter(unsigned long state_filter)
-{
-	struct task_struct *g, *p;
-
-#if BITS_PER_LONG == 32
-	printk(KERN_INFO
-		"  task                PC stack   pid father\n");
-#else
-	printk(KERN_INFO
-		"  task                        PC stack   pid father\n");
-#endif
-	rcu_read_lock();
-	for_each_process_thread(g, p) {
-		/*
-		 * reset the NMI-timeout, listing all files on a slow
-		 * console might take a lot of time:
-		 */
-		touch_nmi_watchdog();
-		if (!state_filter || (p->state & state_filter))
-			sched_show_task(p);
-	}
-
-	touch_all_softlockup_watchdogs();
-
-	rcu_read_unlock();
-	/*
-	 * Only show locks if all tasks are dumped:
-	 */
-	if (!state_filter)
-		debug_show_all_locks();
-}
-
-void dump_cpu_task(int cpu)
-{
-	pr_info("Task dump for CPU %d:\n", cpu);
-	sched_show_task(cpu_curr(cpu));
-}
-
-#ifdef CONFIG_SMP
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-	cpumask_copy(tsk_cpus_allowed(p), new_mask);
-}
-#endif
-
-/**
- * init_idle - set up an idle thread for a given CPU
- * @idle: task in question
- * @cpu: cpu the idle task belongs to
- *
- * NOTE: this function does not set the idle thread's NEED_RESCHED
- * flag, to make booting more robust.
- */
-void init_idle(struct task_struct *idle, int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	time_grq_lock(rq, &flags);
-	idle->last_ran = rq->clock_task;
-	idle->state = TASK_RUNNING;
-	/* Setting prio to illegal value shouldn't matter when never queued */
-	idle->prio = PRIO_LIMIT;
-#ifdef CONFIG_SMT_NICE
-	idle->smt_bias = 0;
-#endif
-	set_rq_task(rq, idle);
-	do_set_cpus_allowed(idle, get_cpu_mask(cpu));
-	/* Silence PROVE_RCU */
-	rcu_read_lock();
-	set_task_cpu(idle, cpu);
-	rcu_read_unlock();
-	rq->curr = rq->idle = idle;
-	idle->on_cpu = 1;
-	grq_unlock_irqrestore(&flags);
-
-	/* Set the preempt count _outside_ the spinlocks! */
-	init_idle_preempt_count(idle, cpu);
-
-	ftrace_graph_init_idle_task(idle, cpu);
-#if defined(CONFIG_SMP)
-	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
-}
-
-int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
-			      const struct cpumask __maybe_unused *trial)
-{
-	return 1;
-}
-
-int task_can_attach(struct task_struct *p,
-		    const struct cpumask *cs_cpus_allowed)
-{
-	int ret = 0;
-
-	/*
-	 * Kthreads which disallow setaffinity shouldn't be moved
-	 * to a new cpuset; we don't want to change their cpu
-	 * affinity and isolating such threads by their set of
-	 * allowed nodes is unnecessary.  Thus, cpusets are not
-	 * applicable for such threads.  This prevents checking for
-	 * success of set_cpus_allowed_ptr() on all attached tasks
-	 * before cpus_allowed may be changed.
-	 */
-	if (p->flags & PF_NO_SETAFFINITY)
-		ret = -EINVAL;
-
-	return ret;
-}
-
-void resched_cpu(int cpu)
-{
-	unsigned long flags;
-
-	grq_lock_irqsave(&flags);
-	resched_task(cpu_curr(cpu));
-	grq_unlock_irqrestore(&flags);
-}
-
-#ifdef CONFIG_SMP
-#ifdef CONFIG_NO_HZ_COMMON
-void nohz_balance_enter_idle(int cpu)
-{
-}
-
-void select_nohz_load_balancer(int stop_tick)
-{
-}
-
-void set_cpu_sd_state_idle(void) {}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:	The cpu whose lowest level of sched domain is to
- *		be returned.
- * @flag:	The flag to check for the lowest sched_domain
- *		for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-	struct sched_domain *sd;
-
-	for_each_domain(cpu, sd)
-		if (sd && (sd->flags & flag))
-			break;
-
-	return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:	The cpu whose domains we're iterating over.
- * @sd:		variable holding the value of the power_savings_sd
- *		for cpu.
- * @flag:	The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-	for (sd = lowest_flag_domain(cpu, flag); \
-		(sd && (sd->flags & flag)); sd = sd->parent)
-
-#endif /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-
-/*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu.  This is good for power-savings.
- *
- * We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
- */
-int get_nohz_timer_target(int pinned)
-{
-	int cpu = smp_processor_id();
-	int i;
-	struct sched_domain *sd;
-
-	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
-		return cpu;
-
-	rcu_read_lock();
-	for_each_domain(cpu, sd) {
-		for_each_cpu(i, sched_domain_span(sd)) {
-			if (!idle_cpu(i)) {
-				cpu = i;
-				goto unlock;
-			}
-		}
-	}
-unlock:
-	rcu_read_unlock();
-	return cpu;
-}
-
-/*
- * When add_timer_on() enqueues a timer into the timer wheel of an
- * idle CPU then this timer might expire before the next timer event
- * which is scheduled to wake up that CPU. In case of a completely
- * idle system the next event might even be infinite time into the
- * future. wake_up_idle_cpu() ensures that the CPU is woken up and
- * leaves the inner idle loop so the newly added timer is taken into
- * account when the CPU goes back to idle and evaluates the timer
- * wheel for the next timer event.
- */
-void wake_up_idle_cpu(int cpu)
-{
-	if (cpu == smp_processor_id())
-		return;
-
-	set_tsk_need_resched(cpu_rq(cpu)->idle);
-	smp_send_reschedule(cpu);
-}
-
-void wake_up_nohz_cpu(int cpu)
-{
-	wake_up_idle_cpu(cpu);
-}
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-	bool running_wrong = false;
-	bool queued = false;
-	unsigned long flags;
-	struct rq *rq;
-	int ret = 0;
-
-	rq = task_grq_lock(p, &flags);
-
-	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
-		goto out;
-
-	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	queued = task_queued(p);
-
-	do_set_cpus_allowed(p, new_mask);
-
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
-
-	if (task_running(p)) {
-		/* Task is running on the wrong cpu now, reschedule it. */
-		if (rq == this_rq()) {
-			set_tsk_need_resched(p);
-			running_wrong = true;
-		} else
-			resched_task(p);
-	} else
-		set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
-
-out:
-	if (queued)
-		try_preempt(p, rq);
-	task_grq_unlock(&flags);
-
-	if (running_wrong)
-		preempt_schedule_common();
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-#ifdef CONFIG_HOTPLUG_CPU
-extern struct task_struct *cpu_stopper_task;
-/* Run through task list and find tasks affined to the dead cpu, then remove
- * that cpu from the list, enable cpu0 and set the zerobound flag. */
-static void bind_zero(int src_cpu)
-{
-	struct task_struct *p, *t, *stopper;
-	int bound = 0;
-
-	if (src_cpu == 0)
-		return;
-
-	stopper = per_cpu(cpu_stopper_task, src_cpu);
-	do_each_thread(t, p) {
-		if (p != stopper && cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
-			cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p));
-			cpumask_set_cpu(0, tsk_cpus_allowed(p));
-			p->zerobound = true;
-			bound++;
-		}
-		clear_sticky(p);
-	} while_each_thread(t, p);
-
-	if (bound) {
-		printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n",
-		       bound, src_cpu);
-	}
-}
-
-/* Find processes with the zerobound flag and reenable their affinity for the
- * CPU coming alive. */
-static void unbind_zero(int src_cpu)
-{
-	int unbound = 0, zerobound = 0;
-	struct task_struct *p, *t;
-
-	if (src_cpu == 0)
-		return;
-
-	do_each_thread(t, p) {
-		if (!p->mm)
-			p->zerobound = false;
-		if (p->zerobound) {
-			unbound++;
-			cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p));
-			/* Once every CPU affinity has been re-enabled, remove
-			 * the zerobound flag */
-			if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) {
-				p->zerobound = false;
-				zerobound++;
-			}
-		}
-	} while_each_thread(t, p);
-
-	if (unbound) {
-		printk(KERN_INFO "Added affinity for %d processes to cpu %d\n",
-		       unbound, src_cpu);
-	}
-	if (zerobound) {
-		printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n",
-		       zerobound);
-	}
-}
-
-/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
- */
-void idle_task_exit(void)
-{
-	struct mm_struct *mm = current->active_mm;
-
-	BUG_ON(cpu_online(smp_processor_id()));
-
-	if (mm != &init_mm) {
-		switch_mm(mm, &init_mm, current);
-		finish_arch_post_lock_switch();
-	}
-	mmdrop(mm);
-}
-#else /* CONFIG_HOTPLUG_CPU */
-static void unbind_zero(int src_cpu) {}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-void sched_set_stop_task(int cpu, struct task_struct *stop)
-{
-	struct sched_param stop_param = { .sched_priority = STOP_PRIO };
-	struct sched_param start_param = { .sched_priority = 0 };
-	struct task_struct *old_stop = cpu_rq(cpu)->stop;
-
-	if (stop) {
-		/*
-		 * Make it appear like a SCHED_FIFO task, its something
-		 * userspace knows about and won't get confused about.
-		 *
-		 * Also, it will make PI more or less work without too
-		 * much confusion -- but then, stop work should not
-		 * rely on PI working anyway.
-		 */
-		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
-	}
-
-	cpu_rq(cpu)->stop = stop;
-
-	if (old_stop) {
-		/*
-		 * Reset it back to a normal scheduling policy so that
-		 * it can die in pieces.
-		 */
-		sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
-	}
-}
-
-
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
-	{
-		.procname	= "sched_domain",
-		.mode		= 0555,
-	},
-	{}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= sd_ctl_dir,
-	},
-	{}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-	struct ctl_table *entry =
-		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-	return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-	struct ctl_table *entry;
-
-	/*
-	 * In the intermediate directories, both the child directory and
-	 * procname are dynamically allocated and could fail but the mode
-	 * will always be set. In the lowest directory the names are
-	 * static strings and all have proc handlers.
-	 */
-	for (entry = *tablep; entry->mode; entry++) {
-		if (entry->child)
-			sd_free_ctl_entry(&entry->child);
-		if (entry->proc_handler == NULL)
-			kfree(entry->procname);
-	}
-
-	kfree(*tablep);
-	*tablep = NULL;
-}
-
-static void
-set_table_entry(struct ctl_table *entry,
-		const char *procname, void *data, int maxlen,
-		mode_t mode, proc_handler *proc_handler)
-{
-	entry->procname = procname;
-	entry->data = data;
-	entry->maxlen = maxlen;
-	entry->mode = mode;
-	entry->proc_handler = proc_handler;
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-	struct ctl_table *table = sd_alloc_ctl_entry(14);
-
-	if (table == NULL)
-		return NULL;
-
-	set_table_entry(&table[0], "min_interval", &sd->min_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax);
-	set_table_entry(&table[1], "max_interval", &sd->max_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax);
-	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[9], "cache_nice_tries",
-		&sd->cache_nice_tries,
-		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[10], "flags", &sd->flags,
-		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[11], "max_newidle_lb_cost",
-		&sd->max_newidle_lb_cost,
-		sizeof(long), 0644, proc_doulongvec_minmax);
-	set_table_entry(&table[12], "name", sd->name,
-		CORENAME_MAX_SIZE, 0444, proc_dostring);
-	/* &table[13] is terminator */
-
-	return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-	struct ctl_table *entry, *table;
-	struct sched_domain *sd;
-	int domain_num = 0, i;
-	char buf[32];
-
-	for_each_domain(cpu, sd)
-		domain_num++;
-	entry = table = sd_alloc_ctl_entry(domain_num + 1);
-	if (table == NULL)
-		return NULL;
-
-	i = 0;
-	for_each_domain(cpu, sd) {
-		snprintf(buf, 32, "domain%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_domain_table(sd);
-		entry++;
-		i++;
-	}
-	return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
-	int i, cpu_num = num_possible_cpus();
-	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-	char buf[32];
-
-	WARN_ON(sd_ctl_dir[0].child);
-	sd_ctl_dir[0].child = entry;
-
-	if (entry == NULL)
-		return;
-
-	for_each_possible_cpu(i) {
-		snprintf(buf, 32, "cpu%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_cpu_table(i);
-		entry++;
-	}
-
-	WARN_ON(sd_sysctl_header);
-	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
-	if (sd_sysctl_header)
-		unregister_sysctl_table(sd_sysctl_header);
-	sd_sysctl_header = NULL;
-	if (sd_ctl_dir[0].child)
-		sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif
-
-static void set_rq_online(struct rq *rq)
-{
-	if (!rq->online) {
-		cpumask_set_cpu(cpu_of(rq), rq->rd->online);
-		rq->online = true;
-	}
-}
-
-static void set_rq_offline(struct rq *rq)
-{
-	if (rq->online) {
-		cpumask_clear_cpu(cpu_of(rq), rq->rd->online);
-		rq->online = false;
-	}
-}
-
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- */
-static int
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-	unsigned long flags;
-	struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-	struct task_struct *idle = rq->idle;
-#endif
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_STARTING:
-		return NOTIFY_OK;
-	case CPU_UP_PREPARE:
-		break;
-
-	case CPU_ONLINE:
-		/* Update our root-domain */
-		grq_lock_irqsave(&flags);
-		if (rq->rd) {
-			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
-			set_rq_online(rq);
-		}
-		unbind_zero(cpu);
-		grq.noc = num_online_cpus();
-		grq_unlock_irqrestore(&flags);
-		break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DEAD:
-		grq_lock_irq();
-		set_rq_task(rq, idle);
-		update_clocks(rq);
-		grq_unlock_irq();
-		break;
-
-	case CPU_DYING:
-		/* Update our root-domain */
-		grq_lock_irqsave(&flags);
-		if (rq->rd) {
-			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-			set_rq_offline(rq);
-		}
-		bind_zero(cpu);
-		grq.noc = num_online_cpus();
-		grq_unlock_irqrestore(&flags);
-		break;
-#endif
-	}
-	return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_counter subsystem, though.
- */
-static struct notifier_block  migration_notifier = {
-	.notifier_call = migration_call,
-	.priority = CPU_PRI_MIGRATION,
-};
-
-static int sched_cpu_active(struct notifier_block *nfb,
-				      unsigned long action, void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_FAILED:
-		set_cpu_active((long)hcpu, true);
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static int sched_cpu_inactive(struct notifier_block *nfb,
-					unsigned long action, void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_PREPARE:
-		set_cpu_active((long)hcpu, false);
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-int __init migration_init(void)
-{
-	void *cpu = (void *)(long)smp_processor_id();
-	int err;
-
-	/* Initialise migration for the boot CPU */
-	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
-	BUG_ON(err == NOTIFY_BAD);
-	migration_call(&migration_notifier, CPU_ONLINE, cpu);
-	register_cpu_notifier(&migration_notifier);
-
-	/* Register cpu active notifiers */
-	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
-	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
-	return 0;
-}
-early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
-
-static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
-
-#ifdef CONFIG_SCHED_DEBUG
-
-static __read_mostly int sched_debug_enabled;
-
-static int __init sched_debug_setup(char *str)
-{
-	sched_debug_enabled = 1;
-
-	return 0;
-}
-early_param("sched_debug", sched_debug_setup);
-
-static inline bool sched_debug(void)
-{
-	return sched_debug_enabled;
-}
-
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-				  struct cpumask *groupmask)
-{
-	cpumask_clear(groupmask);
-
-	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
-
-	if (!(sd->flags & SD_LOAD_BALANCE)) {
-		printk("does not load-balance\n");
-		if (sd->parent)
-			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-					" has parent");
-		return -1;
-	}
-
-	printk(KERN_CONT "span %*pbl level %s\n",
-	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
-
-	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-		printk(KERN_ERR "ERROR: domain->span does not contain "
-				"CPU%d\n", cpu);
-	}
-
-	printk(KERN_CONT "\n");
-
-	if (!cpumask_equal(sched_domain_span(sd), groupmask))
-		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-
-	if (sd->parent &&
-	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
-		printk(KERN_ERR "ERROR: parent span is not a superset "
-			"of domain->span\n");
-	return 0;
-}
-
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
-{
-	int level = 0;
-
-	if (!sched_debug_enabled)
-		return;
-
-	if (!sd) {
-		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
-		return;
-	}
-
-	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-
-	for (;;) {
-		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
-			break;
-		level++;
-		sd = sd->parent;
-		if (!sd)
-			break;
-	}
-}
-#else /* !CONFIG_SCHED_DEBUG */
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
-	return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
-
-static int sd_degenerate(struct sched_domain *sd)
-{
-	if (cpumask_weight(sched_domain_span(sd)) == 1)
-		return 1;
-
-	/* Following flags don't use groups */
-	if (sd->flags & (SD_WAKE_AFFINE))
-		return 0;
-
-	return 1;
-}
-
-static int
-sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
-{
-	unsigned long cflags = sd->flags, pflags = parent->flags;
-
-	if (sd_degenerate(parent))
-		return 1;
-
-	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
-		return 0;
-
-	if (~cflags & pflags)
-		return 0;
-
-	return 1;
-}
-
-static void free_rootdomain(struct rcu_head *rcu)
-{
-	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
-
-	cpupri_cleanup(&rd->cpupri);
-	free_cpumask_var(rd->rto_mask);
-	free_cpumask_var(rd->online);
-	free_cpumask_var(rd->span);
-	kfree(rd);
-}
-
-static void rq_attach_root(struct rq *rq, struct root_domain *rd)
-{
-	struct root_domain *old_rd = NULL;
-	unsigned long flags;
-
-	grq_lock_irqsave(&flags);
-
-	if (rq->rd) {
-		old_rd = rq->rd;
-
-		if (cpumask_test_cpu(rq->cpu, old_rd->online))
-			set_rq_offline(rq);
-
-		cpumask_clear_cpu(rq->cpu, old_rd->span);
-
-		/*
-		 * If we dont want to free the old_rd yet then
-		 * set old_rd to NULL to skip the freeing later
-		 * in this function:
-		 */
-		if (!atomic_dec_and_test(&old_rd->refcount))
-			old_rd = NULL;
-	}
-
-	atomic_inc(&rd->refcount);
-	rq->rd = rd;
-
-	cpumask_set_cpu(rq->cpu, rd->span);
-	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
-		set_rq_online(rq);
-
-	grq_unlock_irqrestore(&flags);
-
-	if (old_rd)
-		call_rcu_sched(&old_rd->rcu, free_rootdomain);
-}
-
-static int init_rootdomain(struct root_domain *rd)
-{
-	memset(rd, 0, sizeof(*rd));
-
-	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
-		goto out;
-	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
-		goto free_span;
-	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
-		goto free_online;
-
-	if (cpupri_init(&rd->cpupri) != 0)
-		goto free_rto_mask;
-	return 0;
-
-free_rto_mask:
-	free_cpumask_var(rd->rto_mask);
-free_online:
-	free_cpumask_var(rd->online);
-free_span:
-	free_cpumask_var(rd->span);
-out:
-	return -ENOMEM;
-}
-
-static void init_defrootdomain(void)
-{
-	init_rootdomain(&def_root_domain);
-
-	atomic_set(&def_root_domain.refcount, 1);
-}
-
-static struct root_domain *alloc_rootdomain(void)
-{
-	struct root_domain *rd;
-
-	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
-	if (!rd)
-		return NULL;
-
-	if (init_rootdomain(rd) != 0) {
-		kfree(rd);
-		return NULL;
-	}
-
-	return rd;
-}
-
-static void free_sched_domain(struct rcu_head *rcu)
-{
-	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
-	kfree(sd);
-}
-
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
-{
-	call_rcu(&sd->rcu, free_sched_domain);
-}
-
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
-{
-	for (; sd; sd = sd->parent)
-		destroy_sched_domain(sd, cpu);
-}
-
-/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
- * hold the hotplug lock.
- */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct sched_domain *tmp;
-
-	/* Remove the sched domains which do not contribute to scheduling. */
-	for (tmp = sd; tmp; ) {
-		struct sched_domain *parent = tmp->parent;
-		if (!parent)
-			break;
-
-		if (sd_parent_degenerate(tmp, parent)) {
-			tmp->parent = parent->parent;
-			if (parent->parent)
-				parent->parent->child = tmp;
-			/*
-			 * Transfer SD_PREFER_SIBLING down in case of a
-			 * degenerate parent; the spans match for this
-			 * so the property transfers.
-			 */
-			if (parent->flags & SD_PREFER_SIBLING)
-				tmp->flags |= SD_PREFER_SIBLING;
-			destroy_sched_domain(parent, cpu);
-		} else
-			tmp = tmp->parent;
-	}
-
-	if (sd && sd_degenerate(sd)) {
-		tmp = sd;
-		sd = sd->parent;
-		destroy_sched_domain(tmp, cpu);
-		if (sd)
-			sd->child = NULL;
-	}
-
-	sched_domain_debug(sd, cpu);
-
-	rq_attach_root(rq, rd);
-	tmp = rq->sd;
-	rcu_assign_pointer(rq->sd, sd);
-	destroy_sched_domains(tmp, cpu);
-}
-
-/* cpus with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
-/* Setup the mask of cpus configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	cpulist_parse(str, cpu_isolated_map);
-	return 1;
-}
-
-__setup("isolcpus=", isolated_cpu_setup);
-
-struct s_data {
-	struct sched_domain ** __percpu sd;
-	struct root_domain	*rd;
-};
-
-enum s_alloc {
-	sa_rootdomain,
-	sa_sd,
-	sa_sd_storage,
-	sa_none,
-};
-
-/*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- */
-
-static int default_relax_domain_level = -1;
-int sched_domain_level_max;
-
-static int __init setup_relax_domain_level(char *str)
-{
-	if (kstrtoint(str, 0, &default_relax_domain_level))
-		pr_warn("Unable to set relax_domain_level\n");
-
-	return 1;
-}
-__setup("relax_domain_level=", setup_relax_domain_level);
-
-static void set_domain_attribute(struct sched_domain *sd,
-				 struct sched_domain_attr *attr)
-{
-	int request;
-
-	if (!attr || attr->relax_domain_level < 0) {
-		if (default_relax_domain_level < 0)
-			return;
-		else
-			request = default_relax_domain_level;
-	} else
-		request = attr->relax_domain_level;
-	if (request < sd->level) {
-		/* turn off idle balance on this domain */
-		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-	} else {
-		/* turn on idle balance on this domain */
-		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-	}
-}
-
-static void __sdt_free(const struct cpumask *cpu_map);
-static int __sdt_alloc(const struct cpumask *cpu_map);
-
-static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
-				 const struct cpumask *cpu_map)
-{
-	switch (what) {
-	case sa_rootdomain:
-		if (!atomic_read(&d->rd->refcount))
-			free_rootdomain(&d->rd->rcu); /* fall through */
-	case sa_sd:
-		free_percpu(d->sd); /* fall through */
-	case sa_sd_storage:
-		__sdt_free(cpu_map); /* fall through */
-	case sa_none:
-		break;
-	}
-}
-
-static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
-						   const struct cpumask *cpu_map)
-{
-	memset(d, 0, sizeof(*d));
-
-	if (__sdt_alloc(cpu_map))
-		return sa_sd_storage;
-	d->sd = alloc_percpu(struct sched_domain *);
-	if (!d->sd)
-		return sa_sd_storage;
-	d->rd = alloc_rootdomain();
-	if (!d->rd)
-		return sa_sd;
-	return sa_rootdomain;
-}
-
-/*
- * NULL the sd_data elements we've used to build the sched_domain
- * structure so that the subsequent __free_domain_allocs()
- * will not free the data we're using.
- */
-static void claim_allocations(int cpu, struct sched_domain *sd)
-{
-	struct sd_data *sdd = sd->private;
-
-	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
-	*per_cpu_ptr(sdd->sd, cpu) = NULL;
-}
-
-#ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
-static int *sched_domains_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
-#endif
-
-/*
- * SD_flags allowed in topology descriptions.
- *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
- *
- * Odd one out:
- * SD_ASYM_PACKING        - describes SMT quirks
- */
-#define TOPOLOGY_SD_FLAGS		\
-	(SD_SHARE_CPUCAPACITY |		\
-	 SD_SHARE_PKG_RESOURCES |	\
-	 SD_NUMA |			\
-	 SD_ASYM_PACKING |		\
-	 SD_SHARE_POWERDOMAIN)
-
-static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
-{
-	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-	int sd_weight, sd_flags = 0;
-
-#ifdef CONFIG_NUMA
-	/*
-	 * Ugly hack to pass state to sd_numa_mask()...
-	 */
-	sched_domains_curr_level = tl->numa_level;
-#endif
-
-	sd_weight = cpumask_weight(tl->mask(cpu));
-
-	if (tl->sd_flags)
-		sd_flags = (*tl->sd_flags)();
-	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
-			"wrong sd_flags in topology description\n"))
-		sd_flags &= ~TOPOLOGY_SD_FLAGS;
-
-	*sd = (struct sched_domain){
-		.min_interval		= sd_weight,
-		.max_interval		= 2*sd_weight,
-		.busy_factor		= 32,
-		.imbalance_pct		= 125,
-
-		.cache_nice_tries	= 0,
-		.busy_idx		= 0,
-		.idle_idx		= 0,
-		.newidle_idx		= 0,
-		.wake_idx		= 0,
-		.forkexec_idx		= 0,
-
-		.flags			= 1*SD_LOAD_BALANCE
-					| 1*SD_BALANCE_NEWIDLE
-					| 1*SD_BALANCE_EXEC
-					| 1*SD_BALANCE_FORK
-					| 0*SD_BALANCE_WAKE
-					| 1*SD_WAKE_AFFINE
-					| 0*SD_SHARE_CPUCAPACITY
-					| 0*SD_SHARE_PKG_RESOURCES
-					| 0*SD_SERIALIZE
-					| 0*SD_PREFER_SIBLING
-					| 0*SD_NUMA
-					| sd_flags
-					,
-
-		.last_balance		= jiffies,
-		.balance_interval	= sd_weight,
-		.smt_gain		= 0,
-		.max_newidle_lb_cost	= 0,
-		.next_decay_max_lb_cost	= jiffies,
-#ifdef CONFIG_SCHED_DEBUG
-		.name			= tl->name,
-#endif
-	};
-
-	/*
-	 * Convert topological properties into behaviour.
-	 */
-
-	if (sd->flags & SD_SHARE_CPUCAPACITY) {
-		sd->flags |= SD_PREFER_SIBLING;
-		sd->imbalance_pct = 110;
-		sd->smt_gain = 1178; /* ~15% */
-
-	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-		sd->imbalance_pct = 117;
-		sd->cache_nice_tries = 1;
-		sd->busy_idx = 2;
-
-#ifdef CONFIG_NUMA
-	} else if (sd->flags & SD_NUMA) {
-		sd->cache_nice_tries = 2;
-		sd->busy_idx = 3;
-		sd->idle_idx = 2;
-
-		sd->flags |= SD_SERIALIZE;
-		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
-			sd->flags &= ~(SD_BALANCE_EXEC |
-				       SD_BALANCE_FORK |
-				       SD_WAKE_AFFINE);
-		}
-
-#endif
-	} else {
-		sd->flags |= SD_PREFER_SIBLING;
-		sd->cache_nice_tries = 1;
-		sd->busy_idx = 2;
-		sd->idle_idx = 1;
-	}
-
-	sd->private = &tl->data;
-
-	return sd;
-}
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-#endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
-
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-#define for_each_sd_topology(tl)			\
-	for (tl = sched_domain_topology; tl->mask; tl++)
-
-void set_sched_topology(struct sched_domain_topology_level *tl)
-{
-	sched_domain_topology = tl;
-}
-
-#ifdef CONFIG_NUMA
-
-static const struct cpumask *sd_numa_mask(int cpu)
-{
-	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
-}
-
-static void sched_numa_warn(const char *str)
-{
-	static int done = false;
-	int i,j;
-
-	if (done)
-		return;
-
-	done = true;
-
-	printk(KERN_WARNING "ERROR: %s\n\n", str);
-
-	for (i = 0; i < nr_node_ids; i++) {
-		printk(KERN_WARNING "  ");
-		for (j = 0; j < nr_node_ids; j++)
-			printk(KERN_CONT "%02d ", node_distance(i,j));
-		printk(KERN_CONT "\n");
-	}
-	printk(KERN_WARNING "\n");
-}
-
-static bool find_numa_distance(int distance)
-{
-	int i;
-
-	if (distance == node_distance(0, 0))
-		return true;
-
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		if (sched_domains_numa_distance[i] == distance)
-			return true;
-	}
-
-	return false;
-}
-
-static void sched_init_numa(void)
-{
-	int next_distance, curr_distance = node_distance(0, 0);
-	struct sched_domain_topology_level *tl;
-	int level = 0;
-	int i, j, k;
-
-	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
-	if (!sched_domains_numa_distance)
-		return;
-
-	/*
-	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
-	 * unique distances in the node_distance() table.
-	 *
-	 * Assumes node_distance(0,j) includes all distances in
-	 * node_distance(i,j) in order to avoid cubic time.
-	 */
-	next_distance = curr_distance;
-	for (i = 0; i < nr_node_ids; i++) {
-		for (j = 0; j < nr_node_ids; j++) {
-			for (k = 0; k < nr_node_ids; k++) {
-				int distance = node_distance(i, k);
-
-				if (distance > curr_distance &&
-				    (distance < next_distance ||
-				     next_distance == curr_distance))
-					next_distance = distance;
-
-				/*
-				 * While not a strong assumption it would be nice to know
-				 * about cases where if node A is connected to B, B is not
-				 * equally connected to A.
-				 */
-				if (sched_debug() && node_distance(k, i) != distance)
-					sched_numa_warn("Node-distance not symmetric");
-
-				if (sched_debug() && i && !find_numa_distance(distance))
-					sched_numa_warn("Node-0 not representative");
-			}
-			if (next_distance != curr_distance) {
-				sched_domains_numa_distance[level++] = next_distance;
-				sched_domains_numa_levels = level;
-				curr_distance = next_distance;
-			} else break;
-		}
-
-		/*
-		 * In case of sched_debug() we verify the above assumption.
-		 */
-		if (!sched_debug())
-			break;
-	}
-	/*
-	 * 'level' contains the number of unique distances, excluding the
-	 * identity distance node_distance(i,i).
-	 *
-	 * The sched_domains_numa_distance[] array includes the actual distance
-	 * numbers.
-	 */
-
-	/*
-	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
-	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
-	 * the array will contain less then 'level' members. This could be
-	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
-	 * in other functions.
-	 *
-	 * We reset it to 'level' at the end of this function.
-	 */
-	sched_domains_numa_levels = 0;
-
-	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
-	if (!sched_domains_numa_masks)
-		return;
-
-	/*
-	 * Now for each level, construct a mask per node which contains all
-	 * cpus of nodes that are that many hops away from us.
-	 */
-	for (i = 0; i < level; i++) {
-		sched_domains_numa_masks[i] =
-			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
-		if (!sched_domains_numa_masks[i])
-			return;
-
-		for (j = 0; j < nr_node_ids; j++) {
-			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
-			if (!mask)
-				return;
-
-			sched_domains_numa_masks[i][j] = mask;
-
-			for (k = 0; k < nr_node_ids; k++) {
-				if (node_distance(j, k) > sched_domains_numa_distance[i])
-					continue;
-
-				cpumask_or(mask, mask, cpumask_of_node(k));
-			}
-		}
-	}
-
-	/* Compute default topology size */
-	for (i = 0; sched_domain_topology[i].mask; i++);
-
-	tl = kzalloc((i + level + 1) *
-			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
-	if (!tl)
-		return;
-
-	/*
-	 * Copy the default topology bits..
-	 */
-	for (i = 0; sched_domain_topology[i].mask; i++)
-		tl[i] = sched_domain_topology[i];
-
-	/*
-	 * .. and append 'j' levels of NUMA goodness.
-	 */
-	for (j = 0; j < level; i++, j++) {
-		tl[i] = (struct sched_domain_topology_level){
-			.mask = sd_numa_mask,
-			.sd_flags = cpu_numa_flags,
-			.flags = SDTL_OVERLAP,
-			.numa_level = j,
-			SD_INIT_NAME(NUMA)
-		};
-	}
-
-	sched_domain_topology = tl;
-
-	sched_domains_numa_levels = level;
-}
-
-static void sched_domains_numa_masks_set(int cpu)
-{
-	int i, j;
-	int node = cpu_to_node(cpu);
-
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		for (j = 0; j < nr_node_ids; j++) {
-			if (node_distance(j, node) <= sched_domains_numa_distance[i])
-				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
-		}
-	}
-}
-
-static void sched_domains_numa_masks_clear(int cpu)
-{
-	int i, j;
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		for (j = 0; j < nr_node_ids; j++)
-			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
-	}
-}
-
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-					   unsigned long action,
-					   void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_ONLINE:
-		sched_domains_numa_masks_set(cpu);
-		break;
-
-	case CPU_DEAD:
-		sched_domains_numa_masks_clear(cpu);
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
-	return NOTIFY_OK;
-}
-#else
-static inline void sched_init_numa(void)
-{
-}
-
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-					   unsigned long action,
-					   void *hcpu)
-{
-	return 0;
-}
-#endif /* CONFIG_NUMA */
-
-static int __sdt_alloc(const struct cpumask *cpu_map)
-{
-	struct sched_domain_topology_level *tl;
-	int j;
-
-	for_each_sd_topology(tl) {
-		struct sd_data *sdd = &tl->data;
-
-		sdd->sd = alloc_percpu(struct sched_domain *);
-		if (!sdd->sd)
-			return -ENOMEM;
-
-		for_each_cpu(j, cpu_map) {
-			struct sched_domain *sd;
-
-			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
-					GFP_KERNEL, cpu_to_node(j));
-			if (!sd)
-				return -ENOMEM;
-
-			*per_cpu_ptr(sdd->sd, j) = sd;
-		}
-	}
-
-	return 0;
-}
-
-static void __sdt_free(const struct cpumask *cpu_map)
-{
-	struct sched_domain_topology_level *tl;
-	int j;
-
-	for_each_sd_topology(tl) {
-		struct sd_data *sdd = &tl->data;
-
-		for_each_cpu(j, cpu_map) {
-			struct sched_domain *sd;
-
-			if (sdd->sd) {
-				sd = *per_cpu_ptr(sdd->sd, j);
-				kfree(*per_cpu_ptr(sdd->sd, j));
-			}
-		}
-		free_percpu(sdd->sd);
-		sdd->sd = NULL;
-	}
-}
-
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
-		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-		struct sched_domain *child, int cpu)
-{
-	struct sched_domain *sd = sd_init(tl, cpu);
-	if (!sd)
-		return child;
-
-	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
-	if (child) {
-		sd->level = child->level + 1;
-		sched_domain_level_max = max(sched_domain_level_max, sd->level);
-		child->parent = sd;
-		sd->child = child;
-
-		if (!cpumask_subset(sched_domain_span(child),
-				    sched_domain_span(sd))) {
-			pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
-			pr_err("     the %s domain not a subset of the %s domain\n",
-					child->name, sd->name);
-#endif
-			/* Fixup, ensure @sd has at least @child cpus. */
-			cpumask_or(sched_domain_span(sd),
-				   sched_domain_span(sd),
-				   sched_domain_span(child));
-		}
-
-	}
-	set_domain_attribute(sd, attr);
-
-	return sd;
-}
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-static int build_sched_domains(const struct cpumask *cpu_map,
-			       struct sched_domain_attr *attr)
-{
-	enum s_alloc alloc_state;
-	struct sched_domain *sd;
-	struct s_data d;
-	int i, ret = -ENOMEM;
-
-	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
-	if (alloc_state != sa_rootdomain)
-		goto error;
-
-	/* Set up domains for cpus specified by the cpu_map. */
-	for_each_cpu(i, cpu_map) {
-		struct sched_domain_topology_level *tl;
-
-		sd = NULL;
-		for_each_sd_topology(tl) {
-			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
-			if (tl == sched_domain_topology)
-				*per_cpu_ptr(d.sd, i) = sd;
-			if (tl->flags & SDTL_OVERLAP)
-				sd->flags |= SD_OVERLAP;
-			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-				break;
-		}
-	}
-
-	/* Calculate CPU capacity for physical packages and nodes */
-	for (i = nr_cpumask_bits-1; i >= 0; i--) {
-		if (!cpumask_test_cpu(i, cpu_map))
-			continue;
-
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-			claim_allocations(i, sd);
-		}
-	}
-
-	/* Attach the domains */
-	rcu_read_lock();
-	for_each_cpu(i, cpu_map) {
-		sd = *per_cpu_ptr(d.sd, i);
-		cpu_attach_domain(sd, d.rd, i);
-	}
-	rcu_read_unlock();
-
-	ret = 0;
-error:
-	__free_domain_allocs(&d, alloc_state, cpu_map);
-	return ret;
-}
-
-static cpumask_var_t *doms_cur;	/* current sched domains */
-static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;
-				/* attribues of custom domains in 'doms_cur' */
-
-/*
- * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t fallback_doms;
-
-/*
- * arch_update_cpu_topology lets virtualized architectures update the
- * cpu core maps. It is supposed to return 1 if the topology changed
- * or 0 if it stayed the same.
- */
-int __weak arch_update_cpu_topology(void)
-{
-	return 0;
-}
-
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
-{
-	int i;
-	cpumask_var_t *doms;
-
-	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
-	if (!doms)
-		return NULL;
-	for (i = 0; i < ndoms; i++) {
-		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
-			free_sched_domains(doms, i);
-			return NULL;
-		}
-	}
-	return doms;
-}
-
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
-{
-	unsigned int i;
-	for (i = 0; i < ndoms; i++)
-		free_cpumask_var(doms[i]);
-	kfree(doms);
-}
-
-/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
- */
-static int init_sched_domains(const struct cpumask *cpu_map)
-{
-	int err;
-
-	arch_update_cpu_topology();
-	ndoms_cur = 1;
-	doms_cur = alloc_sched_domains(ndoms_cur);
-	if (!doms_cur)
-		doms_cur = &fallback_doms;
-	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-	err = build_sched_domains(doms_cur[0], NULL);
-	register_sched_domain_sysctl();
-
-	return err;
-}
-
-/*
- * Detach sched domains from a group of cpus specified in cpu_map
- * These cpus will now be attached to the NULL domain
- */
-static void detach_destroy_domains(const struct cpumask *cpu_map)
-{
-	int i;
-
-	rcu_read_lock();
-	for_each_cpu(i, cpu_map)
-		cpu_attach_domain(NULL, &def_root_domain, i);
-	rcu_read_unlock();
-}
-
-/* handle null as "default" */
-static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
-			struct sched_domain_attr *new, int idx_new)
-{
-	struct sched_domain_attr tmp;
-
-	/* fast path */
-	if (!new && !cur)
-		return 1;
-
-	tmp = SD_ATTR_INIT;
-	return !memcmp(cur ? (cur + idx_cur) : &tmp,
-			new ? (new + idx_new) : &tmp,
-			sizeof(struct sched_domain_attr));
-}
-
-/*
- * Partition sched domains as specified by the 'ndoms_new'
- * cpumasks in the array doms_new[] of cpumasks. This compares
- * doms_new[] to the current sched domain partitioning, doms_cur[].
- * It destroys each deleted domain and builds each new domain.
- *
- * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
- * The masks don't intersect (don't overlap.) We should setup one
- * sched domain for each mask. CPUs not in any of the cpumasks will
- * not be load balanced. If the same cpumask appears both in the
- * current 'doms_cur' domains and in the new 'doms_new', we can leave
- * it as it is.
- *
- * The passed in 'doms_new' should be allocated using
- * alloc_sched_domains.  This routine takes ownership of it and will
- * free_sched_domains it when done with it. If the caller failed the
- * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
- *
- * If doms_new == NULL it will be replaced with cpu_online_mask.
- * ndoms_new == 0 is a special case for destroying existing domains,
- * and it will not create the default domain.
- *
- * Call with hotplug lock held
- */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-			     struct sched_domain_attr *dattr_new)
-{
-	int i, j, n;
-	int new_topology;
-
-	mutex_lock(&sched_domains_mutex);
-
-	/* always unregister in case we don't destroy any domains */
-	unregister_sched_domain_sysctl();
-
-	/* Let architecture update cpu core mappings. */
-	new_topology = arch_update_cpu_topology();
-
-	n = doms_new ? ndoms_new : 0;
-
-	/* Destroy deleted domains */
-	for (i = 0; i < ndoms_cur; i++) {
-		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_cur[i], doms_new[j])
-			    && dattrs_equal(dattr_cur, i, dattr_new, j))
-				goto match1;
-		}
-		/* no match - a current sched domain not in new doms_new[] */
-		detach_destroy_domains(doms_cur[i]);
-match1:
-		;
-	}
-
-	n = ndoms_cur;
-	if (doms_new == NULL) {
-		n = 0;
-		doms_new = &fallback_doms;
-		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
-		WARN_ON_ONCE(dattr_new);
-	}
-
-	/* Build new domains */
-	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_new[i], doms_cur[j])
-			    && dattrs_equal(dattr_new, i, dattr_cur, j))
-				goto match2;
-		}
-		/* no match - add a new doms_new */
-		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-match2:
-		;
-	}
-
-	/* Remember the new sched domains */
-	if (doms_cur != &fallback_doms)
-		free_sched_domains(doms_cur, ndoms_cur);
-	kfree(dattr_cur);	/* kfree(NULL) is safe */
-	doms_cur = doms_new;
-	dattr_cur = dattr_new;
-	ndoms_cur = ndoms_new;
-
-	register_sched_domain_sysctl();
-
-	mutex_unlock(&sched_domains_mutex);
-}
-
-static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
-
-/*
- * Update cpusets according to cpu_active mask.  If cpusets are
- * disabled, cpuset_update_active_cpus() becomes a simple wrapper
- * around partition_sched_domains().
- *
- * If we come here as part of a suspend/resume, don't touch cpusets because we
- * want to restore it back to its original state upon resume anyway.
- */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
-			     void *hcpu)
-{
-	switch (action) {
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_FAILED_FROZEN:
-
-		/*
-		 * num_cpus_frozen tracks how many CPUs are involved in suspend
-		 * resume sequence. As long as this is not the last online
-		 * operation in the resume sequence, just build a single sched
-		 * domain, ignoring cpusets.
-		 */
-		num_cpus_frozen--;
-		if (likely(num_cpus_frozen)) {
-			partition_sched_domains(1, NULL, NULL);
-			break;
-		}
-
-		/*
-		 * This is the last CPU online operation. So fall through and
-		 * restore the original sched domains by considering the
-		 * cpuset configurations.
-		 */
-
-	case CPU_ONLINE:
-		cpuset_update_active_cpus(true);
-		break;
-	default:
-		return NOTIFY_DONE;
-	}
-	return NOTIFY_OK;
-}
-
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
-			       void *hcpu)
-{
-	switch (action) {
-	case CPU_DOWN_PREPARE:
-		cpuset_update_active_cpus(false);
-		break;
-	case CPU_DOWN_PREPARE_FROZEN:
-		num_cpus_frozen++;
-		partition_sched_domains(1, NULL, NULL);
-		break;
-	default:
-		return NOTIFY_DONE;
-	}
-	return NOTIFY_OK;
-}
-
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
-/*
- * Cheaper version of the below functions in case support for SMT and MC is
- * compiled in but CPUs have no siblings.
- */
-static bool sole_cpu_idle(int cpu)
-{
-	return rq_idle(cpu_rq(cpu));
-}
-#endif
-#ifdef CONFIG_SCHED_SMT
-static const cpumask_t *thread_cpumask(int cpu)
-{
-	return topology_thread_cpumask(cpu);
-}
-/* All this CPU's SMT siblings are idle */
-static bool siblings_cpu_idle(int cpu)
-{
-	return cpumask_subset(thread_cpumask(cpu), &grq.cpu_idle_map);
-}
-#endif
-#ifdef CONFIG_SCHED_MC
-static const cpumask_t *core_cpumask(int cpu)
-{
-	return topology_core_cpumask(cpu);
-}
-/* All this CPU's shared cache siblings are idle */
-static bool cache_cpu_idle(int cpu)
-{
-	return cpumask_subset(core_cpumask(cpu), &grq.cpu_idle_map);
-}
-#endif
-
-enum sched_domain_level {
-	SD_LV_NONE = 0,
-	SD_LV_SIBLING,
-	SD_LV_MC,
-	SD_LV_BOOK,
-	SD_LV_CPU,
-	SD_LV_NODE,
-	SD_LV_ALLNODES,
-	SD_LV_MAX
-};
-
-void __init sched_init_smp(void)
-{
-	struct sched_domain *sd;
-	int cpu, other_cpu;
-
-	cpumask_var_t non_isolated_cpus;
-
-	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-
-	sched_init_numa();
-
-	/*
-	 * There's no userspace yet to cause hotplug operations; hence all the
-	 * cpu masks are stable and all blatant races in the below code cannot
-	 * happen.
-	 */
-	mutex_lock(&sched_domains_mutex);
-	init_sched_domains(cpu_active_mask);
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-	if (cpumask_empty(non_isolated_cpus))
-		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
-	mutex_unlock(&sched_domains_mutex);
-
-	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
-	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
-	/* Move init over to a non-isolated CPU */
-	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
-		BUG();
-	free_cpumask_var(non_isolated_cpus);
-
-	grq_lock_irq();
-	/*
-	 * Set up the relative cache distance of each online cpu from each
-	 * other in a simple array for quick lookup. Locality is determined
-	 * by the closest sched_domain that CPUs are separated by. CPUs with
-	 * shared cache in SMT and MC are treated as local. Separate CPUs
-	 * (within the same package or physically) within the same node are
-	 * treated as not local. CPUs not even in the same domain (different
-	 * nodes) are treated as very distant.
-	 */
-	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-
-		/* First check if this cpu is in the same node */
-		for_each_domain(cpu, sd) {
-			if (sd->level > SD_LV_NODE)
-				continue;
-			/* Set locality to local node if not already found lower */
-			for_each_cpu(other_cpu, sched_domain_span(sd)) {
-				if (rq->cpu_locality[other_cpu] > 3)
-					rq->cpu_locality[other_cpu] = 3;
-			}
-		}
-
-		/*
-		 * Each runqueue has its own function in case it doesn't have
-		 * siblings of its own allowing mixed topologies.
-		 */
-#ifdef CONFIG_SCHED_MC
-		for_each_cpu(other_cpu, core_cpumask(cpu)) {
-			if (rq->cpu_locality[other_cpu] > 2)
-				rq->cpu_locality[other_cpu] = 2;
-		}
-		if (cpumask_weight(core_cpumask(cpu)) > 1)
-			rq->cache_idle = cache_cpu_idle;
-#endif
-#ifdef CONFIG_SCHED_SMT
-		for_each_cpu(other_cpu, thread_cpumask(cpu))
-			rq->cpu_locality[other_cpu] = 1;
-		if (cpumask_weight(thread_cpumask(cpu)) > 1)
-			rq->siblings_idle = siblings_cpu_idle;
-#endif
-	}
-	grq_unlock_irq();
-
-	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-		for_each_online_cpu(other_cpu) {
-			if (other_cpu <= cpu)
-				continue;
-			printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
-		}
-	}
-}
-#else
-void __init sched_init_smp(void)
-{
-}
-#endif /* CONFIG_SMP */
-
-unsigned int sysctl_timer_migration = 1;
-
-int in_sched_functions(unsigned long addr)
-{
-	return in_lock_functions(addr) ||
-		(addr >= (unsigned long)__sched_text_start
-		&& addr < (unsigned long)__sched_text_end);
-}
-
-void __init sched_init(void)
-{
-#ifdef CONFIG_SMP
-	int cpu_ids;
-#endif
-	int i;
-	struct rq *rq;
-
-	prio_ratios[0] = 128;
-	for (i = 1 ; i < NICE_WIDTH ; i++)
-		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
-
-	raw_spin_lock_init(&grq.lock);
-	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
-	grq.niffies = 0;
-	grq.last_jiffy = jiffies;
-	raw_spin_lock_init(&grq.iso_lock);
-	grq.iso_ticks = 0;
-	grq.iso_refractory = false;
-	grq.noc = 1;
-#ifdef CONFIG_SMP
-	init_defrootdomain();
-	grq.qnr = grq.idle_cpus = 0;
-	cpumask_clear(&grq.cpu_idle_map);
-#else
-	uprq = &per_cpu(runqueues, 0);
-#endif
-	for_each_possible_cpu(i) {
-		rq = cpu_rq(i);
-		rq->grq_lock = &grq.lock;
-		rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
-			      rq->iowait_pc = rq->idle_pc = 0;
-		rq->dither = false;
-#ifdef CONFIG_SMP
-		rq->sticky_task = NULL;
-		rq->last_niffy = 0;
-		rq->sd = NULL;
-		rq->rd = NULL;
-		rq->online = false;
-		rq->cpu = i;
-		rq_attach_root(rq, &def_root_domain);
-#endif
-		atomic_set(&rq->nr_iowait, 0);
-	}
-
-#ifdef CONFIG_SMP
-	cpu_ids = i;
-	/*
-	 * Set the base locality for cpu cache distance calculation to
-	 * "distant" (3). Make sure the distance from a CPU to itself is 0.
-	 */
-	for_each_possible_cpu(i) {
-		int j;
-
-		rq = cpu_rq(i);
-#ifdef CONFIG_SCHED_SMT
-		rq->siblings_idle = sole_cpu_idle;
-#endif
-#ifdef CONFIG_SCHED_MC
-		rq->cache_idle = sole_cpu_idle;
-#endif
-		rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
-		for_each_possible_cpu(j) {
-			if (i == j)
-				rq->cpu_locality[j] = 0;
-			else
-				rq->cpu_locality[j] = 4;
-		}
-	}
-#endif
-
-	for (i = 0; i < PRIO_LIMIT; i++)
-		INIT_LIST_HEAD(grq.queue + i);
-	/* delimiter for bitsearch */
-	__set_bit(PRIO_LIMIT, grq.prio_bitmap);
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
-	/*
-	 * The boot idle thread does lazy MMU switching as well:
-	 */
-	atomic_inc(&init_mm.mm_count);
-	enter_lazy_tlb(&init_mm, current);
-
-	/*
-	 * Make us the idle thread. Technically, schedule() should not be
-	 * called from this thread, however somewhere below it might be,
-	 * but because we are the idle thread, we just pick up running again
-	 * when this runqueue becomes "idle".
-	 */
-	init_idle(current, smp_processor_id());
-
-#ifdef CONFIG_SMP
-	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
-	/* May be allocated at isolcpus cmdline parse time */
-	if (cpu_isolated_map == NULL)
-		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
-	idle_thread_set_boot_cpu();
-#endif /* SMP */
-}
-
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
-	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
-
-	return (nested == preempt_offset);
-}
-
-void __might_sleep(const char *file, int line, int preempt_offset)
-{
-	/*
-	 * Blocking primitives will set (and therefore destroy) current->state,
-	 * since we will exit with TASK_RUNNING make sure we enter with it,
-	 * otherwise we will destroy state.
-	 */
-	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
-			"do not call blocking ops when !TASK_RUNNING; "
-			"state=%lx set at [<%p>] %pS\n",
-			current->state,
-			(void *)current->task_state_change,
-			(void *)current->task_state_change);
-
-	___might_sleep(file, line, preempt_offset);
-}
-EXPORT_SYMBOL(__might_sleep);
-
-void ___might_sleep(const char *file, int line, int preempt_offset)
-{
-	static unsigned long prev_jiffy;	/* ratelimiting */
-
-	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
-	     !is_idle_task(current)) ||
-	    system_state != SYSTEM_RUNNING || oops_in_progress)
-		return;
-	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-		return;
-	prev_jiffy = jiffies;
-
-	printk(KERN_ERR
-		"BUG: sleeping function called from invalid context at %s:%d\n",
-			file, line);
-	printk(KERN_ERR
-		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-			in_atomic(), irqs_disabled(),
-			current->pid, current->comm);
-
-	if (task_stack_end_corrupted(current))
-		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
-
-	debug_show_held_locks(current);
-	if (irqs_disabled())
-		print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
-	if (!preempt_count_equals(preempt_offset)) {
-		pr_err("Preemption disabled at:");
-		print_ip_sym(current->preempt_disable_ip);
-		pr_cont("\n");
-	}
-#endif
-	dump_stack();
-}
-EXPORT_SYMBOL(___might_sleep);
-#endif
-
-#ifdef CONFIG_MAGIC_SYSRQ
-void normalize_rt_tasks(void)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	struct rq *rq;
-	int queued;
-
-	read_lock(&tasklist_lock);
-	for_each_process_thread(g, p) {
-		if (!rt_task(p) && !iso_task(p))
-			continue;
-
-		rq = task_grq_lock(p, &flags);
-		queued = task_queued(p);
-		if (queued)
-			dequeue_task(p);
-		__setscheduler(p, rq, SCHED_NORMAL, 0, false);
-		if (queued) {
-			enqueue_task(p, rq);
-			try_preempt(p, rq);
-		}
-
-		task_grq_unlock(&flags);
-	}
-	read_unlock(&tasklist_lock);
-}
-#endif /* CONFIG_MAGIC_SYSRQ */
-
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
-/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
- *
- * They can only be called when the whole system has been
- * stopped - every CPU needs to be quiescent, and no scheduling
- * activity can take place. Using them for anything else would
- * be a serious bug, and as a result, they aren't even visible
- * under any other configuration.
- */
-
-/**
- * curr_task - return the current task for a given cpu.
- * @cpu: the processor in question.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- *
- * Return: The current task for @cpu.
- */
-struct task_struct *curr_task(int cpu)
-{
-	return cpu_curr(cpu);
-}
-
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * set_curr_task - set the current task for a given cpu.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack.  It allows the architecture to switch the
- * notion of the current task on a cpu in a non-blocking manner.  This function
- * must be called with all CPU's synchronised, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void set_curr_task(int cpu, struct task_struct *p)
-{
-	cpu_curr(cpu) = p;
-}
-
-#endif
-
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	*ut = p->utime;
-	*st = p->stime;
-}
-
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime;
-
-	thread_group_cputime(p, &cputime);
-
-	*ut = cputime.utime;
-	*st = cputime.stime;
-}
-
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	vtime_account_system(tsk);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
-	if (is_idle_task(prev))
-		vtime_account_idle(prev);
-	else
-		vtime_account_system(prev);
-
-	vtime_account_user(prev);
-	arch_vtime_task_switch(prev);
-}
-#endif
-
-#else
-/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
-{
-	u64 scaled;
-
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
-		if (stime > rtime) {
-			u64 tmp = rtime; rtime = stime; stime = tmp;
-		}
-
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
-
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
-		rtime >>= 1;
-		total >>= 1;
-	}
-
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return (__force cputime_t) scaled;
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
- */
-static void cputime_adjust(struct task_cputime *curr,
-			   struct cputime *prev,
-			   cputime_t *ut, cputime_t *st)
-{
-	cputime_t rtime, stime, utime, total;
-
-	stime = curr->stime;
-	total = stime + curr->utime;
-
-	/*
-	 * Tick based cputime accounting depend on random scheduling
-	 * timeslices of a task to be interrupted or not by the timer.
-	 * Depending on these circumstances, the number of these interrupts
-	 * may be over or under-optimistic, matching the real user and system
-	 * cputime with a variable precision.
-	 *
-	 * Fix this by scaling these tick based values against the total
-	 * runtime accounted by the CFS scheduler.
-	 */
-	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-
-	/*
-	 * Update userspace visible utime/stime values only if actual execution
-	 * time is bigger than already exported. Note that can happen, that we
-	 * provided bigger values due to scaling inaccuracy on big numbers.
-	 */
-	if (prev->stime + prev->utime >= rtime)
-		goto out;
-
-	if (total) {
-		stime = scale_stime((__force u64)stime,
-				    (__force u64)rtime, (__force u64)total);
-		utime = rtime - stime;
-	} else {
-		stime = rtime;
-		utime = 0;
-	}
-
-	/*
-	 * If the tick based count grows faster than the scheduler one,
-	 * the result of the scaling may go backward.
-	 * Let's enforce monotonicity.
-	 */
-	prev->stime = max(prev->stime, stime);
-	prev->utime = max(prev->utime, utime);
-
-out:
-	*ut = prev->utime;
-	*st = prev->stime;
-}
-
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime = {
-		.sum_exec_runtime = tsk_seruntime(p),
-	};
-
-	task_cputime(p, &cputime.utime, &cputime.stime);
-	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
-}
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime;
-
-	thread_group_cputime(p, &cputime);
-	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
-}
-#endif
-
-void init_idle_bootup_task(struct task_struct *idle)
-{}
-
-#ifdef CONFIG_SCHED_DEBUG
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
-{}
-
-void proc_sched_set_task(struct task_struct *p)
-{}
-#endif
-
-#ifdef CONFIG_SMP
-#define SCHED_LOAD_SHIFT	(10)
-#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
-
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-	return SCHED_LOAD_SCALE;
-}
-
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-	unsigned long weight = cpumask_weight(sched_domain_span(sd));
-	unsigned long smt_gain = sd->smt_gain;
-
-	smt_gain /= weight;
-
-	return smt_gain;
-}
-#endif
diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h
deleted file mode 100644
index 876969fff..000000000
--- a/kernel/sched/bfs_sched.h
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <linux/sched.h>
-#include <linux/cpuidle.h>
-
-#ifndef BFS_SCHED_H
-#define BFS_SCHED_H
-
-/*
- * This is the main, per-CPU runqueue data structure.
- * This data should only be modified by the local cpu.
- */
-struct rq {
-	struct task_struct *curr, *idle, *stop;
-	struct mm_struct *prev_mm;
-
-	/* Pointer to grq spinlock */
-	raw_spinlock_t *grq_lock;
-
-	/* Stored data about rq->curr to work outside grq lock */
-	u64 rq_deadline;
-	unsigned int rq_policy;
-	int rq_time_slice;
-	u64 rq_last_ran;
-	int rq_prio;
-	bool rq_running; /* There is a task running */
-	int soft_affined; /* Running or queued tasks with this set as their rq */
-#ifdef CONFIG_SMT_NICE
-	struct mm_struct *rq_mm;
-	int rq_smt_bias; /* Policy/nice level bias across smt siblings */
-#endif
-	/* Accurate timekeeping data */
-	u64 timekeep_clock;
-	unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
-		iowait_pc, idle_pc;
-	atomic_t nr_iowait;
-
-#ifdef CONFIG_SMP
-	int cpu;		/* cpu of this runqueue */
-	bool online;
-	bool scaling; /* This CPU is managed by a scaling CPU freq governor */
-	struct task_struct *sticky_task;
-
-	struct root_domain *rd;
-	struct sched_domain *sd;
-	int *cpu_locality; /* CPU relative cache distance */
-#ifdef CONFIG_SCHED_SMT
-	bool (*siblings_idle)(int cpu);
-	/* See if all smt siblings are idle */
-#endif /* CONFIG_SCHED_SMT */
-#ifdef CONFIG_SCHED_MC
-	bool (*cache_idle)(int cpu);
-	/* See if all cache siblings are idle */
-#endif /* CONFIG_SCHED_MC */
-	u64 last_niffy; /* Last time this RQ updated grq.niffies */
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	u64 prev_irq_time;
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#ifdef CONFIG_PARAVIRT
-	u64 prev_steal_time;
-#endif /* CONFIG_PARAVIRT */
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-	u64 prev_steal_time_rq;
-#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
-
-	u64 clock, old_clock, last_tick;
-	u64 clock_task;
-	bool dither;
-
-#ifdef CONFIG_SCHEDSTATS
-
-	/* latency stats */
-	struct sched_info rq_sched_info;
-	unsigned long long rq_cpu_time;
-	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
-
-	/* sys_sched_yield() stats */
-	unsigned int yld_count;
-
-	/* schedule() stats */
-	unsigned int sched_switch;
-	unsigned int sched_count;
-	unsigned int sched_goidle;
-
-	/* try_to_wake_up() stats */
-	unsigned int ttwu_count;
-	unsigned int ttwu_local;
-#endif /* CONFIG_SCHEDSTATS */
-#ifdef CONFIG_CPU_IDLE
-	/* Must be inspected within a rcu lock section */
-	struct cpuidle_state *idle_state;
-#endif
-};
-
-#ifdef CONFIG_SMP
-struct rq *cpu_rq(int cpu);
-#endif
-
-#ifndef CONFIG_SMP
-extern struct rq *uprq;
-#define cpu_rq(cpu)	(uprq)
-#define this_rq()	(uprq)
-#define raw_rq()	(uprq)
-#define task_rq(p)	(uprq)
-#define cpu_curr(cpu)	((uprq)->curr)
-#else /* CONFIG_SMP */
-DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#define this_rq()		this_cpu_ptr(&runqueues)
-#define raw_rq()		raw_cpu_ptr(&runqueues)
-#endif /* CONFIG_SMP */
-
-static inline u64 __rq_clock_broken(struct rq *rq)
-{
-	return ACCESS_ONCE(rq->clock);
-}
-
-static inline u64 rq_clock(struct rq *rq)
-{
-	lockdep_assert_held(rq->grq_lock);
-	return rq->clock;
-}
-
-static inline u64 rq_clock_task(struct rq *rq)
-{
-	lockdep_assert_held(rq->grq_lock);
-	return rq->clock_task;
-}
-
-#define rcu_dereference_check_sched_domain(p) \
-	rcu_dereference_check((p), \
-			      lockdep_is_held(&sched_domains_mutex))
-
-/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
- *
- * The domain tree of any CPU may only be accessed from within
- * preempt-disabled sections.
- */
-#define for_each_domain(cpu, __sd) \
-	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
-
-static inline void sched_ttwu_pending(void) { }
-
-static inline int task_on_rq_queued(struct task_struct *p)
-{
-	return p->on_rq;
-}
-
-#ifdef CONFIG_CPU_IDLE
-static inline void idle_set_state(struct rq *rq,
-				  struct cpuidle_state *idle_state)
-{
-	rq->idle_state = idle_state;
-}
-
-static inline struct cpuidle_state *idle_get_state(struct rq *rq)
-{
-	WARN_ON(!rcu_read_lock_held());
-	return rq->idle_state;
-}
-#else
-static inline void idle_set_state(struct rq *rq,
-				  struct cpuidle_state *idle_state)
-{
-}
-
-static inline struct cpuidle_state *idle_get_state(struct rq *rq)
-{
-	return NULL;
-}
-#endif
-#endif /* BFS_SCHED_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 123673291..78b4bad10 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,26 +90,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
-	unsigned long delta;
-	ktime_t soft, hard, now;
-
-	for (;;) {
-		if (hrtimer_active(period_timer))
-			break;
-
-		now = hrtimer_cb_get_time(period_timer);
-		hrtimer_forward(period_timer, now, period);
-
-		soft = hrtimer_get_softexpires(period_timer);
-		hard = hrtimer_get_expires(period_timer);
-		delta = ktime_to_ns(ktime_sub(hard, soft));
-		__hrtimer_start_range_ns(period_timer, soft, delta,
-					 HRTIMER_MODE_ABS_PINNED, 0);
-	}
-}
-
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -355,12 +335,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 
 #ifdef CONFIG_SMP
 
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
-	ktime_t time = hrtimer_get_softexpires(timer);
 
-	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 }
 
 /*
@@ -440,8 +419,8 @@ void hrtick_start(struct rq *rq, u64 delay)
 	 * doesn't make sense. Rely on vruntime for fairness.
 	 */
 	delay = max_t(u64, delay, 10000LL);
-	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-			HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 static inline void init_hrtick(void)
@@ -511,7 +490,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
 static bool set_nr_if_polling(struct task_struct *p)
 {
 	struct thread_info *ti = task_thread_info(p);
-	typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
 
 	for (;;) {
 		if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +520,52 @@ static bool set_nr_if_polling(struct task_struct *p)
 #endif
 #endif
 
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+	struct wake_q_node *node = &task->wake_q;
+
+	/*
+	 * Atomically grab the task, if ->wake_q is !nil already it means
+	 * its already queued (either by us or someone else) and will get the
+	 * wakeup due to that.
+	 *
+	 * This cmpxchg() implies a full barrier, which pairs with the write
+	 * barrier implied by the wakeup in wake_up_list().
+	 */
+	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+		return;
+
+	get_task_struct(task);
+
+	/*
+	 * The head is context local, there can be no concurrency.
+	 */
+	*head->lastp = node;
+	head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+	struct wake_q_node *node = head->first;
+
+	while (node != WAKE_Q_TAIL) {
+		struct task_struct *task;
+
+		task = container_of(node, struct task_struct, wake_q);
+		BUG_ON(!task);
+		/* task can safely be re-inserted now */
+		node = node->next;
+		task->wake_q.next = NULL;
+
+		/*
+		 * wake_up_process() implies a wmb() to pair with the queueing
+		 * in wake_q_add() so as not to miss wakeups.
+		 */
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+}
+
 /*
  * resched_curr - mark rq's current task 'to be rescheduled now'.
  *
@@ -593,13 +618,12 @@ void resched_cpu(int cpu)
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
 {
-	int cpu = smp_processor_id();
-	int i;
+	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+	if (!idle_cpu(cpu))
 		return cpu;
 
 	rcu_read_lock();
@@ -976,7 +1000,11 @@ inline int task_curr(const struct task_struct *p)
 }
 
 /*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
  */
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
@@ -985,7 +1013,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
-		/* Possble rq->lock 'hole'.  */
+
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
@@ -1017,6 +1045,177 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+	lockdep_assert_held(&rq->lock);
+
+	dequeue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_MIGRATING;
+	set_task_cpu(p, new_cpu);
+	raw_spin_unlock(&rq->lock);
+
+	rq = cpu_rq(new_cpu);
+
+	raw_spin_lock(&rq->lock);
+	BUG_ON(task_cpu(p) != new_cpu);
+	p->on_rq = TASK_ON_RQ_QUEUED;
+	enqueue_task(rq, p, 0);
+	check_preempt_curr(rq, p, 0);
+
+	return rq;
+}
+
+struct migration_arg {
+	struct task_struct *task;
+	int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+{
+	if (unlikely(!cpu_active(dest_cpu)))
+		return rq;
+
+	/* Affinity changed (again). */
+	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+		return rq;
+
+	rq = move_queued_task(rq, p, dest_cpu);
+
+	return rq;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+	struct migration_arg *arg = data;
+	struct task_struct *p = arg->task;
+	struct rq *rq = this_rq();
+
+	/*
+	 * The original target cpu might have gone down and we might
+	 * be on another cpu but it doesn't matter.
+	 */
+	local_irq_disable();
+	/*
+	 * We need to explicitly wake pending tasks before running
+	 * __migrate_task() such that we will not miss enforcing cpus_allowed
+	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+	 */
+	sched_ttwu_pending();
+
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+	/*
+	 * If task_rq(p) != rq, it cannot be migrated here, because we're
+	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+	 * we're holding p->pi_lock.
+	 */
+	if (task_rq(p) == rq && task_on_rq_queued(p))
+		rq = __migrate_task(rq, p, arg->dest_cpu);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock(&p->pi_lock);
+
+	local_irq_enable();
+	return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, new_mask);
+
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	unsigned long flags;
+	struct rq *rq;
+	unsigned int dest_cpu;
+	int ret = 0;
+
+	rq = task_rq_lock(p, &flags);
+
+	if (cpumask_equal(&p->cpus_allowed, new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_cpus_allowed(p, new_mask);
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+		struct migration_arg arg = { p, dest_cpu };
+		/* Need help from migration thread: drop lock and wait. */
+		task_rq_unlock(rq, p, &flags);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+		tlb_migrate_finish(p->mm);
+		return 0;
+	} else if (task_on_rq_queued(p)) {
+		/*
+		 * OK, since we're going to drop the lock immediately
+		 * afterwards anyway.
+		 */
+		lockdep_unpin_lock(&rq->lock);
+		rq = move_queued_task(rq, p, dest_cpu);
+		lockdep_pin_lock(&rq->lock);
+	}
+out:
+	task_rq_unlock(rq, p, &flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -1049,7 +1248,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
-		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+		perf_event_task_migrate(p);
 	}
 
 	__set_task_cpu(p, new_cpu);
@@ -1157,13 +1356,6 @@ out:
 	return ret;
 }
 
-struct migration_arg {
-	struct task_struct *task;
-	int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -1296,9 +1488,7 @@ void kick_process(struct task_struct *p)
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
 
-#ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
@@ -1378,6 +1568,8 @@ out:
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 
@@ -1403,7 +1595,7 @@ static void update_avg(u64 *avg, u64 sample)
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -1466,8 +1658,15 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
+	if (p->sched_class->task_woken) {
+		/*
+		 * Our task @p is fully woken up and running; so its safe to
+		 * drop the rq->lock, hereafter rq is only used for statistics.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		p->sched_class->task_woken(rq, p);
+		lockdep_pin_lock(&rq->lock);
+	}
 
 	if (rq->idle_stamp) {
 		u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -1486,6 +1685,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
+	lockdep_assert_held(&rq->lock);
+
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
@@ -1530,6 +1731,7 @@ void sched_ttwu_pending(void)
 		return;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
+	lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1537,6 +1739,7 @@ void sched_ttwu_pending(void)
 		ttwu_do_activate(rq, p, 0);
 	}
 
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1633,7 +1836,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 #endif
 
 	raw_spin_lock(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 	ttwu_do_activate(rq, p, 0);
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1728,9 +1933,17 @@ static void try_to_wake_up_local(struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	if (!raw_spin_trylock(&p->pi_lock)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet picked a replacement task.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
+		lockdep_pin_lock(&rq->lock);
 	}
 
 	if (!(p->state & TASK_NORMAL))
@@ -1951,7 +2164,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	set_task_cpu(p, cpu);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
@@ -2105,12 +2318,29 @@ void wake_up_new_task(struct task_struct *p)
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+	static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+	static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
+	if (!static_key_false(&preempt_notifier_key))
+		WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2119,7 +2349,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
  * preempt_notifier_unregister - no longer interested in preemption notifications
  * @notifier: notifier struct to unregister
  *
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
  */
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
@@ -2127,7 +2357,7 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier)
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 	struct preempt_notifier *notifier;
 
@@ -2135,9 +2365,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
 
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	if (static_key_false(&preempt_notifier_key))
+		__fire_sched_in_preempt_notifiers(curr);
+}
+
 static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-				 struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				   struct task_struct *next)
 {
 	struct preempt_notifier *notifier;
 
@@ -2145,13 +2381,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 		notifier->ops->sched_out(notifier, next);
 }
 
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	if (static_key_false(&preempt_notifier_key))
+		__fire_sched_out_preempt_notifiers(curr, next);
+}
+
 #else /* !CONFIG_PREEMPT_NOTIFIERS */
 
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 }
 
-static void
+static inline void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
@@ -2252,23 +2496,35 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 #ifdef CONFIG_SMP
 
 /* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
 {
-	if (rq->post_schedule) {
-		unsigned long flags;
+	struct callback_head *head, *next;
+	void (*func)(struct rq *rq);
+	unsigned long flags;
 
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->curr->sched_class->post_schedule)
-			rq->curr->sched_class->post_schedule(rq);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	head = rq->balance_callback;
+	rq->balance_callback = NULL;
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
 
-		rq->post_schedule = 0;
+		func(rq);
 	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+	if (unlikely(rq->balance_callback))
+		__balance_callback(rq);
 }
 
 #else
 
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -2286,7 +2542,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	/* finish_task_switch() drops rq->lock and enables preemtion */
 	preempt_disable();
 	rq = finish_task_switch(prev);
-	post_schedule(rq);
+	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -2330,9 +2586,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
+	lockdep_unpin_lock(&rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
-	context_tracking_task_switch(prev, next);
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
@@ -2397,9 +2653,9 @@ unsigned long nr_iowait_cpu(int cpu)
 
 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
-	struct rq *this = this_rq();
-	*nr_waiters = atomic_read(&this->nr_iowait);
-	*load = this->cpu_load[0];
+	struct rq *rq = this_rq();
+	*nr_waiters = atomic_read(&rq->nr_iowait);
+	*load = rq->load.weight;
 }
 
 #ifdef CONFIG_SMP
@@ -2497,6 +2753,7 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
+	calc_global_load_tick(rq);
 	raw_spin_unlock(&rq->lock);
 
 	perf_event_task_tick();
@@ -2525,7 +2782,7 @@ void scheduler_tick(void)
 u64 scheduler_tick_max_deferment(void)
 {
 	struct rq *rq = this_rq();
-	unsigned long next, now = ACCESS_ONCE(jiffies);
+	unsigned long next, now = READ_ONCE(jiffies);
 
 	next = rq->last_sched_tick + HZ;
 
@@ -2726,9 +2983,7 @@ again:
  *          - return from syscall or exception to user-space
  *          - return from interrupt-handler to user-space
  *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
  */
 static void __sched __schedule(void)
 {
@@ -2737,7 +2992,6 @@ static void __sched __schedule(void)
 	struct rq *rq;
 	int cpu;
 
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch();
@@ -2755,6 +3009,7 @@ static void __sched __schedule(void)
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
@@ -2797,12 +3052,12 @@ static void __sched __schedule(void)
 
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
 		cpu = cpu_of(rq);
-	} else
+	} else {
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock_irq(&rq->lock);
+	}
 
-	post_schedule(rq);
-
-	sched_preempt_enable_no_resched();
+	balance_callback(rq);
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +3078,9 @@ asmlinkage __visible void __sched schedule(void)
 
 	sched_submit_work(tsk);
 	do {
+		preempt_disable();
 		__schedule();
+		sched_preempt_enable_no_resched();
 	} while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
@@ -2862,15 +3119,14 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
+		preempt_active_enter();
 		__schedule();
-		__preempt_count_sub(PREEMPT_ACTIVE);
+		preempt_active_exit();
 
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
-		barrier();
 	} while (need_resched());
 }
 
@@ -2894,9 +3150,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 
-#ifdef CONFIG_CONTEXT_TRACKING
 /**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
  *
  * The tracing infrastructure uses preempt_enable_notrace to prevent
  * recursion and tracing preempt enabling caused by the tracing
@@ -2909,7 +3164,7 @@ EXPORT_SYMBOL(preempt_schedule);
  * instead of preempt_schedule() to exit user context if needed before
  * calling the scheduler.
  */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 {
 	enum ctx_state prev_ctx;
 
@@ -2917,7 +3172,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 		return;
 
 	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
+		/*
+		 * Use raw __prempt_count() ops that don't call function.
+		 * We can't call functions before disabling preemption which
+		 * disarm preemption tracing recursions.
+		 */
+		__preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+		barrier();
 		/*
 		 * Needs preempt disabled in case user_exit() is traced
 		 * and the tracer calls preempt_enable_notrace() causing
@@ -2927,12 +3188,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 		__schedule();
 		exception_exit(prev_ctx);
 
-		__preempt_count_sub(PREEMPT_ACTIVE);
 		barrier();
+		__preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
 	} while (need_resched());
 }
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
 
 #endif /* CONFIG_PREEMPT */
 
@@ -2952,17 +3212,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	prev_state = exception_enter();
 
 	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
+		preempt_active_enter();
 		local_irq_enable();
 		__schedule();
 		local_irq_disable();
-		__preempt_count_sub(PREEMPT_ACTIVE);
-
-		/*
-		 * Check again in case we missed a preemption opportunity
-		 * between schedule and now.
-		 */
-		barrier();
+		preempt_active_exit();
 	} while (need_resched());
 
 	exception_exit(prev_state);
@@ -3040,7 +3294,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			p->dl.dl_throttled = 0;
 			enqueue_flag = ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
@@ -3068,7 +3321,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
+	preempt_disable(); /* avoid rq from going away on us */
 	__task_rq_unlock(rq);
+
+	balance_callback(rq);
+	preempt_enable();
 }
 #endif
 
@@ -3406,7 +3663,7 @@ static bool dl_param_changed(struct task_struct *p,
 
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
-				bool user)
+				bool user, bool pi)
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3592,18 +3849,20 @@ change:
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 
-	/*
-	 * Take priority boosted tasks into account. If the new
-	 * effective priority is unchanged, we just store the new
-	 * normal parameters and do not touch the scheduler class and
-	 * the runqueue. This will be done when the task deboost
-	 * itself.
-	 */
-	new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-	if (new_effective_prio == oldprio) {
-		__setscheduler_params(p, attr);
-		task_rq_unlock(rq, p, &flags);
-		return 0;
+	if (pi) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		if (new_effective_prio == oldprio) {
+			__setscheduler_params(p, attr);
+			task_rq_unlock(rq, p, &flags);
+			return 0;
+		}
 	}
 
 	queued = task_on_rq_queued(p);
@@ -3614,7 +3873,7 @@ change:
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
-	__setscheduler(rq, p, attr, true);
+	__setscheduler(rq, p, attr, pi);
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
@@ -3627,9 +3886,17 @@ change:
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
+	preempt_disable(); /* avoid rq from going away on us */
 	task_rq_unlock(rq, p, &flags);
 
-	rt_mutex_adjust_pi(p);
+	if (pi)
+		rt_mutex_adjust_pi(p);
+
+	/*
+	 * Run balance callbacks after we've adjusted the PI chain.
+	 */
+	balance_callback(rq);
+	preempt_enable();
 
 	return 0;
 }
@@ -3650,7 +3917,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
 		attr.sched_policy = policy;
 	}
 
-	return __sched_setscheduler(p, &attr, check);
+	return __sched_setscheduler(p, &attr, check, true);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3671,7 +3938,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-	return __sched_setscheduler(p, attr, true);
+	return __sched_setscheduler(p, attr, true, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
@@ -4719,149 +4986,6 @@ out:
 }
 
 #ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
-	struct rq *rq = task_rq(p);
-
-	lockdep_assert_held(&rq->lock);
-
-	dequeue_task(rq, p, 0);
-	p->on_rq = TASK_ON_RQ_MIGRATING;
-	set_task_cpu(p, new_cpu);
-	raw_spin_unlock(&rq->lock);
-
-	rq = cpu_rq(new_cpu);
-
-	raw_spin_lock(&rq->lock);
-	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
-	enqueue_task(rq, p, 0);
-	check_preempt_curr(rq, p, 0);
-
-	return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- *    stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- *    off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- *    it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- *    is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-	unsigned long flags;
-	struct rq *rq;
-	unsigned int dest_cpu;
-	int ret = 0;
-
-	rq = task_rq_lock(p, &flags);
-
-	if (cpumask_equal(&p->cpus_allowed, new_mask))
-		goto out;
-
-	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	do_set_cpus_allowed(p, new_mask);
-
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
-
-	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &flags);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		tlb_migrate_finish(p->mm);
-		return 0;
-	} else if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-out:
-	task_rq_unlock(rq, p, &flags);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-	struct rq *rq;
-	int ret = 0;
-
-	if (unlikely(!cpu_active(dest_cpu)))
-		return ret;
-
-	rq = cpu_rq(src_cpu);
-
-	raw_spin_lock(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
-		goto done;
-
-	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-		goto fail;
-
-	/*
-	 * If we're not on a rq, the next wake-up will ensure we're
-	 * placed properly.
-	 */
-	if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-done:
-	ret = 1;
-fail:
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock(&p->pi_lock);
-	return ret;
-}
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
@@ -4909,35 +5033,9 @@ void sched_setnuma(struct task_struct *p, int nid)
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
-	struct migration_arg *arg = data;
-
-	/*
-	 * The original target cpu might have gone down and we might
-	 * be on another cpu but it doesn't matter.
-	 */
-	local_irq_disable();
-	/*
-	 * We need to explicitly wake pending tasks before running
-	 * __migrate_task() such that we will not miss enforcing cpus_allowed
-	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
-	 */
-	sched_ttwu_pending();
-	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
-	local_irq_enable();
-	return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -4993,9 +5091,9 @@ static struct task_struct fake_task = {
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
 {
-	struct rq *rq = cpu_rq(dead_cpu);
+	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
 	int dest_cpu;
 
@@ -5017,7 +5115,7 @@ static void migrate_tasks(unsigned int dead_cpu)
 	 */
 	update_rq_clock(rq);
 
-	for ( ; ; ) {
+	for (;;) {
 		/*
 		 * There's this thread running, bail when that's the only
 		 * remaining thread.
@@ -5025,22 +5123,29 @@ static void migrate_tasks(unsigned int dead_cpu)
 		if (rq->nr_running == 1)
 			break;
 
+		/*
+		 * Ensure rq->lock covers the entire task selection
+		 * until the migration.
+		 */
+		lockdep_pin_lock(&rq->lock);
 		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
 		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_cpu, next);
-		raw_spin_unlock(&rq->lock);
-
-		__migrate_task(next, dead_cpu, dest_cpu);
-
-		raw_spin_lock(&rq->lock);
+		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+
+		lockdep_unpin_lock(&rq->lock);
+		rq = __migrate_task(rq, next, dest_cpu);
+		if (rq != dead_rq) {
+			raw_spin_unlock(&rq->lock);
+			rq = dead_rq;
+			raw_spin_lock(&rq->lock);
+		}
 	}
 
 	rq->stop = stop;
 }
-
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5219,7 +5324,7 @@ static void register_sched_domain_sysctl(void)
 static void unregister_sched_domain_sysctl(void)
 {
 }
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
 
 static void set_rq_online(struct rq *rq)
 {
@@ -5288,7 +5393,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
-		migrate_tasks(cpu);
+		migrate_tasks(rq);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
@@ -5314,7 +5419,7 @@ static struct notifier_block migration_notifier = {
 	.priority = CPU_PRI_MIGRATION,
 };
 
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
@@ -5366,9 +5471,6 @@ static int __init migration_init(void)
 	return 0;
 }
 early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
 
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
@@ -6594,7 +6696,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;
 
-		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sd)
 				return -ENOMEM;
@@ -7032,6 +7134,9 @@ void __init sched_init_smp(void)
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
+	/* nohz_full won't take effect without isolating the cpus. */
+	tick_nohz_full_add_cpus_to(cpu_isolated_map);
+
 	sched_init_numa();
 
 	/*
@@ -7068,8 +7173,6 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
 
-const_debug unsigned int sysctl_timer_migration = 1;
-
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
@@ -7199,7 +7302,7 @@ void __init sched_init(void)
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-		rq->post_schedule = 0;
+		rq->balance_callback = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
@@ -7329,32 +7432,12 @@ EXPORT_SYMBOL(___might_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
 {
-	const struct sched_class *prev_class = p->sched_class;
+	struct task_struct *g, *p;
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 	};
-	int old_prio = p->prio;
-	int queued;
-
-	queued = task_on_rq_queued(p);
-	if (queued)
-		dequeue_task(rq, p, 0);
-	__setscheduler(rq, p, &attr, false);
-	if (queued) {
-		enqueue_task(rq, p, 0);
-		resched_curr(rq);
-	}
-
-	check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	struct rq *rq;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7381,9 +7464,7 @@ void normalize_rt_tasks(void)
 			continue;
 		}
 
-		rq = task_rq_lock(p, &flags);
-		normalize_task(rq, p);
-		task_rq_unlock(rq, p, &flags);
+		__sched_setscheduler(p, &attr, false, false);
 	}
 	read_unlock(&tasklist_lock);
 }
@@ -7734,11 +7815,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
 	return rt_runtime_us;
 }
 
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 {
 	u64 rt_runtime, rt_period;
 
-	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+	rt_period = rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@ -8105,10 +8186,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 
 	__refill_cfs_bandwidth_runtime(cfs_b);
 	/* restart the period timer (if active) to handle new period expiry */
-	if (runtime_enabled && cfs_b->timer_active) {
-		/* force a reprogram */
-		__start_cfs_bandwidth(cfs_b, true);
-	}
+	if (runtime_enabled)
+		start_cfs_bandwidth(cfs_b);
 	raw_spin_unlock_irq(&cfs_b->lock);
 
 	for_each_online_cpu(i) {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee6..f5a64ffad 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
 {
 	cputime_t old;
 
-	while (new > (old = ACCESS_ONCE(*counter)))
+	while (new > (old = READ_ONCE(*counter)))
 		cmpxchg_cputime(counter, old, new);
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e9514508..0a17af356 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,14 +213,28 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 	return dl_task(prev);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
+
+static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
+{
+	if (!has_pushable_dl_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
 {
-	rq->post_schedule = has_pushable_dl_tasks(rq);
+	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 
-static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
 	struct rq *later_rq = NULL;
 	bool fallback = false;
@@ -254,14 +268,19 @@ static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 		double_lock_balance(rq, later_rq);
 	}
 
+	/*
+	 * By now the task is replenished and enqueued; migrate it.
+	 */
 	deactivate_task(rq, p, 0);
 	set_task_cpu(p, later_rq->cpu);
-	activate_task(later_rq, p, ENQUEUE_REPLENISH);
+	activate_task(later_rq, p, 0);
 
 	if (!fallback)
 		resched_curr(later_rq);
 
-	double_unlock_balance(rq, later_rq);
+	double_unlock_balance(later_rq, rq);
+
+	return later_rq;
 }
 
 #else
@@ -291,12 +310,15 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 	return false;
 }
 
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
 {
-	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
+{
+}
+
+static inline void queue_pull_task(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -498,24 +520,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
 {
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct sched_dl_entity *dl_se = &p->dl;
+	struct hrtimer *timer = &dl_se->dl_timer;
+	struct rq *rq = task_rq(p);
 	ktime_t now, act;
-	ktime_t soft, hard;
-	unsigned long range;
 	s64 delta;
 
-	if (boosted)
-		return 0;
+	lockdep_assert_held(&rq->lock);
+
 	/*
 	 * We want the timer to fire at the deadline, but considering
 	 * that it is actually coming from rq->clock and not from
 	 * hrtimer's time base reading.
 	 */
 	act = ns_to_ktime(dl_se->deadline);
-	now = hrtimer_cb_get_time(&dl_se->dl_timer);
+	now = hrtimer_cb_get_time(timer);
 	delta = ktime_to_ns(now) - rq_clock(rq);
 	act = ktime_add_ns(act, delta);
 
@@ -527,15 +548,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
 	if (ktime_us_delta(act, now) < 0)
 		return 0;
 
-	hrtimer_set_expires(&dl_se->dl_timer, act);
-
-	soft = hrtimer_get_softexpires(&dl_se->dl_timer);
-	hard = hrtimer_get_expires(&dl_se->dl_timer);
-	range = ktime_to_ns(ktime_sub(hard, soft));
-	__hrtimer_start_range_ns(&dl_se->dl_timer, soft,
-				 range, HRTIMER_MODE_ABS, 0);
+	/*
+	 * !enqueued will guarantee another callback; even if one is already in
+	 * progress. This ensures a balanced {get,put}_task_struct().
+	 *
+	 * The race against __run_timer() clearing the enqueued state is
+	 * harmless because we're holding task_rq()->lock, therefore the timer
+	 * expiring after we've done the check will wait on its task_rq_lock()
+	 * and observe our state.
+	 */
+	if (!hrtimer_is_queued(timer)) {
+		get_task_struct(p);
+		hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+	}
 
-	return hrtimer_active(&dl_se->dl_timer);
+	return 1;
 }
 
 /*
@@ -563,35 +590,40 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	rq = task_rq_lock(p, &flags);
 
 	/*
-	 * We need to take care of several possible races here:
-	 *
-	 *   - the task might have changed its scheduling policy
-	 *     to something different than SCHED_DEADLINE
-	 *   - the task might have changed its reservation parameters
-	 *     (through sched_setattr())
-	 *   - the task might have been boosted by someone else and
-	 *     might be in the boosting/deboosting path
+	 * The task might have changed its scheduling policy to something
+	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
+	 */
+	if (!dl_task(p)) {
+		__dl_clear_params(p);
+		goto unlock;
+	}
+
+	/*
+	 * This is possible if switched_from_dl() raced against a running
+	 * callback that took the above !dl_task() path and we've since then
+	 * switched back into SCHED_DEADLINE.
 	 *
-	 * In all this cases we bail out, as the task is already
-	 * in the runqueue or is going to be enqueued back anyway.
+	 * There's nothing to do except drop our task reference.
 	 */
-	if (!dl_task(p) || dl_se->dl_new ||
-	    dl_se->dl_boosted || !dl_se->dl_throttled)
+	if (dl_se->dl_new)
 		goto unlock;
 
-	sched_clock_tick();
-	update_rq_clock(rq);
+	/*
+	 * The task might have been boosted by someone else and might be in the
+	 * boosting/deboosting path, its not throttled.
+	 */
+	if (dl_se->dl_boosted)
+		goto unlock;
 
-#ifdef CONFIG_SMP
 	/*
-	 * If we find that the rq the task was on is no longer
-	 * available, we need to select a new rq.
+	 * Spurious timer due to start_dl_timer() race; or we already received
+	 * a replenishment from rt_mutex_setprio().
 	 */
-	if (unlikely(!rq->online)) {
-		dl_task_offline_migration(rq, p);
+	if (!dl_se->dl_throttled)
 		goto unlock;
-	}
-#endif
+
+	sched_clock_tick();
+	update_rq_clock(rq);
 
 	/*
 	 * If the throttle happened during sched-out; like:
@@ -617,17 +649,38 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		check_preempt_curr_dl(rq, p, 0);
 	else
 		resched_curr(rq);
+
 #ifdef CONFIG_SMP
 	/*
-	 * Queueing this task back might have overloaded rq,
-	 * check if we need to kick someone away.
+	 * Perform balancing operations here; after the replenishments.  We
+	 * cannot drop rq->lock before this, otherwise the assertion in
+	 * start_dl_timer() about not missing updates is not true.
+	 *
+	 * If we find that the rq the task was on is no longer available, we
+	 * need to select a new rq.
+	 *
+	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
+	 */
+	if (unlikely(!rq->online))
+		rq = dl_task_offline_migration(rq, p);
+
+	/*
+	 * Queueing this task back might have overloaded rq, check if we need
+	 * to kick someone away.
 	 */
 	if (has_pushable_dl_tasks(rq))
 		push_dl_task(rq);
 #endif
+
 unlock:
 	task_rq_unlock(rq, p, &flags);
 
+	/*
+	 * This can free the task_struct, including this hrtimer, do not touch
+	 * anything related to that after this.
+	 */
+	put_task_struct(p);
+
 	return HRTIMER_NORESTART;
 }
 
@@ -640,7 +693,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
 }
 
 static
-int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
 {
 	return (dl_se->runtime <= 0);
 }
@@ -684,10 +737,10 @@ static void update_curr_dl(struct rq *rq)
 	sched_rt_avg_update(rq, delta_exec);
 
 	dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
-	if (dl_runtime_exceeded(rq, dl_se)) {
+	if (dl_runtime_exceeded(dl_se)) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
 		if (!is_leftmost(curr, &rq->dl))
@@ -995,7 +1048,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
 	 * If we are dealing with a -deadline task, we must
@@ -1012,7 +1065,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	    (p->nr_cpus_allowed > 1)) {
 		int target = find_later_rq(p);
 
-		if (target != -1)
+		if (target != -1 &&
+				dl_time_before(p->dl.deadline,
+					cpu_rq(target)->dl.earliest_dl.curr))
 			cpu = target;
 	}
 	rcu_read_unlock();
@@ -1042,8 +1097,6 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	resched_curr(rq);
 }
 
-static int pull_dl_task(struct rq *this_rq);
-
 #endif /* CONFIG_SMP */
 
 /*
@@ -1100,7 +1153,15 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 	dl_rq = &rq->dl;
 
 	if (need_pull_dl_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_dl_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a stop task can slip in, in which case we need to
@@ -1134,7 +1195,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -1171,7 +1232,6 @@ static void task_fork_dl(struct task_struct *p)
 
 static void task_dead_dl(struct task_struct *p)
 {
-	struct hrtimer *timer = &p->dl.dl_timer;
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
 	/*
@@ -1181,8 +1241,6 @@ static void task_dead_dl(struct task_struct *p)
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
 	raw_spin_unlock_irq(&dl_b->lock);
-
-	hrtimer_cancel(timer);
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1230,6 +1288,32 @@ next_node:
 	return NULL;
 }
 
+/*
+ * Return the earliest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise:
+ */
+static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
+{
+	struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+	struct task_struct *p = NULL;
+
+	if (!has_pushable_dl_tasks(rq))
+		return NULL;
+
+next_node:
+	if (next_node) {
+		p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+
+		if (pick_dl_task(rq, p, cpu))
+			return p;
+
+		next_node = rb_next(next_node);
+		goto next_node;
+	}
+
+	return NULL;
+}
+
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
 
 static int find_later_rq(struct task_struct *task)
@@ -1333,6 +1417,17 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 
 		later_rq = cpu_rq(cpu);
 
+		if (!dl_time_before(task->dl.deadline,
+					later_rq->dl.earliest_dl.curr)) {
+			/*
+			 * Target rq has tasks of equal or earlier deadline,
+			 * retrying does not release any lock and is unlikely
+			 * to yield a different result.
+			 */
+			later_rq = NULL;
+			break;
+		}
+
 		/* Retry if something changed. */
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
@@ -1473,15 +1568,16 @@ static void push_dl_tasks(struct rq *rq)
 		;
 }
 
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
 	struct task_struct *p;
+	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
 
 	if (likely(!dl_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1514,7 +1610,7 @@ static int pull_dl_task(struct rq *this_rq)
 		if (src_rq->dl.dl_nr_running <= 1)
 			goto skip;
 
-		p = pick_next_earliest_dl_task(src_rq, this_cpu);
+		p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
 
 		/*
 		 * We found a task to be pulled if:
@@ -1536,7 +1632,7 @@ static int pull_dl_task(struct rq *this_rq)
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -1549,12 +1645,8 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
-}
-
-static void post_schedule_dl(struct rq *rq)
-{
-	push_dl_tasks(rq);
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -1659,7 +1751,7 @@ static void rq_offline_dl(struct rq *rq)
 	cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
 
-void init_sched_dl_class(void)
+void __init init_sched_dl_class(void)
 {
 	unsigned int i;
 
@@ -1670,37 +1762,16 @@ void init_sched_dl_class(void)
 
 #endif /* CONFIG_SMP */
 
-/*
- *  Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
-	struct hrtimer *dl_timer = &p->dl.dl_timer;
-
-	/* Nobody will change task's class if pi_lock is held */
-	lockdep_assert_held(&p->pi_lock);
-
-	if (hrtimer_active(dl_timer)) {
-		int ret = hrtimer_try_to_cancel(dl_timer);
-
-		if (unlikely(ret == -1)) {
-			/*
-			 * Note, p may migrate OR new deadline tasks
-			 * may appear in rq when we are unlocking it.
-			 * A caller of us must be fine with that.
-			 */
-			raw_spin_unlock(&rq->lock);
-			hrtimer_cancel(dl_timer);
-			raw_spin_lock(&rq->lock);
-		}
-	}
-}
-
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
-	/* XXX we should retain the bw until 0-lag */
-	cancel_dl_timer(rq, p);
-	__dl_clear_params(p);
+	/*
+	 * Start the deadline timer; if we switch back to dl before this we'll
+	 * continue consuming our current CBS slice. If we stay outside of
+	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
+	 * task.
+	 */
+	if (!start_dl_timer(p))
+		__dl_clear_params(p);
 
 	/*
 	 * Since this might be the only -deadline task on the rq,
@@ -1710,8 +1781,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	if (pull_dl_task(rq))
-		resched_curr(rq);
+	queue_pull_task(rq);
 }
 
 /*
@@ -1720,21 +1790,16 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
-			push_dl_task(rq) && rq != task_rq(p))
-			/* Only reschedule if pushing failed */
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched) {
-			if (dl_task(rq->curr))
-				check_preempt_curr_dl(rq, p, 0);
-			else
-				resched_curr(rq);
-		}
+		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (dl_task(rq->curr))
+			check_preempt_curr_dl(rq, p, 0);
+		else
+			resched_curr(rq);
+#endif
 	}
 }
 
@@ -1754,15 +1819,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 		 * or lowering its prio, so...
 		 */
 		if (!rq->dl.overloaded)
-			pull_dl_task(rq);
+			queue_pull_task(rq);
 
 		/*
 		 * If we now have a earlier deadline task than p,
 		 * then reschedule, provided p is still on this
 		 * runqueue.
 		 */
-		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
-		    rq->curr == p)
+		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
 			resched_curr(rq);
 #else
 		/*
@@ -1792,7 +1856,6 @@ const struct sched_class dl_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
-	.post_schedule		= post_schedule_dl,
 	.task_woken		= task_woken_dl,
 #endif
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a245c1fc6..4222ec50a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -132,15 +132,17 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-		SPLIT_NS(p->se.vruntime),
+		SPLIT_NS(p->se.statistics.wait_sum),
 		SPLIT_NS(p->se.sum_exec_runtime),
 		SPLIT_NS(p->se.statistics.sum_sleep_runtime));
 #else
-	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
-		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+		0LL, 0L,
+		SPLIT_NS(p->se.sum_exec_runtime),
+		0LL, 0L);
 #endif
 #ifdef CONFIG_NUMA_BALANCING
-	SEQ_printf(m, " %d", task_node(p));
+	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
 #endif
 #ifdef CONFIG_CGROUP_SCHED
 	SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -156,7 +158,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 	SEQ_printf(m,
 	"\nrunnable tasks:\n"
 	"            task   PID         tree-key  switches  prio"
-	"     exec-runtime         sum-exec        sum-sleep\n"
+	"     wait-time             sum-exec        sum-sleep\n"
 	"------------------------------------------------------"
 	"----------------------------------------------------\n");
 
@@ -230,8 +232,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
-	SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
-			cfs_rq->tg->cfs_bandwidth.timer_active);
 	SEQ_printf(m, "  .%-30s: %d\n", "throttled",
 			cfs_rq->throttled);
 	SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
@@ -517,11 +517,21 @@ __initcall(init_sched_debug_procfs);
 	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
 
+#ifdef CONFIG_NUMA_BALANCING
+void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+		unsigned long tpf, unsigned long gsf, unsigned long gpf)
+{
+	SEQ_printf(m, "numa_faults node=%d ", node);
+	SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf);
+	SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf);
+}
+#endif
+
+
 static void sched_show_numa(struct task_struct *p, struct seq_file *m)
 {
 #ifdef CONFIG_NUMA_BALANCING
 	struct mempolicy *pol;
-	int node, i;
 
 	if (p->mm)
 		P(mm->numa_scan_seq);
@@ -533,26 +543,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
 	mpol_get(pol);
 	task_unlock(p);
 
-	SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
-
-	for_each_online_node(node) {
-		for (i = 0; i < 2; i++) {
-			unsigned long nr_faults = -1;
-			int cpu_current, home_node;
-
-			if (p->numa_faults)
-				nr_faults = p->numa_faults[2*node + i];
-
-			cpu_current = !i ? (task_node(p) == node) :
-				(pol && node_isset(node, pol->v.nodes));
-
-			home_node = (p->numa_preferred_nid == node);
-
-			SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
-				i, node, cpu_current, home_node, nr_faults);
-		}
-	}
-
+	P(numa_pages_migrated);
+	P(numa_preferred_nid);
+	P(total_numa_faults);
+	SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
+			task_node(p), task_numa_group_id(p));
+	show_numa_stats(p, m);
 	mpol_put(pol);
 #endif
 }
@@ -582,6 +578,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
+	PN(se.statistics.sum_sleep_runtime);
 	PN(se.statistics.wait_start);
 	PN(se.statistics.sleep_start);
 	PN(se.statistics.block_start);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 936664319..134314406 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -166,9 +166,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
 {
-	unsigned int cpus = min_t(int, num_online_cpus(), 8);
+	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 	unsigned int factor;
 
 	switch (sysctl_sched_tunable_scaling) {
@@ -601,7 +601,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 		loff_t *ppos)
 {
 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	int factor = get_update_sysctl_factor();
+	unsigned int factor = get_update_sysctl_factor();
 
 	if (ret || !write)
 		return ret;
@@ -859,7 +859,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 
 static unsigned int task_scan_min(struct task_struct *p)
 {
-	unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
 	unsigned int scan, floor;
 	unsigned int windows = 1;
 
@@ -1223,11 +1223,9 @@ static void task_numa_assign(struct task_numa_env *env,
 static bool load_too_imbalanced(long src_load, long dst_load,
 				struct task_numa_env *env)
 {
+	long imb, old_imb;
+	long orig_src_load, orig_dst_load;
 	long src_capacity, dst_capacity;
-	long orig_src_load;
-	long load_a, load_b;
-	long moved_load;
-	long imb;
 
 	/*
 	 * The load is corrected for the CPU capacity available on each node.
@@ -1240,39 +1238,30 @@ static bool load_too_imbalanced(long src_load, long dst_load,
 	dst_capacity = env->dst_stats.compute_capacity;
 
 	/* We care about the slope of the imbalance, not the direction. */
-	load_a = dst_load;
-	load_b = src_load;
-	if (load_a < load_b)
-		swap(load_a, load_b);
+	if (dst_load < src_load)
+		swap(dst_load, src_load);
 
 	/* Is the difference below the threshold? */
-	imb = load_a * src_capacity * 100 -
-		load_b * dst_capacity * env->imbalance_pct;
+	imb = dst_load * src_capacity * 100 -
+	      src_load * dst_capacity * env->imbalance_pct;
 	if (imb <= 0)
 		return false;
 
 	/*
 	 * The imbalance is above the allowed threshold.
-	 * Allow a move that brings us closer to a balanced situation,
-	 * without moving things past the point of balance.
+	 * Compare it with the old imbalance.
 	 */
 	orig_src_load = env->src_stats.load;
+	orig_dst_load = env->dst_stats.load;
 
-	/*
-	 * In a task swap, there will be one load moving from src to dst,
-	 * and another moving back. This is the net sum of both moves.
-	 * A simple task move will always have a positive value.
-	 * Allow the move if it brings the system closer to a balanced
-	 * situation, without crossing over the balance point.
-	 */
-	moved_load = orig_src_load - src_load;
+	if (orig_dst_load < orig_src_load)
+		swap(orig_dst_load, orig_src_load);
 
-	if (moved_load > 0)
-		/* Moving src -> dst. Did we overshoot balance? */
-		return src_load * dst_capacity < dst_load * src_capacity;
-	else
-		/* Moving dst -> src. Did we overshoot balance? */
-		return dst_load * src_capacity < src_load * dst_capacity;
+	old_imb = orig_dst_load * src_capacity * 100 -
+		  orig_src_load * dst_capacity * env->imbalance_pct;
+
+	/* Would this change make things worse? */
+	return (imb > old_imb);
 }
 
 /*
@@ -1434,6 +1423,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 	}
 }
 
+/* Only move tasks to a NUMA node less busy than the current node. */
+static bool numa_has_capacity(struct task_numa_env *env)
+{
+	struct numa_stats *src = &env->src_stats;
+	struct numa_stats *dst = &env->dst_stats;
+
+	if (src->has_free_capacity && !dst->has_free_capacity)
+		return false;
+
+	/*
+	 * Only consider a task move if the source has a higher load
+	 * than the destination, corrected for CPU capacity on each node.
+	 *
+	 *      src->load                dst->load
+	 * --------------------- vs ---------------------
+	 * src->compute_capacity    dst->compute_capacity
+	 */
+	if (src->load * dst->compute_capacity >
+	    dst->load * src->compute_capacity)
+		return true;
+
+	return false;
+}
+
 static int task_numa_migrate(struct task_struct *p)
 {
 	struct task_numa_env env = {
@@ -1488,7 +1501,8 @@ static int task_numa_migrate(struct task_struct *p)
 	update_numa_stats(&env.dst_stats, env.dst_nid);
 
 	/* Try to find a spot on the preferred nid. */
-	task_numa_find_cpu(&env, taskimp, groupimp);
+	if (numa_has_capacity(&env))
+		task_numa_find_cpu(&env, taskimp, groupimp);
 
 	/*
 	 * Look at other nodes in these cases:
@@ -1519,7 +1533,8 @@ static int task_numa_migrate(struct task_struct *p)
 			env.dist = dist;
 			env.dst_nid = nid;
 			update_numa_stats(&env.dst_stats, env.dst_nid);
-			task_numa_find_cpu(&env, taskimp, groupimp);
+			if (numa_has_capacity(&env))
+				task_numa_find_cpu(&env, taskimp, groupimp);
 		}
 	}
 
@@ -1819,7 +1834,12 @@ static void task_numa_placement(struct task_struct *p)
 	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
 
-	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+	/*
+	 * The p->mm->numa_scan_seq field gets updated without
+	 * exclusive access. Use READ_ONCE() here to ensure
+	 * that the field is read in a single access:
+	 */
+	seq = READ_ONCE(p->mm->numa_scan_seq);
 	if (p->numa_scan_seq == seq)
 		return;
 	p->numa_scan_seq = seq;
@@ -1963,7 +1983,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 	}
 
 	rcu_read_lock();
-	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+	tsk = READ_ONCE(cpu_rq(cpu)->curr);
 
 	if (!cpupid_match_pid(tsk, cpupid))
 		goto no_join;
@@ -2132,7 +2152,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
 static void reset_ptenuma_scan(struct task_struct *p)
 {
-	ACCESS_ONCE(p->mm->numa_scan_seq)++;
+	/*
+	 * We only did a read acquisition of the mmap sem, so
+	 * p->mm->numa_scan_seq is written to without exclusive access
+	 * and the update is not guaranteed to be atomic. That's not
+	 * much of an issue though, since this is just used for
+	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+	 * expensive, to avoid any form of compiler optimizations:
+	 */
+	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
 	p->mm->numa_scan_offset = 0;
 }
 
@@ -3501,16 +3529,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (cfs_b->quota == RUNTIME_INF)
 		amount = min_amount;
 	else {
-		/*
-		 * If the bandwidth pool has become inactive, then at least one
-		 * period must have elapsed since the last consumption.
-		 * Refresh the global state and ensure bandwidth timer becomes
-		 * active.
-		 */
-		if (!cfs_b->timer_active) {
-			__refill_cfs_bandwidth_runtime(cfs_b);
-			__start_cfs_bandwidth(cfs_b, false);
-		}
+		start_cfs_bandwidth(cfs_b);
 
 		if (cfs_b->runtime > 0) {
 			amount = min(cfs_b->runtime, min_amount);
@@ -3659,6 +3678,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
 	long task_delta, dequeue = 1;
+	bool empty;
 
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
@@ -3688,13 +3708,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
+	empty = list_empty(&cfs_b->throttled_cfs_rq);
+
 	/*
 	 * Add to the _head_ of the list, so that an already-started
 	 * distribute_cfs_runtime will not see us
 	 */
 	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-	if (!cfs_b->timer_active)
-		__start_cfs_bandwidth(cfs_b, false);
+
+	/*
+	 * If we're the first throttled task, make sure the bandwidth
+	 * timer is running.
+	 */
+	if (empty)
+		start_cfs_bandwidth(cfs_b);
+
 	raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -3809,13 +3837,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	if (cfs_b->idle && !throttled)
 		goto out_deactivate;
 
-	/*
-	 * if we have relooped after returning idle once, we need to update our
-	 * status as actually running, so that other cpus doing
-	 * __start_cfs_bandwidth will stop trying to cancel us.
-	 */
-	cfs_b->timer_active = 1;
-
 	__refill_cfs_bandwidth_runtime(cfs_b);
 
 	if (!throttled) {
@@ -3860,7 +3881,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	return 0;
 
 out_deactivate:
-	cfs_b->timer_active = 0;
 	return 1;
 }
 
@@ -3875,7 +3895,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
  * Are we near the end of the current quota period?
  *
  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
- * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * hrtimer base being cleared by hrtimer_start. In the case of
  * migrate_hrtimers, base is never cleared, so we are fine.
  */
 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
@@ -3903,8 +3923,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
 	if (runtime_refresh_within(cfs_b, min_left))
 		return;
 
-	start_bandwidth_timer(&cfs_b->slack_timer,
-				ns_to_ktime(cfs_bandwidth_slack_period));
+	hrtimer_start(&cfs_b->slack_timer,
+			ns_to_ktime(cfs_bandwidth_slack_period),
+			HRTIMER_MODE_REL);
 }
 
 /* we know any runtime found here is valid as update_curr() precedes return */
@@ -4024,6 +4045,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, slack_timer);
+
 	do_sched_cfs_slack_timer(cfs_b);
 
 	return HRTIMER_NORESTART;
@@ -4033,20 +4055,19 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, period_timer);
-	ktime_t now;
 	int overrun;
 	int idle = 0;
 
 	raw_spin_lock(&cfs_b->lock);
 	for (;;) {
-		now = hrtimer_cb_get_time(timer);
-		overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+		overrun = hrtimer_forward_now(timer, cfs_b->period);
 		if (!overrun)
 			break;
 
 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 	}
+	if (idle)
+		cfs_b->period_active = 0;
 	raw_spin_unlock(&cfs_b->lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -4060,7 +4081,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->period = ns_to_ktime(default_cfs_period());
 
 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
-	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
@@ -4072,28 +4093,15 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 
-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
-	/*
-	 * The timer may be active because we're trying to set a new bandwidth
-	 * period or because we're racing with the tear-down path
-	 * (timer_active==0 becomes visible before the hrtimer call-back
-	 * terminates).  In either case we ensure that it's re-programmed
-	 */
-	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
-	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
-		/* bounce the lock to allow do_sched_cfs_period_timer to run */
-		raw_spin_unlock(&cfs_b->lock);
-		cpu_relax();
-		raw_spin_lock(&cfs_b->lock);
-		/* if someone else restarted the timer then we're done */
-		if (!force && cfs_b->timer_active)
-			return;
-	}
+	lockdep_assert_held(&cfs_b->lock);
 
-	cfs_b->timer_active = 1;
-	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+	if (!cfs_b->period_active) {
+		cfs_b->period_active = 1;
+		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+	}
 }
 
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -4348,6 +4356,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT		7
+static const unsigned char
+		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+					{0, 0, 0, 0, 0, 0, 0, 0},
+					{64, 32, 8, 0, 0, 0, 0, 0},
+					{96, 72, 40, 12, 1, 0, 0},
+					{112, 98, 75, 43, 15, 1, 0},
+					{120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+	int j = 0;
+
+	if (!missed_updates)
+		return load;
+
+	if (missed_updates >= degrade_zero_ticks[idx])
+		return 0;
+
+	if (idx == 1)
+		return load >> missed_updates;
+
+	while (missed_updates) {
+		if (missed_updates % 2)
+			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+		missed_updates >>= 1;
+		j++;
+	}
+	return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+			      unsigned long pending_updates)
+{
+	int i, scale;
+
+	this_rq->nr_load_updates++;
+
+	/* Update our load: */
+	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+		unsigned long old_load, new_load;
+
+		/* scale is effectively 1 << i now, and >> i divides by scale */
+
+		old_load = this_rq->cpu_load[i];
+		old_load = decay_load_missed(old_load, pending_updates - 1, i);
+		new_load = this_load;
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale - 1;
+
+		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+	}
+
+	sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+	unsigned long curr_jiffies = READ_ONCE(jiffies);
+	unsigned long load = this_rq->cfs.runnable_load_avg;
+	unsigned long pending_updates;
+
+	/*
+	 * bail if there's load or we're actually up-to-date.
+	 */
+	if (load || curr_jiffies == this_rq->last_load_update_tick)
+		return;
+
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	this_rq->last_load_update_tick = curr_jiffies;
+
+	__update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+	struct rq *this_rq = this_rq();
+	unsigned long curr_jiffies = READ_ONCE(jiffies);
+	unsigned long pending_updates;
+
+	if (curr_jiffies == this_rq->last_load_update_tick)
+		return;
+
+	raw_spin_lock(&this_rq->lock);
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	if (pending_updates) {
+		this_rq->last_load_update_tick = curr_jiffies;
+		/*
+		 * We were idle, this means load 0, the current load might be
+		 * !0 due to remote wakeups and the sort.
+		 */
+		__update_cpu_load(this_rq, 0, pending_updates);
+	}
+	raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+	unsigned long load = this_rq->cfs.runnable_load_avg;
+	/*
+	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+	 */
+	this_rq->last_load_update_tick = jiffies;
+	__update_cpu_load(this_rq, load, 1);
+}
+
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
@@ -4400,7 +4591,7 @@ static unsigned long capacity_orig_of(int cpu)
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
 	unsigned long load_avg = rq->cfs.runnable_load_avg;
 
 	if (nr_running)
@@ -5151,18 +5342,21 @@ again:
 		 * entity, update_curr() will update its vruntime, otherwise
 		 * forget we've ever seen it.
 		 */
-		if (curr && curr->on_rq)
-			update_curr(cfs_rq);
-		else
-			curr = NULL;
+		if (curr) {
+			if (curr->on_rq)
+				update_curr(cfs_rq);
+			else
+				curr = NULL;
 
-		/*
-		 * This call to check_cfs_rq_runtime() will do the throttle and
-		 * dequeue its entity in the parent(s). Therefore the 'simple'
-		 * nr_running test will indeed be correct.
-		 */
-		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-			goto simple;
+			/*
+			 * This call to check_cfs_rq_runtime() will do the
+			 * throttle and dequeue its entity in the parent(s).
+			 * Therefore the 'simple' nr_running test will indeed
+			 * be correct.
+			 */
+			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+				goto simple;
+		}
 
 		se = pick_next_entity(cfs_rq, curr);
 		cfs_rq = group_cfs_rq(se);
@@ -5223,7 +5417,15 @@ simple:
 	return p;
 
 idle:
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	lockdep_unpin_lock(&rq->lock);
 	new_tasks = idle_balance(rq);
+	lockdep_pin_lock(&rq->lock);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
@@ -5492,10 +5694,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 }
 
 #ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
+/*
+ * Returns true if the destination node is the preferred node.
+ * Needs to match fbq_classify_rq(): if there is a runnable task
+ * that is not on its preferred node, we should identify it.
+ */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
+	unsigned long src_faults, dst_faults;
 	int src_nid, dst_nid;
 
 	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5509,29 +5716,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 	if (src_nid == dst_nid)
 		return false;
 
-	if (numa_group) {
-		/* Task is already in the group's interleave set. */
-		if (node_isset(src_nid, numa_group->active_nodes))
-			return false;
-
-		/* Task is moving into the group's interleave set. */
-		if (node_isset(dst_nid, numa_group->active_nodes))
-			return true;
-
-		return group_faults(p, dst_nid) > group_faults(p, src_nid);
-	}
-
 	/* Encourage migration to the preferred node. */
 	if (dst_nid == p->numa_preferred_nid)
 		return true;
 
-	return task_faults(p, dst_nid) > task_faults(p, src_nid);
+	/* Migrating away from the preferred node is bad. */
+	if (src_nid == p->numa_preferred_nid)
+		return false;
+
+	if (numa_group) {
+		src_faults = group_faults(p, src_nid);
+		dst_faults = group_faults(p, dst_nid);
+	} else {
+		src_faults = task_faults(p, src_nid);
+		dst_faults = task_faults(p, dst_nid);
+	}
+
+	return dst_faults > src_faults;
 }
 
 
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
+	unsigned long src_faults, dst_faults;
 	int src_nid, dst_nid;
 
 	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5546,23 +5754,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	if (src_nid == dst_nid)
 		return false;
 
-	if (numa_group) {
-		/* Task is moving within/into the group's interleave set. */
-		if (node_isset(dst_nid, numa_group->active_nodes))
-			return false;
+	/* Migrating away from the preferred node is bad. */
+	if (src_nid == p->numa_preferred_nid)
+		return true;
 
-		/* Task is moving out of the group's interleave set. */
-		if (node_isset(src_nid, numa_group->active_nodes))
-			return true;
+	/* Encourage migration to the preferred node. */
+	if (dst_nid == p->numa_preferred_nid)
+		return false;
 
-		return group_faults(p, dst_nid) < group_faults(p, src_nid);
+	if (numa_group) {
+		src_faults = group_faults(p, src_nid);
+		dst_faults = group_faults(p, dst_nid);
+	} else {
+		src_faults = task_faults(p, src_nid);
+		dst_faults = task_faults(p, dst_nid);
 	}
 
-	/* Migrating away from the preferred node is always bad. */
-	if (src_nid == p->numa_preferred_nid)
-		return true;
-
-	return task_faults(p, dst_nid) < task_faults(p, src_nid);
+	return dst_faults < src_faults;
 }
 
 #else
@@ -6062,8 +6270,8 @@ static unsigned long scale_rt_capacity(int cpu)
 	 * Since we're reading these variables without serialization make sure
 	 * we read them once before doing sanity checks on them.
 	 */
-	age_stamp = ACCESS_ONCE(rq->age_stamp);
-	avg = ACCESS_ONCE(rq->rt_avg);
+	age_stamp = READ_ONCE(rq->age_stamp);
+	avg = READ_ONCE(rq->rt_avg);
 	delta = __rq_clock_broken(rq) - age_stamp;
 
 	if (unlikely(delta < 0))
@@ -7251,9 +7459,6 @@ static int idle_balance(struct rq *this_rq)
 		goto out;
 	}
 
-	/*
-	 * Drop the rq->lock, but keep IRQ/preempt disabled.
-	 */
 	raw_spin_unlock(&this_rq->lock);
 
 	update_blocked_averages(this_cpu);
@@ -8293,7 +8498,27 @@ void print_cfs_stats(struct seq_file *m, int cpu)
 		print_cfs_rq(m, cpu, cfs_rq);
 	rcu_read_unlock();
 }
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+void show_numa_stats(struct task_struct *p, struct seq_file *m)
+{
+	int node;
+	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+
+	for_each_online_node(node) {
+		if (p->numa_faults) {
+			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
+			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
+		}
+		if (p->numa_group) {
+			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
+			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+		}
+		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
+	}
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
 
 __init void init_sched_fair_class(void)
 {
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 70e698d02..594275ed2 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -13,11 +13,16 @@
 
 #include <trace/events/power.h>
 
-#ifdef CONFIG_SCHED_BFS
-#include "bfs_sched.h"
-#else
 #include "sched.h"
-#endif
+
+/**
+ * sched_idle_set_state - Record idle state for the current CPU.
+ * @idle_state: State to record.
+ */
+void sched_idle_set_state(struct cpuidle_state *idle_state)
+{
+	idle_set_state(this_rq(), idle_state);
+}
 
 static int __read_mostly cpu_idle_force_poll;
 
@@ -72,6 +77,46 @@ void __weak arch_cpu_idle(void)
 }
 
 /**
+ * default_idle_call - Default CPU idle routine.
+ *
+ * To use when the cpuidle framework cannot be used.
+ */
+void default_idle_call(void)
+{
+	if (current_clr_polling_and_test())
+		local_irq_enable();
+	else
+		arch_cpu_idle();
+}
+
+static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		      int next_state)
+{
+	/* Fall back to the default arch idle method on errors. */
+	if (next_state < 0) {
+		default_idle_call();
+		return next_state;
+	}
+
+	/*
+	 * The idle task must be scheduled, it is pointless to go to idle, just
+	 * update no idle residency and return.
+	 */
+	if (current_clr_polling_and_test()) {
+		dev->last_residency = 0;
+		local_irq_enable();
+		return -EBUSY;
+	}
+
+	/*
+	 * Enter the idle state previously returned by the governor decision.
+	 * This function will block until an interrupt occurs and will take
+	 * care of re-enabling the local interrupts
+	 */
+	return cpuidle_enter(drv, dev, next_state);
+}
+
+/**
  * cpuidle_idle_call - the main idle function
  *
  * NOTE: no locks or semaphores should be used here
@@ -85,7 +130,6 @@ static void cpuidle_idle_call(void)
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
 	int next_state, entered_state;
-	bool reflect;
 
 	/*
 	 * Check if the idle task must be rescheduled. If it is the
@@ -109,8 +153,10 @@ static void cpuidle_idle_call(void)
 	 */
 	rcu_idle_enter();
 
-	if (cpuidle_not_available(drv, dev))
-		goto use_default;
+	if (cpuidle_not_available(drv, dev)) {
+		default_idle_call();
+		goto exit_idle;
+	}
 
 	/*
 	 * Suspend-to-idle ("freeze") is a system state in which all user space
@@ -128,52 +174,19 @@ static void cpuidle_idle_call(void)
 			goto exit_idle;
 		}
 
-		reflect = false;
 		next_state = cpuidle_find_deepest_state(drv, dev);
+		call_cpuidle(drv, dev, next_state);
 	} else {
-		reflect = true;
 		/*
 		 * Ask the cpuidle framework to choose a convenient idle state.
 		 */
 		next_state = cpuidle_select(drv, dev);
-	}
-	/* Fall back to the default arch idle method on errors. */
-	if (next_state < 0)
-		goto use_default;
-
-	/*
-	 * The idle task must be scheduled, it is pointless to
-	 * go to idle, just update no idle residency and get
-	 * out of this function
-	 */
-	if (current_clr_polling_and_test()) {
-		dev->last_residency = 0;
-		entered_state = next_state;
-		local_irq_enable();
-		goto exit_idle;
-	}
-
-	/* Take note of the planned idle state. */
-	idle_set_state(this_rq(), &drv->states[next_state]);
-
-	/*
-	 * Enter the idle state previously returned by the governor decision.
-	 * This function will block until an interrupt occurs and will take
-	 * care of re-enabling the local interrupts
-	 */
-	entered_state = cpuidle_enter(drv, dev, next_state);
-
-	/* The cpu is no longer idle or about to enter idle. */
-	idle_set_state(this_rq(), NULL);
-
-	if (entered_state == -EBUSY)
-		goto use_default;
-
-	/*
-	 * Give the governor an opportunity to reflect on the outcome
-	 */
-	if (reflect)
+		entered_state = call_cpuidle(drv, dev, next_state);
+		/*
+		 * Give the governor an opportunity to reflect on the outcome
+		 */
 		cpuidle_reflect(dev, entered_state);
+	}
 
 exit_idle:
 	__current_set_polling();
@@ -186,19 +199,6 @@ exit_idle:
 
 	rcu_idle_exit();
 	start_critical_timings();
-	return;
-
-use_default:
-	/*
-	 * We can't use the cpuidle framework, let's use the default
-	 * idle routine.
-	 */
-	if (current_clr_polling_and_test())
-		local_irq_enable();
-	else
-		arch_cpu_idle();
-
-	goto exit_idle;
 }
 
 DEFINE_PER_CPU(bool, cpu_dead_idle);
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 8ecd552fe..ef7159012 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
 /*
- *  kernel/sched/proc.c
+ * kernel/sched/loadavg.c
  *
- *  Kernel load calculations, forked from sched/core.c
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
  */
 
 #include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
 	long nr_active, delta = 0;
 
 	nr_active = this_rq->nr_running;
-	nr_active += (long) this_rq->nr_uninterruptible;
+	nr_active += (long)this_rq->nr_uninterruptible;
 
 	if (nr_active != this_rq->calc_load_active) {
 		delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
 	delta = calc_load_fold_active(this_rq);
 	if (delta) {
 		int idx = calc_load_write_idx();
+
 		atomic_long_add(delta, &calc_load_idle[idx]);
 	}
 }
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
 {
 	unsigned long result = 1UL << frac_bits;
 
-	if (n) for (;;) {
-		if (n & 1) {
-			result *= x;
-			result += 1UL << (frac_bits - 1);
-			result >>= frac_bits;
+	if (n) {
+		for (;;) {
+			if (n & 1) {
+				result *= x;
+				result += 1UL << (frac_bits - 1);
+				result >>= frac_bits;
+			}
+			n >>= 1;
+			if (!n)
+				break;
+			x *= x;
+			x += 1UL << (frac_bits - 1);
+			x >>= frac_bits;
 		}
-		n >>= 1;
-		if (!n)
-			break;
-		x *= x;
-		x += 1UL << (frac_bits - 1);
-		x >>= frac_bits;
 	}
 
 	return result;
@@ -285,7 +290,6 @@ static unsigned long
 calc_load_n(unsigned long load, unsigned long exp,
 	    unsigned long active, unsigned int n)
 {
-
 	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
 }
 
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
  * CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
  */
 void calc_global_load(unsigned long ticks)
 {
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
 }
 
 /*
- * Called from update_cpu_load() to periodically update this CPU's
+ * Called from scheduler_tick() to periodically update this CPU's
  * active count.
  */
-static void calc_load_account_active(struct rq *this_rq)
+void calc_global_load_tick(struct rq *this_rq)
 {
 	long delta;
 
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
 
 	this_rq->calc_load_update += LOAD_FREQ;
 }
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT		7
-static const unsigned char
-		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-					{0, 0, 0, 0, 0, 0, 0, 0},
-					{64, 32, 8, 0, 0, 0, 0, 0},
-					{96, 72, 40, 12, 1, 0, 0},
-					{112, 98, 75, 43, 15, 1, 0},
-					{120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-	int j = 0;
-
-	if (!missed_updates)
-		return load;
-
-	if (missed_updates >= degrade_zero_ticks[idx])
-		return 0;
-
-	if (idx == 1)
-		return load >> missed_updates;
-
-	while (missed_updates) {
-		if (missed_updates % 2)
-			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
-		missed_updates >>= 1;
-		j++;
-	}
-	return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-			      unsigned long pending_updates)
-{
-	int i, scale;
-
-	this_rq->nr_load_updates++;
-
-	/* Update our load: */
-	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-		unsigned long old_load, new_load;
-
-		/* scale is effectively 1 << i now, and >> i divides by scale */
-
-		old_load = this_rq->cpu_load[i];
-		old_load = decay_load_missed(old_load, pending_updates - 1, i);
-		new_load = this_load;
-		/*
-		 * Round up the averaging division if load is increasing. This
-		 * prevents us from getting stuck on 9 if the load is 10, for
-		 * example.
-		 */
-		if (new_load > old_load)
-			new_load += scale - 1;
-
-		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-	}
-
-	sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-	return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-	return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-	unsigned long load = get_rq_runnable_load(this_rq);
-	unsigned long pending_updates;
-
-	/*
-	 * bail if there's load or we're actually up-to-date.
-	 */
-	if (load || curr_jiffies == this_rq->last_load_update_tick)
-		return;
-
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	this_rq->last_load_update_tick = curr_jiffies;
-
-	__update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
-	struct rq *this_rq = this_rq();
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-	unsigned long pending_updates;
-
-	if (curr_jiffies == this_rq->last_load_update_tick)
-		return;
-
-	raw_spin_lock(&this_rq->lock);
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	if (pending_updates) {
-		this_rq->last_load_update_tick = curr_jiffies;
-		/*
-		 * We were idle, this means load 0, the current load might be
-		 * !0 due to remote wakeups and the sort.
-		 */
-		__update_cpu_load(this_rq, 0, pending_updates);
-	}
-	raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
-	unsigned long load = get_rq_runnable_load(this_rq);
-	/*
-	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
-	 */
-	this_rq->last_load_update_tick = jiffies;
-	__update_cpu_load(this_rq, load, 1);
-
-	calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3..0d193a243 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -18,19 +18,22 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 {
 	struct rt_bandwidth *rt_b =
 		container_of(timer, struct rt_bandwidth, rt_period_timer);
-	ktime_t now;
-	int overrun;
 	int idle = 0;
+	int overrun;
 
+	raw_spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
-		now = hrtimer_cb_get_time(timer);
-		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+		overrun = hrtimer_forward_now(timer, rt_b->rt_period);
 		if (!overrun)
 			break;
 
+		raw_spin_unlock(&rt_b->rt_runtime_lock);
 		idle = do_sched_rt_period_timer(rt_b, overrun);
+		raw_spin_lock(&rt_b->rt_runtime_lock);
 	}
+	if (idle)
+		rt_b->rt_period_active = 0;
+	raw_spin_unlock(&rt_b->rt_runtime_lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
@@ -52,11 +55,12 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
-	if (hrtimer_active(&rt_b->rt_period_timer))
-		return;
-
 	raw_spin_lock(&rt_b->rt_runtime_lock);
-	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+	if (!rt_b->rt_period_active) {
+		rt_b->rt_period_active = 1;
+		hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
@@ -256,7 +260,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 #ifdef CONFIG_SMP
 
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
@@ -350,13 +354,23 @@ static inline int has_pushable_tasks(struct rq *rq)
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
+
+static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	/*
-	 * We detect this state here so that we can avoid taking the RQ
-	 * lock again later if there is no need to push
-	 */
-	rq->post_schedule = has_pushable_tasks(rq);
+	if (!has_pushable_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -408,12 +422,11 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 	return false;
 }
 
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
 {
-	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1323,7 +1336,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
 	 * If the current task on @p's runqueue is an RT task, then
@@ -1465,7 +1478,15 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	struct rt_rq *rt_rq = &rq->rt;
 
 	if (need_pull_rt_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_rt_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a dl or stop task can slip in, in which case we need
@@ -1493,7 +1514,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -1948,14 +1969,15 @@ static void push_irq_work_func(struct irq_work *work)
 }
 #endif /* HAVE_RT_PUSH_IPI */
 
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
+	bool resched = false;
 	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1966,7 +1988,7 @@ static int pull_rt_task(struct rq *this_rq)
 #ifdef HAVE_RT_PUSH_IPI
 	if (sched_feat(RT_PUSH_IPI)) {
 		tell_cpu_to_push(this_rq);
-		return 0;
+		return;
 	}
 #endif
 
@@ -2019,7 +2041,7 @@ static int pull_rt_task(struct rq *this_rq)
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -2035,12 +2057,8 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
-}
-
-static void post_schedule_rt(struct rq *rq)
-{
-	push_rt_tasks(rq);
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -2136,8 +2154,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	if (pull_rt_task(rq))
-		resched_curr(rq);
+	queue_pull_task(rq);
 }
 
 void __init init_sched_rt_class(void)
@@ -2158,8 +2175,6 @@ void __init init_sched_rt_class(void)
  */
 static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	/*
 	 * If we are already running, then there's nothing
 	 * that needs to be done. But if we are not running
@@ -2169,13 +2184,12 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
-		    /* Don't resched if we changed runqueues */
-		    push_rt_task(rq) && rq != task_rq(p))
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched && p->prio < rq->curr->prio)
+		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
+#endif /* CONFIG_SMP */
 	}
 }
 
@@ -2196,14 +2210,13 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 		 * may need to pull tasks to this runqueue.
 		 */
 		if (oldprio < p->prio)
-			pull_rt_task(rq);
+			queue_pull_task(rq);
+
 		/*
 		 * If there's a higher priority task waiting to run
-		 * then reschedule. Note, the above pull_rt_task
-		 * can release the rq lock and p could migrate.
-		 * Only reschedule if p is still on the same runqueue.
+		 * then reschedule.
 		 */
-		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr)
 			resched_curr(rq);
 #else
 		/* For UP simply resched on drop of prio */
@@ -2314,7 +2327,6 @@ const struct sched_class rt_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
-	.post_schedule		= post_schedule_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e129993..84d48790b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
+extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq);
+
+#ifdef CONFIG_SMP
 extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
 
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
@@ -131,6 +137,7 @@ struct rt_bandwidth {
 	ktime_t			rt_period;
 	u64			rt_runtime;
 	struct hrtimer		rt_period_timer;
+	unsigned int		rt_period_active;
 };
 
 void __dl_clear_params(struct task_struct *p);
@@ -215,7 +222,7 @@ struct cfs_bandwidth {
 	s64 hierarchical_quota;
 	u64 runtime_expires;
 
-	int idle, timer_active;
+	int idle, period_active;
 	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
@@ -306,7 +313,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 
 extern void free_rt_sched_group(struct task_group *tg);
@@ -617,9 +624,10 @@ struct rq {
 	unsigned long cpu_capacity;
 	unsigned long cpu_capacity_orig;
 
+	struct callback_head *balance_callback;
+
 	unsigned char idle_balance;
 	/* For active balancing */
-	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;
@@ -707,7 +715,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
 static inline u64 __rq_clock_broken(struct rq *rq)
 {
-	return ACCESS_ONCE(rq->clock);
+	return READ_ONCE(rq->clock);
 }
 
 static inline u64 rq_clock(struct rq *rq)
@@ -760,6 +768,21 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
 
 #ifdef CONFIG_SMP
 
+static inline void
+queue_balance_callback(struct rq *rq,
+		       struct callback_head *head,
+		       void (*func)(struct rq *rq))
+{
+	lockdep_assert_held(&rq->lock);
+
+	if (unlikely(head->next))
+		return;
+
+	head->func = (void (*)(struct callback_head *))func;
+	head->next = rq->balance_callback;
+	rq->balance_callback = head;
+}
+
 extern void sched_ttwu_pending(void);
 
 #define rcu_dereference_check_sched_domain(p) \
@@ -1185,7 +1208,6 @@ struct sched_class {
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
-	void (*post_schedule) (struct rq *this_rq);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 
@@ -1284,7 +1306,6 @@ extern void update_max_interval(void);
 extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
-extern void init_sched_dl_class(void);
 
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
@@ -1298,8 +1319,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 
 unsigned long to_ratio(u64 period, u64 runtime);
 
-extern void update_idle_cpu_load(struct rq *this_rq);
-
 extern void init_task_runnable_average(struct task_struct *p);
 
 static inline void add_nr_running(struct rq *rq, unsigned count)
@@ -1406,8 +1425,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
-
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
@@ -1421,8 +1438,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 
 		while (unlikely(task_on_rq_migrating(p)))
@@ -1459,8 +1478,10 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
 		 * If we observe the new cpu in task_rq_lock, the acquire will
 		 * pair with the WMB to ensure we must then also see migrating.
 		 */
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 
@@ -1472,6 +1493,7 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
 static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1480,6 +1502,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
@@ -1666,9 +1689,22 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+
+#ifdef	CONFIG_SCHED_DEBUG
 extern void print_cfs_stats(struct seq_file *m, int cpu);
 extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void
+show_numa_stats(struct task_struct *p, struct seq_file *m);
+extern void
+print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+	unsigned long tpf, unsigned long gsf, unsigned long gpf);
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
 
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq);
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 7466a0bb2..87e2c9f0c 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,11 +4,7 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 
-#ifndef CONFIG_SCHED_BFS
 #include "sched.h"
-#else
-#include "bfs_sched.h"
-#endif
 
 /*
  * bump this up when changing the output format or the meaning of an existing
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339..b0fbc7632 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -47,7 +47,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 # define schedstat_set(var, val)	do { } while (0)
 #endif
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 static inline void sched_info_reset_dequeued(struct task_struct *t)
 {
 	t->sched_info.last_queued = 0;
@@ -156,7 +156,7 @@ sched_info_switch(struct rq *rq,
 #define sched_info_depart(rq, t)		do { } while (0)
 #define sched_info_arrive(rq, next)		do { } while (0)
 #define sched_info_switch(rq, t, next)		do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#endif /* CONFIG_SCHED_INFO */
 
 /*
  * The following are functions that support scheduler-internal time accounting.
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
-	if (!cputimer->running)
+	/* Check if cputimer isn't running. This is accessed without locking. */
+	if (!READ_ONCE(cputimer->running))
 		return false;
 
 	/*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.utime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
 }
 
 /**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.stime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
 }
 
 /**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.sum_exec_runtime += ns;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
 }
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79..052e02672 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
 	 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
 	 * an event.
 	 */
-	set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+	smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
 
 	return timeout;
 }
@@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	 * doesn't imply write barrier and the users expects write
 	 * barrier semantics on wakeup functions.  The following
 	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-	 * and is paired with set_mb() in wait_woken().
+	 * and is paired with smp_store_mb() in wait_woken().
 	 */
 	smp_wmb(); /* C */
 	wait->flags |= WQ_FLAG_WOKEN;
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
 
 __sched int bit_wait_timeout(struct wait_bit_key *word)
 {
-	unsigned long now = ACCESS_ONCE(jiffies);
+	unsigned long now = READ_ONCE(jiffies);
 	if (signal_pending_state(current->state, current))
 		return 1;
 	if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
 
 __sched int bit_wait_io_timeout(struct wait_bit_key *word)
 {
-	unsigned long now = ACCESS_ONCE(jiffies);
+	unsigned long now = READ_ONCE(jiffies);
 	if (signal_pending_state(current->state, current))
 		return 1;
 	if (time_after_eq(now, word->timeout))
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4f4402894..245df6b32 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -346,16 +346,13 @@ static inline void seccomp_sync_threads(void)
  */
 static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
-	struct seccomp_filter *filter;
-	unsigned long fp_size;
-	struct sock_filter *fp;
-	int new_len;
-	long ret;
+	struct seccomp_filter *sfilter;
+	int ret;
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 		return ERR_PTR(-EINVAL);
+
 	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
-	fp_size = fprog->len * sizeof(struct sock_filter);
 
 	/*
 	 * Installing a seccomp filter requires that the task has
@@ -368,60 +365,21 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 				     CAP_SYS_ADMIN) != 0)
 		return ERR_PTR(-EACCES);
 
-	fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
-	if (!fp)
-		return ERR_PTR(-ENOMEM);
-
-	/* Copy the instructions from fprog. */
-	ret = -EFAULT;
-	if (copy_from_user(fp, fprog->filter, fp_size))
-		goto free_prog;
-
-	/* Check and rewrite the fprog via the skb checker */
-	ret = bpf_check_classic(fp, fprog->len);
-	if (ret)
-		goto free_prog;
-
-	/* Check and rewrite the fprog for seccomp use */
-	ret = seccomp_check_filter(fp, fprog->len);
-	if (ret)
-		goto free_prog;
-
-	/* Convert 'sock_filter' insns to 'bpf_insn' insns */
-	ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
-	if (ret)
-		goto free_prog;
-
 	/* Allocate a new seccomp_filter */
-	ret = -ENOMEM;
-	filter = kzalloc(sizeof(struct seccomp_filter),
-			 GFP_KERNEL|__GFP_NOWARN);
-	if (!filter)
-		goto free_prog;
-
-	filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
-	if (!filter->prog)
-		goto free_filter;
-
-	ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
-	if (ret)
-		goto free_filter_prog;
-
-	kfree(fp);
-	atomic_set(&filter->usage, 1);
-	filter->prog->len = new_len;
+	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
+	if (!sfilter)
+		return ERR_PTR(-ENOMEM);
 
-	bpf_prog_select_runtime(filter->prog);
+	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
+					seccomp_check_filter);
+	if (ret < 0) {
+		kfree(sfilter);
+		return ERR_PTR(ret);
+	}
 
-	return filter;
+	atomic_set(&sfilter->usage, 1);
 
-free_filter_prog:
-	__bpf_prog_free(filter->prog);
-free_filter:
-	kfree(filter);
-free_prog:
-	kfree(fp);
-	return ERR_PTR(ret);
+	return sfilter;
 }
 
 /**
diff --git a/kernel/signal.c b/kernel/signal.c
index 0206be728..0f6bbbe77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
  * RETURNS:
  * %true if @mask is set, %false if made noop because @task was dying.
  */
-bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
 {
 	BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
 			JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
 {
 	BUG_ON(mask & ~JOBCTL_PENDING_MASK);
 
@@ -414,21 +414,16 @@ void flush_sigqueue(struct sigpending *queue)
 }
 
 /*
- * Flush all pending signals for a task.
+ * Flush all pending signals for this kthread.
  */
-void __flush_signals(struct task_struct *t)
-{
-	clear_tsk_thread_flag(t, TIF_SIGPENDING);
-	flush_sigqueue(&t->pending);
-	flush_sigqueue(&t->signal->shared_pending);
-}
-
 void flush_signals(struct task_struct *t)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	__flush_signals(t);
+	clear_tsk_thread_flag(t, TIF_SIGPENDING);
+	flush_sigqueue(&t->pending);
+	flush_sigqueue(&t->signal->shared_pending);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
@@ -2000,7 +1995,7 @@ static bool do_signal_stop(int signr)
 	struct signal_struct *sig = current->signal;
 
 	if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
-		unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+		unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
 		struct task_struct *t;
 
 		/* signr will be recorded in task->jobctl for retries */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index bdcc6c018..7c434c39f 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -173,7 +173,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 	if (tsk)
 		return 0;
 
-	td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu));
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
 	if (!td)
 		return -ENOMEM;
 	td->cpu = cpu;
@@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu)
 
 	mutex_lock(&smpboot_threads_lock);
 	list_for_each_entry(cur, &hotplug_threads, list)
-		smpboot_unpark_thread(cur, cpu);
+		if (cpumask_test_cpu(cpu, cur->cpumask))
+			smpboot_unpark_thread(cur, cpu);
 	mutex_unlock(&smpboot_threads_lock);
 }
 
@@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 {
 	unsigned int cpu;
 
+	/* Unpark any threads that were voluntarily parked. */
+	for_each_cpu_not(cpu, ht->cpumask) {
+		if (cpu_online(cpu)) {
+			struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+			if (tsk)
+				kthread_unpark(tsk);
+		}
+	}
+
 	/* We need to destroy also the parked threads of offline cpus */
 	for_each_possible_cpu(cpu) {
 		struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 	unsigned int cpu;
 	int ret = 0;
 
+	if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+
 	get_online_cpus();
 	mutex_lock(&smpboot_threads_lock);
 	for_each_online_cpu(cpu) {
@@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
 	smpboot_destroy_threads(plug_thread);
 	mutex_unlock(&smpboot_threads_lock);
 	put_online_cpus();
+	free_cpumask_var(plug_thread->cpumask);
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
 
+/**
+ * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
+ * @plug_thread:	Hotplug thread descriptor
+ * @new:		Revised mask to use
+ *
+ * The cpumask field in the smp_hotplug_thread must not be updated directly
+ * by the client, but only by calling this function.
+ * This function can only be called on a registered smp_hotplug_thread.
+ */
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+					 const struct cpumask *new)
+{
+	struct cpumask *old = plug_thread->cpumask;
+	cpumask_var_t tmp;
+	unsigned int cpu;
+
+	if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
+		return -ENOMEM;
+
+	get_online_cpus();
+	mutex_lock(&smpboot_threads_lock);
+
+	/* Park threads that were exclusively enabled on the old mask. */
+	cpumask_andnot(tmp, old, new);
+	for_each_cpu_and(cpu, tmp, cpu_online_mask)
+		smpboot_park_thread(plug_thread, cpu);
+
+	/* Unpark threads that are exclusively enabled on the new mask. */
+	cpumask_andnot(tmp, new, old);
+	for_each_cpu_and(cpu, tmp, cpu_online_mask)
+		smpboot_unpark_thread(plug_thread, cpu);
+
+	cpumask_copy(old, new);
+
+	mutex_unlock(&smpboot_threads_lock);
+	put_online_cpus();
+
+	free_cpumask_var(tmp);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
+
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
 
 /*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 263b0e1ad..fd643d8c4 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -41,8 +41,7 @@ struct cpu_stopper {
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
-DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
-
+static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
 static bool stop_machine_initialized = false;
 
 /*
@@ -212,25 +211,6 @@ static int multi_cpu_stop(void *data)
 	return err;
 }
 
-struct irq_cpu_stop_queue_work_info {
-	int cpu1;
-	int cpu2;
-	struct cpu_stop_work *work1;
-	struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
-	struct irq_cpu_stop_queue_work_info *info = arg;
-	cpu_stop_queue_work(info->cpu1, info->work1);
-	cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
 /**
  * stop_two_cpus - stops two cpus
  * @cpu1: the cpu to stop
@@ -246,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 {
 	struct cpu_stop_done done;
 	struct cpu_stop_work work1, work2;
-	struct irq_cpu_stop_queue_work_info call_args;
 	struct multi_stop_data msdata;
 
 	preempt_disable();
@@ -263,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 		.done = &done
 	};
 
-	call_args = (struct irq_cpu_stop_queue_work_info){
-		.cpu1 = cpu1,
-		.cpu2 = cpu2,
-		.work1 = &work1,
-		.work2 = &work2,
-	};
-
 	cpu_stop_init_done(&done, 2);
 	set_state(&msdata, MULTI_STOP_PREPARE);
 
@@ -286,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 		return -ENOENT;
 	}
 
-	lg_local_lock(&stop_cpus_lock);
-	/*
-	 * Queuing needs to be done by the lowest numbered CPU, to ensure
-	 * that works are always queued in the same order on every CPU.
-	 * This prevents deadlocks.
-	 */
-	smp_call_function_single(min(cpu1, cpu2),
-				 &irq_cpu_stop_queue_work,
-				 &call_args, 1);
-	lg_local_unlock(&stop_cpus_lock);
+	lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+	cpu_stop_queue_work(cpu1, &work1);
+	cpu_stop_queue_work(cpu2, &work2);
+	lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
 	preempt_enable();
 
 	wait_for_completion(&done.completion);
diff --git a/kernel/sys.c b/kernel/sys.c
index a4e372b79..259fda25e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -92,10 +92,10 @@
 # define SET_TSC_CTL(a)		(-EINVAL)
 #endif
 #ifndef MPX_ENABLE_MANAGEMENT
-# define MPX_ENABLE_MANAGEMENT(a)	(-EINVAL)
+# define MPX_ENABLE_MANAGEMENT()	(-EINVAL)
 #endif
 #ifndef MPX_DISABLE_MANAGEMENT
-# define MPX_DISABLE_MANAGEMENT(a)	(-EINVAL)
+# define MPX_DISABLE_MANAGEMENT()	(-EINVAL)
 #endif
 #ifndef GET_FP_MODE
 # define GET_FP_MODE(a)		(-EINVAL)
@@ -1722,7 +1722,6 @@ exit_err:
 	goto exit;
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
 /*
  * WARNING: we don't require any capability here so be very careful
  * in what is allowed for modification from userspace.
@@ -1818,6 +1817,7 @@ out:
 	return error;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
 {
 	struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
@@ -1902,10 +1902,41 @@ out:
 }
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
+static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
+			  unsigned long len)
+{
+	/*
+	 * This doesn't move the auxiliary vector itself since it's pinned to
+	 * mm_struct, but it permits filling the vector with new values.  It's
+	 * up to the caller to provide sane values here, otherwise userspace
+	 * tools which use this vector might be unhappy.
+	 */
+	unsigned long user_auxv[AT_VECTOR_SIZE];
+
+	if (len > sizeof(user_auxv))
+		return -EINVAL;
+
+	if (copy_from_user(user_auxv, (const void __user *)addr, len))
+		return -EFAULT;
+
+	/* Make sure the last entry is always AT_NULL */
+	user_auxv[AT_VECTOR_SIZE - 2] = 0;
+	user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+	BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+	task_lock(current);
+	memcpy(mm->saved_auxv, user_auxv, len);
+	task_unlock(current);
+
+	return 0;
+}
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
 	struct mm_struct *mm = current->mm;
+	struct prctl_mm_map prctl_map;
 	struct vm_area_struct *vma;
 	int error;
 
@@ -1925,6 +1956,9 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (opt == PR_SET_MM_EXE_FILE)
 		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
+	if (opt == PR_SET_MM_AUXV)
+		return prctl_set_auxv(mm, addr, arg4);
+
 	if (addr >= TASK_SIZE || addr < mmap_min_addr)
 		return -EINVAL;
 
@@ -1933,42 +1967,64 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, addr);
 
+	prctl_map.start_code	= mm->start_code;
+	prctl_map.end_code	= mm->end_code;
+	prctl_map.start_data	= mm->start_data;
+	prctl_map.end_data	= mm->end_data;
+	prctl_map.start_brk	= mm->start_brk;
+	prctl_map.brk		= mm->brk;
+	prctl_map.start_stack	= mm->start_stack;
+	prctl_map.arg_start	= mm->arg_start;
+	prctl_map.arg_end	= mm->arg_end;
+	prctl_map.env_start	= mm->env_start;
+	prctl_map.env_end	= mm->env_end;
+	prctl_map.auxv		= NULL;
+	prctl_map.auxv_size	= 0;
+	prctl_map.exe_fd	= -1;
+
 	switch (opt) {
 	case PR_SET_MM_START_CODE:
-		mm->start_code = addr;
+		prctl_map.start_code = addr;
 		break;
 	case PR_SET_MM_END_CODE:
-		mm->end_code = addr;
+		prctl_map.end_code = addr;
 		break;
 	case PR_SET_MM_START_DATA:
-		mm->start_data = addr;
+		prctl_map.start_data = addr;
 		break;
 	case PR_SET_MM_END_DATA:
-		mm->end_data = addr;
+		prctl_map.end_data = addr;
+		break;
+	case PR_SET_MM_START_STACK:
+		prctl_map.start_stack = addr;
 		break;
-
 	case PR_SET_MM_START_BRK:
-		if (addr <= mm->end_data)
-			goto out;
-
-		if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
-				      mm->end_data, mm->start_data))
-			goto out;
-
-		mm->start_brk = addr;
+		prctl_map.start_brk = addr;
 		break;
-
 	case PR_SET_MM_BRK:
-		if (addr <= mm->end_data)
-			goto out;
-
-		if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
-				      mm->end_data, mm->start_data))
-			goto out;
-
-		mm->brk = addr;
+		prctl_map.brk = addr;
 		break;
+	case PR_SET_MM_ARG_START:
+		prctl_map.arg_start = addr;
+		break;
+	case PR_SET_MM_ARG_END:
+		prctl_map.arg_end = addr;
+		break;
+	case PR_SET_MM_ENV_START:
+		prctl_map.env_start = addr;
+		break;
+	case PR_SET_MM_ENV_END:
+		prctl_map.env_end = addr;
+		break;
+	default:
+		goto out;
+	}
+
+	error = validate_prctl_map(&prctl_map);
+	if (error)
+		goto out;
 
+	switch (opt) {
 	/*
 	 * If command line arguments and environment
 	 * are placed somewhere else on stack, we can
@@ -1985,52 +2041,20 @@ static int prctl_set_mm(int opt, unsigned long addr,
 			error = -EFAULT;
 			goto out;
 		}
-		if (opt == PR_SET_MM_START_STACK)
-			mm->start_stack = addr;
-		else if (opt == PR_SET_MM_ARG_START)
-			mm->arg_start = addr;
-		else if (opt == PR_SET_MM_ARG_END)
-			mm->arg_end = addr;
-		else if (opt == PR_SET_MM_ENV_START)
-			mm->env_start = addr;
-		else if (opt == PR_SET_MM_ENV_END)
-			mm->env_end = addr;
-		break;
-
-	/*
-	 * This doesn't move auxiliary vector itself
-	 * since it's pinned to mm_struct, but allow
-	 * to fill vector with new values. It's up
-	 * to a caller to provide sane values here
-	 * otherwise user space tools which use this
-	 * vector might be unhappy.
-	 */
-	case PR_SET_MM_AUXV: {
-		unsigned long user_auxv[AT_VECTOR_SIZE];
-
-		if (arg4 > sizeof(user_auxv))
-			goto out;
-		up_read(&mm->mmap_sem);
-
-		if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
-			return -EFAULT;
-
-		/* Make sure the last entry is always AT_NULL */
-		user_auxv[AT_VECTOR_SIZE - 2] = 0;
-		user_auxv[AT_VECTOR_SIZE - 1] = 0;
-
-		BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
-
-		task_lock(current);
-		memcpy(mm->saved_auxv, user_auxv, arg4);
-		task_unlock(current);
-
-		return 0;
-	}
-	default:
-		goto out;
 	}
 
+	mm->start_code	= prctl_map.start_code;
+	mm->end_code	= prctl_map.end_code;
+	mm->start_data	= prctl_map.start_data;
+	mm->end_data	= prctl_map.end_data;
+	mm->start_brk	= prctl_map.start_brk;
+	mm->brk		= prctl_map.brk;
+	mm->start_stack	= prctl_map.start_stack;
+	mm->arg_start	= prctl_map.arg_start;
+	mm->arg_end	= prctl_map.arg_end;
+	mm->env_start	= prctl_map.env_start;
+	mm->env_end	= prctl_map.env_end;
+
 	error = 0;
 out:
 	up_read(&mm->mmap_sem);
@@ -2230,12 +2254,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_MPX_ENABLE_MANAGEMENT:
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
-		error = MPX_ENABLE_MANAGEMENT(me);
+		error = MPX_ENABLE_MANAGEMENT();
 		break;
 	case PR_MPX_DISABLE_MANAGEMENT:
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
-		error = MPX_DISABLE_MANAGEMENT(me);
+		error = MPX_DISABLE_MANAGEMENT();
 		break;
 	case PR_SET_FP_MODE:
 		error = SET_FP_MODE(me, arg2);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7f45887fa..19b62b522 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -124,12 +124,7 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
-static int __maybe_unused one_hundred = 100;
-#ifdef CONFIG_SCHED_BFS
-extern int rr_interval;
-extern int sched_iso_cpu;
-static int __read_mostly one_thousand = 1000;
-#endif
+static int one_hundred = 100;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -264,7 +259,7 @@ static struct ctl_table sysctl_base_table[] = {
 	{ }
 };
 
-#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS)
+#ifdef CONFIG_SCHED_DEBUG
 static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
@@ -281,7 +276,6 @@ static int max_extfrag_threshold = 1000;
 #endif
 
 static struct ctl_table kern_table[] = {
-#ifndef CONFIG_SCHED_BFS
 	{
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
@@ -355,15 +349,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "timer_migration",
-		.data		= &sysctl_timer_migration,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
-	},
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
 	{
@@ -448,7 +433,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
-#endif /* !CONFIG_SCHED_BFS */
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
@@ -888,6 +872,13 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 	{
+		.procname	= "watchdog_cpumask",
+		.data		= &watchdog_cpumask_bits,
+		.maxlen		= NR_CPUS,
+		.mode		= 0644,
+		.proc_handler	= proc_watchdog_cpumask,
+	},
+	{
 		.procname	= "softlockup_panic",
 		.data		= &softlockup_panic,
 		.maxlen		= sizeof(int),
@@ -979,26 +970,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_SCHED_BFS
-	{
-		.procname	= "rr_interval",
-		.data		= &rr_interval,
-		.maxlen		= sizeof (int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one_thousand,
-	},
-	{
-		.procname	= "iso_cpu",
-		.data		= &sched_iso_cpu,
-		.maxlen		= sizeof (int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one_hundred,
-	},
-#endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
 	{
 		.procname	= "spin_retry",
@@ -1159,6 +1130,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+	{
+		.procname	= "timer_migration",
+		.data		= &sysctl_timer_migration,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= timer_migration_handler,
+	},
+#endif
 	{ }
 };
 
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 7ceb68656..579ce1b92 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -89,7 +89,7 @@ config NO_HZ_IDLE
 config NO_HZ_FULL
 	bool "Full dynticks system (tickless)"
 	# NO_HZ_COMMON dependency
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS
+	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	# We need at least one periodic CPU for timekeeping
 	depends on SMP
 	# RCU_USER_QS dependency
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 01f031241..49eca0bee 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -12,20 +12,3 @@ obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o tick-sched.o
 obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o
 obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
-
-$(obj)/time.o: $(obj)/timeconst.h
-
-quiet_cmd_hzfile = HZFILE  $@
-      cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
-	$(call if_changed,hzfile)
-
-quiet_cmd_bc  = BC      $@
-      cmd_bc  = bc -q $(filter-out FORCE,$^) > $@
-
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
-	$(call if_changed,bc)
-
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 1b001ed1e..7fbba635a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init);
  * @alarm: ptr to alarm to set
  * @start: time to run the alarm
  */
-int alarm_start(struct alarm *alarm, ktime_t start)
+void alarm_start(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 	unsigned long flags;
-	int ret;
 
 	spin_lock_irqsave(&base->lock, flags);
 	alarm->node.expires = start;
 	alarmtimer_enqueue(base, alarm);
-	ret = hrtimer_start(&alarm->timer, alarm->node.expires,
-				HRTIMER_MODE_ABS);
+	hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
 	spin_unlock_irqrestore(&base->lock, flags);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(alarm_start);
 
@@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start);
  * @alarm: ptr to alarm to set
  * @start: time relative to now to run the alarm
  */
-int alarm_start_relative(struct alarm *alarm, ktime_t start)
+void alarm_start_relative(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 
 	start = ktime_add(start, base->gettime());
-	return alarm_start(alarm, start);
+	alarm_start(alarm, start);
 }
 EXPORT_SYMBOL_GPL(alarm_start_relative);
 
@@ -495,12 +492,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
  */
 static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
-	clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
-
 	if (!alarmtimer_get_rtcdev())
 		return -EINVAL;
 
-	return hrtimer_get_res(baseid, tp);
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
+	return 0;
 }
 
 /**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 637a09461..50eb107f1 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,8 +94,8 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 
-static int __clockevents_set_state(struct clock_event_device *dev,
-				   enum clock_event_state state)
+static int __clockevents_switch_state(struct clock_event_device *dev,
+				      enum clock_event_state state)
 {
 	/* Transition with legacy set_mode() callback */
 	if (dev->set_mode) {
@@ -120,19 +120,37 @@ static int __clockevents_set_state(struct clock_event_device *dev,
 		/* The clockevent device is getting replaced. Shut it down. */
 
 	case CLOCK_EVT_STATE_SHUTDOWN:
-		return dev->set_state_shutdown(dev);
+		if (dev->set_state_shutdown)
+			return dev->set_state_shutdown(dev);
+		return 0;
 
 	case CLOCK_EVT_STATE_PERIODIC:
 		/* Core internal bug */
 		if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
 			return -ENOSYS;
-		return dev->set_state_periodic(dev);
+		if (dev->set_state_periodic)
+			return dev->set_state_periodic(dev);
+		return 0;
 
 	case CLOCK_EVT_STATE_ONESHOT:
 		/* Core internal bug */
 		if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
 			return -ENOSYS;
-		return dev->set_state_oneshot(dev);
+		if (dev->set_state_oneshot)
+			return dev->set_state_oneshot(dev);
+		return 0;
+
+	case CLOCK_EVT_STATE_ONESHOT_STOPPED:
+		/* Core internal bug */
+		if (WARN_ONCE(!clockevent_state_oneshot(dev),
+			      "Current state: %d\n",
+			      clockevent_get_state(dev)))
+			return -EINVAL;
+
+		if (dev->set_state_oneshot_stopped)
+			return dev->set_state_oneshot_stopped(dev);
+		else
+			return -ENOSYS;
 
 	default:
 		return -ENOSYS;
@@ -140,26 +158,26 @@ static int __clockevents_set_state(struct clock_event_device *dev,
 }
 
 /**
- * clockevents_set_state - set the operating state of a clock event device
+ * clockevents_switch_state - set the operating state of a clock event device
  * @dev:	device to modify
  * @state:	new state
  *
  * Must be called with interrupts disabled !
  */
-void clockevents_set_state(struct clock_event_device *dev,
-			   enum clock_event_state state)
+void clockevents_switch_state(struct clock_event_device *dev,
+			      enum clock_event_state state)
 {
-	if (dev->state != state) {
-		if (__clockevents_set_state(dev, state))
+	if (clockevent_get_state(dev) != state) {
+		if (__clockevents_switch_state(dev, state))
 			return;
 
-		dev->state = state;
+		clockevent_set_state(dev, state);
 
 		/*
 		 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
 		 * on it, so fix it up and emit a warning:
 		 */
-		if (state == CLOCK_EVT_STATE_ONESHOT) {
+		if (clockevent_state_oneshot(dev)) {
 			if (unlikely(!dev->mult)) {
 				dev->mult = 1;
 				WARN_ON(1);
@@ -174,7 +192,7 @@ void clockevents_set_state(struct clock_event_device *dev,
  */
 void clockevents_shutdown(struct clock_event_device *dev)
 {
-	clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 	dev->next_event.tv64 = KTIME_MAX;
 }
 
@@ -248,7 +266,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
 		delta = dev->min_delta_ns;
 		dev->next_event = ktime_add_ns(ktime_get(), delta);
 
-		if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+		if (clockevent_state_shutdown(dev))
 			return 0;
 
 		dev->retries++;
@@ -285,7 +303,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
 	delta = dev->min_delta_ns;
 	dev->next_event = ktime_add_ns(ktime_get(), delta);
 
-	if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+	if (clockevent_state_shutdown(dev))
 		return 0;
 
 	dev->retries++;
@@ -317,9 +335,13 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 
 	dev->next_event = expires;
 
-	if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+	if (clockevent_state_shutdown(dev))
 		return 0;
 
+	/* We must be in ONESHOT state here */
+	WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
+		  clockevent_get_state(dev));
+
 	/* Shortcut for clockevent devices that can deal with ktime. */
 	if (dev->features & CLOCK_EVT_FEAT_KTIME)
 		return dev->set_next_ktime(expires, dev);
@@ -362,7 +384,7 @@ static int clockevents_replace(struct clock_event_device *ced)
 	struct clock_event_device *dev, *newdev = NULL;
 
 	list_for_each_entry(dev, &clockevent_devices, list) {
-		if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
+		if (dev == ced || !clockevent_state_detached(dev))
 			continue;
 
 		if (!tick_check_replacement(newdev, dev))
@@ -388,7 +410,7 @@ static int clockevents_replace(struct clock_event_device *ced)
 static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
 {
 	/* Fast track. Device is unused */
-	if (ced->state == CLOCK_EVT_STATE_DETACHED) {
+	if (clockevent_state_detached(ced)) {
 		list_del_init(&ced->list);
 		return 0;
 	}
@@ -445,7 +467,8 @@ static int clockevents_sanity_check(struct clock_event_device *dev)
 	if (dev->set_mode) {
 		/* We shouldn't be supporting new modes now */
 		WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
-			dev->set_state_shutdown || dev->tick_resume);
+			dev->set_state_shutdown || dev->tick_resume ||
+			dev->set_state_oneshot_stopped);
 
 		BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
 		return 0;
@@ -454,18 +477,6 @@ static int clockevents_sanity_check(struct clock_event_device *dev)
 	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
 		return 0;
 
-	/* New state-specific callbacks */
-	if (!dev->set_state_shutdown)
-		return -EINVAL;
-
-	if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
-	    !dev->set_state_periodic)
-		return -EINVAL;
-
-	if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
-	    !dev->set_state_oneshot)
-		return -EINVAL;
-
 	return 0;
 }
 
@@ -480,7 +491,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 	BUG_ON(clockevents_sanity_check(dev));
 
 	/* Initialize state to DETACHED */
-	dev->state = CLOCK_EVT_STATE_DETACHED;
+	clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 
 	if (!dev->cpumask) {
 		WARN_ON(num_possible_cpus() > 1);
@@ -545,11 +556,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 {
 	clockevents_config(dev, freq);
 
-	if (dev->state == CLOCK_EVT_STATE_ONESHOT)
+	if (clockevent_state_oneshot(dev))
 		return clockevents_program_event(dev, dev->next_event, false);
 
-	if (dev->state == CLOCK_EVT_STATE_PERIODIC)
-		return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+	if (clockevent_state_periodic(dev))
+		return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
 
 	return 0;
 }
@@ -603,13 +614,13 @@ void clockevents_exchange_device(struct clock_event_device *old,
 	 */
 	if (old) {
 		module_put(old->owner);
-		clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
+		clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
 		list_del(&old->list);
 		list_add(&old->list, &clockevents_released);
 	}
 
 	if (new) {
-		BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
+		BUG_ON(!clockevent_state_detached(new));
 		clockevents_shutdown(new);
 	}
 }
@@ -622,7 +633,7 @@ void clockevents_suspend(void)
 	struct clock_event_device *dev;
 
 	list_for_each_entry_reverse(dev, &clockevent_devices, list)
-		if (dev->suspend)
+		if (dev->suspend && !clockevent_state_detached(dev))
 			dev->suspend(dev);
 }
 
@@ -634,7 +645,7 @@ void clockevents_resume(void)
 	struct clock_event_device *dev;
 
 	list_for_each_entry(dev, &clockevent_devices, list)
-		if (dev->resume)
+		if (dev->resume && !clockevent_state_detached(dev))
 			dev->resume(dev);
 }
 
@@ -665,7 +676,7 @@ void tick_cleanup_dead_cpu(int cpu)
 		if (cpumask_test_cpu(cpu, dev->cpumask) &&
 		    cpumask_weight(dev->cpumask) == 1 &&
 		    !tick_is_broadcast_device(dev)) {
-			BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
+			BUG_ON(!clockevent_state_detached(dev));
 			list_del(&dev->list);
 		}
 	}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 15facb1b9..841b72f72 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,6 +23,8 @@
  *   o Allow clocksource drivers to be unregistered
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/device.h>
 #include <linux/clocksource.h>
 #include <linux/init.h>
@@ -216,10 +218,11 @@ static void clocksource_watchdog(unsigned long data)
 
 		/* Check the deviation from the watchdog clocksource. */
 		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
-			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
-			pr_warn("	'%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
+				cs->name);
+			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
 				watchdog->name, wdnow, wdlast, watchdog->mask);
-			pr_warn("	'%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+			pr_warn("                      '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
 				cs->name, csnow, cslast, cs->mask);
 			__clocksource_unstable(cs);
 			continue;
@@ -567,9 +570,8 @@ static void __clocksource_select(bool skipcur)
 		 */
 		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
 			/* Override clocksource cannot be used. */
-			printk(KERN_WARNING "Override clocksource %s is not "
-			       "HRT compatible. Cannot switch while in "
-			       "HRT/NOHZ mode\n", cs->name);
+			pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
+				cs->name);
 			override_name[0] = 0;
 		} else
 			/* Override clocksource can be used. */
@@ -708,8 +710,8 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
 
 	clocksource_update_max_deferment(cs);
 
-	pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
-			cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
+	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
 EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 
@@ -1008,12 +1010,10 @@ __setup("clocksource=", boot_override_clocksource);
 static int __init boot_override_clock(char* str)
 {
 	if (!strcmp(str, "pmtmr")) {
-		printk("Warning: clock=pmtmr is deprecated. "
-			"Use clocksource=acpi_pm.\n");
+		pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
 		return boot_override_clocksource("acpi_pm");
 	}
-	printk("Warning! clock= boot option is deprecated. "
-		"Use clocksource=xyz\n");
+	pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
 	return boot_override_clocksource(str);
 }
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 93ef7190b..5c7ae4b64 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -66,33 +66,29 @@
  */
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
-
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
 	.clock_base =
 	{
 		{
 			.index = HRTIMER_BASE_MONOTONIC,
 			.clockid = CLOCK_MONOTONIC,
 			.get_time = &ktime_get,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_REALTIME,
 			.clockid = CLOCK_REALTIME,
 			.get_time = &ktime_get_real,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_BOOTTIME,
 			.clockid = CLOCK_BOOTTIME,
 			.get_time = &ktime_get_boottime,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_TAI,
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
-			.resolution = KTIME_LOW_RES,
 		},
 	}
 };
@@ -109,27 +105,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 	return hrtimer_clock_to_base_table[clock_id];
 }
 
-
-/*
- * Get the coarse grained time at the softirq based on xtime and
- * wall_to_monotonic.
- */
-static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
-{
-	ktime_t xtim, mono, boot, tai;
-	ktime_t off_real, off_boot, off_tai;
-
-	mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
-	boot = ktime_add(mono, off_boot);
-	xtim = ktime_add(mono, off_real);
-	tai = ktime_add(mono, off_tai);
-
-	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
-	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
-	base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
-}
-
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -137,6 +112,18 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 #ifdef CONFIG_SMP
 
 /*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+	.seq = SEQCNT_ZERO(migration_cpu_base),
+	.clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base	migration_cpu_base.clock_base[0]
+
+/*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
@@ -145,8 +132,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
  */
 static
 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -156,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 
 	for (;;) {
 		base = timer->base;
-		if (likely(base != NULL)) {
+		if (likely(base != &migration_base)) {
 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
@@ -190,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
 #endif
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+					 int pinned)
+{
+	if (pinned || !base->migration_enabled)
+		return this_cpu_ptr(&hrtimer_bases);
+	return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+}
+#else
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+					 int pinned)
+{
+	return this_cpu_ptr(&hrtimer_bases);
+}
+#endif
+
 /*
  * Switch the timer base to the current CPU when possible.
  */
@@ -197,14 +202,13 @@ static inline struct hrtimer_clock_base *
 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		    int pinned)
 {
+	struct hrtimer_cpu_base *new_cpu_base, *this_base;
 	struct hrtimer_clock_base *new_base;
-	struct hrtimer_cpu_base *new_cpu_base;
-	int this_cpu = smp_processor_id();
-	int cpu = get_nohz_timer_target(pinned);
 	int basenum = base->index;
 
+	this_base = this_cpu_ptr(&hrtimer_bases);
+	new_cpu_base = get_target_base(this_base, pinned);
 again:
-	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
 	new_base = &new_cpu_base->clock_base[basenum];
 
 	if (base != new_base) {
@@ -220,22 +224,24 @@ again:
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
 
-		/* See the comment in lock_timer_base() */
-		timer->base = NULL;
+		/* See the comment in lock_hrtimer_base() */
+		timer->base = &migration_base;
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
-		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-			cpu = this_cpu;
+		if (new_cpu_base != this_base &&
+		    hrtimer_check_target(timer, new_base)) {
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raw_spin_lock(&base->cpu_base->lock);
+			new_cpu_base = this_base;
 			timer->base = base;
 			goto again;
 		}
 		timer->base = new_base;
 	} else {
-		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-			cpu = this_cpu;
+		if (new_cpu_base != this_base &&
+		    hrtimer_check_target(timer, new_base)) {
+			new_cpu_base = this_base;
 			goto again;
 		}
 	}
@@ -443,24 +449,35 @@ static inline void debug_deactivate(struct hrtimer *timer)
 }
 
 #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
+					     struct hrtimer *timer)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+	cpu_base->next_timer = timer;
+#endif
+}
+
 static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 {
 	struct hrtimer_clock_base *base = cpu_base->clock_base;
 	ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
-	int i;
+	unsigned int active = cpu_base->active_bases;
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+	hrtimer_update_next_timer(cpu_base, NULL);
+	for (; active; base++, active >>= 1) {
 		struct timerqueue_node *next;
 		struct hrtimer *timer;
 
-		next = timerqueue_getnext(&base->active);
-		if (!next)
+		if (!(active & 0x01))
 			continue;
 
+		next = timerqueue_getnext(&base->active);
 		timer = container_of(next, struct hrtimer, node);
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-		if (expires.tv64 < expires_next.tv64)
+		if (expires.tv64 < expires_next.tv64) {
 			expires_next = expires;
+			hrtimer_update_next_timer(cpu_base, timer);
+		}
 	}
 	/*
 	 * clock_was_set() might have changed base->offset of any of
@@ -473,6 +490,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 }
 #endif
 
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+
+	return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+					    offs_real, offs_boot, offs_tai);
+}
+
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -480,6 +507,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
  * High resolution timer enabled ?
  */
 static int hrtimer_hres_enabled __read_mostly  = 1;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
 
 /*
  * Enable / Disable high resolution mode
@@ -508,9 +537,14 @@ static inline int hrtimer_is_hres_enabled(void)
 /*
  * Is the high resolution mode active ?
  */
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+{
+	return cpu_base->hres_active;
+}
+
 static inline int hrtimer_hres_active(void)
 {
-	return __this_cpu_read(hrtimer_bases.hres_active);
+	return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
 }
 
 /*
@@ -521,7 +555,12 @@ static inline int hrtimer_hres_active(void)
 static void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 {
-	ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+	ktime_t expires_next;
+
+	if (!cpu_base->hres_active)
+		return;
+
+	expires_next = __hrtimer_get_next_event(cpu_base);
 
 	if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
 		return;
@@ -545,63 +584,53 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 	if (cpu_base->hang_detected)
 		return;
 
-	if (cpu_base->expires_next.tv64 != KTIME_MAX)
-		tick_program_event(cpu_base->expires_next, 1);
+	tick_program_event(cpu_base->expires_next, 1);
 }
 
 /*
- * Shared reprogramming for clock_realtime and clock_monotonic
- *
  * When a timer is enqueued and expires earlier than the already enqueued
  * timers, we have to check, whether it expires earlier than the timer for
  * which the clock event device was armed.
  *
- * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
- * and no expiry check happens. The timer gets enqueued into the rbtree. The
- * reprogramming and expiry check is done in the hrtimer_interrupt or in the
- * softirq.
- *
  * Called with interrupts disabled and base->cpu_base.lock held
  */
-static int hrtimer_reprogram(struct hrtimer *timer,
-			     struct hrtimer_clock_base *base)
+static void hrtimer_reprogram(struct hrtimer *timer,
+			      struct hrtimer_clock_base *base)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-	int res;
 
 	WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
 
 	/*
-	 * When the callback is running, we do not reprogram the clock event
-	 * device. The timer callback is either running on a different CPU or
-	 * the callback is executed in the hrtimer_interrupt context. The
-	 * reprogramming is handled either by the softirq, which called the
-	 * callback or at the end of the hrtimer_interrupt.
+	 * If the timer is not on the current cpu, we cannot reprogram
+	 * the other cpus clock event device.
 	 */
-	if (hrtimer_callback_running(timer))
-		return 0;
+	if (base->cpu_base != cpu_base)
+		return;
+
+	/*
+	 * If the hrtimer interrupt is running, then it will
+	 * reevaluate the clock bases and reprogram the clock event
+	 * device. The callbacks are always executed in hard interrupt
+	 * context so we don't need an extra check for a running
+	 * callback.
+	 */
+	if (cpu_base->in_hrtirq)
+		return;
 
 	/*
 	 * CLOCK_REALTIME timer might be requested with an absolute
-	 * expiry time which is less than base->offset. Nothing wrong
-	 * about that, just avoid to call into the tick code, which
-	 * has now objections against negative expiry values.
+	 * expiry time which is less than base->offset. Set it to 0.
 	 */
 	if (expires.tv64 < 0)
-		return -ETIME;
+		expires.tv64 = 0;
 
 	if (expires.tv64 >= cpu_base->expires_next.tv64)
-		return 0;
+		return;
 
-	/*
-	 * When the target cpu of the timer is currently executing
-	 * hrtimer_interrupt(), then we do not touch the clock event
-	 * device. hrtimer_interrupt() will reevaluate all clock bases
-	 * before reprogramming the device.
-	 */
-	if (cpu_base->in_hrtirq)
-		return 0;
+	/* Update the pointer to the next expiring timer */
+	cpu_base->next_timer = timer;
 
 	/*
 	 * If a hang was detected in the last timer interrupt then we
@@ -610,15 +639,14 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	 * to make progress.
 	 */
 	if (cpu_base->hang_detected)
-		return 0;
+		return;
 
 	/*
-	 * Clockevents returns -ETIME, when the event was in the past.
+	 * Program the timer hardware. We enforce the expiry for
+	 * events which are already in the past.
 	 */
-	res = tick_program_event(expires, 0);
-	if (!IS_ERR_VALUE(res))
-		cpu_base->expires_next = expires;
-	return res;
+	cpu_base->expires_next = expires;
+	tick_program_event(expires, 1);
 }
 
 /*
@@ -630,15 +658,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 	base->hres_active = 0;
 }
 
-static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
-{
-	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
-	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
-	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-
-	return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
-}
-
 /*
  * Retrigger next event is called after clock was set
  *
@@ -648,7 +667,7 @@ static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
-	if (!hrtimer_hres_active())
+	if (!base->hres_active)
 		return;
 
 	raw_spin_lock(&base->lock);
@@ -662,29 +681,19 @@ static void retrigger_next_event(void *arg)
  */
 static int hrtimer_switch_to_hres(void)
 {
-	int i, cpu = smp_processor_id();
-	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
-	unsigned long flags;
-
-	if (base->hres_active)
-		return 1;
-
-	local_irq_save(flags);
+	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
 	if (tick_init_highres()) {
-		local_irq_restore(flags);
 		printk(KERN_WARNING "Could not switch to high resolution "
-				    "mode on CPU %d\n", cpu);
+				    "mode on CPU %d\n", base->cpu);
 		return 0;
 	}
 	base->hres_active = 1;
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
-		base->clock_base[i].resolution = KTIME_HIGH_RES;
+	hrtimer_resolution = HIGH_RES_NSEC;
 
 	tick_setup_sched_timer();
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
-	local_irq_restore(flags);
 	return 1;
 }
 
@@ -706,6 +715,7 @@ void clock_was_set_delayed(void)
 
 #else
 
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
 static inline int hrtimer_hres_active(void) { return 0; }
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
@@ -803,6 +813,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  *
  * Forward the timer expiry so it will expire in the future.
  * Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
  */
 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
@@ -814,8 +832,11 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 	if (delta.tv64 < 0)
 		return 0;
 
-	if (interval.tv64 < timer->base->resolution.tv64)
-		interval.tv64 = timer->base->resolution.tv64;
+	if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+		return 0;
+
+	if (interval.tv64 < hrtimer_resolution)
+		interval.tv64 = hrtimer_resolution;
 
 	if (unlikely(delta.tv64 >= interval.tv64)) {
 		s64 incr = ktime_to_ns(interval);
@@ -849,16 +870,11 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 {
 	debug_activate(timer);
 
-	timerqueue_add(&base->active, &timer->node);
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	/*
-	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-	 * state of a possibly running callback.
-	 */
-	timer->state |= HRTIMER_STATE_ENQUEUED;
+	timer->state = HRTIMER_STATE_ENQUEUED;
 
-	return (&timer->node == base->active.next);
+	return timerqueue_add(&base->active, &timer->node);
 }
 
 /*
@@ -875,39 +891,38 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     struct hrtimer_clock_base *base,
 			     unsigned long newstate, int reprogram)
 {
-	struct timerqueue_node *next_timer;
-	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
-		goto out;
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+	unsigned int state = timer->state;
+
+	timer->state = newstate;
+	if (!(state & HRTIMER_STATE_ENQUEUED))
+		return;
+
+	if (!timerqueue_del(&base->active, &timer->node))
+		cpu_base->active_bases &= ~(1 << base->index);
 
-	next_timer = timerqueue_getnext(&base->active);
-	timerqueue_del(&base->active, &timer->node);
-	if (&timer->node == next_timer) {
 #ifdef CONFIG_HIGH_RES_TIMERS
-		/* Reprogram the clock event device. if enabled */
-		if (reprogram && hrtimer_hres_active()) {
-			ktime_t expires;
-
-			expires = ktime_sub(hrtimer_get_expires(timer),
-					    base->offset);
-			if (base->cpu_base->expires_next.tv64 == expires.tv64)
-				hrtimer_force_reprogram(base->cpu_base, 1);
-		}
+	/*
+	 * Note: If reprogram is false we do not update
+	 * cpu_base->next_timer. This happens when we remove the first
+	 * timer on a remote cpu. No harm as we never dereference
+	 * cpu_base->next_timer. So the worst thing what can happen is
+	 * an superflous call to hrtimer_force_reprogram() on the
+	 * remote cpu later on if the same timer gets enqueued again.
+	 */
+	if (reprogram && timer == cpu_base->next_timer)
+		hrtimer_force_reprogram(cpu_base, 1);
 #endif
-	}
-	if (!timerqueue_getnext(&base->active))
-		base->cpu_base->active_bases &= ~(1 << base->index);
-out:
-	timer->state = newstate;
 }
 
 /*
  * remove hrtimer, called with base lock held
  */
 static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
 {
 	if (hrtimer_is_queued(timer)) {
-		unsigned long state;
+		unsigned long state = timer->state;
 		int reprogram;
 
 		/*
@@ -921,30 +936,35 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-		/*
-		 * We must preserve the CALLBACK state flag here,
-		 * otherwise we could move the timer base in
-		 * switch_hrtimer_base.
-		 */
-		state = timer->state & HRTIMER_STATE_CALLBACK;
+
+		if (!restart)
+			state = HRTIMER_STATE_INACTIVE;
+
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
 	return 0;
 }
 
-int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-		unsigned long delta_ns, const enum hrtimer_mode mode,
-		int wakeup)
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL)
+ */
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			    unsigned long delta_ns, const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
-	int ret, leftmost;
+	int leftmost;
 
 	base = lock_hrtimer_base(timer, &flags);
 
 	/* Remove an active timer from the queue: */
-	ret = remove_hrtimer(timer, base);
+	remove_hrtimer(timer, base, true);
 
 	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, base->get_time());
@@ -956,7 +976,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * timeouts. This will go away with the GTOD framework.
 		 */
 #ifdef CONFIG_TIME_LOW_RES
-		tim = ktime_add_safe(tim, base->resolution);
+		tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
 #endif
 	}
 
@@ -968,85 +988,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	timer_stats_hrtimer_set_start_info(timer);
 
 	leftmost = enqueue_hrtimer(timer, new_base);
-
-	if (!leftmost) {
-		unlock_hrtimer_base(timer, &flags);
-		return ret;
-	}
+	if (!leftmost)
+		goto unlock;
 
 	if (!hrtimer_is_hres_active(timer)) {
 		/*
 		 * Kick to reschedule the next tick to handle the new timer
 		 * on dynticks target.
 		 */
-		wake_up_nohz_cpu(new_base->cpu_base->cpu);
-	} else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
-			hrtimer_reprogram(timer, new_base)) {
-		/*
-		 * Only allow reprogramming if the new base is on this CPU.
-		 * (it might still be on another CPU if the timer was pending)
-		 *
-		 * XXX send_remote_softirq() ?
-		 */
-		if (wakeup) {
-			/*
-			 * We need to drop cpu_base->lock to avoid a
-			 * lock ordering issue vs. rq->lock.
-			 */
-			raw_spin_unlock(&new_base->cpu_base->lock);
-			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-			local_irq_restore(flags);
-			return ret;
-		} else {
-			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-		}
+		if (new_base->cpu_base->nohz_active)
+			wake_up_nohz_cpu(new_base->cpu_base->cpu);
+	} else {
+		hrtimer_reprogram(timer, new_base);
 	}
-
+unlock:
 	unlock_hrtimer_base(timer, &flags);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
-
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @delta_ns:	"slack" range for the timer
- * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
- *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-		unsigned long delta_ns, const enum hrtimer_mode mode)
-{
-	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
 }
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
 /**
- * hrtimer_start - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
- *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
-{
-	return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
-}
-EXPORT_SYMBOL_GPL(hrtimer_start);
-
-
-/**
  * hrtimer_try_to_cancel - try to deactivate a timer
  * @timer:	hrtimer to stop
  *
@@ -1062,10 +1022,19 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
 	unsigned long flags;
 	int ret = -1;
 
+	/*
+	 * Check lockless first. If the timer is not active (neither
+	 * enqueued nor running the callback, nothing to do here.  The
+	 * base lock does not serialize against a concurrent enqueue,
+	 * so we can avoid taking it.
+	 */
+	if (!hrtimer_active(timer))
+		return 0;
+
 	base = lock_hrtimer_base(timer, &flags);
 
 	if (!hrtimer_callback_running(timer))
-		ret = remove_hrtimer(timer, base);
+		ret = remove_hrtimer(timer, base, false);
 
 	unlock_hrtimer_base(timer, &flags);
 
@@ -1115,26 +1084,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 /**
  * hrtimer_get_next_event - get the time until next expiry event
  *
- * Returns the delta to the next expiry event or KTIME_MAX if no timer
- * is pending.
+ * Returns the next expiry time or KTIME_MAX if no timer is pending.
  */
-ktime_t hrtimer_get_next_event(void)
+u64 hrtimer_get_next_event(void)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	ktime_t mindelta = { .tv64 = KTIME_MAX };
+	u64 expires = KTIME_MAX;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
-	if (!hrtimer_hres_active())
-		mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
-				     ktime_get());
+	if (!__hrtimer_hres_active(cpu_base))
+		expires = __hrtimer_get_next_event(cpu_base).tv64;
 
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
-	if (mindelta.tv64 < 0)
-		mindelta.tv64 = 0;
-	return mindelta;
+	return expires;
 }
 #endif
 
@@ -1176,37 +1141,73 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
-/**
- * hrtimer_get_res - get the timer resolution for a clock
- * @which_clock: which clock to query
- * @tp:		 pointer to timespec variable to store the resolution
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
  *
- * Store the resolution of the clock selected by @which_clock in the
- * variable pointed to by @tp.
+ * It is important for this function to not return a false negative.
  */
-int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+bool hrtimer_active(const struct hrtimer *timer)
 {
 	struct hrtimer_cpu_base *cpu_base;
-	int base = hrtimer_clockid_to_base(which_clock);
+	unsigned int seq;
 
-	cpu_base = raw_cpu_ptr(&hrtimer_bases);
-	*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
+	do {
+		cpu_base = READ_ONCE(timer->base->cpu_base);
+		seq = raw_read_seqcount_begin(&cpu_base->seq);
 
-	return 0;
+		if (timer->state != HRTIMER_STATE_INACTIVE ||
+		    cpu_base->running == timer)
+			return true;
+
+	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
+		 cpu_base != READ_ONCE(timer->base->cpu_base));
+
+	return false;
 }
-EXPORT_SYMBOL_GPL(hrtimer_get_res);
+EXPORT_SYMBOL_GPL(hrtimer_active);
 
-static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ *  - queued:	the timer is queued
+ *  - callback:	the timer is being ran
+ *  - post:	the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
+
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
+			  struct hrtimer_clock_base *base,
+			  struct hrtimer *timer, ktime_t *now)
 {
-	struct hrtimer_clock_base *base = timer->base;
-	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_held(&cpu_base->lock);
 
 	debug_deactivate(timer);
-	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	cpu_base->running = timer;
+
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
@@ -1222,58 +1223,43 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 	raw_spin_lock(&cpu_base->lock);
 
 	/*
-	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+	 * Note: We clear the running state after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
+	 *
+	 * Note: Because we dropped the cpu_base->lock above,
+	 * hrtimer_start_range_ns() can have popped in and enqueued the timer
+	 * for us already.
 	 */
-	if (restart != HRTIMER_NORESTART) {
-		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+	if (restart != HRTIMER_NORESTART &&
+	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
-	}
 
-	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
 
-	timer->state &= ~HRTIMER_STATE_CALLBACK;
+	WARN_ON_ONCE(cpu_base->running != timer);
+	cpu_base->running = NULL;
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer interrupt
- * Called with interrupts disabled
- */
-void hrtimer_interrupt(struct clock_event_device *dev)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 {
-	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	ktime_t expires_next, now, entry_time, delta;
-	int i, retries = 0;
-
-	BUG_ON(!cpu_base->hres_active);
-	cpu_base->nr_events++;
-	dev->next_event.tv64 = KTIME_MAX;
-
-	raw_spin_lock(&cpu_base->lock);
-	entry_time = now = hrtimer_update_base(cpu_base);
-retry:
-	cpu_base->in_hrtirq = 1;
-	/*
-	 * We set expires_next to KTIME_MAX here with cpu_base->lock
-	 * held to prevent that a timer is enqueued in our queue via
-	 * the migration code. This does not affect enqueueing of
-	 * timers which run their callback and need to be requeued on
-	 * this CPU.
-	 */
-	cpu_base->expires_next.tv64 = KTIME_MAX;
+	struct hrtimer_clock_base *base = cpu_base->clock_base;
+	unsigned int active = cpu_base->active_bases;
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-		struct hrtimer_clock_base *base;
+	for (; active; base++, active >>= 1) {
 		struct timerqueue_node *node;
 		ktime_t basenow;
 
-		if (!(cpu_base->active_bases & (1 << i)))
+		if (!(active & 0x01))
 			continue;
 
-		base = cpu_base->clock_base + i;
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = timerqueue_getnext(&base->active))) {
@@ -1296,9 +1282,42 @@ retry:
 			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
 				break;
 
-			__run_hrtimer(timer, &basenow);
+			__run_hrtimer(cpu_base, base, timer, &basenow);
 		}
 	}
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t expires_next, now, entry_time, delta;
+	int retries = 0;
+
+	BUG_ON(!cpu_base->hres_active);
+	cpu_base->nr_events++;
+	dev->next_event.tv64 = KTIME_MAX;
+
+	raw_spin_lock(&cpu_base->lock);
+	entry_time = now = hrtimer_update_base(cpu_base);
+retry:
+	cpu_base->in_hrtirq = 1;
+	/*
+	 * We set expires_next to KTIME_MAX here with cpu_base->lock
+	 * held to prevent that a timer is enqueued in our queue via
+	 * the migration code. This does not affect enqueueing of
+	 * timers which run their callback and need to be requeued on
+	 * this CPU.
+	 */
+	cpu_base->expires_next.tv64 = KTIME_MAX;
+
+	__hrtimer_run_queues(cpu_base, now);
+
 	/* Reevaluate the clock bases for the next expiry */
 	expires_next = __hrtimer_get_next_event(cpu_base);
 	/*
@@ -1310,8 +1329,7 @@ retry:
 	raw_spin_unlock(&cpu_base->lock);
 
 	/* Reprogramming necessary ? */
-	if (expires_next.tv64 == KTIME_MAX ||
-	    !tick_program_event(expires_next, 0)) {
+	if (!tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
 		return;
 	}
@@ -1344,8 +1362,8 @@ retry:
 	cpu_base->hang_detected = 1;
 	raw_spin_unlock(&cpu_base->lock);
 	delta = ktime_sub(now, entry_time);
-	if (delta.tv64 > cpu_base->max_hang_time.tv64)
-		cpu_base->max_hang_time = delta;
+	if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
+		cpu_base->max_hang_time = (unsigned int) delta.tv64;
 	/*
 	 * Limit it to a sensible value as we enforce a longer
 	 * delay. Give the CPU at least 100ms to catch up.
@@ -1363,7 +1381,7 @@ retry:
  * local version of hrtimer_peek_ahead_timers() called with interrupts
  * disabled.
  */
-static void __hrtimer_peek_ahead_timers(void)
+static inline void __hrtimer_peek_ahead_timers(void)
 {
 	struct tick_device *td;
 
@@ -1375,29 +1393,6 @@ static void __hrtimer_peek_ahead_timers(void)
 		hrtimer_interrupt(td->evtdev);
 }
 
-/**
- * hrtimer_peek_ahead_timers -- run soft-expired timers now
- *
- * hrtimer_peek_ahead_timers will peek at the timer queue of
- * the current cpu and check if there are any timers for which
- * the soft expires time has passed. If any such timers exist,
- * they are run immediately and then removed from the timer queue.
- *
- */
-void hrtimer_peek_ahead_timers(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__hrtimer_peek_ahead_timers();
-	local_irq_restore(flags);
-}
-
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
-	hrtimer_peek_ahead_timers();
-}
-
 #else /* CONFIG_HIGH_RES_TIMERS */
 
 static inline void __hrtimer_peek_ahead_timers(void) { }
@@ -1405,66 +1400,32 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
 #endif	/* !CONFIG_HIGH_RES_TIMERS */
 
 /*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
+ * Called from run_local_timers in hardirq context every jiffy
  */
-void hrtimer_run_pending(void)
+void hrtimer_run_queues(void)
 {
-	if (hrtimer_hres_active())
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t now;
+
+	if (__hrtimer_hres_active(cpu_base))
 		return;
 
 	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
+	 * This _is_ ugly: We have to check periodically, whether we
+	 * can switch to highres and / or nohz mode. The clocksource
+	 * switch happens with xtime_lock held. Notification from
+	 * there only sets the check bit in the tick_oneshot code,
+	 * otherwise we might deadlock vs. xtime_lock.
 	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
 		hrtimer_switch_to_hres();
-}
-
-/*
- * Called from hardirq context every jiffy
- */
-void hrtimer_run_queues(void)
-{
-	struct timerqueue_node *node;
-	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	struct hrtimer_clock_base *base;
-	int index, gettime = 1;
-
-	if (hrtimer_hres_active())
 		return;
-
-	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
-		base = &cpu_base->clock_base[index];
-		if (!timerqueue_getnext(&base->active))
-			continue;
-
-		if (gettime) {
-			hrtimer_get_softirq_time(cpu_base);
-			gettime = 0;
-		}
-
-		raw_spin_lock(&cpu_base->lock);
-
-		while ((node = timerqueue_getnext(&base->active))) {
-			struct hrtimer *timer;
-
-			timer = container_of(node, struct hrtimer, node);
-			if (base->softirq_time.tv64 <=
-					hrtimer_get_expires_tv64(timer))
-				break;
-
-			__run_hrtimer(timer, &base->softirq_time);
-		}
-		raw_spin_unlock(&cpu_base->lock);
 	}
+
+	raw_spin_lock(&cpu_base->lock);
+	now = hrtimer_update_base(cpu_base);
+	__hrtimer_run_queues(cpu_base, now);
+	raw_spin_unlock(&cpu_base->lock);
 }
 
 /*
@@ -1497,8 +1458,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start_expires(&t->timer, mode);
-		if (!hrtimer_active(&t->timer))
-			t->task = NULL;
 
 		if (likely(t->task))
 			freezable_schedule();
@@ -1642,11 +1601,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		debug_deactivate(timer);
 
 		/*
-		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * Mark it as ENQUEUED not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1657,9 +1616,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * event device.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 
@@ -1731,9 +1687,6 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
-	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
@@ -1772,8 +1725,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
 	hrtimer_init_sleeper(&t, current);
 
 	hrtimer_start_expires(&t.timer, mode);
-	if (!hrtimer_active(&t.timer))
-		t.task = NULL;
 
 	if (likely(t.task))
 		schedule();
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7a6810030..fb4d98c7f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -35,6 +35,7 @@ unsigned long			tick_nsec;
 static u64			tick_length;
 static u64			tick_length_base;
 
+#define SECS_PER_DAY		86400
 #define MAX_TICKADJ		500LL		/* usecs */
 #define MAX_TICKADJ_SCALED \
 	(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -76,6 +77,9 @@ static long			time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/
 static s64			ntp_tick_adj;
 
+/* second value of the next pending leapsecond, or TIME64_MAX if no leap */
+static time64_t			ntp_next_leap_sec = TIME64_MAX;
+
 #ifdef CONFIG_NTP_PPS
 
 /*
@@ -349,6 +353,7 @@ void ntp_clear(void)
 	tick_length	= tick_length_base;
 	time_offset	= 0;
 
+	ntp_next_leap_sec = TIME64_MAX;
 	/* Clear PPS state variables */
 	pps_clear();
 }
@@ -359,6 +364,21 @@ u64 ntp_tick_length(void)
 	return tick_length;
 }
 
+/**
+ * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ *
+ * Provides the time of the next leapsecond against CLOCK_REALTIME in
+ * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ */
+ktime_t ntp_get_next_leap(void)
+{
+	ktime_t ret;
+
+	if ((time_state == TIME_INS) && (time_status & STA_INS))
+		return ktime_set(ntp_next_leap_sec, 0);
+	ret.tv64 = KTIME_MAX;
+	return ret;
+}
 
 /*
  * this routine handles the overflow of the microsecond field
@@ -382,15 +402,21 @@ int second_overflow(unsigned long secs)
 	 */
 	switch (time_state) {
 	case TIME_OK:
-		if (time_status & STA_INS)
+		if (time_status & STA_INS) {
 			time_state = TIME_INS;
-		else if (time_status & STA_DEL)
+			ntp_next_leap_sec = secs + SECS_PER_DAY -
+						(secs % SECS_PER_DAY);
+		} else if (time_status & STA_DEL) {
 			time_state = TIME_DEL;
+			ntp_next_leap_sec = secs + SECS_PER_DAY -
+						 ((secs+1) % SECS_PER_DAY);
+		}
 		break;
 	case TIME_INS:
-		if (!(time_status & STA_INS))
+		if (!(time_status & STA_INS)) {
+			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_OK;
-		else if (secs % 86400 == 0) {
+		} else if (secs % SECS_PER_DAY == 0) {
 			leap = -1;
 			time_state = TIME_OOP;
 			printk(KERN_NOTICE
@@ -398,19 +424,21 @@ int second_overflow(unsigned long secs)
 		}
 		break;
 	case TIME_DEL:
-		if (!(time_status & STA_DEL))
+		if (!(time_status & STA_DEL)) {
+			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_OK;
-		else if ((secs + 1) % 86400 == 0) {
+		} else if ((secs + 1) % SECS_PER_DAY == 0) {
 			leap = 1;
+			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_WAIT;
 			printk(KERN_NOTICE
 				"Clock: deleting leap second 23:59:59 UTC\n");
 		}
 		break;
 	case TIME_OOP:
+		ntp_next_leap_sec = TIME64_MAX;
 		time_state = TIME_WAIT;
 		break;
-
 	case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
 			time_state = TIME_OK;
@@ -547,6 +575,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
 	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
 		time_state = TIME_OK;
 		time_status = STA_UNSYNC;
+		ntp_next_leap_sec = TIME64_MAX;
 		/* restart PPS frequency calibration */
 		pps_reset_freq_interval();
 	}
@@ -711,6 +740,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
 	if (!(time_status & STA_NANO))
 		txc->time.tv_usec /= NSEC_PER_USEC;
 
+	/* Handle leapsec adjustments */
+	if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
+		if ((time_state == TIME_INS) && (time_status & STA_INS)) {
+			result = TIME_OOP;
+			txc->tai++;
+			txc->time.tv_sec--;
+		}
+		if ((time_state == TIME_DEL) && (time_status & STA_DEL)) {
+			result = TIME_WAIT;
+			txc->tai--;
+			txc->time.tv_sec++;
+		}
+		if ((time_state == TIME_OOP) &&
+					(ts->tv_sec == ntp_next_leap_sec)) {
+			result = TIME_WAIT;
+		}
+	}
+
 	return result;
 }
 
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index bbd102ad9..65430504c 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -5,6 +5,7 @@ extern void ntp_init(void);
 extern void ntp_clear(void);
 /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
 extern u64 ntp_tick_length(void);
+extern ktime_t ntp_get_next_leap(void);
 extern int second_overflow(unsigned long secs);
 extern int ntp_validate_timex(struct timex *);
 extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0ac829b48..892e3dae0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 	return 0;
 }
 
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
 {
-	if (b->utime > a->utime)
-		a->utime = b->utime;
+	u64 curr_cputime;
+retry:
+	curr_cputime = atomic64_read(cputime);
+	if (sum_cputime > curr_cputime) {
+		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+			goto retry;
+	}
+}
 
-	if (b->stime > a->stime)
-		a->stime = b->stime;
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+{
+	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
+	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
+	__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
+}
 
-	if (b->sum_exec_runtime > a->sum_exec_runtime)
-		a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+					 struct task_cputime_atomic *atomic_times)
+{
+	times->utime = atomic64_read(&atomic_times->utime);
+	times->stime = atomic64_read(&atomic_times->stime);
+	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
 }
 
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	struct task_cputime sum;
-	unsigned long flags;
 
-	if (!cputimer->running) {
+	/* Check if cputimer isn't running. This is accessed without locking. */
+	if (!READ_ONCE(cputimer->running)) {
 		/*
 		 * The POSIX timer interface allows for absolute time expiry
 		 * values through the TIMER_ABSTIME flag, therefore we have
-		 * to synchronize the timer to the clock every time we start
-		 * it.
+		 * to synchronize the timer to the clock every time we start it.
 		 */
 		thread_group_cputime(tsk, &sum);
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-		cputimer->running = 1;
-		update_gt_cputime(&cputimer->cputime, &sum);
-	} else
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-	*times = cputimer->cputime;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+		update_gt_cputime(&cputimer->cputime_atomic, &sum);
+
+		/*
+		 * We're setting cputimer->running without a lock. Ensure
+		 * this only gets written to in one operation. We set
+		 * running after update_gt_cputime() as a small optimization,
+		 * but barriers are not required because update_gt_cputime()
+		 * can handle concurrent updates.
+		 */
+		WRITE_ONCE(cputimer->running, 1);
+	}
+	sample_cputime_atomic(times, &cputimer->cputime_atomic);
 }
 
 /*
@@ -425,7 +448,7 @@ static void cleanup_timers(struct list_head *head)
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
-	add_device_randomness((const void*) &tsk_seruntime(tsk),
+	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 						sizeof(unsigned long long));
 	cleanup_timers(tsk->cpu_timers);
 
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
 	if (!task_cputime_zero(&tsk->cputime_expires))
 		return false;
 
-	if (tsk->signal->cputimer.running)
+	/* Check if cputimer is running. This is accessed without locking. */
+	if (READ_ONCE(tsk->signal->cputimer.running))
 		return false;
 
 	return true;
@@ -847,18 +871,18 @@ static void check_thread_timers(struct task_struct *tsk,
 	tsk_expires->virt_exp = expires_to_cputime(expires);
 
 	tsk_expires->sched_exp = check_timers_list(++timers, firing,
-						   tsk_seruntime(tsk));
+						   tsk->se.sum_exec_runtime);
 
 	/*
 	 * Check for the special case thread timers.
 	 */
-	soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+	soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
 	if (soft != RLIM_INFINITY) {
 		unsigned long hard =
-			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+			READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
-		    tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -866,7 +890,7 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 }
 
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
 {
 	struct thread_group_cputimer *cputimer = &sig->cputimer;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&cputimer->lock, flags);
-	cputimer->running = 0;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+	/* Turn off cputimer->running. This is done without locking. */
+	WRITE_ONCE(cputimer->running, 0);
 }
 
 static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
 			 SIGPROF);
 	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
 			 SIGVTALRM);
-	soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (soft != RLIM_INFINITY) {
 		unsigned long psecs = cputime_to_secs(ptime);
 		unsigned long hard =
-			ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+			READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
 		cputime_t x;
 		if (psecs >= hard) {
 			/*
@@ -1103,7 +1125,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 		struct task_cputime task_sample = {
 			.utime = utime,
 			.stime = stime,
-			.sum_exec_runtime = tsk_seruntime(tsk)
+			.sum_exec_runtime = tsk->se.sum_exec_runtime
 		};
 
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	}
 
 	sig = tsk->signal;
-	if (sig->cputimer.running) {
+	/* Check if cputimer is running. This is accessed without locking. */
+	if (READ_ONCE(sig->cputimer.running)) {
 		struct task_cputime group_sample;
 
-		raw_spin_lock(&sig->cputimer.lock);
-		group_sample = sig->cputimer.cputime;
-		raw_spin_unlock(&sig->cputimer.lock);
+		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	 * If there are any active process wide timers (POSIX 1.b, itimers,
 	 * RLIMIT_CPU) cputimer must be running.
 	 */
-	if (tsk->signal->cputimer.running)
+	if (READ_ONCE(tsk->signal->cputimer.running))
 		check_process_timers(tsk, &firing);
 
 	/*
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31ea01f42..31d11ac9f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
+	return 0;
+}
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
 static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_clock_realtime_get,
 		.clock_set	= posix_clock_realtime_set,
 		.clock_adj	= posix_clock_realtime_adj,
@@ -290,7 +297,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_ktime_get_ts,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -300,7 +307,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic_raw = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_monotonic_raw,
 	};
 	struct k_clock clock_realtime_coarse = {
@@ -312,7 +319,7 @@ static __init int init_posix_timers(void)
 		.clock_get	= posix_get_monotonic_coarse,
 	};
 	struct k_clock clock_tai = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_tai,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -322,7 +329,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_boottime = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_boottime,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 6aac4beed..3e7db49a2 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -22,6 +22,7 @@ static void bc_set_mode(enum clock_event_mode mode,
 			struct clock_event_device *bc)
 {
 	switch (mode) {
+	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		/*
 		 * Note, we cannot cancel the timer here as we might
@@ -66,9 +67,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 	 * hrtimer_{start/cancel} functions call into tracing,
 	 * calls to these functions must be bound within RCU_NONIDLE.
 	 */
-	RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
-		!hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
-			0);
+	RCU_NONIDLE({
+			bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
+			if (bc_moved)
+				hrtimer_start(&bctimer, expires,
+					      HRTIMER_MODE_ABS_PINNED);});
 	if (bc_moved) {
 		/* Bind the "device" to the cpu */
 		bc->bound_on = smp_processor_id();
@@ -99,10 +102,13 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
 {
 	ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
 
-	if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
+	switch (ce_broadcast_hrtimer.mode) {
+	case CLOCK_EVT_MODE_ONESHOT:
+		if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
+			return HRTIMER_RESTART;
+	default:
 		return HRTIMER_NORESTART;
-
-	return HRTIMER_RESTART;
+	}
 }
 
 void tick_setup_hrtimer_broadcast(void)
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7e8ca4f44..f6aae7977 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -159,7 +159,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 {
 	struct clock_event_device *bc = tick_broadcast_device.evtdev;
 	unsigned long flags;
-	int ret;
+	int ret = 0;
 
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 
@@ -221,13 +221,14 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 			 * If we kept the cpu in the broadcast mask,
 			 * tell the caller to leave the per cpu device
 			 * in shutdown state. The periodic interrupt
-			 * is delivered by the broadcast device.
+			 * is delivered by the broadcast device, if
+			 * the broadcast device exists and is not
+			 * hrtimer based.
 			 */
-			ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
+			if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+				ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
 			break;
 		default:
-			/* Nothing to do */
-			ret = 0;
 			break;
 		}
 	}
@@ -255,18 +256,32 @@ int tick_receive_broadcast(void)
 /*
  * Broadcast the event to the cpus, which are set in the mask (mangled).
  */
-static void tick_do_broadcast(struct cpumask *mask)
+static bool tick_do_broadcast(struct cpumask *mask)
 {
 	int cpu = smp_processor_id();
 	struct tick_device *td;
+	bool local = false;
 
 	/*
 	 * Check, if the current cpu is in the mask
 	 */
 	if (cpumask_test_cpu(cpu, mask)) {
+		struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
 		cpumask_clear_cpu(cpu, mask);
-		td = &per_cpu(tick_cpu_device, cpu);
-		td->evtdev->event_handler(td->evtdev);
+		/*
+		 * We only run the local handler, if the broadcast
+		 * device is not hrtimer based. Otherwise we run into
+		 * a hrtimer recursion.
+		 *
+		 * local timer_interrupt()
+		 *   local_handler()
+		 *     expire_hrtimers()
+		 *       bc_handler()
+		 *         local_handler()
+		 *	     expire_hrtimers()
+		 */
+		local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER);
 	}
 
 	if (!cpumask_empty(mask)) {
@@ -279,16 +294,17 @@ static void tick_do_broadcast(struct cpumask *mask)
 		td = &per_cpu(tick_cpu_device, cpumask_first(mask));
 		td->evtdev->broadcast(mask);
 	}
+	return local;
 }
 
 /*
  * Periodic broadcast:
  * - invoke the broadcast handlers
  */
-static void tick_do_periodic_broadcast(void)
+static bool tick_do_periodic_broadcast(void)
 {
 	cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
-	tick_do_broadcast(tmpmask);
+	return tick_do_broadcast(tmpmask);
 }
 
 /*
@@ -296,34 +312,33 @@ static void tick_do_periodic_broadcast(void)
  */
 static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 {
-	ktime_t next;
+	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+	bool bc_local;
 
 	raw_spin_lock(&tick_broadcast_lock);
 
-	tick_do_periodic_broadcast();
+	/* Handle spurious interrupts gracefully */
+	if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) {
+		raw_spin_unlock(&tick_broadcast_lock);
+		return;
+	}
 
-	/*
-	 * The device is in periodic mode. No reprogramming necessary:
-	 */
-	if (dev->state == CLOCK_EVT_STATE_PERIODIC)
-		goto unlock;
+	bc_local = tick_do_periodic_broadcast();
 
-	/*
-	 * Setup the next period for devices, which do not have
-	 * periodic mode. We read dev->next_event first and add to it
-	 * when the event already expired. clockevents_program_event()
-	 * sets dev->next_event only when the event is really
-	 * programmed to the device.
-	 */
-	for (next = dev->next_event; ;) {
-		next = ktime_add(next, tick_period);
+	if (clockevent_state_oneshot(dev)) {
+		ktime_t next = ktime_add(dev->next_event, tick_period);
 
-		if (!clockevents_program_event(dev, next, false))
-			goto unlock;
-		tick_do_periodic_broadcast();
+		clockevents_program_event(dev, next, true);
 	}
-unlock:
 	raw_spin_unlock(&tick_broadcast_lock);
+
+	/*
+	 * We run the handler of the local cpu after dropping
+	 * tick_broadcast_lock because the handler might deadlock when
+	 * trying to switch to oneshot mode.
+	 */
+	if (bc_local)
+		td->evtdev->event_handler(td->evtdev);
 }
 
 /**
@@ -366,8 +381,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
 	case TICK_BROADCAST_ON:
 		cpumask_set_cpu(cpu, tick_broadcast_on);
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
-			if (tick_broadcast_device.mode ==
-			    TICKDEV_MODE_PERIODIC)
+			/*
+			 * Only shutdown the cpu local device, if:
+			 *
+			 * - the broadcast device exists
+			 * - the broadcast device is not a hrtimer based one
+			 * - the broadcast device is in periodic mode to
+			 *   avoid a hickup during switch to oneshot mode
+			 */
+			if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) &&
+			    tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
 				clockevents_shutdown(dev);
 		}
 		break;
@@ -386,14 +409,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
 		break;
 	}
 
-	if (cpumask_empty(tick_broadcast_mask)) {
-		if (!bc_stopped)
-			clockevents_shutdown(bc);
-	} else if (bc_stopped) {
-		if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
-			tick_broadcast_start_periodic(bc);
-		else
-			tick_broadcast_setup_oneshot(bc);
+	if (bc) {
+		if (cpumask_empty(tick_broadcast_mask)) {
+			if (!bc_stopped)
+				clockevents_shutdown(bc);
+		} else if (bc_stopped) {
+			if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+				tick_broadcast_start_periodic(bc);
+			else
+				tick_broadcast_setup_oneshot(bc);
+		}
 	}
 	raw_spin_unlock(&tick_broadcast_lock);
 }
@@ -532,23 +557,19 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc,
 	irq_set_affinity(bc->irq, bc->cpumask);
 }
 
-static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
-				    ktime_t expires, int force)
+static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+				     ktime_t expires)
 {
-	int ret;
-
-	if (bc->state != CLOCK_EVT_STATE_ONESHOT)
-		clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+	if (!clockevent_state_oneshot(bc))
+		clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
 
-	ret = clockevents_program_event(bc, expires, force);
-	if (!ret)
-		tick_broadcast_set_affinity(bc, cpumask_of(cpu));
-	return ret;
+	clockevents_program_event(bc, expires, 1);
+	tick_broadcast_set_affinity(bc, cpumask_of(cpu));
 }
 
 static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
-	clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
 }
 
 /*
@@ -566,7 +587,7 @@ void tick_check_oneshot_broadcast_this_cpu(void)
 		 * switched over, leave the device alone.
 		 */
 		if (td->mode == TICKDEV_MODE_ONESHOT) {
-			clockevents_set_state(td->evtdev,
+			clockevents_switch_state(td->evtdev,
 					      CLOCK_EVT_STATE_ONESHOT);
 		}
 	}
@@ -580,9 +601,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 	struct tick_device *td;
 	ktime_t now, next_event;
 	int cpu, next_cpu = 0;
+	bool bc_local;
 
 	raw_spin_lock(&tick_broadcast_lock);
-again:
 	dev->next_event.tv64 = KTIME_MAX;
 	next_event.tv64 = KTIME_MAX;
 	cpumask_clear(tmpmask);
@@ -624,7 +645,7 @@ again:
 	/*
 	 * Wakeup the cpus which have an expired event.
 	 */
-	tick_do_broadcast(tmpmask);
+	bc_local = tick_do_broadcast(tmpmask);
 
 	/*
 	 * Two reasons for reprogram:
@@ -636,15 +657,15 @@ again:
 	 * - There are pending events on sleeping CPUs which were not
 	 * in the event mask
 	 */
-	if (next_event.tv64 != KTIME_MAX) {
-		/*
-		 * Rearm the broadcast device. If event expired,
-		 * repeat the above
-		 */
-		if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
-			goto again;
-	}
+	if (next_event.tv64 != KTIME_MAX)
+		tick_broadcast_set_event(dev, next_cpu, next_event);
+
 	raw_spin_unlock(&tick_broadcast_lock);
+
+	if (bc_local) {
+		td = this_cpu_ptr(&tick_cpu_device);
+		td->evtdev->event_handler(td->evtdev);
+	}
 }
 
 static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
@@ -670,77 +691,88 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
 		if (dev->next_event.tv64 < bc->next_event.tv64)
 			return;
 	}
-	clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 }
 
-/**
- * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
- * @state:	The target state (enter/exit)
- *
- * The system enters/leaves a state, where affected devices might stop
- * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
- *
- * Called with interrupts disabled, so clockevents_lock is not
- * required here because the local clock event device cannot go away
- * under us.
- */
-int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 {
 	struct clock_event_device *bc, *dev;
-	struct tick_device *td;
 	int cpu, ret = 0;
 	ktime_t now;
 
 	/*
-	 * Periodic mode does not care about the enter/exit of power
-	 * states
+	 * If there is no broadcast device, tell the caller not to go
+	 * into deep idle.
 	 */
-	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
-		return 0;
+	if (!tick_broadcast_device.evtdev)
+		return -EBUSY;
 
-	/*
-	 * We are called with preemtion disabled from the depth of the
-	 * idle code, so we can't be moved away.
-	 */
-	td = this_cpu_ptr(&tick_cpu_device);
-	dev = td->evtdev;
-
-	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
-		return 0;
+	dev = this_cpu_ptr(&tick_cpu_device)->evtdev;
 
 	raw_spin_lock(&tick_broadcast_lock);
 	bc = tick_broadcast_device.evtdev;
 	cpu = smp_processor_id();
 
 	if (state == TICK_BROADCAST_ENTER) {
+		/*
+		 * If the current CPU owns the hrtimer broadcast
+		 * mechanism, it cannot go deep idle and we do not add
+		 * the CPU to the broadcast mask. We don't have to go
+		 * through the EXIT path as the local timer is not
+		 * shutdown.
+		 */
+		ret = broadcast_needs_cpu(bc, cpu);
+		if (ret)
+			goto out;
+
+		/*
+		 * If the broadcast device is in periodic mode, we
+		 * return.
+		 */
+		if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
+			/* If it is a hrtimer based broadcast, return busy */
+			if (bc->features & CLOCK_EVT_FEAT_HRTIMER)
+				ret = -EBUSY;
+			goto out;
+		}
+
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
+
+			/* Conditionally shut down the local timer. */
 			broadcast_shutdown_local(bc, dev);
+
 			/*
 			 * We only reprogram the broadcast timer if we
 			 * did not mark ourself in the force mask and
 			 * if the cpu local event is earlier than the
 			 * broadcast event. If the current CPU is in
 			 * the force mask, then we are going to be
-			 * woken by the IPI right away.
+			 * woken by the IPI right away; we return
+			 * busy, so the CPU does not try to go deep
+			 * idle.
 			 */
-			if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
-			    dev->next_event.tv64 < bc->next_event.tv64)
-				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
+			if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) {
+				ret = -EBUSY;
+			} else if (dev->next_event.tv64 < bc->next_event.tv64) {
+				tick_broadcast_set_event(bc, cpu, dev->next_event);
+				/*
+				 * In case of hrtimer broadcasts the
+				 * programming might have moved the
+				 * timer to this cpu. If yes, remove
+				 * us from the broadcast mask and
+				 * return busy.
+				 */
+				ret = broadcast_needs_cpu(bc, cpu);
+				if (ret) {
+					cpumask_clear_cpu(cpu,
+						tick_broadcast_oneshot_mask);
+				}
+			}
 		}
-		/*
-		 * If the current CPU owns the hrtimer broadcast
-		 * mechanism, it cannot go deep idle and we remove the
-		 * CPU from the broadcast mask. We don't have to go
-		 * through the EXIT path as the local timer is not
-		 * shutdown.
-		 */
-		ret = broadcast_needs_cpu(bc, cpu);
-		if (ret)
-			cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
-			clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+			clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 			/*
 			 * The cpu which was handling the broadcast
 			 * timer marked this cpu in the broadcast
@@ -807,7 +839,6 @@ out:
 	raw_spin_unlock(&tick_broadcast_lock);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
 
 /*
  * Reset the one shot broadcast for a cpu
@@ -842,7 +873,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 
 	/* Set it up only once ! */
 	if (bc->event_handler != tick_handle_oneshot_broadcast) {
-		int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
+		int was_periodic = clockevent_state_periodic(bc);
 
 		bc->event_handler = tick_handle_oneshot_broadcast;
 
@@ -858,10 +889,10 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 			   tick_broadcast_oneshot_mask, tmpmask);
 
 		if (was_periodic && !cpumask_empty(tmpmask)) {
-			clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+			clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
 			tick_broadcast_init_next_event(tmpmask,
 						       tick_next_period);
-			tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
+			tick_broadcast_set_event(bc, cpu, tick_next_period);
 		} else
 			bc->next_event.tv64 = KTIME_MAX;
 	} else {
@@ -949,6 +980,16 @@ bool tick_broadcast_oneshot_available(void)
 	return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
 }
 
+#else
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+	if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER))
+		return -EBUSY;
+
+	return 0;
+}
 #endif
 
 void __init tick_broadcast_init(void)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 3ae6afa1e..f8bf47571 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -19,6 +19,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <trace/events/power.h>
 
 #include <asm/irq_regs.h>
 
@@ -102,7 +103,17 @@ void tick_handle_periodic(struct clock_event_device *dev)
 
 	tick_periodic(cpu);
 
-	if (dev->state != CLOCK_EVT_STATE_ONESHOT)
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
+	/*
+	 * The cpu might have transitioned to HIGHRES or NOHZ mode via
+	 * update_process_times() -> run_local_timers() ->
+	 * hrtimer_run_queues().
+	 */
+	if (dev->event_handler != tick_handle_periodic)
+		return;
+#endif
+
+	if (!clockevent_state_oneshot(dev))
 		return;
 	for (;;) {
 		/*
@@ -140,7 +151,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 
 	if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
 	    !tick_broadcast_oneshot_active()) {
-		clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
 	} else {
 		unsigned long seq;
 		ktime_t next;
@@ -150,7 +161,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 			next = tick_next_period;
 		} while (read_seqretry(&jiffies_lock, seq));
 
-		clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 
 		for (;;) {
 			if (!clockevents_program_event(dev, next, false))
@@ -332,6 +343,28 @@ out_bc:
 	tick_install_broadcast_device(newdev);
 }
 
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state:	The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
+ */
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
+	if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP))
+		return 0;
+
+	return __tick_broadcast_oneshot_control(state);
+}
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Transfer the do_timer job away from a dying cpu.
@@ -367,7 +400,7 @@ void tick_shutdown(unsigned int cpu)
 		 * Prevent that the clock events layer tries to call
 		 * the set mode function!
 		 */
-		dev->state = CLOCK_EVT_STATE_DETACHED;
+		clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 		dev->mode = CLOCK_EVT_MODE_UNUSED;
 		clockevents_exchange_device(dev, NULL);
 		dev->event_handler = clockevents_handle_noop;
@@ -440,6 +473,7 @@ void tick_resume(void)
 	tick_resume_local();
 }
 
+#ifdef CONFIG_SUSPEND
 static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
 static unsigned int tick_freeze_depth;
 
@@ -457,10 +491,13 @@ void tick_freeze(void)
 	raw_spin_lock(&tick_freeze_lock);
 
 	tick_freeze_depth++;
-	if (tick_freeze_depth == num_online_cpus())
+	if (tick_freeze_depth == num_online_cpus()) {
+		trace_suspend_resume(TPS("timekeeping_freeze"),
+				     smp_processor_id(), true);
 		timekeeping_suspend();
-	else
+	} else {
 		tick_suspend_local();
+	}
 
 	raw_spin_unlock(&tick_freeze_lock);
 }
@@ -478,15 +515,19 @@ void tick_unfreeze(void)
 {
 	raw_spin_lock(&tick_freeze_lock);
 
-	if (tick_freeze_depth == num_online_cpus())
+	if (tick_freeze_depth == num_online_cpus()) {
 		timekeeping_resume();
-	else
+		trace_suspend_resume(TPS("timekeeping_freeze"),
+				     smp_processor_id(), false);
+	} else {
 		tick_resume_local();
+	}
 
 	tick_freeze_depth--;
 
 	raw_spin_unlock(&tick_freeze_lock);
 }
+#endif /* CONFIG_SUSPEND */
 
 /**
  * tick_init - initialize the tick control
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b64fdd805..966a5a6fd 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,11 +36,22 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 	return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
 
+static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev)
+{
+	return dev->state_use_accessors;
+}
+
+static inline void clockevent_set_state(struct clock_event_device *dev,
+					enum clock_event_state state)
+{
+	dev->state_use_accessors = state;
+}
+
 extern void clockevents_shutdown(struct clock_event_device *dev);
 extern void clockevents_exchange_device(struct clock_event_device *old,
 					struct clock_event_device *new);
-extern void clockevents_set_state(struct clock_event_device *dev,
-				 enum clock_event_state state);
+extern void clockevents_switch_state(struct clock_event_device *dev,
+				     enum clock_event_state state);
 extern int clockevents_program_event(struct clock_event_device *dev,
 				     ktime_t expires, bool force);
 extern void clockevents_handle_noop(struct clock_event_device *dev);
@@ -137,3 +148,19 @@ extern void tick_nohz_init(void);
 # else
 static inline void tick_nohz_init(void) { }
 #endif
+
+#ifdef CONFIG_NO_HZ_COMMON
+extern unsigned long tick_nohz_active;
+#else
+#define tick_nohz_active (0)
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void timers_update_migration(bool update_nohz);
+#else
+static inline void timers_update_migration(bool update_nohz) { }
+#endif
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
+extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 67a64b167..b51344652 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
+	if (unlikely(expires.tv64 == KTIME_MAX)) {
+		/*
+		 * We don't need the clock event device any more, stop it.
+		 */
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+		return 0;
+	}
+
+	if (unlikely(clockevent_state_oneshot_stopped(dev))) {
+		/*
+		 * We need the clock event again, configure it in ONESHOT mode
+		 * before using it.
+		 */
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
+	}
+
 	return clockevents_program_event(dev, expires, force);
 }
 
@@ -38,7 +54,7 @@ void tick_resume_oneshot(void)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
-	clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 	clockevents_program_event(dev, ktime_get(), true);
 }
 
@@ -50,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 			ktime_t next_event)
 {
 	newdev->event_handler = handler;
-	clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);
 	clockevents_program_event(newdev, next_event, true);
 }
 
@@ -81,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 
 	td->mode = TICKDEV_MODE_ONESHOT;
 	dev->event_handler = handler;
-	clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 	tick_broadcast_switch_to_oneshot();
 	return 0;
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 914259128..c792429e9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -399,7 +399,7 @@ void __init tick_nohz_init(void)
  * NO HZ enabled ?
  */
 static int tick_nohz_enabled __read_mostly  = 1;
-int tick_nohz_active  __read_mostly;
+unsigned long tick_nohz_active  __read_mostly;
 /*
  * Enable / Disable tickless mode
  */
@@ -565,156 +565,144 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+	hrtimer_cancel(&ts->sched_timer);
+	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+
+	/* Forward the time to expire in the future */
+	hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+	else
+		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 					 ktime_t now, int cpu)
 {
-	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
-	ktime_t last_update, expires, ret = { .tv64 = 0 };
-	unsigned long rcu_delta_jiffies;
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-	u64 time_delta;
-
-	time_delta = timekeeping_max_deferment();
+	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+	unsigned long seq, basejiff;
+	ktime_t	tick;
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
 		seq = read_seqbegin(&jiffies_lock);
-		last_update = last_jiffies_update;
-		last_jiffies = jiffies;
+		basemono = last_jiffies_update.tv64;
+		basejiff = jiffies;
 	} while (read_seqretry(&jiffies_lock, seq));
+	ts->last_jiffies = basejiff;
 
-	if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+	if (rcu_needs_cpu(basemono, &next_rcu) ||
 	    arch_needs_cpu() || irq_work_needs_cpu()) {
-		next_jiffies = last_jiffies + 1;
-		delta_jiffies = 1;
+		next_tick = basemono + TICK_NSEC;
 	} else {
-		/* Get the next timer wheel timer */
-		next_jiffies = get_next_timer_interrupt(last_jiffies);
-		delta_jiffies = next_jiffies - last_jiffies;
-		if (rcu_delta_jiffies < delta_jiffies) {
-			next_jiffies = last_jiffies + rcu_delta_jiffies;
-			delta_jiffies = rcu_delta_jiffies;
-		}
+		/*
+		 * Get the next pending timer. If high resolution
+		 * timers are enabled this only takes the timer wheel
+		 * timers into account. If high resolution timers are
+		 * disabled this also looks at the next expiring
+		 * hrtimer.
+		 */
+		next_tmr = get_next_timer_interrupt(basejiff, basemono);
+		ts->next_timer = next_tmr;
+		/* Take the next rcu event into account */
+		next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
 	}
 
 	/*
-	 * Do not stop the tick, if we are only one off (or less)
-	 * or if the cpu is required for RCU:
+	 * If the tick is due in the next period, keep it ticking or
+	 * restart it proper.
 	 */
-	if (!ts->tick_stopped && delta_jiffies <= 1)
-		goto out;
-
-	/* Schedule the tick, if we are at least one jiffie off */
-	if ((long)delta_jiffies >= 1) {
-
-		/*
-		 * If this cpu is the one which updates jiffies, then
-		 * give up the assignment and let it be taken by the
-		 * cpu which runs the tick timer next, which might be
-		 * this cpu as well. If we don't drop this here the
-		 * jiffies might be stale and do_timer() never
-		 * invoked. Keep track of the fact that it was the one
-		 * which had the do_timer() duty last. If this cpu is
-		 * the one which had the do_timer() duty last, we
-		 * limit the sleep time to the timekeeping
-		 * max_deferement value which we retrieved
-		 * above. Otherwise we can sleep as long as we want.
-		 */
-		if (cpu == tick_do_timer_cpu) {
-			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-			ts->do_timer_last = 1;
-		} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
-			time_delta = KTIME_MAX;
-			ts->do_timer_last = 0;
-		} else if (!ts->do_timer_last) {
-			time_delta = KTIME_MAX;
+	delta = next_tick - basemono;
+	if (delta <= (u64)TICK_NSEC) {
+		tick.tv64 = 0;
+		if (!ts->tick_stopped)
+			goto out;
+		if (delta == 0) {
+			/* Tick is stopped, but required now. Enforce it */
+			tick_nohz_restart(ts, now);
+			goto out;
 		}
+	}
+
+	/*
+	 * If this cpu is the one which updates jiffies, then give up
+	 * the assignment and let it be taken by the cpu which runs
+	 * the tick timer next, which might be this cpu as well. If we
+	 * don't drop this here the jiffies might be stale and
+	 * do_timer() never invoked. Keep track of the fact that it
+	 * was the one which had the do_timer() duty last. If this cpu
+	 * is the one which had the do_timer() duty last, we limit the
+	 * sleep time to the timekeeping max_deferement value.
+	 * Otherwise we can sleep as long as we want.
+	 */
+	delta = timekeeping_max_deferment();
+	if (cpu == tick_do_timer_cpu) {
+		tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+		ts->do_timer_last = 1;
+	} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+		delta = KTIME_MAX;
+		ts->do_timer_last = 0;
+	} else if (!ts->do_timer_last) {
+		delta = KTIME_MAX;
+	}
 
 #ifdef CONFIG_NO_HZ_FULL
-		if (!ts->inidle) {
-			time_delta = min(time_delta,
-					 scheduler_tick_max_deferment());
-		}
+	/* Limit the tick delta to the maximum scheduler deferment */
+	if (!ts->inidle)
+		delta = min(delta, scheduler_tick_max_deferment());
 #endif
 
-		/*
-		 * calculate the expiry time for the next timer wheel
-		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
-		 * that there is no timer pending or at least extremely
-		 * far into the future (12 days for HZ=1000). In this
-		 * case we set the expiry to the end of time.
-		 */
-		if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
-			/*
-			 * Calculate the time delta for the next timer event.
-			 * If the time delta exceeds the maximum time delta
-			 * permitted by the current clocksource then adjust
-			 * the time delta accordingly to ensure the
-			 * clocksource does not wrap.
-			 */
-			time_delta = min_t(u64, time_delta,
-					   tick_period.tv64 * delta_jiffies);
-		}
-
-		if (time_delta < KTIME_MAX)
-			expires = ktime_add_ns(last_update, time_delta);
-		else
-			expires.tv64 = KTIME_MAX;
-
-		/* Skip reprogram of event if its not changed */
-		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
-			goto out;
+	/* Calculate the next expiry time */
+	if (delta < (KTIME_MAX - basemono))
+		expires = basemono + delta;
+	else
+		expires = KTIME_MAX;
 
-		ret = expires;
+	expires = min_t(u64, expires, next_tick);
+	tick.tv64 = expires;
 
-		/*
-		 * nohz_stop_sched_tick can be called several times before
-		 * the nohz_restart_sched_tick is called. This happens when
-		 * interrupts arrive which do not cause a reschedule. In the
-		 * first call we save the current tick time, so we can restart
-		 * the scheduler tick in nohz_restart_sched_tick.
-		 */
-		if (!ts->tick_stopped) {
-			nohz_balance_enter_idle(cpu);
-			calc_load_enter_idle();
+	/* Skip reprogram of event if its not changed */
+	if (ts->tick_stopped && (expires == dev->next_event.tv64))
+		goto out;
 
-			ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
-			ts->tick_stopped = 1;
-			trace_tick_stop(1, " ");
-		}
+	/*
+	 * nohz_stop_sched_tick can be called several times before
+	 * the nohz_restart_sched_tick is called. This happens when
+	 * interrupts arrive which do not cause a reschedule. In the
+	 * first call we save the current tick time, so we can restart
+	 * the scheduler tick in nohz_restart_sched_tick.
+	 */
+	if (!ts->tick_stopped) {
+		nohz_balance_enter_idle(cpu);
+		calc_load_enter_idle();
 
-		/*
-		 * If the expiration time == KTIME_MAX, then
-		 * in this case we simply stop the tick timer.
-		 */
-		 if (unlikely(expires.tv64 == KTIME_MAX)) {
-			if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-				hrtimer_cancel(&ts->sched_timer);
-			goto out;
-		}
+		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
+		ts->tick_stopped = 1;
+		trace_tick_stop(1, " ");
+	}
 
-		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-			hrtimer_start(&ts->sched_timer, expires,
-				      HRTIMER_MODE_ABS_PINNED);
-			/* Check, if the timer was already in the past */
-			if (hrtimer_active(&ts->sched_timer))
-				goto out;
-		} else if (!tick_program_event(expires, 0))
-				goto out;
-		/*
-		 * We are past the event already. So we crossed a
-		 * jiffie boundary. Update jiffies and raise the
-		 * softirq.
-		 */
-		tick_do_update_jiffies64(ktime_get());
+	/*
+	 * If the expiration time == KTIME_MAX, then we simply stop
+	 * the tick timer.
+	 */
+	if (unlikely(expires == KTIME_MAX)) {
+		if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+			hrtimer_cancel(&ts->sched_timer);
+		goto out;
 	}
-	raise_softirq_irqoff(TIMER_SOFTIRQ);
+
+	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+		hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+	else
+		tick_program_event(tick, 1);
 out:
-	ts->next_jiffies = next_jiffies;
-	ts->last_jiffies = last_jiffies;
+	/* Update the estimated sleep length */
 	ts->sleep_length = ktime_sub(dev->next_event, now);
-
-	return ret;
+	return tick;
 }
 
 static void tick_nohz_full_stop_tick(struct tick_sched *ts)
@@ -876,32 +864,6 @@ ktime_t tick_nohz_get_sleep_length(void)
 	return ts->sleep_length;
 }
 
-static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
-{
-	hrtimer_cancel(&ts->sched_timer);
-	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
-
-	while (1) {
-		/* Forward the time to expire in the future */
-		hrtimer_forward(&ts->sched_timer, now, tick_period);
-
-		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-			hrtimer_start_expires(&ts->sched_timer,
-					      HRTIMER_MODE_ABS_PINNED);
-			/* Check, if the timer was already in the past */
-			if (hrtimer_active(&ts->sched_timer))
-				break;
-		} else {
-			if (!tick_program_event(
-				hrtimer_get_expires(&ts->sched_timer), 0))
-				break;
-		}
-		/* Reread time and update jiffies */
-		now = ktime_get();
-		tick_do_update_jiffies64(now);
-	}
-}
-
 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
 	/* Update jiffies first */
@@ -972,12 +934,6 @@ void tick_nohz_idle_exit(void)
 	local_irq_enable();
 }
 
-static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
-{
-	hrtimer_forward(&ts->sched_timer, now, tick_period);
-	return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
-}
-
 /*
  * The nohz low res interrupt handler
  */
@@ -996,10 +952,18 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	if (unlikely(ts->tick_stopped))
 		return;
 
-	while (tick_nohz_reprogram(ts, now)) {
-		now = ktime_get();
-		tick_do_update_jiffies64(now);
-	}
+	hrtimer_forward(&ts->sched_timer, now, tick_period);
+	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
+{
+	if (!tick_nohz_enabled)
+		return;
+	ts->nohz_mode = mode;
+	/* One update is enough */
+	if (!test_and_set_bit(0, &tick_nohz_active))
+		timers_update_migration(true);
 }
 
 /**
@@ -1013,13 +977,8 @@ static void tick_nohz_switch_to_nohz(void)
 	if (!tick_nohz_enabled)
 		return;
 
-	local_irq_disable();
-	if (tick_switch_to_oneshot(tick_nohz_handler)) {
-		local_irq_enable();
+	if (tick_switch_to_oneshot(tick_nohz_handler))
 		return;
-	}
-	tick_nohz_active = 1;
-	ts->nohz_mode = NOHZ_MODE_LOWRES;
 
 	/*
 	 * Recycle the hrtimer in ts, so we can share the
@@ -1029,13 +988,10 @@ static void tick_nohz_switch_to_nohz(void)
 	/* Get the next period */
 	next = tick_init_jiffy_update();
 
-	for (;;) {
-		hrtimer_set_expires(&ts->sched_timer, next);
-		if (!tick_program_event(next, 0))
-			break;
-		next = ktime_add(next, tick_period);
-	}
-	local_irq_enable();
+	hrtimer_forward_now(&ts->sched_timer, tick_period);
+	hrtimer_set_expires(&ts->sched_timer, next);
+	tick_program_event(next, 1);
+	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
 
 /*
@@ -1087,6 +1043,7 @@ static inline void tick_nohz_irq_enter(void)
 
 static inline void tick_nohz_switch_to_nohz(void) { }
 static inline void tick_nohz_irq_enter(void) { }
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
@@ -1167,22 +1124,9 @@ void tick_setup_sched_timer(void)
 		hrtimer_add_expires_ns(&ts->sched_timer, offset);
 	}
 
-	for (;;) {
-		hrtimer_forward(&ts->sched_timer, now, tick_period);
-		hrtimer_start_expires(&ts->sched_timer,
-				      HRTIMER_MODE_ABS_PINNED);
-		/* Check, if the timer was already in the past */
-		if (hrtimer_active(&ts->sched_timer))
-			break;
-		now = ktime_get();
-	}
-
-#ifdef CONFIG_NO_HZ_COMMON
-	if (tick_nohz_enabled) {
-		ts->nohz_mode = NOHZ_MODE_HIGHRES;
-		tick_nohz_active = 1;
-	}
-#endif
+	hrtimer_forward(&ts->sched_timer, now, tick_period);
+	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+	tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
 }
 #endif /* HIGH_RES_TIMERS */
 
@@ -1227,7 +1171,7 @@ void tick_oneshot_notify(void)
  * Called cyclic from the hrtimer softirq (driven by the timer
  * softirq) allow_nohz signals, that we can switch into low-res nohz
  * mode, because high resolution timers are disabled (either compile
- * or runtime).
+ * or runtime). Called with interrupts disabled.
  */
 int tick_check_oneshot_change(int allow_nohz)
 {
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 28b5da3e1..a4a8d4e9b 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -57,7 +57,7 @@ struct tick_sched {
 	ktime_t				iowait_sleeptime;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
-	unsigned long			next_jiffies;
+	u64				next_timer;
 	ktime_t				idle_expires;
 	int				do_timer_last;
 };
@@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu);
 static inline void tick_cancel_sched_timer(int cpu) { }
 #endif
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+#else
+static inline int
+__tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	return -EBUSY;
+}
+#endif
+
 #endif
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2c85b7724..85d5bb1d6 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -41,7 +41,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
-#include "timeconst.h"
+#include <generated/timeconst.h>
 #include "timekeeping.h"
 
 /*
@@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
 		return error;
 
 	if (tz) {
+		/* Verify we're witin the +-15 hrs range */
+		if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
+			return -EINVAL;
+
 		sys_tz = *tz;
 		update_vsyscall_tz();
 		if (firsttime) {
@@ -483,9 +487,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
 }
 EXPORT_SYMBOL(ns_to_timespec64);
 #endif
-/*
- * When we convert to jiffies then we interpret incoming values
- * the following way:
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m:	time in milliseconds
+ *
+ * conversion is done as follows:
  *
  * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
  *
@@ -493,66 +499,36 @@ EXPORT_SYMBOL(ns_to_timespec64);
  *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
  *
  * - all other values are converted to jiffies by either multiplying
- *   the input value by a factor or dividing it with a factor
- *
- * We must also be careful about 32-bit overflows.
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows.
+ *   for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * routines found in include/linux/jiffies.h
  */
-unsigned long msecs_to_jiffies(const unsigned int m)
+unsigned long __msecs_to_jiffies(const unsigned int m)
 {
 	/*
 	 * Negative value, means infinite timeout:
 	 */
 	if ((int)m < 0)
 		return MAX_JIFFY_OFFSET;
-
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
-	/*
-	 * HZ is equal to or smaller than 1000, and 1000 is a nice
-	 * round multiple of HZ, divide with the factor between them,
-	 * but round upwards:
-	 */
-	return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
-	/*
-	 * HZ is larger than 1000, and HZ is a nice round multiple of
-	 * 1000 - simply multiply with the factor between them.
-	 *
-	 * But first make sure the multiplication result cannot
-	 * overflow:
-	 */
-	if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-
-	return m * (HZ / MSEC_PER_SEC);
-#else
-	/*
-	 * Generic case - multiply, round and divide. But first
-	 * check that if we are doing a net multiplication, that
-	 * we wouldn't overflow:
-	 */
-	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-
-	return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
-		>> MSEC_TO_HZ_SHR32;
-#endif
+	return _msecs_to_jiffies(m);
 }
-EXPORT_SYMBOL(msecs_to_jiffies);
+EXPORT_SYMBOL(__msecs_to_jiffies);
 
-unsigned long usecs_to_jiffies(const unsigned int u)
+unsigned long __usecs_to_jiffies(const unsigned int u)
 {
 	if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
 		return MAX_JIFFY_OFFSET;
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
-	return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-	return u * (HZ / USEC_PER_SEC);
-#else
-	return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
-		>> USEC_TO_HZ_SHR32;
-#endif
+	return _usecs_to_jiffies(u);
 }
-EXPORT_SYMBOL(usecs_to_jiffies);
+EXPORT_SYMBOL(__usecs_to_jiffies);
 
 /*
  * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2ca..c7388dee8 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -50,7 +50,7 @@ define timeconst(hz) {
 	print "#include <linux/types.h>\n\n"
 
 	print "#if HZ != ", hz, "\n"
-	print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+	print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n"
 	print "#endif\n\n"
 
 	if (hz < 2) {
@@ -105,4 +105,5 @@ define timeconst(hz) {
 	halt
 }
 
+hz = read();
 timeconst(hz)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 946acb721..bca3667a2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-/*
- * These simple flag variables are managed
- * without locks, which is racy, but ok since
- * we don't really care about being super
- * precise about how many events were seen,
- * just that a problem was observed.
- */
-static int timekeeping_underflow_seen;
-static int timekeeping_overflow_seen;
-
-/* last_warning is only modified under the timekeeping lock */
-static long timekeeping_last_warning;
 
 static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 {
@@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 		}
 	}
 
-	if (timekeeping_underflow_seen) {
-		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+	if (tk->underflow_seen) {
+		if (jiffies - tk->last_warning > WARNING_FREQ) {
 			printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
 			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
 			printk_deferred("         Your kernel is probably still fine.\n");
-			timekeeping_last_warning = jiffies;
+			tk->last_warning = jiffies;
 		}
-		timekeeping_underflow_seen = 0;
+		tk->underflow_seen = 0;
 	}
 
-	if (timekeeping_overflow_seen) {
-		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+	if (tk->overflow_seen) {
+		if (jiffies - tk->last_warning > WARNING_FREQ) {
 			printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
 			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
 			printk_deferred("         Your kernel is probably still fine.\n");
-			timekeeping_last_warning = jiffies;
+			tk->last_warning = jiffies;
 		}
-		timekeeping_overflow_seen = 0;
+		tk->overflow_seen = 0;
 	}
 }
 
 static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 {
+	struct timekeeper *tk = &tk_core.timekeeper;
 	cycle_t now, last, mask, max, delta;
 	unsigned int seq;
 
@@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 	 * mask-relative negative values.
 	 */
 	if (unlikely((~delta & mask) < (mask >> 3))) {
-		timekeeping_underflow_seen = 1;
+		tk->underflow_seen = 1;
 		delta = 0;
 	}
 
 	/* Cap delta value to the max_cycles values to avoid mult overflows */
 	if (unlikely(delta > max)) {
-		timekeeping_overflow_seen = 1;
+		tk->overflow_seen = 1;
 		delta = tkr->clock->max_cycles;
 	}
 
@@ -330,32 +319,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
  * We want to use this from any context including NMI and tracing /
  * instrumenting the timekeeping code itself.
  *
- * So we handle this differently than the other timekeeping accessor
- * functions which retry when the sequence count has changed. The
- * update side does:
- *
- * smp_wmb();	<- Ensure that the last base[1] update is visible
- * tkf->seq++;
- * smp_wmb();	<- Ensure that the seqcount update is visible
- * update(tkf->base[0], tkr);
- * smp_wmb();	<- Ensure that the base[0] update is visible
- * tkf->seq++;
- * smp_wmb();	<- Ensure that the seqcount update is visible
- * update(tkf->base[1], tkr);
- *
- * The reader side does:
- *
- * do {
- *	seq = tkf->seq;
- *	smp_rmb();
- *	idx = seq & 0x01;
- *	now = now(tkf->base[idx]);
- *	smp_rmb();
- * } while (seq != tkf->seq)
- *
- * As long as we update base[0] readers are forced off to
- * base[1]. Once base[0] is updated readers are redirected to base[0]
- * and the base[1] update takes place.
+ * Employ the latch technique; see @raw_write_seqcount_latch.
  *
  * So if a NMI hits the update of base[0] then it will use base[1]
  * which is still consistent. In the worst case this can result is a
@@ -418,7 +382,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 	u64 now;
 
 	do {
-		seq = raw_read_seqcount(&tkf->seq);
+		seq = raw_read_seqcount_latch(&tkf->seq);
 		tkr = tkf->base + (seq & 0x01);
 		now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
 	} while (read_seqcount_retry(&tkf->seq, seq));
@@ -551,6 +515,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
 /*
+ * tk_update_leap_state - helper to update the next_leap_ktime
+ */
+static inline void tk_update_leap_state(struct timekeeper *tk)
+{
+	tk->next_leap_ktime = ntp_get_next_leap();
+	if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+		/* Convert to monotonic time */
+		tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
+}
+
+/*
  * Update the ktime_t based scalar nsec members of the timekeeper
  */
 static inline void tk_update_ktime_data(struct timekeeper *tk)
@@ -591,17 +566,25 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 		ntp_clear();
 	}
 
+	tk_update_leap_state(tk);
 	tk_update_ktime_data(tk);
 
 	update_vsyscall(tk);
 	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
 
+	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
+
+	if (action & TK_CLOCK_WAS_SET)
+		tk->clock_was_set_seq++;
+	/*
+	 * The mirroring of the data to the shadow-timekeeper needs
+	 * to happen last here to ensure we don't over-write the
+	 * timekeeper structure on the next update with stale data
+	 */
 	if (action & TK_MIRROR)
 		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
 		       sizeof(tk_core.timekeeper));
-
-	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
-	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
 }
 
 /**
@@ -699,6 +682,23 @@ ktime_t ktime_get(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get);
 
+u32 ktime_get_resolution_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	unsigned int seq;
+	u32 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	return nsecs;
+}
+EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
+
 static ktime_t *offsets[TK_OFFS_MAX] = {
 	[TK_OFFS_REAL]	= &tk_core.timekeeper.offs_real,
 	[TK_OFFS_BOOT]	= &tk_core.timekeeper.offs_boot,
@@ -1179,28 +1179,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64)
 }
 
 /**
- * read_boot_clock -  Return time of the system start.
+ * read_boot_clock64 -  Return time of the system start.
  *
  * Weak dummy function for arches that do not yet support it.
  * Function to read the exact time the system has been started.
- * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported.
  *
  *  XXX - Do be sure to remove it once all arches implement it.
  */
-void __weak read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock64(struct timespec64 *ts)
 {
 	ts->tv_sec = 0;
 	ts->tv_nsec = 0;
 }
 
-void __weak read_boot_clock64(struct timespec64 *ts64)
-{
-	struct timespec ts;
-
-	read_boot_clock(&ts);
-	*ts64 = timespec_to_timespec64(ts);
-}
-
 /* Flag for if timekeeping_resume() has injected sleeptime */
 static bool sleeptime_injected;
 
@@ -1836,8 +1828,9 @@ void update_wall_time(void)
 	 * memcpy under the tk_core.seq against one before we start
 	 * updating.
 	 */
+	timekeeping_update(tk, clock_set);
 	memcpy(real_tk, tk, sizeof(*tk));
-	timekeeping_update(real_tk, clock_set);
+	/* The memcpy must come last. Do not put anything here! */
 	write_seqcount_end(&tk_core.seq);
 out:
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1926,47 +1919,20 @@ void do_timer(unsigned long ticks)
 }
 
 /**
- * ktime_get_update_offsets_tick - hrtimer helper
- * @offs_real:	pointer to storage for monotonic -> realtime offset
- * @offs_boot:	pointer to storage for monotonic -> boottime offset
- * @offs_tai:	pointer to storage for monotonic -> clock tai offset
- *
- * Returns monotonic time at last tick and various offsets
- */
-ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
-							ktime_t *offs_tai)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned int seq;
-	ktime_t base;
-	u64 nsecs;
-
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-
-		base = tk->tkr_mono.base;
-		nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
-
-		*offs_real = tk->offs_real;
-		*offs_boot = tk->offs_boot;
-		*offs_tai = tk->offs_tai;
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	return ktime_add_ns(base, nsecs);
-}
-
-#ifdef CONFIG_HIGH_RES_TIMERS
-/**
  * ktime_get_update_offsets_now - hrtimer helper
+ * @cwsseq:	pointer to check and store the clock was set sequence number
  * @offs_real:	pointer to storage for monotonic -> realtime offset
  * @offs_boot:	pointer to storage for monotonic -> boottime offset
  * @offs_tai:	pointer to storage for monotonic -> clock tai offset
  *
- * Returns current monotonic time and updates the offsets
+ * Returns current monotonic time and updates the offsets if the
+ * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
+ * different.
+ *
  * Called from hrtimer_interrupt() or retrigger_next_event()
  */
-ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
-							ktime_t *offs_tai)
+ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
+				     ktime_t *offs_boot, ktime_t *offs_tai)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
@@ -1978,15 +1944,23 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
 
 		base = tk->tkr_mono.base;
 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
+		base = ktime_add_ns(base, nsecs);
+
+		if (*cwsseq != tk->clock_was_set_seq) {
+			*cwsseq = tk->clock_was_set_seq;
+			*offs_real = tk->offs_real;
+			*offs_boot = tk->offs_boot;
+			*offs_tai = tk->offs_tai;
+		}
+
+		/* Handle leapsecond insertion adjustments */
+		if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+			*offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
 
-		*offs_real = tk->offs_real;
-		*offs_boot = tk->offs_boot;
-		*offs_tai = tk->offs_tai;
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	return ktime_add_ns(base, nsecs);
+	return base;
 }
-#endif
 
 /**
  * do_adjtimex() - Accessor function to NTP __do_adjtimex function
@@ -2027,6 +2001,8 @@ int do_adjtimex(struct timex *txc)
 		__timekeeping_set_tai_offset(tk, tai);
 		timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
 	}
+	tk_update_leap_state(tk);
+
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index ead8794b9..704f595ce 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -3,19 +3,16 @@
 /*
  * Internal interfaces for kernel/time/
  */
-extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
-						ktime_t *offs_boot,
-						ktime_t *offs_tai);
-extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
-						ktime_t *offs_boot,
-						ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
+					    ktime_t *offs_real,
+					    ktime_t *offs_boot,
+					    ktime_t *offs_tai);
 
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
 extern int timekeeping_inject_offset(struct timespec *ts);
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
-extern void timekeeping_clocktai(struct timespec *ts);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2ece3aa50..84190f02b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
 #include <asm/timex.h>
 #include <asm/io.h>
 
+#include "tick-internal.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
 
@@ -68,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64);
 #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
 
 struct tvec {
-	struct list_head vec[TVN_SIZE];
+	struct hlist_head vec[TVN_SIZE];
 };
 
 struct tvec_root {
-	struct list_head vec[TVR_SIZE];
+	struct hlist_head vec[TVR_SIZE];
 };
 
 struct tvec_base {
@@ -83,6 +85,8 @@ struct tvec_base {
 	unsigned long active_timers;
 	unsigned long all_timers;
 	int cpu;
+	bool migration_enabled;
+	bool nohz_active;
 	struct tvec_root tv1;
 	struct tvec tv2;
 	struct tvec tv3;
@@ -90,43 +94,60 @@ struct tvec_base {
 	struct tvec tv5;
 } ____cacheline_aligned;
 
-/*
- * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
- * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
- * pointer to per-cpu entries because we don't know where we'll map the section,
- * even for the boot cpu.
- *
- * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
- * rest of them.
- */
-struct tvec_base boot_tvec_bases;
-EXPORT_SYMBOL(boot_tvec_bases);
 
-static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+unsigned int sysctl_timer_migration = 1;
 
-/* Functions below help us manage 'deferrable' flag */
-static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
+void timers_update_migration(bool update_nohz)
 {
-	return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
+	bool on = sysctl_timer_migration && tick_nohz_active;
+	unsigned int cpu;
+
+	/* Avoid the loop, if nothing to update */
+	if (this_cpu_read(tvec_bases.migration_enabled) == on)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(tvec_bases.migration_enabled, cpu) = on;
+		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
+		if (!update_nohz)
+			continue;
+		per_cpu(tvec_bases.nohz_active, cpu) = true;
+		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
+	}
 }
 
-static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
+int timer_migration_handler(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp,
+			    loff_t *ppos)
 {
-	return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
+	static DEFINE_MUTEX(mutex);
+	int ret;
+
+	mutex_lock(&mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write)
+		timers_update_migration(false);
+	mutex_unlock(&mutex);
+	return ret;
 }
 
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+						int pinned)
 {
-	return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
+	if (pinned || !base->migration_enabled)
+		return this_cpu_ptr(&tvec_bases);
+	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
 }
-
-static inline void
-timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
+#else
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+						int pinned)
 {
-	unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
-
-	timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
+	return this_cpu_ptr(&tvec_bases);
 }
+#endif
 
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
 		bool force_up)
@@ -349,26 +370,12 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
-/*
- * If the list is empty, catch up ->timer_jiffies to the current time.
- * The caller must hold the tvec_base lock.  Returns true if the list
- * was empty and therefore ->timer_jiffies was updated.
- */
-static bool catchup_timer_jiffies(struct tvec_base *base)
-{
-	if (!base->all_timers) {
-		base->timer_jiffies = jiffies;
-		return true;
-	}
-	return false;
-}
-
 static void
 __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
 	unsigned long idx = expires - base->timer_jiffies;
-	struct list_head *vec;
+	struct hlist_head *vec;
 
 	if (idx < TVR_SIZE) {
 		int i = expires & TVR_MASK;
@@ -401,25 +408,25 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
 	}
-	/*
-	 * Timers are FIFO:
-	 */
-	list_add_tail(&timer->entry, vec);
+
+	hlist_add_head(&timer->entry, vec);
 }
 
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
-	(void)catchup_timer_jiffies(base);
+	/* Advance base->jiffies, if the base is empty */
+	if (!base->all_timers++)
+		base->timer_jiffies = jiffies;
+
 	__internal_add_timer(base, timer);
 	/*
 	 * Update base->active_timers and base->next_timer
 	 */
-	if (!tbase_get_deferrable(timer->base)) {
+	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		if (!base->active_timers++ ||
 		    time_before(timer->expires, base->next_timer))
 			base->next_timer = timer->expires;
 	}
-	base->all_timers++;
 
 	/*
 	 * Check whether the other CPU is in dynticks mode and needs
@@ -434,8 +441,11 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
-		wake_up_nohz_cpu(base->cpu);
+	if (base->nohz_active) {
+		if (!(timer->flags & TIMER_DEFERRABLE) ||
+		    tick_nohz_full_cpu(base->cpu))
+			wake_up_nohz_cpu(base->cpu);
+	}
 }
 
 #ifdef CONFIG_TIMER_STATS
@@ -451,15 +461,12 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 
 static void timer_stats_account_timer(struct timer_list *timer)
 {
-	unsigned int flag = 0;
-
 	if (likely(!timer->start_site))
 		return;
-	if (unlikely(tbase_get_deferrable(timer->base)))
-		flag |= TIMER_STATS_FLAG_DEFERRABLE;
 
 	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
-				 timer->function, timer->start_comm, flag);
+				 timer->function, timer->start_comm,
+				 timer->flags);
 }
 
 #else
@@ -516,8 +523,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
 		 * statically initialized. We just make sure that it
 		 * is tracked in the object tracker.
 		 */
-		if (timer->entry.next == NULL &&
-		    timer->entry.prev == TIMER_ENTRY_STATIC) {
+		if (timer->entry.pprev == NULL &&
+		    timer->entry.next == TIMER_ENTRY_STATIC) {
 			debug_object_init(timer, &timer_debug_descr);
 			debug_object_activate(timer, &timer_debug_descr);
 			return 0;
@@ -563,7 +570,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+		if (timer->entry.next == TIMER_ENTRY_STATIC) {
 			/*
 			 * This is not really a fixup. The timer was
 			 * statically initialized. We just make sure that it
@@ -648,7 +655,7 @@ static inline void
 debug_activate(struct timer_list *timer, unsigned long expires)
 {
 	debug_timer_activate(timer);
-	trace_timer_start(timer, expires);
+	trace_timer_start(timer, expires, timer->flags);
 }
 
 static inline void debug_deactivate(struct timer_list *timer)
@@ -665,10 +672,8 @@ static inline void debug_assert_init(struct timer_list *timer)
 static void do_init_timer(struct timer_list *timer, unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
-	struct tvec_base *base = raw_cpu_read(tvec_bases);
-
-	timer->entry.next = NULL;
-	timer->base = (void *)((unsigned long)base | flags);
+	timer->entry.pprev = NULL;
+	timer->flags = flags | raw_smp_processor_id();
 	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
@@ -699,24 +704,23 @@ EXPORT_SYMBOL(init_timer_key);
 
 static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 {
-	struct list_head *entry = &timer->entry;
+	struct hlist_node *entry = &timer->entry;
 
 	debug_deactivate(timer);
 
-	__list_del(entry->prev, entry->next);
+	__hlist_del(entry);
 	if (clear_pending)
-		entry->next = NULL;
-	entry->prev = LIST_POISON2;
+		entry->pprev = NULL;
+	entry->next = LIST_POISON2;
 }
 
 static inline void
 detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
 {
 	detach_timer(timer, true);
-	if (!tbase_get_deferrable(timer->base))
+	if (!(timer->flags & TIMER_DEFERRABLE))
 		base->active_timers--;
 	base->all_timers--;
-	(void)catchup_timer_jiffies(base);
 }
 
 static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -726,13 +730,14 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 		return 0;
 
 	detach_timer(timer, clear_pending);
-	if (!tbase_get_deferrable(timer->base)) {
+	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		base->active_timers--;
 		if (timer->expires == base->next_timer)
 			base->next_timer = base->timer_jiffies;
 	}
-	base->all_timers--;
-	(void)catchup_timer_jiffies(base);
+	/* If this was the last timer, advance base->jiffies */
+	if (!--base->all_timers)
+		base->timer_jiffies = jiffies;
 	return 1;
 }
 
@@ -744,24 +749,22 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
  * So __run_timers/migrate_timers can safely modify all timers which could
  * be found on ->tvX lists.
  *
- * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * When the timer's base is locked and removed from the list, the
+ * TIMER_MIGRATING flag is set, FIXME
  */
 static struct tvec_base *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 	__acquires(timer->base->lock)
 {
-	struct tvec_base *base;
-
 	for (;;) {
-		struct tvec_base *prelock_base = timer->base;
-		base = tbase_get_base(prelock_base);
-		if (likely(base != NULL)) {
+		u32 tf = timer->flags;
+		struct tvec_base *base;
+
+		if (!(tf & TIMER_MIGRATING)) {
+			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
 			spin_lock_irqsave(&base->lock, *flags);
-			if (likely(prelock_base == timer->base))
+			if (timer->flags == tf)
 				return base;
-			/* The timer has migrated to another CPU */
 			spin_unlock_irqrestore(&base->lock, *flags);
 		}
 		cpu_relax();
@@ -770,11 +773,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires,
-						bool pending_only, int pinned)
+	    bool pending_only, int pinned)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret = 0 , cpu;
+	int ret = 0;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -787,8 +790,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	cpu = get_nohz_timer_target(pinned);
-	new_base = per_cpu(tvec_bases, cpu);
+	new_base = get_target_base(base, pinned);
 
 	if (base != new_base) {
 		/*
@@ -800,11 +802,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 		 */
 		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
+			timer->flags |= TIMER_MIGRATING;
+
 			spin_unlock(&base->lock);
 			base = new_base;
 			spin_lock(&base->lock);
-			timer_set_base(timer, base);
+			WRITE_ONCE(timer->flags,
+				   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
 		}
 	}
 
@@ -966,13 +970,13 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct tvec_base *base = per_cpu(tvec_bases, cpu);
+	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 	spin_lock_irqsave(&base->lock, flags);
-	timer_set_base(timer, base);
+	timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 	debug_activate(timer, timer->expires);
 	internal_add_timer(base, timer);
 	spin_unlock_irqrestore(&base->lock, flags);
@@ -1037,8 +1041,6 @@ int try_to_del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
 #ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
-
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -1093,7 +1095,7 @@ int del_timer_sync(struct timer_list *timer)
 	 * don't use it in hardirq context, because it
 	 * could lead to deadlock.
 	 */
-	WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
+	WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
@@ -1107,17 +1109,17 @@ EXPORT_SYMBOL(del_timer_sync);
 static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
-	struct timer_list *timer, *tmp;
-	struct list_head tv_list;
+	struct timer_list *timer;
+	struct hlist_node *tmp;
+	struct hlist_head tv_list;
 
-	list_replace_init(tv->vec + index, &tv_list);
+	hlist_move_list(tv->vec + index, &tv_list);
 
 	/*
 	 * We are removing _all_ timers from the list, so we
 	 * don't have to detach them individually.
 	 */
-	list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-		BUG_ON(tbase_get_base(timer->base) != base);
+	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
 		/* No accounting, while moving them */
 		__internal_add_timer(base, timer);
 	}
@@ -1182,14 +1184,18 @@ static inline void __run_timers(struct tvec_base *base)
 	struct timer_list *timer;
 
 	spin_lock_irq(&base->lock);
-	if (catchup_timer_jiffies(base)) {
-		spin_unlock_irq(&base->lock);
-		return;
-	}
+
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
-		struct list_head work_list;
-		struct list_head *head = &work_list;
-		int index = base->timer_jiffies & TVR_MASK;
+		struct hlist_head work_list;
+		struct hlist_head *head = &work_list;
+		int index;
+
+		if (!base->all_timers) {
+			base->timer_jiffies = jiffies;
+			break;
+		}
+
+		index = base->timer_jiffies & TVR_MASK;
 
 		/*
 		 * Cascade timers:
@@ -1200,16 +1206,16 @@ static inline void __run_timers(struct tvec_base *base)
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
 		++base->timer_jiffies;
-		list_replace_init(base->tv1.vec + index, head);
-		while (!list_empty(head)) {
+		hlist_move_list(base->tv1.vec + index, head);
+		while (!hlist_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
 			bool irqsafe;
 
-			timer = list_first_entry(head, struct timer_list,entry);
+			timer = hlist_entry(head->first, struct timer_list, entry);
 			fn = timer->function;
 			data = timer->data;
-			irqsafe = tbase_get_irqsafe(timer->base);
+			irqsafe = timer->flags & TIMER_IRQSAFE;
 
 			timer_stats_account_timer(timer);
 
@@ -1248,8 +1254,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
 	/* Look for timer events in tv1. */
 	index = slot = timer_jiffies & TVR_MASK;
 	do {
-		list_for_each_entry(nte, base->tv1.vec + slot, entry) {
-			if (tbase_get_deferrable(nte->base))
+		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
+			if (nte->flags & TIMER_DEFERRABLE)
 				continue;
 
 			found = 1;
@@ -1279,8 +1285,8 @@ cascade:
 
 		index = slot = timer_jiffies & TVN_MASK;
 		do {
-			list_for_each_entry(nte, varp->vec + slot, entry) {
-				if (tbase_get_deferrable(nte->base))
+			hlist_for_each_entry(nte, varp->vec + slot, entry) {
+				if (nte->flags & TIMER_DEFERRABLE)
 					continue;
 
 				found = 1;
@@ -1311,54 +1317,48 @@ cascade:
  * Check, if the next hrtimer event is before the next timer wheel
  * event:
  */
-static unsigned long cmp_next_hrtimer_event(unsigned long now,
-					    unsigned long expires)
+static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 {
-	ktime_t hr_delta = hrtimer_get_next_event();
-	struct timespec tsdelta;
-	unsigned long delta;
-
-	if (hr_delta.tv64 == KTIME_MAX)
-		return expires;
+	u64 nextevt = hrtimer_get_next_event();
 
 	/*
-	 * Expired timer available, let it expire in the next tick
+	 * If high resolution timers are enabled
+	 * hrtimer_get_next_event() returns KTIME_MAX.
 	 */
-	if (hr_delta.tv64 <= 0)
-		return now + 1;
-
-	tsdelta = ktime_to_timespec(hr_delta);
-	delta = timespec_to_jiffies(&tsdelta);
+	if (expires <= nextevt)
+		return expires;
 
 	/*
-	 * Limit the delta to the max value, which is checked in
-	 * tick_nohz_stop_sched_tick():
+	 * If the next timer is already expired, return the tick base
+	 * time so the tick is fired immediately.
 	 */
-	if (delta > NEXT_TIMER_MAX_DELTA)
-		delta = NEXT_TIMER_MAX_DELTA;
+	if (nextevt <= basem)
+		return basem;
 
 	/*
-	 * Take rounding errors in to account and make sure, that it
-	 * expires in the next tick. Otherwise we go into an endless
-	 * ping pong due to tick_nohz_stop_sched_tick() retriggering
-	 * the timer softirq
+	 * Round up to the next jiffie. High resolution timers are
+	 * off, so the hrtimers are expired in the tick and we need to
+	 * make sure that this tick really expires the timer to avoid
+	 * a ping pong of the nohz stop code.
+	 *
+	 * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
 	 */
-	if (delta < 1)
-		delta = 1;
-	now += delta;
-	if (time_before(now, expires))
-		return now;
-	return expires;
+	return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
 }
 
 /**
- * get_next_timer_interrupt - return the jiffy of the next pending timer
- * @now: current time (in jiffies)
+ * get_next_timer_interrupt - return the time (clock mono) of the next timer
+ * @basej:	base time jiffies
+ * @basem:	base time clock monotonic
+ *
+ * Returns the tick aligned clock monotonic time of the next pending
+ * timer or KTIME_MAX if no timer is pending.
  */
-unsigned long get_next_timer_interrupt(unsigned long now)
+u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct tvec_base *base = __this_cpu_read(tvec_bases);
-	unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
+	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+	u64 expires = KTIME_MAX;
+	unsigned long nextevt;
 
 	/*
 	 * Pretend that there is no timer pending if the cpu is offline.
@@ -1371,14 +1371,15 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	if (base->active_timers) {
 		if (time_before_eq(base->next_timer, base->timer_jiffies))
 			base->next_timer = __next_timer_interrupt(base);
-		expires = base->next_timer;
+		nextevt = base->next_timer;
+		if (time_before_eq(nextevt, basej))
+			expires = basem;
+		else
+			expires = basem + (nextevt - basej) * TICK_NSEC;
 	}
 	spin_unlock(&base->lock);
 
-	if (time_before_eq(expires, now))
-		return now;
-
-	return cmp_next_hrtimer_event(now, expires);
+	return cmp_next_hrtimer_event(basem, expires);
 }
 #endif
 
@@ -1407,9 +1408,7 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct tvec_base *base = __this_cpu_read(tvec_bases);
-
-	hrtimer_run_pending();
+	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
@@ -1545,15 +1544,16 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
+static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
 {
 	struct timer_list *timer;
+	int cpu = new_base->cpu;
 
-	while (!list_empty(head)) {
-		timer = list_first_entry(head, struct timer_list, entry);
+	while (!hlist_empty(head)) {
+		timer = hlist_entry(head->first, struct timer_list, entry);
 		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
-		timer_set_base(timer, new_base);
+		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		internal_add_timer(new_base, timer);
 	}
 }
@@ -1565,8 +1565,8 @@ static void migrate_timers(int cpu)
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu(tvec_bases, cpu);
-	new_base = get_cpu_var(tvec_bases);
+	old_base = per_cpu_ptr(&tvec_bases, cpu);
+	new_base = get_cpu_ptr(&tvec_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1590,7 +1590,7 @@ static void migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_var(tvec_bases);
+	put_cpu_ptr(&tvec_bases);
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1616,52 +1616,27 @@ static inline void timer_register_cpu_notifier(void)
 static inline void timer_register_cpu_notifier(void) { }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+static void __init init_timer_cpu(int cpu)
 {
-	int j;
-
-	BUG_ON(base != tbase_get_base(base));
+	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
 
 	base->cpu = cpu;
-	per_cpu(tvec_bases, cpu) = base;
 	spin_lock_init(&base->lock);
 
-	for (j = 0; j < TVN_SIZE; j++) {
-		INIT_LIST_HEAD(base->tv5.vec + j);
-		INIT_LIST_HEAD(base->tv4.vec + j);
-		INIT_LIST_HEAD(base->tv3.vec + j);
-		INIT_LIST_HEAD(base->tv2.vec + j);
-	}
-	for (j = 0; j < TVR_SIZE; j++)
-		INIT_LIST_HEAD(base->tv1.vec + j);
-
 	base->timer_jiffies = jiffies;
 	base->next_timer = base->timer_jiffies;
 }
 
 static void __init init_timer_cpus(void)
 {
-	struct tvec_base *base;
-	int local_cpu = smp_processor_id();
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		if (cpu == local_cpu)
-			base = &boot_tvec_bases;
-#ifdef CONFIG_SMP
-		else
-			base = per_cpu_ptr(&__tvec_bases, cpu);
-#endif
-
-		init_timer_cpu(base, cpu);
-	}
+	for_each_possible_cpu(cpu)
+		init_timer_cpu(cpu);
 }
 
 void __init init_timers(void)
 {
-	/* ensure there are enough low bits for flags in timer->base pointer */
-	BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
-
 	init_timer_cpus();
 	init_timer_stats();
 	timer_register_cpu_notifier();
@@ -1697,14 +1672,14 @@ unsigned long msleep_interruptible(unsigned int msecs)
 
 EXPORT_SYMBOL(msleep_interruptible);
 
-static int __sched do_usleep_range(unsigned long min, unsigned long max)
+static void __sched do_usleep_range(unsigned long min, unsigned long max)
 {
 	ktime_t kmin;
 	unsigned long delta;
 
 	kmin = ktime_set(0, min * NSEC_PER_USEC);
 	delta = (max - min) * NSEC_PER_USEC;
-	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+	schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
 }
 
 /**
@@ -1712,7 +1687,7 @@ static int __sched do_usleep_range(unsigned long min, unsigned long max)
  * @min: Minimum time in usecs to sleep
  * @max: Maximum time in usecs to sleep
  */
-void usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range(unsigned long min, unsigned long max)
 {
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	do_usleep_range(min, max);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index e878c2e0b..a4536e1e3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -29,19 +29,24 @@ struct timer_list_iter {
 
 typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
 
-DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
-
 /*
  * This allows printing both to /proc/timer_list and
  * to the console (on SysRq-Q):
  */
-#define SEQ_printf(m, x...)			\
- do {						\
-	if (m)					\
-		seq_printf(m, x);		\
-	else					\
-		printk(x);			\
- } while (0)
+__printf(2, 3)
+static void SEQ_printf(struct seq_file *m, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+
+	if (m)
+		seq_vprintf(m, fmt, args);
+	else
+		vprintk(fmt, args);
+
+	va_end(args);
+}
 
 static void print_name_offset(struct seq_file *m, void *sym)
 {
@@ -120,10 +125,10 @@ static void
 print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
 	SEQ_printf(m, "  .base:       %pK\n", base);
-	SEQ_printf(m, "  .index:      %d\n",
-			base->index);
-	SEQ_printf(m, "  .resolution: %Lu nsecs\n",
-			(unsigned long long)ktime_to_ns(base->resolution));
+	SEQ_printf(m, "  .index:      %d\n", base->index);
+
+	SEQ_printf(m, "  .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+
 	SEQ_printf(m,   "  .get_time:   ");
 	print_name_offset(m, base->get_time);
 	SEQ_printf(m,   "\n");
@@ -158,7 +163,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 	P(nr_events);
 	P(nr_retries);
 	P(nr_hangs);
-	P_ns(max_hang_time);
+	P(max_hang_time);
 #endif
 #undef P
 #undef P_ns
@@ -184,7 +189,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 		P_ns(idle_sleeptime);
 		P_ns(iowait_sleeptime);
 		P(last_jiffies);
-		P(next_jiffies);
+		P(next_timer);
 		P_ns(idle_expires);
 		SEQ_printf(m, "jiffies: %Lu\n",
 			   (unsigned long long)jiffies);
@@ -251,6 +256,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 			SEQ_printf(m, "\n");
 		}
 
+		if (dev->set_state_oneshot_stopped) {
+			SEQ_printf(m, " oneshot stopped: ");
+			print_name_offset(m, dev->set_state_oneshot_stopped);
+			SEQ_printf(m, "\n");
+		}
+
 		if (dev->tick_resume) {
 			SEQ_printf(m, " resume:   ");
 			print_name_offset(m, dev->tick_resume);
@@ -269,11 +280,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
 {
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 	print_tickdevice(m, tick_get_broadcast_device(), -1);
-	SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
-		   cpumask_bits(tick_get_broadcast_mask())[0]);
+	SEQ_printf(m, "tick_broadcast_mask: %*pb\n",
+		   cpumask_pr_args(tick_get_broadcast_mask()));
 #ifdef CONFIG_TICK_ONESHOT
-	SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
-		   cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
+	SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n",
+		   cpumask_pr_args(tick_get_broadcast_oneshot_mask()));
 #endif
 	SEQ_printf(m, "\n");
 #endif
@@ -282,7 +293,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
 
 static inline void timer_list_header(struct seq_file *m, u64 now)
 {
-	SEQ_printf(m, "Timer List Version: v0.7\n");
+	SEQ_printf(m, "Timer List Version: v0.8\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
 	SEQ_printf(m, "\n");
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f213..1adecb4b8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -68,7 +68,7 @@ struct entry {
 	 * Number of timeout events:
 	 */
 	unsigned long		count;
-	unsigned int		timer_flag;
+	u32			flags;
 
 	/*
 	 * We save the command-line string to preserve
@@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
  * @startf:	pointer to the function which did the timer setup
  * @timerf:	pointer to the timer callback function of the timer
  * @comm:	name of the process which set up the timer
+ * @tflags:	The flags field of the timer
  *
  * When the timer is already registered, then the event counter is
  * incremented. Otherwise the timer is registered in a free slot.
  */
 void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-			      void *timerf, char *comm,
-			      unsigned int timer_flag)
+			      void *timerf, char *comm, u32 tflags)
 {
 	/*
 	 * It doesn't matter which lock we take:
@@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 	input.start_func = startf;
 	input.expire_func = timerf;
 	input.pid = pid;
-	input.timer_flag = timer_flag;
+	input.flags = tflags;
 
 	raw_spin_lock_irqsave(lock, flags);
 	if (!timer_stats_active)
@@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v)
 
 	for (i = 0; i < nr_entries; i++) {
 		entry = entries + i;
-		if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+		if (entry->flags & TIMER_DEFERRABLE) {
 			seq_printf(m, "%4luD, %5d %-16s ",
 				entry->count, entry->pid, entry->comm);
 		} else {
diff --git a/kernel/torture.c b/kernel/torture.c
index dd70993c2..3e4840633 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -409,7 +409,7 @@ static void (*torture_shutdown_hook)(void);
  */
 void torture_shutdown_absorb(const char *title)
 {
-	while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+	while (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
 		pr_notice("torture thread %s parking due to system shutdown\n",
 			  title);
 		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -480,9 +480,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1,
 				   unsigned long unused2, void *unused3)
 {
 	mutex_lock(&fullstop_mutex);
-	if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+	if (READ_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
 		VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
-		ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+		WRITE_ONCE(fullstop, FULLSTOP_SHUTDOWN);
 	} else {
 		pr_warn("Concurrent rmmod and shutdown illegal!\n");
 	}
@@ -523,13 +523,13 @@ static int stutter;
  */
 void stutter_wait(const char *title)
 {
-	while (ACCESS_ONCE(stutter_pause_test) ||
-	       (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+	while (READ_ONCE(stutter_pause_test) ||
+	       (torture_runnable && !READ_ONCE(*torture_runnable))) {
 		if (stutter_pause_test)
-			if (ACCESS_ONCE(stutter_pause_test) == 1)
+			if (READ_ONCE(stutter_pause_test) == 1)
 				schedule_timeout_interruptible(1);
 			else
-				while (ACCESS_ONCE(stutter_pause_test))
+				while (READ_ONCE(stutter_pause_test))
 					cond_resched();
 		else
 			schedule_timeout_interruptible(round_jiffies_relative(HZ));
@@ -549,14 +549,14 @@ static int torture_stutter(void *arg)
 		if (!torture_must_stop()) {
 			if (stutter > 1) {
 				schedule_timeout_interruptible(stutter - 1);
-				ACCESS_ONCE(stutter_pause_test) = 2;
+				WRITE_ONCE(stutter_pause_test, 2);
 			}
 			schedule_timeout_interruptible(1);
-			ACCESS_ONCE(stutter_pause_test) = 1;
+			WRITE_ONCE(stutter_pause_test, 1);
 		}
 		if (!torture_must_stop())
 			schedule_timeout_interruptible(stutter);
-		ACCESS_ONCE(stutter_pause_test) = 0;
+		WRITE_ONCE(stutter_pause_test, 0);
 		torture_shutdown_absorb("torture_stutter");
 	} while (!torture_must_stop());
 	torture_kthread_stopping("torture_stutter");
@@ -642,13 +642,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
 bool torture_cleanup_begin(void)
 {
 	mutex_lock(&fullstop_mutex);
-	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+	if (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
 		pr_warn("Concurrent rmmod and shutdown illegal!\n");
 		mutex_unlock(&fullstop_mutex);
 		schedule_timeout_uninterruptible(10);
 		return true;
 	}
-	ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+	WRITE_ONCE(fullstop, FULLSTOP_RMMOD);
 	mutex_unlock(&fullstop_mutex);
 	torture_shutdown_cleanup();
 	torture_shuffle_cleanup();
@@ -681,7 +681,7 @@ EXPORT_SYMBOL_GPL(torture_must_stop);
  */
 bool torture_must_stop_irq(void)
 {
-	return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+	return READ_ONCE(fullstop) != FULLSTOP_DONTSTOP;
 }
 EXPORT_SYMBOL_GPL(torture_must_stop_irq);
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 483cecfa5..b3e6b39b6 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -439,7 +439,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
-	int ret, i;
+	int ret;
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
@@ -451,9 +451,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	 * some device names have larger paths - convert the slashes
 	 * to underscores for this to work as expected
 	 */
-	for (i = 0; i < strlen(buts->name); i++)
-		if (buts->name[i] == '/')
-			buts->name[i] = '_';
+	strreplace(buts->name, '/', '_');
 
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
@@ -1450,14 +1448,14 @@ static struct trace_event trace_blk_event = {
 
 static int __init init_blk_tracer(void)
 {
-	if (!register_ftrace_event(&trace_blk_event)) {
+	if (!register_trace_event(&trace_blk_event)) {
 		pr_warning("Warning: could not register block events\n");
 		return 1;
 	}
 
 	if (register_tracer(&blk_tracer) != 0) {
 		pr_warning("Warning: could not register the block tracer\n");
-		unregister_ftrace_event(&trace_blk_event);
+		unregister_trace_event(&trace_blk_event);
 		return 1;
 	}
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d56ce501..88a041ade 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -79,18 +79,6 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-	/* NMI safe access to clock monotonic */
-	return ktime_get_mono_fast_ns();
-}
-
-static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
-	.func		= bpf_ktime_get_ns,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-};
-
 /*
  * limited trace_printk()
  * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
@@ -159,6 +147,17 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
 	.arg2_type	= ARG_CONST_STACK_SIZE,
 };
 
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+{
+	/*
+	 * this program might be calling bpf_trace_printk,
+	 * so allocate per-cpu printk buffers
+	 */
+	trace_printk_init_buffers();
+
+	return &bpf_trace_printk_proto;
+}
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -172,15 +171,18 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
-
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
+	case BPF_FUNC_get_current_pid_tgid:
+		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_current_comm:
+		return &bpf_get_current_comm_proto;
 	case BPF_FUNC_trace_printk:
-		/*
-		 * this program might be calling bpf_trace_printk,
-		 * so allocate per-cpu printk buffers
-		 */
-		trace_printk_init_buffers();
-
-		return &bpf_trace_printk_proto;
+		return bpf_get_trace_printk_proto();
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
 	default:
 		return NULL;
 	}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0315d4317..6260717c1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
 #include <linux/trace_seq.h>
@@ -115,63 +115,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
  *
  */
 
-/*
- * A fast way to enable or disable all ring buffers is to
- * call tracing_on or tracing_off. Turning off the ring buffers
- * prevents all ring buffers from being recorded to.
- * Turning this switch on, makes it OK to write to the
- * ring buffer, if the ring buffer is enabled itself.
- *
- * There's three layers that must be on in order to write
- * to the ring buffer.
- *
- * 1) This global flag must be set.
- * 2) The ring buffer must be enabled for recording.
- * 3) The per cpu buffer must be enabled for recording.
- *
- * In case of an anomaly, this global flag has a bit set that
- * will permantly disable all ring buffers.
- */
-
-/*
- * Global flag to disable all recording to ring buffers
- *  This has two bits: ON, DISABLED
- *
- *  ON   DISABLED
- * ---- ----------
- *   0      0        : ring buffers are off
- *   1      0        : ring buffers are on
- *   X      1        : ring buffers are permanently disabled
- */
-
-enum {
-	RB_BUFFERS_ON_BIT	= 0,
-	RB_BUFFERS_DISABLED_BIT	= 1,
-};
-
-enum {
-	RB_BUFFERS_ON		= 1 << RB_BUFFERS_ON_BIT,
-	RB_BUFFERS_DISABLED	= 1 << RB_BUFFERS_DISABLED_BIT,
-};
-
-static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
-
 /* Used for individual buffers (after the counter) */
 #define RB_BUFFER_OFF		(1 << 20)
 
 #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
 
-/**
- * tracing_off_permanent - permanently disable ring buffers
- *
- * This function, once called, will disable all ring buffers
- * permanently.
- */
-void tracing_off_permanent(void)
-{
-	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
-}
-
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -452,6 +400,23 @@ struct rb_irq_work {
 };
 
 /*
+ * Used for which event context the event is in.
+ *  NMI     = 0
+ *  IRQ     = 1
+ *  SOFTIRQ = 2
+ *  NORMAL  = 3
+ *
+ * See trace_recursive_lock() comment below for more details.
+ */
+enum {
+	RB_CTX_NMI,
+	RB_CTX_IRQ,
+	RB_CTX_SOFTIRQ,
+	RB_CTX_NORMAL,
+	RB_CTX_MAX
+};
+
+/*
  * head_page == tail_page && head == tail then buffer is empty.
  */
 struct ring_buffer_per_cpu {
@@ -462,6 +427,7 @@ struct ring_buffer_per_cpu {
 	arch_spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	unsigned int			nr_pages;
+	unsigned int			current_context;
 	struct list_head		*pages;
 	struct buffer_page		*head_page;	/* read from head */
 	struct buffer_page		*tail_page;	/* write to tail */
@@ -2224,7 +2190,7 @@ static unsigned rb_calculate_event_length(unsigned length)
 
 	/* zero length can cause confusions */
 	if (!length)
-		length = 1;
+		length++;
 
 	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
 		length += sizeof(event.array[0]);
@@ -2636,8 +2602,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	return NULL;
 }
 
-#ifdef CONFIG_TRACING
-
 /*
  * The lock and unlock are done within a preempt disable section.
  * The current_context per_cpu variable can only be modified
@@ -2675,44 +2639,38 @@ rb_reserve_next_event(struct ring_buffer *buffer,
  * just so happens that it is the same bit corresponding to
  * the current context.
  */
-static DEFINE_PER_CPU(unsigned int, current_context);
 
-static __always_inline int trace_recursive_lock(void)
+static __always_inline int
+trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	unsigned int val = __this_cpu_read(current_context);
+	unsigned int val = cpu_buffer->current_context;
 	int bit;
 
 	if (in_interrupt()) {
 		if (in_nmi())
-			bit = 0;
+			bit = RB_CTX_NMI;
 		else if (in_irq())
-			bit = 1;
+			bit = RB_CTX_IRQ;
 		else
-			bit = 2;
+			bit = RB_CTX_SOFTIRQ;
 	} else
-		bit = 3;
+		bit = RB_CTX_NORMAL;
 
 	if (unlikely(val & (1 << bit)))
 		return 1;
 
 	val |= (1 << bit);
-	__this_cpu_write(current_context, val);
+	cpu_buffer->current_context = val;
 
 	return 0;
 }
 
-static __always_inline void trace_recursive_unlock(void)
+static __always_inline void
+trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	__this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
+	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
 }
 
-#else
-
-#define trace_recursive_lock()		(0)
-#define trace_recursive_unlock()	do { } while (0)
-
-#endif
-
 /**
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
@@ -2735,41 +2693,37 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	struct ring_buffer_event *event;
 	int cpu;
 
-	if (ring_buffer_flags != RB_BUFFERS_ON)
-		return NULL;
-
 	/* If we are tracing schedule, we don't want to recurse */
 	preempt_disable_notrace();
 
-	if (atomic_read(&buffer->record_disabled))
-		goto out_nocheck;
-
-	if (trace_recursive_lock())
-		goto out_nocheck;
+	if (unlikely(atomic_read(&buffer->record_disabled)))
+		goto out;
 
 	cpu = raw_smp_processor_id();
 
-	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+	if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask)))
 		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 
-	if (atomic_read(&cpu_buffer->record_disabled))
+	if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
 		goto out;
 
-	if (length > BUF_MAX_DATA_SIZE)
+	if (unlikely(length > BUF_MAX_DATA_SIZE))
+		goto out;
+
+	if (unlikely(trace_recursive_lock(cpu_buffer)))
 		goto out;
 
 	event = rb_reserve_next_event(buffer, cpu_buffer, length);
 	if (!event)
-		goto out;
+		goto out_unlock;
 
 	return event;
 
+ out_unlock:
+	trace_recursive_unlock(cpu_buffer);
  out:
-	trace_recursive_unlock();
-
- out_nocheck:
 	preempt_enable_notrace();
 	return NULL;
 }
@@ -2859,7 +2813,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	rb_wakeups(buffer, cpu_buffer);
 
-	trace_recursive_unlock();
+	trace_recursive_unlock(cpu_buffer);
 
 	preempt_enable_notrace();
 
@@ -2970,7 +2924,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
  out:
 	rb_end_commit(cpu_buffer);
 
-	trace_recursive_unlock();
+	trace_recursive_unlock(cpu_buffer);
 
 	preempt_enable_notrace();
 
@@ -3000,9 +2954,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	int ret = -EBUSY;
 	int cpu;
 
-	if (ring_buffer_flags != RB_BUFFERS_ON)
-		return -EBUSY;
-
 	preempt_disable_notrace();
 
 	if (atomic_read(&buffer->record_disabled))
@@ -3021,9 +2972,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	if (length > BUF_MAX_DATA_SIZE)
 		goto out;
 
+	if (unlikely(trace_recursive_lock(cpu_buffer)))
+		goto out;
+
 	event = rb_reserve_next_event(buffer, cpu_buffer, length);
 	if (!event)
-		goto out;
+		goto out_unlock;
 
 	body = rb_event_data(event);
 
@@ -3034,6 +2988,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	rb_wakeups(buffer, cpu_buffer);
 
 	ret = 0;
+
+ out_unlock:
+	trace_recursive_unlock(cpu_buffer);
+
  out:
 	preempt_enable_notrace();
 
@@ -3860,19 +3818,36 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
 
-static inline int rb_ok_to_lock(void)
+static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer)
 {
+	if (likely(!in_nmi())) {
+		raw_spin_lock(&cpu_buffer->reader_lock);
+		return true;
+	}
+
 	/*
 	 * If an NMI die dumps out the content of the ring buffer
-	 * do not grab locks. We also permanently disable the ring
-	 * buffer too. A one time deal is all you get from reading
-	 * the ring buffer from an NMI.
+	 * trylock must be used to prevent a deadlock if the NMI
+	 * preempted a task that holds the ring buffer locks. If
+	 * we get the lock then all is fine, if not, then continue
+	 * to do the read, but this can corrupt the ring buffer,
+	 * so it must be permanently disabled from future writes.
+	 * Reading from NMI is a oneshot deal.
 	 */
-	if (likely(!in_nmi()))
-		return 1;
+	if (raw_spin_trylock(&cpu_buffer->reader_lock))
+		return true;
 
-	tracing_off_permanent();
-	return 0;
+	/* Continue without locking, but disable the ring buffer */
+	atomic_inc(&cpu_buffer->record_disabled);
+	return false;
+}
+
+static inline void
+rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
+{
+	if (likely(locked))
+		raw_spin_unlock(&cpu_buffer->reader_lock);
+	return;
 }
 
 /**
@@ -3892,21 +3867,18 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
 	unsigned long flags;
-	int dolock;
+	bool dolock;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
-	dolock = rb_ok_to_lock();
  again:
 	local_irq_save(flags);
-	if (dolock)
-		raw_spin_lock(&cpu_buffer->reader_lock);
+	dolock = rb_reader_lock(cpu_buffer);
 	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
 		rb_advance_reader(cpu_buffer);
-	if (dolock)
-		raw_spin_unlock(&cpu_buffer->reader_lock);
+	rb_reader_unlock(cpu_buffer, dolock);
 	local_irq_restore(flags);
 
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3959,9 +3931,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
-	int dolock;
-
-	dolock = rb_ok_to_lock();
+	bool dolock;
 
  again:
 	/* might be called in atomic */
@@ -3972,8 +3942,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 
 	cpu_buffer = buffer->buffers[cpu];
 	local_irq_save(flags);
-	if (dolock)
-		raw_spin_lock(&cpu_buffer->reader_lock);
+	dolock = rb_reader_lock(cpu_buffer);
 
 	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 	if (event) {
@@ -3981,8 +3950,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 		rb_advance_reader(cpu_buffer);
 	}
 
-	if (dolock)
-		raw_spin_unlock(&cpu_buffer->reader_lock);
+	rb_reader_unlock(cpu_buffer, dolock);
 	local_irq_restore(flags);
 
  out:
@@ -4263,21 +4231,17 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	unsigned long flags;
-	int dolock;
+	bool dolock;
 	int cpu;
 	int ret;
 
-	dolock = rb_ok_to_lock();
-
 	/* yes this is racy, but if you don't like the race, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		local_irq_save(flags);
-		if (dolock)
-			raw_spin_lock(&cpu_buffer->reader_lock);
+		dolock = rb_reader_lock(cpu_buffer);
 		ret = rb_per_cpu_empty(cpu_buffer);
-		if (dolock)
-			raw_spin_unlock(&cpu_buffer->reader_lock);
+		rb_reader_unlock(cpu_buffer, dolock);
 		local_irq_restore(flags);
 
 		if (!ret)
@@ -4297,21 +4261,17 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	unsigned long flags;
-	int dolock;
+	bool dolock;
 	int ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 1;
 
-	dolock = rb_ok_to_lock();
-
 	cpu_buffer = buffer->buffers[cpu];
 	local_irq_save(flags);
-	if (dolock)
-		raw_spin_lock(&cpu_buffer->reader_lock);
+	dolock = rb_reader_lock(cpu_buffer);
 	ret = rb_per_cpu_empty(cpu_buffer);
-	if (dolock)
-		raw_spin_unlock(&cpu_buffer->reader_lock);
+	rb_reader_unlock(cpu_buffer, dolock);
 	local_irq_restore(flags);
 
 	return ret;
@@ -4349,9 +4309,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 
 	ret = -EAGAIN;
 
-	if (ring_buffer_flags != RB_BUFFERS_ON)
-		goto out;
-
 	if (atomic_read(&buffer_a->record_disabled))
 		goto out;
 
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 1b28df2d9..a1503a027 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -32,11 +32,11 @@ static struct task_struct *producer;
 static struct task_struct *consumer;
 static unsigned long read;
 
-static int disable_reader;
+static unsigned int disable_reader;
 module_param(disable_reader, uint, 0644);
 MODULE_PARM_DESC(disable_reader, "only run producer");
 
-static int write_iteration = 50;
+static unsigned int write_iteration = 50;
 module_param(write_iteration, uint, 0644);
 MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
 
@@ -46,16 +46,16 @@ static int consumer_nice = MAX_NICE;
 static int producer_fifo = -1;
 static int consumer_fifo = -1;
 
-module_param(producer_nice, uint, 0644);
+module_param(producer_nice, int, 0644);
 MODULE_PARM_DESC(producer_nice, "nice prio for producer");
 
-module_param(consumer_nice, uint, 0644);
+module_param(consumer_nice, int, 0644);
 MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
 
-module_param(producer_fifo, uint, 0644);
+module_param(producer_fifo, int, 0644);
 MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
 
-module_param(consumer_fifo, uint, 0644);
+module_param(consumer_fifo, int, 0644);
 MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
 
 static int read_events;
@@ -263,6 +263,8 @@ static void ring_buffer_producer(void)
 		if (cnt % wakeup_interval)
 			cond_resched();
 #endif
+		if (kthread_should_stop())
+			kill_test = 1;
 
 	} while (ktime_before(end_time, timeout) && !kill_test);
 	trace_printk("End ring buffer hammer\n");
@@ -285,7 +287,7 @@ static void ring_buffer_producer(void)
 	entries = ring_buffer_entries(buffer);
 	overruns = ring_buffer_overruns(buffer);
 
-	if (kill_test)
+	if (kill_test && !kthread_should_stop())
 		trace_printk("ERROR!\n");
 
 	if (!disable_reader) {
@@ -379,7 +381,7 @@ static int ring_buffer_consumer_thread(void *arg)
 	}
 	__set_current_state(TASK_RUNNING);
 
-	if (kill_test)
+	if (!kthread_should_stop())
 		wait_to_die();
 
 	return 0;
@@ -399,13 +401,16 @@ static int ring_buffer_producer_thread(void *arg)
 		}
 
 		ring_buffer_producer();
+		if (kill_test)
+			goto out_kill;
 
 		trace_printk("Sleeping for 10 secs\n");
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ * SLEEP_TIME);
 	}
 
-	if (kill_test)
+out_kill:
+	if (!kthread_should_stop())
 		wait_to_die();
 
 	return 0;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 05330494a..abcbf7ff8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -297,11 +297,11 @@ void trace_array_put(struct trace_array *this_tr)
 	mutex_unlock(&trace_types_lock);
 }
 
-int filter_check_discard(struct ftrace_event_file *file, void *rec,
+int filter_check_discard(struct trace_event_file *file, void *rec,
 			 struct ring_buffer *buffer,
 			 struct ring_buffer_event *event)
 {
-	if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
+	if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
 	    !filter_match_preds(file->filter, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
@@ -311,7 +311,7 @@ int filter_check_discard(struct ftrace_event_file *file, void *rec,
 }
 EXPORT_SYMBOL_GPL(filter_check_discard);
 
-int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
+int call_filter_check_discard(struct trace_event_call *call, void *rec,
 			      struct ring_buffer *buffer,
 			      struct ring_buffer_event *event)
 {
@@ -876,6 +876,7 @@ static struct {
 	{ trace_clock_jiffies,		"uptime",	0 },
 	{ trace_clock,			"perf",		1 },
 	{ ktime_get_mono_fast_ns,	"mono",		1 },
+	{ ktime_get_raw_fast_ns,	"mono_raw",	1 },
 	ARCH_TRACE_CLOCKS
 };
 
@@ -1693,13 +1694,13 @@ static struct ring_buffer *temp_buffer;
 
 struct ring_buffer_event *
 trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
-			  struct ftrace_event_file *ftrace_file,
+			  struct trace_event_file *trace_file,
 			  int type, unsigned long len,
 			  unsigned long flags, int pc)
 {
 	struct ring_buffer_event *entry;
 
-	*current_rb = ftrace_file->tr->trace_buffer.buffer;
+	*current_rb = trace_file->tr->trace_buffer.buffer;
 	entry = trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 	/*
@@ -1708,7 +1709,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 	 * to store the trace event for the tigger to use. It's recusive
 	 * safe and will not be recorded anywhere.
 	 */
-	if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+	if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
 		*current_rb = temp_buffer;
 		entry = trace_buffer_lock_reserve(*current_rb,
 						  type, len, flags, pc);
@@ -1760,7 +1761,7 @@ trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       int pc)
 {
-	struct ftrace_event_call *call = &event_function;
+	struct trace_event_call *call = &event_function;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
@@ -1795,7 +1796,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 				 unsigned long flags,
 				 int skip, int pc, struct pt_regs *regs)
 {
-	struct ftrace_event_call *call = &event_kernel_stack;
+	struct trace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
@@ -1923,7 +1924,7 @@ static DEFINE_PER_CPU(int, user_stack_count);
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
-	struct ftrace_event_call *call = &event_user_stack;
+	struct trace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
@@ -2129,7 +2130,7 @@ static void trace_printk_start_stop_comm(int enabled)
  */
 int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
-	struct ftrace_event_call *call = &event_bprint;
+	struct trace_event_call *call = &event_bprint;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	struct trace_array *tr = &global_trace;
@@ -2187,7 +2188,7 @@ static int
 __trace_array_vprintk(struct ring_buffer *buffer,
 		      unsigned long ip, const char *fmt, va_list args)
 {
-	struct ftrace_event_call *call = &event_print;
+	struct trace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
 	int len = 0, size, pc;
 	struct print_entry *entry;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 921691c5c..74bde8160 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -12,7 +12,7 @@
 #include <linux/ftrace.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/compiler.h>
 #include <linux/trace_seq.h>
 
@@ -211,8 +211,8 @@ struct trace_array {
 #ifdef CONFIG_FTRACE_SYSCALLS
 	int			sys_refcount_enter;
 	int			sys_refcount_exit;
-	struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
-	struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
+	struct trace_event_file __rcu *enter_syscall_files[NR_syscalls];
+	struct trace_event_file __rcu *exit_syscall_files[NR_syscalls];
 #endif
 	int			stop_count;
 	int			clock_id;
@@ -859,7 +859,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops);
 #define ftrace_destroy_filter_files(ops) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
 
-int ftrace_event_is_function(struct ftrace_event_call *call);
+int ftrace_event_is_function(struct trace_event_call *call);
 
 /*
  * struct trace_parser - servers for reading the user input separated by spaces
@@ -993,7 +993,7 @@ struct event_subsystem {
 	int			ref_count;
 };
 
-struct ftrace_subsystem_dir {
+struct trace_subsystem_dir {
 	struct list_head		list;
 	struct event_subsystem		*subsystem;
 	struct trace_array		*tr;
@@ -1053,30 +1053,30 @@ struct filter_pred {
 
 extern enum regex_type
 filter_parse_regex(char *buff, int len, char **search, int *not);
-extern void print_event_filter(struct ftrace_event_file *file,
+extern void print_event_filter(struct trace_event_file *file,
 			       struct trace_seq *s);
-extern int apply_event_filter(struct ftrace_event_file *file,
+extern int apply_event_filter(struct trace_event_file *file,
 			      char *filter_string);
-extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
 					char *filter_string);
 extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
 extern int filter_assign_type(const char *type);
-extern int create_event_filter(struct ftrace_event_call *call,
+extern int create_event_filter(struct trace_event_call *call,
 			       char *filter_str, bool set_str,
 			       struct event_filter **filterp);
 extern void free_event_filter(struct event_filter *filter);
 
 struct ftrace_event_field *
-trace_find_event_field(struct ftrace_event_call *call, char *name);
+trace_find_event_field(struct trace_event_call *call, char *name);
 
 extern void trace_event_enable_cmd_record(bool enable);
 extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
 extern int event_trace_del_tracer(struct trace_array *tr);
 
-extern struct ftrace_event_file *find_event_file(struct trace_array *tr,
-						 const char *system,
-						 const char *event);
+extern struct trace_event_file *find_event_file(struct trace_array *tr,
+						const char *system,
+						const char *event);
 
 static inline void *event_file_data(struct file *filp)
 {
@@ -1181,7 +1181,7 @@ struct event_trigger_ops {
  *	commands need to do this if they themselves log to the trace
  *	buffer (see the @post_trigger() member below).  @trigger_type
  *	values are defined by adding new values to the trigger_type
- *	enum in include/linux/ftrace_event.h.
+ *	enum in include/linux/trace_events.h.
  *
  * @post_trigger: A flag that says whether or not this command needs
  *	to have its action delayed until after the current event has
@@ -1243,23 +1243,23 @@ struct event_command {
 	enum event_trigger_type	trigger_type;
 	bool			post_trigger;
 	int			(*func)(struct event_command *cmd_ops,
-					struct ftrace_event_file *file,
+					struct trace_event_file *file,
 					char *glob, char *cmd, char *params);
 	int			(*reg)(char *glob,
 				       struct event_trigger_ops *ops,
 				       struct event_trigger_data *data,
-				       struct ftrace_event_file *file);
+				       struct trace_event_file *file);
 	void			(*unreg)(char *glob,
 					 struct event_trigger_ops *ops,
 					 struct event_trigger_data *data,
-					 struct ftrace_event_file *file);
+					 struct trace_event_file *file);
 	int			(*set_filter)(char *filter_str,
 					      struct event_trigger_data *data,
-					      struct ftrace_event_file *file);
+					      struct trace_event_file *file);
 	struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
 };
 
-extern int trace_event_enable_disable(struct ftrace_event_file *file,
+extern int trace_event_enable_disable(struct trace_event_file *file,
 				      int enable, int soft_disable);
 extern int tracing_alloc_snapshot(void);
 
@@ -1287,7 +1287,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\
-	extern struct ftrace_event_call					\
+	extern struct trace_event_call					\
 	__aligned(4) event_##call;
 #undef FTRACE_ENTRY_DUP
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter)	\
@@ -1296,7 +1296,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 #include "trace_entries.h"
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
-int perf_ftrace_event_register(struct ftrace_event_call *call,
+int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data);
 #else
 #define perf_ftrace_event_register NULL
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 1879980f0..e2e12ad31 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -29,7 +29,7 @@ static struct trace_array *branch_tracer;
 static void
 probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
-	struct ftrace_event_call *call = &event_branch;
+	struct trace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
 	struct trace_array_cpu *data;
 	struct ring_buffer_event *event;
@@ -194,7 +194,7 @@ __init static int init_branch_tracer(void)
 {
 	int ret;
 
-	ret = register_ftrace_event(&trace_branch_event);
+	ret = register_trace_event(&trace_branch_event);
 	if (!ret) {
 		printk(KERN_WARNING "Warning: could not register "
 				    "branch events\n");
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 57b67b1f2..0f06532a7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -56,6 +56,7 @@ u64 notrace trace_clock(void)
 {
 	return local_clock();
 }
+EXPORT_SYMBOL_GPL(trace_clock);
 
 /*
  * trace_jiffy_clock(): Simply use jiffies as a clock counter.
@@ -68,6 +69,7 @@ u64 notrace trace_clock_jiffies(void)
 {
 	return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
 }
+EXPORT_SYMBOL_GPL(trace_clock_jiffies);
 
 /*
  * trace_clock_global(): special globally coherent trace clock
@@ -123,6 +125,7 @@ u64 notrace trace_clock_global(void)
 
 	return now;
 }
+EXPORT_SYMBOL_GPL(trace_clock_global);
 
 static atomic64_t trace_counter;
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 6fa484de2..abfc903e7 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,7 +21,7 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
 /* Count the events in use (per event id, not per instance) */
 static int	total_ref_count;
 
-static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+static int perf_trace_event_perm(struct trace_event_call *tp_event,
 				 struct perf_event *p_event)
 {
 	if (tp_event->perf_perm) {
@@ -83,7 +83,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
 	return 0;
 }
 
-static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
+static int perf_trace_event_reg(struct trace_event_call *tp_event,
 				struct perf_event *p_event)
 {
 	struct hlist_head __percpu *list;
@@ -143,7 +143,7 @@ fail:
 
 static void perf_trace_event_unreg(struct perf_event *p_event)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	int i;
 
 	if (--tp_event->perf_refcount > 0)
@@ -172,17 +172,17 @@ out:
 
 static int perf_trace_event_open(struct perf_event *p_event)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
 }
 
 static void perf_trace_event_close(struct perf_event *p_event)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
 }
 
-static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+static int perf_trace_event_init(struct trace_event_call *tp_event,
 				 struct perf_event *p_event)
 {
 	int ret;
@@ -206,7 +206,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 
 int perf_trace_init(struct perf_event *p_event)
 {
-	struct ftrace_event_call *tp_event;
+	struct trace_event_call *tp_event;
 	u64 event_id = p_event->attr.config;
 	int ret = -EINVAL;
 
@@ -236,7 +236,7 @@ void perf_trace_destroy(struct perf_event *p_event)
 
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	struct hlist_head __percpu *pcpu_list;
 	struct hlist_head *list;
 
@@ -255,7 +255,7 @@ int perf_trace_add(struct perf_event *p_event, int flags)
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	hlist_del_rcu(&p_event->hlist_entry);
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
@@ -357,7 +357,7 @@ static void perf_ftrace_function_disable(struct perf_event *event)
 	ftrace_function_local_disable(&event->ftrace_ops);
 }
 
-int perf_ftrace_event_register(struct ftrace_event_call *call,
+int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data)
 {
 	switch (type) {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c4de47fc5..404a372ad 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -61,14 +61,14 @@ static int system_refcount_dec(struct event_subsystem *system)
 
 #define do_for_each_event_file_safe(tr, file)			\
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\
-		struct ftrace_event_file *___n;				\
+		struct trace_event_file *___n;				\
 		list_for_each_entry_safe(file, ___n, &tr->events, list)
 
 #define while_for_each_event_file()		\
 	}
 
 static struct list_head *
-trace_get_fields(struct ftrace_event_call *event_call)
+trace_get_fields(struct trace_event_call *event_call)
 {
 	if (!event_call->class->get_fields)
 		return &event_call->class->fields;
@@ -89,7 +89,7 @@ __find_event_field(struct list_head *head, char *name)
 }
 
 struct ftrace_event_field *
-trace_find_event_field(struct ftrace_event_call *call, char *name)
+trace_find_event_field(struct trace_event_call *call, char *name)
 {
 	struct ftrace_event_field *field;
 	struct list_head *head;
@@ -129,7 +129,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
 	return 0;
 }
 
-int trace_define_field(struct ftrace_event_call *call, const char *type,
+int trace_define_field(struct trace_event_call *call, const char *type,
 		       const char *name, int offset, int size, int is_signed,
 		       int filter_type)
 {
@@ -166,7 +166,7 @@ static int trace_define_common_fields(void)
 	return ret;
 }
 
-static void trace_destroy_fields(struct ftrace_event_call *call)
+static void trace_destroy_fields(struct trace_event_call *call)
 {
 	struct ftrace_event_field *field, *next;
 	struct list_head *head;
@@ -178,11 +178,11 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
 	}
 }
 
-int trace_event_raw_init(struct ftrace_event_call *call)
+int trace_event_raw_init(struct trace_event_call *call)
 {
 	int id;
 
-	id = register_ftrace_event(&call->event);
+	id = register_trace_event(&call->event);
 	if (!id)
 		return -ENODEV;
 
@@ -190,18 +190,18 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
 
-void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
-				  struct ftrace_event_file *ftrace_file,
-				  unsigned long len)
+void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
+				 struct trace_event_file *trace_file,
+				 unsigned long len)
 {
-	struct ftrace_event_call *event_call = ftrace_file->event_call;
+	struct trace_event_call *event_call = trace_file->event_call;
 
 	local_save_flags(fbuffer->flags);
 	fbuffer->pc = preempt_count();
-	fbuffer->ftrace_file = ftrace_file;
+	fbuffer->trace_file = trace_file;
 
 	fbuffer->event =
-		trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
+		trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
 						event_call->event.type, len,
 						fbuffer->flags, fbuffer->pc);
 	if (!fbuffer->event)
@@ -210,13 +210,13 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
 	fbuffer->entry = ring_buffer_event_data(fbuffer->event);
 	return fbuffer->entry;
 }
-EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+EXPORT_SYMBOL_GPL(trace_event_buffer_reserve);
 
 static DEFINE_SPINLOCK(tracepoint_iter_lock);
 
-static void output_printk(struct ftrace_event_buffer *fbuffer)
+static void output_printk(struct trace_event_buffer *fbuffer)
 {
-	struct ftrace_event_call *event_call;
+	struct trace_event_call *event_call;
 	struct trace_event *event;
 	unsigned long flags;
 	struct trace_iterator *iter = tracepoint_print_iter;
@@ -224,12 +224,12 @@ static void output_printk(struct ftrace_event_buffer *fbuffer)
 	if (!iter)
 		return;
 
-	event_call = fbuffer->ftrace_file->event_call;
+	event_call = fbuffer->trace_file->event_call;
 	if (!event_call || !event_call->event.funcs ||
 	    !event_call->event.funcs->trace)
 		return;
 
-	event = &fbuffer->ftrace_file->event_call->event;
+	event = &fbuffer->trace_file->event_call->event;
 
 	spin_lock_irqsave(&tracepoint_iter_lock, flags);
 	trace_seq_init(&iter->seq);
@@ -241,21 +241,21 @@ static void output_printk(struct ftrace_event_buffer *fbuffer)
 	spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
 }
 
-void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
 {
 	if (tracepoint_printk)
 		output_printk(fbuffer);
 
-	event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
+	event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
 				    fbuffer->event, fbuffer->entry,
 				    fbuffer->flags, fbuffer->pc);
 }
-EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
+EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
 
-int ftrace_event_reg(struct ftrace_event_call *call,
-		     enum trace_reg type, void *data)
+int trace_event_reg(struct trace_event_call *call,
+		    enum trace_reg type, void *data)
 {
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
 	switch (type) {
@@ -288,34 +288,34 @@ int ftrace_event_reg(struct ftrace_event_call *call,
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(ftrace_event_reg);
+EXPORT_SYMBOL_GPL(trace_event_reg);
 
 void trace_event_enable_cmd_record(bool enable)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_array *tr;
 
 	mutex_lock(&event_mutex);
 	do_for_each_event_file(tr, file) {
 
-		if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
+		if (!(file->flags & EVENT_FILE_FL_ENABLED))
 			continue;
 
 		if (enable) {
 			tracing_start_cmdline_record();
-			set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+			set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 		} else {
 			tracing_stop_cmdline_record();
-			clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+			clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 		}
 	} while_for_each_event_file();
 	mutex_unlock(&event_mutex);
 }
 
-static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
+static int __ftrace_event_enable_disable(struct trace_event_file *file,
 					 int enable, int soft_disable)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 	int ret = 0;
 	int disable;
 
@@ -337,24 +337,24 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 		if (soft_disable) {
 			if (atomic_dec_return(&file->sm_ref) > 0)
 				break;
-			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
-			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+			disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
+			clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
 		} else
-			disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+			disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
 
-		if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
-			clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
-			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
+		if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
+			clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
+			if (file->flags & EVENT_FILE_FL_RECORDED_CMD) {
 				tracing_stop_cmdline_record();
-				clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+				clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 			}
 			call->class->reg(call, TRACE_REG_UNREGISTER, file);
 		}
 		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
-		if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
-			set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+		if (file->flags & EVENT_FILE_FL_SOFT_MODE)
+			set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 		else
-			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+			clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 		break;
 	case 1:
 		/*
@@ -366,31 +366,31 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 		 * it still seems to be disabled.
 		 */
 		if (!soft_disable)
-			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+			clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 		else {
 			if (atomic_inc_return(&file->sm_ref) > 1)
 				break;
-			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+			set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
 		}
 
-		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+		if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
 
 			/* Keep the event disabled, when going to SOFT_MODE. */
 			if (soft_disable)
-				set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+				set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 
 			if (trace_flags & TRACE_ITER_RECORD_CMD) {
 				tracing_start_cmdline_record();
-				set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+				set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 			}
 			ret = call->class->reg(call, TRACE_REG_REGISTER, file);
 			if (ret) {
 				tracing_stop_cmdline_record();
 				pr_info("event trace: Could not enable event "
-					"%s\n", ftrace_event_name(call));
+					"%s\n", trace_event_name(call));
 				break;
 			}
-			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+			set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
 
 			/* WAS_ENABLED gets set but never cleared. */
 			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
@@ -401,13 +401,13 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 	return ret;
 }
 
-int trace_event_enable_disable(struct ftrace_event_file *file,
+int trace_event_enable_disable(struct trace_event_file *file,
 			       int enable, int soft_disable)
 {
 	return __ftrace_event_enable_disable(file, enable, soft_disable);
 }
 
-static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+static int ftrace_event_enable_disable(struct trace_event_file *file,
 				       int enable)
 {
 	return __ftrace_event_enable_disable(file, enable, 0);
@@ -415,7 +415,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
 
 static void ftrace_clear_events(struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(file, &tr->events, list) {
@@ -449,14 +449,14 @@ static void __get_system(struct event_subsystem *system)
 	system_refcount_inc(system);
 }
 
-static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+static void __get_system_dir(struct trace_subsystem_dir *dir)
 {
 	WARN_ON_ONCE(dir->ref_count == 0);
 	dir->ref_count++;
 	__get_system(dir->subsystem);
 }
 
-static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+static void __put_system_dir(struct trace_subsystem_dir *dir)
 {
 	WARN_ON_ONCE(dir->ref_count == 0);
 	/* If the subsystem is about to be freed, the dir must be too */
@@ -467,14 +467,14 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)
 		kfree(dir);
 }
 
-static void put_system(struct ftrace_subsystem_dir *dir)
+static void put_system(struct trace_subsystem_dir *dir)
 {
 	mutex_lock(&event_mutex);
 	__put_system_dir(dir);
 	mutex_unlock(&event_mutex);
 }
 
-static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+static void remove_subsystem(struct trace_subsystem_dir *dir)
 {
 	if (!dir)
 		return;
@@ -486,7 +486,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
 	}
 }
 
-static void remove_event_file_dir(struct ftrace_event_file *file)
+static void remove_event_file_dir(struct trace_event_file *file)
 {
 	struct dentry *dir = file->dir;
 	struct dentry *child;
@@ -515,15 +515,15 @@ static int
 __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
 			      const char *sub, const char *event, int set)
 {
-	struct ftrace_event_file *file;
-	struct ftrace_event_call *call;
+	struct trace_event_file *file;
+	struct trace_event_call *call;
 	const char *name;
 	int ret = -EINVAL;
 
 	list_for_each_entry(file, &tr->events, list) {
 
 		call = file->event_call;
-		name = ftrace_event_name(call);
+		name = trace_event_name(call);
 
 		if (!name || !call->class || !call->class->reg)
 			continue;
@@ -671,8 +671,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_file *file = v;
-	struct ftrace_event_call *call;
+	struct trace_event_file *file = v;
+	struct trace_event_call *call;
 	struct trace_array *tr = m->private;
 
 	(*pos)++;
@@ -692,13 +692,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_array *tr = m->private;
 	loff_t l;
 
 	mutex_lock(&event_mutex);
 
-	file = list_entry(&tr->events, struct ftrace_event_file, list);
+	file = list_entry(&tr->events, struct trace_event_file, list);
 	for (l = 0; l <= *pos; ) {
 		file = t_next(m, file, &l);
 		if (!file)
@@ -710,13 +710,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_file *file = v;
+	struct trace_event_file *file = v;
 	struct trace_array *tr = m->private;
 
 	(*pos)++;
 
 	list_for_each_entry_continue(file, &tr->events, list) {
-		if (file->flags & FTRACE_EVENT_FL_ENABLED)
+		if (file->flags & EVENT_FILE_FL_ENABLED)
 			return file;
 	}
 
@@ -725,13 +725,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_array *tr = m->private;
 	loff_t l;
 
 	mutex_lock(&event_mutex);
 
-	file = list_entry(&tr->events, struct ftrace_event_file, list);
+	file = list_entry(&tr->events, struct trace_event_file, list);
 	for (l = 0; l <= *pos; ) {
 		file = s_next(m, file, &l);
 		if (!file)
@@ -742,12 +742,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_event_file *file = v;
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_file *file = v;
+	struct trace_event_call *call = file->event_call;
 
 	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
 		seq_printf(m, "%s:", call->class->system);
-	seq_printf(m, "%s\n", ftrace_event_name(call));
+	seq_printf(m, "%s\n", trace_event_name(call));
 
 	return 0;
 }
@@ -761,7 +761,7 @@ static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	unsigned long flags;
 	char buf[4] = "0";
 
@@ -774,12 +774,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	if (!file)
 		return -ENODEV;
 
-	if (flags & FTRACE_EVENT_FL_ENABLED &&
-	    !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+	if (flags & EVENT_FILE_FL_ENABLED &&
+	    !(flags & EVENT_FILE_FL_SOFT_DISABLED))
 		strcpy(buf, "1");
 
-	if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
-	    flags & FTRACE_EVENT_FL_SOFT_MODE)
+	if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
+	    flags & EVENT_FILE_FL_SOFT_MODE)
 		strcat(buf, "*");
 
 	strcat(buf, "\n");
@@ -791,7 +791,7 @@ static ssize_t
 event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	unsigned long val;
 	int ret;
 
@@ -828,10 +828,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
 	const char set_to_char[4] = { '?', '0', '1', 'X' };
-	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct trace_subsystem_dir *dir = filp->private_data;
 	struct event_subsystem *system = dir->subsystem;
-	struct ftrace_event_call *call;
-	struct ftrace_event_file *file;
+	struct trace_event_call *call;
+	struct trace_event_file *file;
 	struct trace_array *tr = dir->tr;
 	char buf[2];
 	int set = 0;
@@ -840,7 +840,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	mutex_lock(&event_mutex);
 	list_for_each_entry(file, &tr->events, list) {
 		call = file->event_call;
-		if (!ftrace_event_name(call) || !call->class || !call->class->reg)
+		if (!trace_event_name(call) || !call->class || !call->class->reg)
 			continue;
 
 		if (system && strcmp(call->class->system, system->name) != 0)
@@ -851,7 +851,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		 * or if all events or cleared, or if we have
 		 * a mixture.
 		 */
-		set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
+		set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED));
 
 		/*
 		 * If we have a mixture, no need to look further.
@@ -873,7 +873,7 @@ static ssize_t
 system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		    loff_t *ppos)
 {
-	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct trace_subsystem_dir *dir = filp->private_data;
 	struct event_subsystem *system = dir->subsystem;
 	const char *name = NULL;
 	unsigned long val;
@@ -917,7 +917,7 @@ enum {
 
 static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = event_file_data(m->private);
+	struct trace_event_call *call = event_file_data(m->private);
 	struct list_head *common_head = &ftrace_common_fields;
 	struct list_head *head = trace_get_fields(call);
 	struct list_head *node = v;
@@ -949,13 +949,13 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 
 static int f_show(struct seq_file *m, void *v)
 {
-	struct ftrace_event_call *call = event_file_data(m->private);
+	struct trace_event_call *call = event_file_data(m->private);
 	struct ftrace_event_field *field;
 	const char *array_descriptor;
 
 	switch ((unsigned long)v) {
 	case FORMAT_HEADER:
-		seq_printf(m, "name: %s\n", ftrace_event_name(call));
+		seq_printf(m, "name: %s\n", trace_event_name(call));
 		seq_printf(m, "ID: %d\n", call->event.type);
 		seq_puts(m, "format:\n");
 		return 0;
@@ -1062,7 +1062,7 @@ static ssize_t
 event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_seq *s;
 	int r = -ENODEV;
 
@@ -1095,7 +1095,7 @@ static ssize_t
 event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	char *buf;
 	int err = -ENODEV;
 
@@ -1132,7 +1132,7 @@ static LIST_HEAD(event_subsystems);
 static int subsystem_open(struct inode *inode, struct file *filp)
 {
 	struct event_subsystem *system = NULL;
-	struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+	struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */
 	struct trace_array *tr;
 	int ret;
 
@@ -1181,7 +1181,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 
 static int system_tr_open(struct inode *inode, struct file *filp)
 {
-	struct ftrace_subsystem_dir *dir;
+	struct trace_subsystem_dir *dir;
 	struct trace_array *tr = inode->i_private;
 	int ret;
 
@@ -1214,7 +1214,7 @@ static int system_tr_open(struct inode *inode, struct file *filp)
 
 static int subsystem_release(struct inode *inode, struct file *file)
 {
-	struct ftrace_subsystem_dir *dir = file->private_data;
+	struct trace_subsystem_dir *dir = file->private_data;
 
 	trace_array_put(dir->tr);
 
@@ -1235,7 +1235,7 @@ static ssize_t
 subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 		      loff_t *ppos)
 {
-	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct trace_subsystem_dir *dir = filp->private_data;
 	struct event_subsystem *system = dir->subsystem;
 	struct trace_seq *s;
 	int r;
@@ -1262,7 +1262,7 @@ static ssize_t
 subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
-	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct trace_subsystem_dir *dir = filp->private_data;
 	char *buf;
 	int err;
 
@@ -1497,9 +1497,9 @@ create_new_subsystem(const char *name)
 
 static struct dentry *
 event_subsystem_dir(struct trace_array *tr, const char *name,
-		    struct ftrace_event_file *file, struct dentry *parent)
+		    struct trace_event_file *file, struct dentry *parent)
 {
-	struct ftrace_subsystem_dir *dir;
+	struct trace_subsystem_dir *dir;
 	struct event_subsystem *system;
 	struct dentry *entry;
 
@@ -1571,9 +1571,9 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 }
 
 static int
-event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
+event_create_dir(struct dentry *parent, struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 	struct trace_array *tr = file->tr;
 	struct list_head *head;
 	struct dentry *d_events;
@@ -1591,7 +1591,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
 	} else
 		d_events = parent;
 
-	name = ftrace_event_name(call);
+	name = trace_event_name(call);
 	file->dir = tracefs_create_dir(name, d_events);
 	if (!file->dir) {
 		pr_warn("Could not create tracefs '%s' directory\n", name);
@@ -1634,9 +1634,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
 	return 0;
 }
 
-static void remove_event_from_tracers(struct ftrace_event_call *call)
+static void remove_event_from_tracers(struct trace_event_call *call)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_array *tr;
 
 	do_for_each_event_file_safe(tr, file) {
@@ -1654,10 +1654,10 @@ static void remove_event_from_tracers(struct ftrace_event_call *call)
 	} while_for_each_event_file();
 }
 
-static void event_remove(struct ftrace_event_call *call)
+static void event_remove(struct trace_event_call *call)
 {
 	struct trace_array *tr;
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	do_for_each_event_file(tr, file) {
 		if (file->event_call != call)
@@ -1673,17 +1673,17 @@ static void event_remove(struct ftrace_event_call *call)
 	} while_for_each_event_file();
 
 	if (call->event.funcs)
-		__unregister_ftrace_event(&call->event);
+		__unregister_trace_event(&call->event);
 	remove_event_from_tracers(call);
 	list_del(&call->list);
 }
 
-static int event_init(struct ftrace_event_call *call)
+static int event_init(struct trace_event_call *call)
 {
 	int ret = 0;
 	const char *name;
 
-	name = ftrace_event_name(call);
+	name = trace_event_name(call);
 	if (WARN_ON(!name))
 		return -EINVAL;
 
@@ -1697,7 +1697,7 @@ static int event_init(struct ftrace_event_call *call)
 }
 
 static int
-__register_event(struct ftrace_event_call *call, struct module *mod)
+__register_event(struct trace_event_call *call, struct module *mod)
 {
 	int ret;
 
@@ -1733,7 +1733,7 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
 	return ptr + elen;
 }
 
-static void update_event_printk(struct ftrace_event_call *call,
+static void update_event_printk(struct trace_event_call *call,
 				struct trace_enum_map *map)
 {
 	char *ptr;
@@ -1811,7 +1811,7 @@ static void update_event_printk(struct ftrace_event_call *call,
 
 void trace_event_enum_update(struct trace_enum_map **map, int len)
 {
-	struct ftrace_event_call *call, *p;
+	struct trace_event_call *call, *p;
 	const char *last_system = NULL;
 	int last_i;
 	int i;
@@ -1836,11 +1836,11 @@ void trace_event_enum_update(struct trace_enum_map **map, int len)
 	up_write(&trace_event_sem);
 }
 
-static struct ftrace_event_file *
-trace_create_new_event(struct ftrace_event_call *call,
+static struct trace_event_file *
+trace_create_new_event(struct trace_event_call *call,
 		       struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
 	if (!file)
@@ -1858,9 +1858,9 @@ trace_create_new_event(struct ftrace_event_call *call,
 
 /* Add an event to a trace directory */
 static int
-__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
+__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	file = trace_create_new_event(call, tr);
 	if (!file)
@@ -1875,10 +1875,10 @@ __trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
  * the filesystem is initialized.
  */
 static __init int
-__trace_early_add_new_event(struct ftrace_event_call *call,
+__trace_early_add_new_event(struct trace_event_call *call,
 			    struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	file = trace_create_new_event(call, tr);
 	if (!file)
@@ -1888,10 +1888,10 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
 }
 
 struct ftrace_module_file_ops;
-static void __add_event_to_tracers(struct ftrace_event_call *call);
+static void __add_event_to_tracers(struct trace_event_call *call);
 
 /* Add an additional event_call dynamically */
-int trace_add_event_call(struct ftrace_event_call *call)
+int trace_add_event_call(struct trace_event_call *call)
 {
 	int ret;
 	mutex_lock(&trace_types_lock);
@@ -1910,7 +1910,7 @@ int trace_add_event_call(struct ftrace_event_call *call)
  * Must be called under locking of trace_types_lock, event_mutex and
  * trace_event_sem.
  */
-static void __trace_remove_event_call(struct ftrace_event_call *call)
+static void __trace_remove_event_call(struct trace_event_call *call)
 {
 	event_remove(call);
 	trace_destroy_fields(call);
@@ -1918,10 +1918,10 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
 	call->filter = NULL;
 }
 
-static int probe_remove_event_call(struct ftrace_event_call *call)
+static int probe_remove_event_call(struct trace_event_call *call)
 {
 	struct trace_array *tr;
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 #ifdef CONFIG_PERF_EVENTS
 	if (call->perf_refcount)
@@ -1932,10 +1932,10 @@ static int probe_remove_event_call(struct ftrace_event_call *call)
 			continue;
 		/*
 		 * We can't rely on ftrace_event_enable_disable(enable => 0)
-		 * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
+		 * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
 		 * TRACE_REG_UNREGISTER.
 		 */
-		if (file->flags & FTRACE_EVENT_FL_ENABLED)
+		if (file->flags & EVENT_FILE_FL_ENABLED)
 			return -EBUSY;
 		/*
 		 * The do_for_each_event_file_safe() is
@@ -1952,7 +1952,7 @@ static int probe_remove_event_call(struct ftrace_event_call *call)
 }
 
 /* Remove an event_call */
-int trace_remove_event_call(struct ftrace_event_call *call)
+int trace_remove_event_call(struct trace_event_call *call)
 {
 	int ret;
 
@@ -1976,7 +1976,7 @@ int trace_remove_event_call(struct ftrace_event_call *call)
 
 static void trace_module_add_events(struct module *mod)
 {
-	struct ftrace_event_call **call, **start, **end;
+	struct trace_event_call **call, **start, **end;
 
 	if (!mod->num_trace_events)
 		return;
@@ -1999,7 +1999,7 @@ static void trace_module_add_events(struct module *mod)
 
 static void trace_module_remove_events(struct module *mod)
 {
-	struct ftrace_event_call *call, *p;
+	struct trace_event_call *call, *p;
 	bool clear_trace = false;
 
 	down_write(&trace_event_sem);
@@ -2055,28 +2055,28 @@ static struct notifier_block trace_module_nb = {
 static void
 __trace_add_event_dirs(struct trace_array *tr)
 {
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 	int ret;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		ret = __trace_add_new_event(call, tr);
 		if (ret < 0)
 			pr_warn("Could not create directory for event %s\n",
-				ftrace_event_name(call));
+				trace_event_name(call));
 	}
 }
 
-struct ftrace_event_file *
+struct trace_event_file *
 find_event_file(struct trace_array *tr, const char *system,  const char *event)
 {
-	struct ftrace_event_file *file;
-	struct ftrace_event_call *call;
+	struct trace_event_file *file;
+	struct trace_event_call *call;
 	const char *name;
 
 	list_for_each_entry(file, &tr->events, list) {
 
 		call = file->event_call;
-		name = ftrace_event_name(call);
+		name = trace_event_name(call);
 
 		if (!name || !call->class || !call->class->reg)
 			continue;
@@ -2098,7 +2098,7 @@ find_event_file(struct trace_array *tr, const char *system,  const char *event)
 #define DISABLE_EVENT_STR	"disable_event"
 
 struct event_probe_data {
-	struct ftrace_event_file	*file;
+	struct trace_event_file	*file;
 	unsigned long			count;
 	int				ref;
 	bool				enable;
@@ -2114,9 +2114,9 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
 		return;
 
 	if (data->enable)
-		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+		clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
 	else
-		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+		set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
 }
 
 static void
@@ -2132,7 +2132,7 @@ event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data
 		return;
 
 	/* Skip if the event is in a state we want to switch to */
-	if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+	if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
 		return;
 
 	if (data->count != -1)
@@ -2152,7 +2152,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
 	seq_printf(m, "%s:%s:%s",
 		   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
 		   data->file->event_call->class->system,
-		   ftrace_event_name(data->file->event_call));
+		   trace_event_name(data->file->event_call));
 
 	if (data->count == -1)
 		seq_puts(m, ":unlimited\n");
@@ -2226,7 +2226,7 @@ event_enable_func(struct ftrace_hash *hash,
 		  char *glob, char *cmd, char *param, int enabled)
 {
 	struct trace_array *tr = top_trace_array();
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct ftrace_probe_ops *ops;
 	struct event_probe_data *data;
 	const char *system;
@@ -2358,7 +2358,7 @@ static inline int register_event_cmds(void) { return 0; }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /*
- * The top level array has already had its ftrace_event_file
+ * The top level array has already had its trace_event_file
  * descriptors created in order to allow for early events to
  * be recorded. This function is called after the tracefs has been
  * initialized, and we now have to create the files associated
@@ -2367,7 +2367,7 @@ static inline int register_event_cmds(void) { return 0; }
 static __init void
 __trace_early_add_event_dirs(struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	int ret;
 
 
@@ -2375,7 +2375,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
 		ret = event_create_dir(tr->event_dir, file);
 		if (ret < 0)
 			pr_warn("Could not create directory for event %s\n",
-				ftrace_event_name(file->event_call));
+				trace_event_name(file->event_call));
 	}
 }
 
@@ -2388,7 +2388,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
 static __init void
 __trace_early_add_events(struct trace_array *tr)
 {
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 	int ret;
 
 	list_for_each_entry(call, &ftrace_events, list) {
@@ -2399,7 +2399,7 @@ __trace_early_add_events(struct trace_array *tr)
 		ret = __trace_early_add_new_event(call, tr);
 		if (ret < 0)
 			pr_warn("Could not create early event %s\n",
-				ftrace_event_name(call));
+				trace_event_name(call));
 	}
 }
 
@@ -2407,13 +2407,13 @@ __trace_early_add_events(struct trace_array *tr)
 static void
 __trace_remove_event_dirs(struct trace_array *tr)
 {
-	struct ftrace_event_file *file, *next;
+	struct trace_event_file *file, *next;
 
 	list_for_each_entry_safe(file, next, &tr->events, list)
 		remove_event_file_dir(file);
 }
 
-static void __add_event_to_tracers(struct ftrace_event_call *call)
+static void __add_event_to_tracers(struct trace_event_call *call)
 {
 	struct trace_array *tr;
 
@@ -2421,8 +2421,8 @@ static void __add_event_to_tracers(struct ftrace_event_call *call)
 		__trace_add_new_event(call, tr);
 }
 
-extern struct ftrace_event_call *__start_ftrace_events[];
-extern struct ftrace_event_call *__stop_ftrace_events[];
+extern struct trace_event_call *__start_ftrace_events[];
+extern struct trace_event_call *__stop_ftrace_events[];
 
 static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
 
@@ -2557,7 +2557,7 @@ int event_trace_del_tracer(struct trace_array *tr)
 static __init int event_trace_memsetup(void)
 {
 	field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
-	file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+	file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC);
 	return 0;
 }
 
@@ -2593,7 +2593,7 @@ early_enable_events(struct trace_array *tr, bool disable_first)
 static __init int event_trace_enable(void)
 {
 	struct trace_array *tr = top_trace_array();
-	struct ftrace_event_call **iter, *call;
+	struct trace_event_call **iter, *call;
 	int ret;
 
 	if (!tr)
@@ -2754,9 +2754,9 @@ static __init void event_test_stuff(void)
  */
 static __init void event_trace_self_tests(void)
 {
-	struct ftrace_subsystem_dir *dir;
-	struct ftrace_event_file *file;
-	struct ftrace_event_call *call;
+	struct trace_subsystem_dir *dir;
+	struct trace_event_file *file;
+	struct trace_event_call *call;
 	struct event_subsystem *system;
 	struct trace_array *tr;
 	int ret;
@@ -2787,13 +2787,13 @@ static __init void event_trace_self_tests(void)
 			continue;
 #endif
 
-		pr_info("Testing event %s: ", ftrace_event_name(call));
+		pr_info("Testing event %s: ", trace_event_name(call));
 
 		/*
 		 * If an event is already enabled, someone is using
 		 * it and the self test should not be on.
 		 */
-		if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+		if (file->flags & EVENT_FILE_FL_ENABLED) {
 			pr_warn("Enabled event during self test!\n");
 			WARN_ON_ONCE(1);
 			continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 52adf02d7..d81d6f302 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -643,7 +643,7 @@ static void append_filter_err(struct filter_parse_state *ps,
 	free_page((unsigned long) buf);
 }
 
-static inline struct event_filter *event_filter(struct ftrace_event_file *file)
+static inline struct event_filter *event_filter(struct trace_event_file *file)
 {
 	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		return file->event_call->filter;
@@ -652,7 +652,7 @@ static inline struct event_filter *event_filter(struct ftrace_event_file *file)
 }
 
 /* caller must hold event_mutex */
-void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
+void print_event_filter(struct trace_event_file *file, struct trace_seq *s)
 {
 	struct event_filter *filter = event_filter(file);
 
@@ -780,14 +780,14 @@ static void __free_preds(struct event_filter *filter)
 	filter->n_preds = 0;
 }
 
-static void filter_disable(struct ftrace_event_file *file)
+static void filter_disable(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags &= ~TRACE_EVENT_FL_FILTERED;
 	else
-		file->flags &= ~FTRACE_EVENT_FL_FILTERED;
+		file->flags &= ~EVENT_FILE_FL_FILTERED;
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -837,9 +837,9 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 	return 0;
 }
 
-static inline void __remove_filter(struct ftrace_event_file *file)
+static inline void __remove_filter(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	filter_disable(file);
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
@@ -848,10 +848,10 @@ static inline void __remove_filter(struct ftrace_event_file *file)
 		remove_filter_string(file->filter);
 }
 
-static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
+static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
 					struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	list_for_each_entry(file, &tr->events, list) {
 		if (file->system != dir)
@@ -860,9 +860,9 @@ static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
 	}
 }
 
-static inline void __free_subsystem_filter(struct ftrace_event_file *file)
+static inline void __free_subsystem_filter(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
 		__free_filter(call->filter);
@@ -873,10 +873,10 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
 	}
 }
 
-static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
+static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
 					  struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	list_for_each_entry(file, &tr->events, list) {
 		if (file->system != dir)
@@ -1342,7 +1342,7 @@ parse_operand:
 }
 
 static struct filter_pred *create_pred(struct filter_parse_state *ps,
-				       struct ftrace_event_call *call,
+				       struct trace_event_call *call,
 				       int op, char *operand1, char *operand2)
 {
 	struct ftrace_event_field *field;
@@ -1564,7 +1564,7 @@ static int fold_pred_tree(struct event_filter *filter,
 			      filter->preds);
 }
 
-static int replace_preds(struct ftrace_event_call *call,
+static int replace_preds(struct trace_event_call *call,
 			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
 			 bool dry_run)
@@ -1677,20 +1677,20 @@ fail:
 	return err;
 }
 
-static inline void event_set_filtered_flag(struct ftrace_event_file *file)
+static inline void event_set_filtered_flag(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags |= TRACE_EVENT_FL_FILTERED;
 	else
-		file->flags |= FTRACE_EVENT_FL_FILTERED;
+		file->flags |= EVENT_FILE_FL_FILTERED;
 }
 
-static inline void event_set_filter(struct ftrace_event_file *file,
+static inline void event_set_filter(struct trace_event_file *file,
 				    struct event_filter *filter)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		rcu_assign_pointer(call->filter, filter);
@@ -1698,9 +1698,9 @@ static inline void event_set_filter(struct ftrace_event_file *file,
 		rcu_assign_pointer(file->filter, filter);
 }
 
-static inline void event_clear_filter(struct ftrace_event_file *file)
+static inline void event_clear_filter(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		RCU_INIT_POINTER(call->filter, NULL);
@@ -1709,33 +1709,33 @@ static inline void event_clear_filter(struct ftrace_event_file *file)
 }
 
 static inline void
-event_set_no_set_filter_flag(struct ftrace_event_file *file)
+event_set_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
 	else
-		file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
+		file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline void
-event_clear_no_set_filter_flag(struct ftrace_event_file *file)
+event_clear_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
 	else
-		file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
+		file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline bool
-event_no_set_filter_flag(struct ftrace_event_file *file)
+event_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
-	if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
+	if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
 		return true;
 
 	if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
@@ -1750,12 +1750,12 @@ struct filter_list {
 	struct event_filter	*filter;
 };
 
-static int replace_system_preds(struct ftrace_subsystem_dir *dir,
+static int replace_system_preds(struct trace_subsystem_dir *dir,
 				struct trace_array *tr,
 				struct filter_parse_state *ps,
 				char *filter_string)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct filter_list *filter_item;
 	struct filter_list *tmp;
 	LIST_HEAD(filter_list);
@@ -1899,8 +1899,8 @@ static void create_filter_finish(struct filter_parse_state *ps)
 }
 
 /**
- * create_filter - create a filter for a ftrace_event_call
- * @call: ftrace_event_call to create a filter for
+ * create_filter - create a filter for a trace_event_call
+ * @call: trace_event_call to create a filter for
  * @filter_str: filter string
  * @set_str: remember @filter_str and enable detailed error in filter
  * @filterp: out param for created filter (always updated on return)
@@ -1914,7 +1914,7 @@ static void create_filter_finish(struct filter_parse_state *ps)
  * information if @set_str is %true and the caller is responsible for
  * freeing it.
  */
-static int create_filter(struct ftrace_event_call *call,
+static int create_filter(struct trace_event_call *call,
 			 char *filter_str, bool set_str,
 			 struct event_filter **filterp)
 {
@@ -1934,7 +1934,7 @@ static int create_filter(struct ftrace_event_call *call,
 	return err;
 }
 
-int create_event_filter(struct ftrace_event_call *call,
+int create_event_filter(struct trace_event_call *call,
 			char *filter_str, bool set_str,
 			struct event_filter **filterp)
 {
@@ -1950,7 +1950,7 @@ int create_event_filter(struct ftrace_event_call *call,
  * Identical to create_filter() except that it creates a subsystem filter
  * and always remembers @filter_str.
  */
-static int create_system_filter(struct ftrace_subsystem_dir *dir,
+static int create_system_filter(struct trace_subsystem_dir *dir,
 				struct trace_array *tr,
 				char *filter_str, struct event_filter **filterp)
 {
@@ -1976,9 +1976,9 @@ static int create_system_filter(struct ftrace_subsystem_dir *dir,
 }
 
 /* caller must hold event_mutex */
-int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
+int apply_event_filter(struct trace_event_file *file, char *filter_string)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 	struct event_filter *filter;
 	int err;
 
@@ -2027,7 +2027,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
 	return err;
 }
 
-int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
 				 char *filter_string)
 {
 	struct event_subsystem *system = dir->subsystem;
@@ -2090,7 +2090,7 @@ struct function_filter_data {
 static char **
 ftrace_function_filter_re(char *buf, int len, int *count)
 {
-	char *str, *sep, **re;
+	char *str, **re;
 
 	str = kstrndup(buf, len, GFP_KERNEL);
 	if (!str)
@@ -2100,8 +2100,7 @@ ftrace_function_filter_re(char *buf, int len, int *count)
 	 * The argv_split function takes white space
 	 * as a separator, so convert ',' into spaces.
 	 */
-	while ((sep = strchr(str, ',')))
-		*sep = ' ';
+	strreplace(str, ',', ' ');
 
 	re = argv_split(GFP_KERNEL, str, count);
 	kfree(str);
@@ -2227,7 +2226,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 {
 	int err;
 	struct event_filter *filter;
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 
 	mutex_lock(&event_mutex);
 
@@ -2283,7 +2282,7 @@ out_unlock:
 
 static struct test_filter_data_t {
 	char *filter;
-	struct ftrace_raw_ftrace_test_filter rec;
+	struct trace_event_raw_ftrace_test_filter rec;
 	int match;
 	char *not_visited;
 } test_filter_data[] = {
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8712df9de..42a4009fd 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -40,7 +40,7 @@ trigger_data_free(struct event_trigger_data *data)
 
 /**
  * event_triggers_call - Call triggers associated with a trace event
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  * @rec: The trace entry for the event, NULL for unconditional invocation
  *
  * For each trigger associated with an event, invoke the trigger
@@ -63,7 +63,7 @@ trigger_data_free(struct event_trigger_data *data)
  * any trigger that should be deferred, ETT_NONE if nothing to defer.
  */
 enum event_trigger_type
-event_triggers_call(struct ftrace_event_file *file, void *rec)
+event_triggers_call(struct trace_event_file *file, void *rec)
 {
 	struct event_trigger_data *data;
 	enum event_trigger_type tt = ETT_NONE;
@@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
 
 /**
  * event_triggers_post_call - Call 'post_triggers' for a trace event
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  * @tt: enum event_trigger_type containing a set bit for each trigger to invoke
  *
  * For each trigger associated with an event, invoke the trigger
@@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
  * Called from tracepoint handlers (with rcu_read_lock_sched() held).
  */
 void
-event_triggers_post_call(struct ftrace_event_file *file,
+event_triggers_post_call(struct trace_event_file *file,
 			 enum event_trigger_type tt)
 {
 	struct event_trigger_data *data;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(event_triggers_post_call);
 
 static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
 {
-	struct ftrace_event_file *event_file = event_file_data(m->private);
+	struct trace_event_file *event_file = event_file_data(m->private);
 
 	if (t == SHOW_AVAILABLE_TRIGGERS)
 		return NULL;
@@ -129,7 +129,7 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
 
 static void *trigger_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_file *event_file;
+	struct trace_event_file *event_file;
 
 	/* ->stop() is called even if ->start() fails */
 	mutex_lock(&event_mutex);
@@ -201,7 +201,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static int trigger_process_regex(struct ftrace_event_file *file, char *buff)
+static int trigger_process_regex(struct trace_event_file *file, char *buff)
 {
 	char *command, *next = buff;
 	struct event_command *p;
@@ -227,7 +227,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
 					 const char __user *ubuf,
 					 size_t cnt, loff_t *ppos)
 {
-	struct ftrace_event_file *event_file;
+	struct trace_event_file *event_file;
 	ssize_t ret;
 	char *buf;
 
@@ -430,7 +430,7 @@ event_trigger_free(struct event_trigger_ops *ops,
 		trigger_data_free(data);
 }
 
-static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
+static int trace_event_trigger_enable_disable(struct trace_event_file *file,
 					      int trigger_enable)
 {
 	int ret = 0;
@@ -438,12 +438,12 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
 	if (trigger_enable) {
 		if (atomic_inc_return(&file->tm_ref) > 1)
 			return ret;
-		set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+		set_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags);
 		ret = trace_event_enable_disable(file, 1, 1);
 	} else {
 		if (atomic_dec_return(&file->tm_ref) > 0)
 			return ret;
-		clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+		clear_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags);
 		ret = trace_event_enable_disable(file, 0, 1);
 	}
 
@@ -466,7 +466,7 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
 void
 clear_event_triggers(struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	list_for_each_entry(file, &tr->events, list) {
 		struct event_trigger_data *data;
@@ -480,7 +480,7 @@ clear_event_triggers(struct trace_array *tr)
 
 /**
  * update_cond_flag - Set or reset the TRIGGER_COND bit
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  *
  * If an event has triggers and any of those triggers has a filter or
  * a post_trigger, trigger invocation needs to be deferred until after
@@ -488,7 +488,7 @@ clear_event_triggers(struct trace_array *tr)
  * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
  * cleared.
  */
-static void update_cond_flag(struct ftrace_event_file *file)
+static void update_cond_flag(struct trace_event_file *file)
 {
 	struct event_trigger_data *data;
 	bool set_cond = false;
@@ -501,9 +501,9 @@ static void update_cond_flag(struct ftrace_event_file *file)
 	}
 
 	if (set_cond)
-		set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+		set_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
 	else
-		clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+		clear_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
 }
 
 /**
@@ -511,7 +511,7 @@ static void update_cond_flag(struct ftrace_event_file *file)
  * @glob: The raw string used to register the trigger
  * @ops: The trigger ops associated with the trigger
  * @data: Trigger-specific data to associate with the trigger
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  *
  * Common implementation for event trigger registration.
  *
@@ -522,7 +522,7 @@ static void update_cond_flag(struct ftrace_event_file *file)
  */
 static int register_trigger(char *glob, struct event_trigger_ops *ops,
 			    struct event_trigger_data *data,
-			    struct ftrace_event_file *file)
+			    struct trace_event_file *file)
 {
 	struct event_trigger_data *test;
 	int ret = 0;
@@ -557,7 +557,7 @@ out:
  * @glob: The raw string used to register the trigger
  * @ops: The trigger ops associated with the trigger
  * @test: Trigger-specific data used to find the trigger to remove
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  *
  * Common implementation for event trigger unregistration.
  *
@@ -566,7 +566,7 @@ out:
  */
 static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
 			       struct event_trigger_data *test,
-			       struct ftrace_event_file *file)
+			       struct trace_event_file *file)
 {
 	struct event_trigger_data *data;
 	bool unregistered = false;
@@ -588,7 +588,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
 /**
  * event_trigger_callback - Generic event_command @func implementation
  * @cmd_ops: The command ops, used for trigger registration
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  * @glob: The raw string used to register the trigger
  * @cmd: The cmd portion of the string used to register the trigger
  * @param: The params portion of the string used to register the trigger
@@ -603,7 +603,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
  */
 static int
 event_trigger_callback(struct event_command *cmd_ops,
-		       struct ftrace_event_file *file,
+		       struct trace_event_file *file,
 		       char *glob, char *cmd, char *param)
 {
 	struct event_trigger_data *trigger_data;
@@ -688,7 +688,7 @@ event_trigger_callback(struct event_command *cmd_ops,
  * set_trigger_filter - Generic event_command @set_filter implementation
  * @filter_str: The filter string for the trigger, NULL to remove filter
  * @trigger_data: Trigger-specific data
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  *
  * Common implementation for event command filter parsing and filter
  * instantiation.
@@ -702,7 +702,7 @@ event_trigger_callback(struct event_command *cmd_ops,
  */
 static int set_trigger_filter(char *filter_str,
 			      struct event_trigger_data *trigger_data,
-			      struct ftrace_event_file *file)
+			      struct trace_event_file *file)
 {
 	struct event_trigger_data *data = trigger_data;
 	struct event_filter *filter = NULL, *tmp;
@@ -900,7 +900,7 @@ snapshot_count_trigger(struct event_trigger_data *data)
 static int
 register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
 			  struct event_trigger_data *data,
-			  struct ftrace_event_file *file)
+			  struct trace_event_file *file)
 {
 	int ret = register_trigger(glob, ops, data, file);
 
@@ -968,7 +968,7 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
  * Skip 3:
  *   stacktrace_trigger()
  *   event_triggers_post_call()
- *   ftrace_raw_event_xxx()
+ *   trace_event_raw_event_xxx()
  */
 #define STACK_SKIP 3
 
@@ -1053,7 +1053,7 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
 #define DISABLE_EVENT_STR	"disable_event"
 
 struct enable_trigger_data {
-	struct ftrace_event_file	*file;
+	struct trace_event_file		*file;
 	bool				enable;
 };
 
@@ -1063,9 +1063,9 @@ event_enable_trigger(struct event_trigger_data *data)
 	struct enable_trigger_data *enable_data = data->private_data;
 
 	if (enable_data->enable)
-		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+		clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
 	else
-		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+		set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
 }
 
 static void
@@ -1077,7 +1077,7 @@ event_enable_count_trigger(struct event_trigger_data *data)
 		return;
 
 	/* Skip if the event is in a state we want to switch to */
-	if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+	if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
 		return;
 
 	if (data->count != -1)
@@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
 	seq_printf(m, "%s:%s:%s",
 		   enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
 		   enable_data->file->event_call->class->system,
-		   ftrace_event_name(enable_data->file->event_call));
+		   trace_event_name(enable_data->file->event_call));
 
 	if (data->count == -1)
 		seq_puts(m, ":unlimited");
@@ -1159,10 +1159,10 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
 
 static int
 event_enable_trigger_func(struct event_command *cmd_ops,
-			  struct ftrace_event_file *file,
+			  struct trace_event_file *file,
 			  char *glob, char *cmd, char *param)
 {
-	struct ftrace_event_file *event_enable_file;
+	struct trace_event_file *event_enable_file;
 	struct enable_trigger_data *enable_data;
 	struct event_trigger_data *trigger_data;
 	struct event_trigger_ops *trigger_ops;
@@ -1294,7 +1294,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 static int event_enable_register_trigger(char *glob,
 					 struct event_trigger_ops *ops,
 					 struct event_trigger_data *data,
-					 struct ftrace_event_file *file)
+					 struct trace_event_file *file)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 	struct enable_trigger_data *test_enable_data;
@@ -1331,7 +1331,7 @@ out:
 static void event_enable_unregister_trigger(char *glob,
 					    struct event_trigger_ops *ops,
 					    struct event_trigger_data *test,
-					    struct ftrace_event_file *file)
+					    struct trace_event_file *file)
 {
 	struct enable_trigger_data *test_enable_data = test->private_data;
 	struct enable_trigger_data *enable_data;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 174a6a711..adabf7da9 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,7 +125,7 @@ static void __always_unused ____ftrace_check_##name(void)		\
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\
 static int __init							\
-ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
+ftrace_define_fields_##name(struct trace_event_call *event_call)	\
 {									\
 	struct struct_name field;					\
 	int ret;							\
@@ -163,14 +163,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
 			 regfn)						\
 									\
-struct ftrace_event_class __refdata event_class_ftrace_##call = {	\
+struct trace_event_class __refdata event_class_ftrace_##call = {	\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.define_fields		= ftrace_define_fields_##call,		\
 	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
 	.reg			= regfn,				\
 };									\
 									\
-struct ftrace_event_call __used event_##call = {			\
+struct trace_event_call __used event_##call = {				\
 	.class			= &event_class_ftrace_##call,		\
 	{								\
 		.name			= #call,			\
@@ -179,7 +179,7 @@ struct ftrace_event_call __used event_##call = {			\
 	.print_fmt		= print,				\
 	.flags			= TRACE_EVENT_FL_IGNORE_ENABLE,		\
 };									\
-struct ftrace_event_call __used						\
+struct trace_event_call __used						\
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
 
 #undef FTRACE_ENTRY
@@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
 	FTRACE_ENTRY_REG(call, struct_name, etype,			\
 			 PARAMS(tstruct), PARAMS(print), filter, NULL)
 
-int ftrace_event_is_function(struct ftrace_event_call *call)
+int ftrace_event_is_function(struct trace_event_call *call)
 {
 	return call == &event_function;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a51e79688..8968bf720 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -278,7 +278,7 @@ int __trace_graph_entry(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
-	struct ftrace_event_call *call = &event_funcgraph_entry;
+	struct trace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ftrace_graph_ent_entry *entry;
@@ -393,7 +393,7 @@ void __trace_graph_return(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
-	struct ftrace_event_call *call = &event_funcgraph_exit;
+	struct trace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ftrace_graph_ret_entry *entry;
@@ -1454,12 +1454,12 @@ static __init int init_graph_trace(void)
 {
 	max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
 
-	if (!register_ftrace_event(&graph_trace_entry_event)) {
+	if (!register_trace_event(&graph_trace_entry_event)) {
 		pr_warning("Warning: could not register graph trace events\n");
 		return 1;
 	}
 
-	if (!register_ftrace_event(&graph_trace_ret_event)) {
+	if (!register_trace_event(&graph_trace_ret_event)) {
 		pr_warning("Warning: could not register graph trace events\n");
 		return 1;
 	}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d0ce590f0..b7d0cdd99 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -348,7 +348,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
 	struct trace_kprobe *tk;
 
 	list_for_each_entry(tk, &probe_list, list)
-		if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
+		if (strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
 		    strcmp(tk->tp.call.class->system, group) == 0)
 			return tk;
 	return NULL;
@@ -359,7 +359,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
  * if the file is NULL, enable "perf" handler, or enable "trace" handler.
  */
 static int
-enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 {
 	int ret = 0;
 
@@ -394,7 +394,7 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
  * if the file is NULL, disable "perf" handler, or disable "trace" handler.
  */
 static int
-disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 {
 	struct event_file_link *link = NULL;
 	int wait = 0;
@@ -523,7 +523,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
 	mutex_lock(&probe_lock);
 
 	/* Delete old (same name) event if exist */
-	old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
+	old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call),
 			tk->tp.call.class->system);
 	if (old_tk) {
 		ret = unregister_trace_kprobe(old_tk);
@@ -572,7 +572,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
 			if (ret)
 				pr_warning("Failed to re-register probe %s on"
 					   "%s: %d\n",
-					   ftrace_event_name(&tk->tp.call),
+					   trace_event_name(&tk->tp.call),
 					   mod->name, ret);
 		}
 	}
@@ -829,7 +829,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
 
 	seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
 	seq_printf(m, ":%s/%s", tk->tp.call.class->system,
-			ftrace_event_name(&tk->tp.call));
+			trace_event_name(&tk->tp.call));
 
 	if (!tk->symbol)
 		seq_printf(m, " 0x%p", tk->rp.kp.addr);
@@ -888,7 +888,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 	struct trace_kprobe *tk = v;
 
 	seq_printf(m, "  %-44s %15lu %15lu\n",
-		   ftrace_event_name(&tk->tp.call), tk->nhit,
+		   trace_event_name(&tk->tp.call), tk->nhit,
 		   tk->rp.kp.nmissed);
 
 	return 0;
@@ -917,18 +917,18 @@ static const struct file_operations kprobe_profile_ops = {
 /* Kprobe handler */
 static nokprobe_inline void
 __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
-		    struct ftrace_event_file *ftrace_file)
+		    struct trace_event_file *trace_file)
 {
 	struct kprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	int size, dsize, pc;
 	unsigned long irq_flags;
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 
-	WARN_ON(call != ftrace_file->event_call);
+	WARN_ON(call != trace_file->event_call);
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	local_save_flags(irq_flags);
@@ -937,7 +937,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 	dsize = __get_data_size(&tk->tp, regs);
 	size = sizeof(*entry) + tk->tp.size + dsize;
 
-	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
 						call->event.type,
 						size, irq_flags, pc);
 	if (!event)
@@ -947,7 +947,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 
-	event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+	event_trigger_unlock_commit_regs(trace_file, buffer, event,
 					 entry, irq_flags, pc, regs);
 }
 
@@ -965,18 +965,18 @@ NOKPROBE_SYMBOL(kprobe_trace_func);
 static nokprobe_inline void
 __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		       struct pt_regs *regs,
-		       struct ftrace_event_file *ftrace_file)
+		       struct trace_event_file *trace_file)
 {
 	struct kretprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	int size, pc, dsize;
 	unsigned long irq_flags;
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 
-	WARN_ON(call != ftrace_file->event_call);
+	WARN_ON(call != trace_file->event_call);
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	local_save_flags(irq_flags);
@@ -985,7 +985,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	dsize = __get_data_size(&tk->tp, regs);
 	size = sizeof(*entry) + tk->tp.size + dsize;
 
-	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
 						call->event.type,
 						size, irq_flags, pc);
 	if (!event)
@@ -996,7 +996,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 
-	event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+	event_trigger_unlock_commit_regs(trace_file, buffer, event,
 					 entry, irq_flags, pc, regs);
 }
 
@@ -1025,7 +1025,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
 	field = (struct kprobe_trace_entry_head *)iter->ent;
 	tp = container_of(event, struct trace_probe, call.event);
 
-	trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+	trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
 
 	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto out;
@@ -1056,7 +1056,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
 	field = (struct kretprobe_trace_entry_head *)iter->ent;
 	tp = container_of(event, struct trace_probe, call.event);
 
-	trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+	trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
 
 	if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto out;
@@ -1081,7 +1081,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
 }
 
 
-static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int kprobe_event_define_fields(struct trace_event_call *event_call)
 {
 	int ret, i;
 	struct kprobe_trace_entry_head field;
@@ -1104,7 +1104,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 	return 0;
 }
 
-static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 {
 	int ret, i;
 	struct kretprobe_trace_entry_head field;
@@ -1134,7 +1134,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
@@ -1169,7 +1169,7 @@ static void
 kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
@@ -1206,11 +1206,11 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
  * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
  * lockless, but we can't race with this __init function.
  */
-static int kprobe_register(struct ftrace_event_call *event,
+static int kprobe_register(struct trace_event_call *event,
 			   enum trace_reg type, void *data)
 {
 	struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
@@ -1276,10 +1276,10 @@ static struct trace_event_functions kprobe_funcs = {
 
 static int register_kprobe_event(struct trace_kprobe *tk)
 {
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 	int ret;
 
-	/* Initialize ftrace_event_call */
+	/* Initialize trace_event_call */
 	INIT_LIST_HEAD(&call->class->fields);
 	if (trace_kprobe_is_return(tk)) {
 		call->event.funcs = &kretprobe_funcs;
@@ -1290,7 +1290,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 	}
 	if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
 		return -ENOMEM;
-	ret = register_ftrace_event(&call->event);
+	ret = register_trace_event(&call->event);
 	if (!ret) {
 		kfree(call->print_fmt);
 		return -ENODEV;
@@ -1301,9 +1301,9 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 	ret = trace_add_event_call(call);
 	if (ret) {
 		pr_info("Failed to register kprobe event: %s\n",
-			ftrace_event_name(call));
+			trace_event_name(call));
 		kfree(call->print_fmt);
-		unregister_ftrace_event(&call->event);
+		unregister_trace_event(&call->event);
 	}
 	return ret;
 }
@@ -1364,10 +1364,10 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
 	return a1 + a2 + a3 + a4 + a5 + a6;
 }
 
-static struct ftrace_event_file *
+static struct trace_event_file *
 find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	list_for_each_entry(file, &tr->events, list)
 		if (file->event_call == &tk->tp.call)
@@ -1385,7 +1385,7 @@ static __init int kprobe_trace_self_tests_init(void)
 	int ret, warn = 0;
 	int (*target)(int, int, int, int, int, int);
 	struct trace_kprobe *tk;
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	if (tracing_is_disabled())
 		return -ENODEV;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 7a9ba62e9..638e110c5 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -298,7 +298,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct mmiotrace_rw *rw)
 {
-	struct ftrace_event_call *call = &event_mmiotrace_rw;
+	struct trace_event_call *call = &event_mmiotrace_rw;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
@@ -328,7 +328,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct mmiotrace_map *map)
 {
-	struct ftrace_event_call *call = &event_mmiotrace_map;
+	struct trace_event_call *call = &event_mmiotrace_map;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 25a086bcb..dfab25372 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -60,9 +60,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
 }
 
 const char *
-ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
-		       unsigned long flags,
-		       const struct trace_print_flags *flag_array)
+trace_print_flags_seq(struct trace_seq *p, const char *delim,
+		      unsigned long flags,
+		      const struct trace_print_flags *flag_array)
 {
 	unsigned long mask;
 	const char *str;
@@ -95,11 +95,11 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_flags_seq);
+EXPORT_SYMBOL(trace_print_flags_seq);
 
 const char *
-ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
-			 const struct trace_print_flags *symbol_array)
+trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+			const struct trace_print_flags *symbol_array)
 {
 	int i;
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -120,11 +120,11 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_symbols_seq);
+EXPORT_SYMBOL(trace_print_symbols_seq);
 
 #if BITS_PER_LONG == 32
 const char *
-ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
 			 const struct trace_print_flags_u64 *symbol_array)
 {
 	int i;
@@ -146,12 +146,12 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+EXPORT_SYMBOL(trace_print_symbols_seq_u64);
 #endif
 
 const char *
-ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
-			 unsigned int bitmask_size)
+trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+			unsigned int bitmask_size)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
 
@@ -160,10 +160,10 @@ ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
+EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
 
 const char *
-ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 {
 	int i;
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -175,11 +175,11 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_hex_seq);
+EXPORT_SYMBOL(trace_print_hex_seq);
 
 const char *
-ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
-		       size_t el_size)
+trace_print_array_seq(struct trace_seq *p, const void *buf, int count,
+		      size_t el_size)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
 	const char *prefix = "";
@@ -220,17 +220,17 @@ ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_array_seq);
+EXPORT_SYMBOL(trace_print_array_seq);
 
-int ftrace_raw_output_prep(struct trace_iterator *iter,
-			   struct trace_event *trace_event)
+int trace_raw_output_prep(struct trace_iterator *iter,
+			  struct trace_event *trace_event)
 {
-	struct ftrace_event_call *event;
+	struct trace_event_call *event;
 	struct trace_seq *s = &iter->seq;
 	struct trace_seq *p = &iter->tmp_seq;
 	struct trace_entry *entry;
 
-	event = container_of(trace_event, struct ftrace_event_call, event);
+	event = container_of(trace_event, struct trace_event_call, event);
 	entry = iter->ent;
 
 	if (entry->type != event->event.type) {
@@ -239,14 +239,14 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
 	}
 
 	trace_seq_init(p);
-	trace_seq_printf(s, "%s: ", ftrace_event_name(event));
+	trace_seq_printf(s, "%s: ", trace_event_name(event));
 
 	return trace_handle_return(s);
 }
-EXPORT_SYMBOL(ftrace_raw_output_prep);
+EXPORT_SYMBOL(trace_raw_output_prep);
 
-static int ftrace_output_raw(struct trace_iterator *iter, char *name,
-			     char *fmt, va_list ap)
+static int trace_output_raw(struct trace_iterator *iter, char *name,
+			    char *fmt, va_list ap)
 {
 	struct trace_seq *s = &iter->seq;
 
@@ -256,18 +256,18 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
 	return trace_handle_return(s);
 }
 
-int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
+int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
 {
 	va_list ap;
 	int ret;
 
 	va_start(ap, fmt);
-	ret = ftrace_output_raw(iter, name, fmt, ap);
+	ret = trace_output_raw(iter, name, fmt, ap);
 	va_end(ap);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ftrace_output_call);
+EXPORT_SYMBOL_GPL(trace_output_call);
 
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
@@ -675,7 +675,7 @@ static int trace_search_list(struct list_head **list)
 	}
 
 	/* Did we used up all 65 thousand events??? */
-	if ((last + 1) > FTRACE_MAX_EVENT)
+	if ((last + 1) > TRACE_EVENT_TYPE_MAX)
 		return 0;
 
 	*list = &e->list;
@@ -693,7 +693,7 @@ void trace_event_read_unlock(void)
 }
 
 /**
- * register_ftrace_event - register output for an event type
+ * register_trace_event - register output for an event type
  * @event: the event type to register
  *
  * Event types are stored in a hash and this hash is used to
@@ -707,7 +707,7 @@ void trace_event_read_unlock(void)
  *
  * Returns the event type number or zero on error.
  */
-int register_ftrace_event(struct trace_event *event)
+int register_trace_event(struct trace_event *event)
 {
 	unsigned key;
 	int ret = 0;
@@ -725,7 +725,7 @@ int register_ftrace_event(struct trace_event *event)
 	if (!event->type) {
 		struct list_head *list = NULL;
 
-		if (next_event_type > FTRACE_MAX_EVENT) {
+		if (next_event_type > TRACE_EVENT_TYPE_MAX) {
 
 			event->type = trace_search_list(&list);
 			if (!event->type)
@@ -771,12 +771,12 @@ int register_ftrace_event(struct trace_event *event)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(register_ftrace_event);
+EXPORT_SYMBOL_GPL(register_trace_event);
 
 /*
  * Used by module code with the trace_event_sem held for write.
  */
-int __unregister_ftrace_event(struct trace_event *event)
+int __unregister_trace_event(struct trace_event *event)
 {
 	hlist_del(&event->node);
 	list_del(&event->list);
@@ -784,18 +784,18 @@ int __unregister_ftrace_event(struct trace_event *event)
 }
 
 /**
- * unregister_ftrace_event - remove a no longer used event
+ * unregister_trace_event - remove a no longer used event
  * @event: the event to remove
  */
-int unregister_ftrace_event(struct trace_event *event)
+int unregister_trace_event(struct trace_event *event)
 {
 	down_write(&trace_event_sem);
-	__unregister_ftrace_event(event);
+	__unregister_trace_event(event);
 	up_write(&trace_event_sem);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(unregister_ftrace_event);
+EXPORT_SYMBOL_GPL(unregister_trace_event);
 
 /*
  * Standard events
@@ -1243,7 +1243,7 @@ __init static int init_events(void)
 	for (i = 0; events[i]; i++) {
 		event = events[i];
 
-		ret = register_ftrace_event(event);
+		ret = register_trace_event(event);
 		if (!ret) {
 			printk(KERN_WARNING "event %d failed to register\n",
 			       event->type);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 8ef2c40ef..4cbfe85b9 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -32,7 +32,7 @@ extern int
 trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
 
 /* used by module unregistering */
-extern int __unregister_ftrace_event(struct trace_event *event);
+extern int __unregister_trace_event(struct trace_event *event);
 extern struct rw_semaphore trace_event_sem;
 
 #define SEQ_PUT_FIELD(s, x)				\
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index ab283e146..b98dee914 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -272,8 +272,8 @@ struct probe_arg {
 
 struct trace_probe {
 	unsigned int			flags;	/* For TP_FLAG_* */
-	struct ftrace_event_class	class;
-	struct ftrace_event_call	call;
+	struct trace_event_class	class;
+	struct trace_event_call		call;
 	struct list_head 		files;
 	ssize_t				size;	/* trace entry size */
 	unsigned int			nr_args;
@@ -281,7 +281,7 @@ struct trace_probe {
 };
 
 struct event_file_link {
-	struct ftrace_event_file	*file;
+	struct trace_event_file		*file;
 	struct list_head		list;
 };
 
@@ -314,7 +314,7 @@ static inline int is_good_name(const char *name)
 }
 
 static inline struct event_file_link *
-find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
+find_event_file_link(struct trace_probe *tp, struct trace_event_file *file)
 {
 	struct event_file_link *link;
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index d6e100372..9b33dd117 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -369,7 +369,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
 {
-	struct ftrace_event_call *call = &event_context_switch;
+	struct trace_event_call *call = &event_context_switch;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
@@ -397,7 +397,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct task_struct *curr,
 			   unsigned long flags, int pc)
 {
-	struct ftrace_event_call *call = &event_wakeup;
+	struct trace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 287cf721c..b0f86ea77 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1039,15 +1039,10 @@ static int trace_wakeup_test_thread(void *data)
 {
 	/* Make this a -deadline thread */
 	static const struct sched_attr attr = {
-#ifdef CONFIG_SCHED_BFS
-		/* No deadline on BFS, use RR */
-		.sched_policy = SCHED_RR,
-#else
 		.sched_policy = SCHED_DEADLINE,
 		.sched_runtime = 100000ULL,
 		.sched_deadline = 10000000ULL,
 		.sched_period = 10000000ULL
-#endif
 	};
 	struct wakeup_test_data *x = data;
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f97f6e3a6..7d567a4b9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -13,13 +13,13 @@
 
 static DEFINE_MUTEX(syscall_trace_lock);
 
-static int syscall_enter_register(struct ftrace_event_call *event,
+static int syscall_enter_register(struct trace_event_call *event,
 				 enum trace_reg type, void *data);
-static int syscall_exit_register(struct ftrace_event_call *event,
+static int syscall_exit_register(struct trace_event_call *event,
 				 enum trace_reg type, void *data);
 
 static struct list_head *
-syscall_get_enter_fields(struct ftrace_event_call *call)
+syscall_get_enter_fields(struct trace_event_call *call)
 {
 	struct syscall_metadata *entry = call->data;
 
@@ -219,7 +219,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 	return pos;
 }
 
-static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
+static int __init set_syscall_print_fmt(struct trace_event_call *call)
 {
 	char *print_fmt;
 	int len;
@@ -244,7 +244,7 @@ static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
 	return 0;
 }
 
-static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
+static void __init free_syscall_print_fmt(struct trace_event_call *call)
 {
 	struct syscall_metadata *entry = call->data;
 
@@ -252,7 +252,7 @@ static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
 		kfree(call->print_fmt);
 }
 
-static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
+static int __init syscall_enter_define_fields(struct trace_event_call *call)
 {
 	struct syscall_trace_enter trace;
 	struct syscall_metadata *meta = call->data;
@@ -275,7 +275,7 @@ static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
 	return ret;
 }
 
-static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
+static int __init syscall_exit_define_fields(struct trace_event_call *call)
 {
 	struct syscall_trace_exit trace;
 	int ret;
@@ -293,7 +293,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 {
 	struct trace_array *tr = data;
-	struct ftrace_event_file *ftrace_file;
+	struct trace_event_file *trace_file;
 	struct syscall_trace_enter *entry;
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
@@ -308,11 +308,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 		return;
 
 	/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
-	ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
-	if (!ftrace_file)
+	trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
+	if (!trace_file)
 		return;
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -334,14 +334,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	entry->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
 
-	event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+	event_trigger_unlock_commit(trace_file, buffer, event, entry,
 				    irq_flags, pc);
 }
 
 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 {
 	struct trace_array *tr = data;
-	struct ftrace_event_file *ftrace_file;
+	struct trace_event_file *trace_file;
 	struct syscall_trace_exit *entry;
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
@@ -355,11 +355,11 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 		return;
 
 	/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
-	ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
-	if (!ftrace_file)
+	trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
+	if (!trace_file)
 		return;
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -380,12 +380,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	entry->nr = syscall_nr;
 	entry->ret = syscall_get_return_value(current, regs);
 
-	event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+	event_trigger_unlock_commit(trace_file, buffer, event, entry,
 				    irq_flags, pc);
 }
 
-static int reg_event_syscall_enter(struct ftrace_event_file *file,
-				   struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct trace_event_file *file,
+				   struct trace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
 	int ret = 0;
@@ -405,8 +405,8 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file,
 	return ret;
 }
 
-static void unreg_event_syscall_enter(struct ftrace_event_file *file,
-				      struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct trace_event_file *file,
+				      struct trace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
 	int num;
@@ -422,8 +422,8 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int reg_event_syscall_exit(struct ftrace_event_file *file,
-				  struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct trace_event_file *file,
+				  struct trace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
 	int ret = 0;
@@ -443,8 +443,8 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file,
 	return ret;
 }
 
-static void unreg_event_syscall_exit(struct ftrace_event_file *file,
-				     struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct trace_event_file *file,
+				     struct trace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
 	int num;
@@ -460,7 +460,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int __init init_syscall_trace(struct ftrace_event_call *call)
+static int __init init_syscall_trace(struct trace_event_call *call)
 {
 	int id;
 	int num;
@@ -493,7 +493,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
 	.trace		= print_syscall_exit,
 };
 
-struct ftrace_event_class __refdata event_class_syscall_enter = {
+struct trace_event_class __refdata event_class_syscall_enter = {
 	.system		= "syscalls",
 	.reg		= syscall_enter_register,
 	.define_fields	= syscall_enter_define_fields,
@@ -501,7 +501,7 @@ struct ftrace_event_class __refdata event_class_syscall_enter = {
 	.raw_init	= init_syscall_trace,
 };
 
-struct ftrace_event_class __refdata event_class_syscall_exit = {
+struct trace_event_class __refdata event_class_syscall_exit = {
 	.system		= "syscalls",
 	.reg		= syscall_exit_register,
 	.define_fields	= syscall_exit_define_fields,
@@ -584,7 +584,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 
-static int perf_sysenter_enable(struct ftrace_event_call *call)
+static int perf_sysenter_enable(struct trace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -605,7 +605,7 @@ static int perf_sysenter_enable(struct ftrace_event_call *call)
 	return ret;
 }
 
-static void perf_sysenter_disable(struct ftrace_event_call *call)
+static void perf_sysenter_disable(struct trace_event_call *call)
 {
 	int num;
 
@@ -656,7 +656,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 
-static int perf_sysexit_enable(struct ftrace_event_call *call)
+static int perf_sysexit_enable(struct trace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -677,7 +677,7 @@ static int perf_sysexit_enable(struct ftrace_event_call *call)
 	return ret;
 }
 
-static void perf_sysexit_disable(struct ftrace_event_call *call)
+static void perf_sysexit_disable(struct trace_event_call *call)
 {
 	int num;
 
@@ -693,10 +693,10 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
 
 #endif /* CONFIG_PERF_EVENTS */
 
-static int syscall_enter_register(struct ftrace_event_call *event,
+static int syscall_enter_register(struct trace_event_call *event,
 				 enum trace_reg type, void *data)
 {
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
@@ -721,10 +721,10 @@ static int syscall_enter_register(struct ftrace_event_call *event,
 	return 0;
 }
 
-static int syscall_exit_register(struct ftrace_event_call *event,
+static int syscall_exit_register(struct trace_event_call *event,
 				 enum trace_reg type, void *data)
 {
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 6dd022c7b..aa1ea7b36 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -293,7 +293,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
 	struct trace_uprobe *tu;
 
 	list_for_each_entry(tu, &uprobe_list, list)
-		if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
+		if (strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
 		    strcmp(tu->tp.call.class->system, group) == 0)
 			return tu;
 
@@ -323,7 +323,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
 	mutex_lock(&uprobe_lock);
 
 	/* register as an event */
-	old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
+	old_tu = find_probe_event(trace_event_name(&tu->tp.call),
 			tu->tp.call.class->system);
 	if (old_tu) {
 		/* delete old event */
@@ -600,7 +600,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	int i;
 
 	seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
-			ftrace_event_name(&tu->tp.call));
+			trace_event_name(&tu->tp.call));
 	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
 
 	for (i = 0; i < tu->tp.nr_args; i++)
@@ -651,7 +651,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 	struct trace_uprobe *tu = v;
 
 	seq_printf(m, "  %s %-44s %15lu\n", tu->filename,
-			ftrace_event_name(&tu->tp.call), tu->nhit);
+			trace_event_name(&tu->tp.call), tu->nhit);
 	return 0;
 }
 
@@ -770,26 +770,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
 static void __uprobe_trace_func(struct trace_uprobe *tu,
 				unsigned long func, struct pt_regs *regs,
 				struct uprobe_cpu_buffer *ucb, int dsize,
-				struct ftrace_event_file *ftrace_file)
+				struct trace_event_file *trace_file)
 {
 	struct uprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	void *data;
 	int size, esize;
-	struct ftrace_event_call *call = &tu->tp.call;
+	struct trace_event_call *call = &tu->tp.call;
 
-	WARN_ON(call != ftrace_file->event_call);
+	WARN_ON(call != trace_file->event_call);
 
 	if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
 		return;
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	size = esize + tu->tp.size + dsize;
-	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
 						call->event.type, size, 0, 0);
 	if (!event)
 		return;
@@ -806,7 +806,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
 	memcpy(data, ucb->buf, tu->tp.size + dsize);
 
-	event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
+	event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
 }
 
 /* uprobe handler */
@@ -853,12 +853,12 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
 
 	if (is_ret_probe(tu)) {
 		trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
-				 ftrace_event_name(&tu->tp.call),
+				 trace_event_name(&tu->tp.call),
 				 entry->vaddr[1], entry->vaddr[0]);
 		data = DATAOF_TRACE_ENTRY(entry, true);
 	} else {
 		trace_seq_printf(s, "%s: (0x%lx)",
-				 ftrace_event_name(&tu->tp.call),
+				 trace_event_name(&tu->tp.call),
 				 entry->vaddr[0]);
 		data = DATAOF_TRACE_ENTRY(entry, false);
 	}
@@ -881,7 +881,7 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
 				struct mm_struct *mm);
 
 static int
-probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
+probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
 		   filter_func_t filter)
 {
 	bool enabled = trace_probe_is_enabled(&tu->tp);
@@ -938,7 +938,7 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
 }
 
 static void
-probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
+probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
 {
 	if (!trace_probe_is_enabled(&tu->tp))
 		return;
@@ -967,7 +967,7 @@ probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
 	uprobe_buffer_disable();
 }
 
-static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int uprobe_event_define_fields(struct trace_event_call *event_call)
 {
 	int ret, i, size;
 	struct uprobe_trace_entry_head field;
@@ -1093,7 +1093,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 			       unsigned long func, struct pt_regs *regs,
 			       struct uprobe_cpu_buffer *ucb, int dsize)
 {
-	struct ftrace_event_call *call = &tu->tp.call;
+	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	void *data;
@@ -1159,11 +1159,11 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 #endif	/* CONFIG_PERF_EVENTS */
 
 static int
-trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
+trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
 		      void *data)
 {
 	struct trace_uprobe *tu = event->data;
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
@@ -1272,10 +1272,10 @@ static struct trace_event_functions uprobe_funcs = {
 
 static int register_uprobe_event(struct trace_uprobe *tu)
 {
-	struct ftrace_event_call *call = &tu->tp.call;
+	struct trace_event_call *call = &tu->tp.call;
 	int ret;
 
-	/* Initialize ftrace_event_call */
+	/* Initialize trace_event_call */
 	INIT_LIST_HEAD(&call->class->fields);
 	call->event.funcs = &uprobe_funcs;
 	call->class->define_fields = uprobe_event_define_fields;
@@ -1283,7 +1283,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 	if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
 		return -ENOMEM;
 
-	ret = register_ftrace_event(&call->event);
+	ret = register_trace_event(&call->event);
 	if (!ret) {
 		kfree(call->print_fmt);
 		return -ENODEV;
@@ -1295,9 +1295,9 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 
 	if (ret) {
 		pr_info("Failed to register uprobe event: %s\n",
-			ftrace_event_name(call));
+			trace_event_name(call));
 		kfree(call->print_fmt);
-		unregister_ftrace_event(&call->event);
+		unregister_trace_event(&call->event);
 	}
 
 	return ret;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 581a68a04..a6ffa43f2 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,6 +19,7 @@
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
 #include <linux/sched/rt.h>
+#include <linux/tick.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -58,6 +59,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 #else
 #define sysctl_softlockup_all_cpu_backtrace 0
 #endif
+static struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
@@ -207,7 +214,7 @@ void touch_all_softlockup_watchdogs(void)
 	 * do we care if a 0 races with a timestamp?
 	 * all it means is the softlock check starts one cycle later
 	 */
-	for_each_online_cpu(cpu)
+	for_each_watchdog_cpu(cpu)
 		per_cpu(watchdog_touch_ts, cpu) = 0;
 }
 
@@ -616,7 +623,7 @@ void watchdog_nmi_enable_all(void)
 		goto unlock;
 
 	get_online_cpus();
-	for_each_online_cpu(cpu)
+	for_each_watchdog_cpu(cpu)
 		watchdog_nmi_enable(cpu);
 	put_online_cpus();
 
@@ -634,7 +641,7 @@ void watchdog_nmi_disable_all(void)
 		goto unlock;
 
 	get_online_cpus();
-	for_each_online_cpu(cpu)
+	for_each_watchdog_cpu(cpu)
 		watchdog_nmi_disable(cpu);
 	put_online_cpus();
 
@@ -696,7 +703,7 @@ static void update_watchdog_all_cpus(void)
 	int cpu;
 
 	get_online_cpus();
-	for_each_online_cpu(cpu)
+	for_each_watchdog_cpu(cpu)
 		update_watchdog(cpu);
 	put_online_cpus();
 }
@@ -709,8 +716,12 @@ static int watchdog_enable_all_cpus(void)
 		err = smpboot_register_percpu_thread(&watchdog_threads);
 		if (err)
 			pr_err("Failed to create watchdog threads, disabled\n");
-		else
+		else {
+			if (smpboot_update_cpumask_percpu_thread(
+				    &watchdog_threads, &watchdog_cpumask))
+				pr_err("Failed to set cpumask for watchdog threads\n");
 			watchdog_running = 1;
+		}
 	} else {
 		/*
 		 * Enable/disable the lockup detectors or
@@ -879,12 +890,58 @@ out:
 	mutex_unlock(&watchdog_proc_mutex);
 	return err;
 }
+
+/*
+ * The cpumask is the mask of possible cpus that the watchdog can run
+ * on, not the mask of cpus it is actually running on.  This allows the
+ * user to specify a mask that will include cpus that have not yet
+ * been brought online, if desired.
+ */
+int proc_watchdog_cpumask(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err;
+
+	mutex_lock(&watchdog_proc_mutex);
+	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
+	if (!err && write) {
+		/* Remove impossible cpus to keep sysctl output cleaner. */
+		cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
+			    cpu_possible_mask);
+
+		if (watchdog_running) {
+			/*
+			 * Failure would be due to being unable to allocate
+			 * a temporary cpumask, so we are likely not in a
+			 * position to do much else to make things better.
+			 */
+			if (smpboot_update_cpumask_percpu_thread(
+				    &watchdog_threads, &watchdog_cpumask) != 0)
+				pr_err("cpumask update failed\n");
+		}
+	}
+	mutex_unlock(&watchdog_proc_mutex);
+	return err;
+}
+
 #endif /* CONFIG_SYSCTL */
 
 void __init lockup_detector_init(void)
 {
 	set_sample_period();
 
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled()) {
+		if (!cpumask_empty(tick_nohz_full_mask))
+			pr_info("Disabling watchdog on nohz_full cores by default\n");
+		cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
+			       tick_nohz_full_mask);
+	} else
+		cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#else
+	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#endif
+
 	if (watchdog_enabled)
 		watchdog_enable_all_cpus();
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 586ad9130..a413acb59 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -127,6 +127,11 @@ enum {
  *
  * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
  *
+ * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
+ *
+ * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
+ *      sched-RCU for reads.
+ *
  * WQ: wq->mutex protected.
  *
  * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
@@ -247,8 +252,8 @@ struct workqueue_struct {
 	int			nr_drainers;	/* WQ: drain in progress */
 	int			saved_max_active; /* WQ: saved pwq max_active */
 
-	struct workqueue_attrs	*unbound_attrs;	/* WQ: only for unbound wqs */
-	struct pool_workqueue	*dfl_pwq;	/* WQ: only for unbound wqs */
+	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
+	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */
 
 #ifdef CONFIG_SYSFS
 	struct wq_device	*wq_dev;	/* I: for sysfs interface */
@@ -268,7 +273,7 @@ struct workqueue_struct {
 	/* hot fields used during command issue, aligned to cacheline */
 	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
 	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
-	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
+	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
 };
 
 static struct kmem_cache *pwq_cache;
@@ -280,12 +285,7 @@ static bool wq_disable_numa;
 module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 
 /* see the comment above the definition of WQ_POWER_EFFICIENT */
-#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
-static bool wq_power_efficient = true;
-#else
-static bool wq_power_efficient;
-#endif
-
+static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
 module_param_named(power_efficient, wq_power_efficient, bool, 0444);
 
 static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
@@ -299,6 +299,8 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -330,8 +332,6 @@ struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 
 static int worker_thread(void *__worker);
-static void copy_workqueue_attrs(struct workqueue_attrs *to,
-				 const struct workqueue_attrs *from);
 static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 
 #define CREATE_TRACE_POINTS
@@ -347,6 +347,12 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 			   lockdep_is_held(&wq->mutex),			\
 			   "sched RCU or wq->mutex should be held")
 
+#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
+	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
+			   lockdep_is_held(&wq->mutex) ||		\
+			   lockdep_is_held(&wq_pool_mutex),		\
+			   "sched RCU, wq->mutex or wq_pool_mutex should be held")
+
 #define for_each_cpu_worker_pool(pool, cpu)				\
 	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
 	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -551,7 +557,8 @@ static int worker_pool_assign_id(struct worker_pool *pool)
  * @wq: the target workqueue
  * @node: the node ID
  *
- * This must be called either with pwq_lock held or sched RCU read locked.
+ * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
+ * read locked.
  * If the pwq needs to be used beyond the locking in effect, the caller is
  * responsible for guaranteeing that the pwq stays online.
  *
@@ -560,7 +567,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
 static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
 						  int node)
 {
-	assert_rcu_or_wq_mutex(wq);
+	assert_rcu_or_wq_mutex_or_pool_mutex(wq);
 	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
 }
 
@@ -976,7 +983,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
  * move_linked_works - move linked works to a list
  * @work: start of series of works to be scheduled
  * @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
+ * @nextp: out parameter for nested worklist walking
  *
  * Schedule linked works starting from @work to @head.  Work series to
  * be scheduled starts at @work and includes any consecutive work with
@@ -2607,7 +2614,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 out_unlock:
 	mutex_unlock(&wq->mutex);
 }
-EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue);
 
 /**
  * drain_workqueue - drain a workqueue
@@ -2616,7 +2623,7 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
  * Wait until the workqueue becomes empty.  While draining is in progress,
  * only chain queueing is allowed.  IOW, only currently pending or running
  * work items on @wq can queue further work items on it.  @wq is flushed
- * repeatedly until it becomes empty.  The number of flushing is detemined
+ * repeatedly until it becomes empty.  The number of flushing is determined
  * by the depth of chaining and should be relatively short.  Whine if it
  * takes too long.
  */
@@ -2947,36 +2954,6 @@ int schedule_on_each_cpu(work_func_t func)
 }
 
 /**
- * flush_scheduled_work - ensure that any scheduled work has run to completion.
- *
- * Forces execution of the kernel-global workqueue and blocks until its
- * completion.
- *
- * Think twice before calling this function!  It's very easy to get into
- * trouble if you don't take great care.  Either of the following situations
- * will lead to deadlock:
- *
- *	One of the work items currently on the workqueue needs to acquire
- *	a lock held by your code or its caller.
- *
- *	Your code is running in the context of a work routine.
- *
- * They will be detected by lockdep when they occur, but the first might not
- * occur very often.  It depends on what work items are on the workqueue and
- * what locks they need, which you have no control over.
- *
- * In most situations flushing the entire workqueue is overkill; you merely
- * need to know that a particular work item isn't queued and isn't running.
- * In such cases you should use cancel_delayed_work_sync() or
- * cancel_work_sync() instead.
- */
-void flush_scheduled_work(void)
-{
-	flush_workqueue(system_wq);
-}
-EXPORT_SYMBOL(flush_scheduled_work);
-
-/**
  * execute_in_process_context - reliably execute the routine with user context
  * @fn:		the function to execute
  * @ew:		guaranteed storage for the execute work structure (must
@@ -3081,7 +3058,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
  * init_worker_pool - initialize a newly zalloc'd worker_pool
  * @pool: worker_pool to initialize
  *
- * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs.
+ * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
  *
  * Return: 0 on success, -errno on failure.  Even on failure, all fields
  * inside @pool proper are initialized and put_unbound_pool() can be called
@@ -3425,20 +3402,9 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
 	return pwq;
 }
 
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
-	lockdep_assert_held(&wq_pool_mutex);
-
-	if (pwq) {
-		put_unbound_pool(pwq->pool);
-		kmem_cache_free(pwq_cache, pwq);
-	}
-}
-
 /**
- * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
- * @attrs: the wq_attrs of interest
+ * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of the default pwq of the target workqueue
  * @node: the target NUMA node
  * @cpu_going_down: if >= 0, the CPU to consider as offline
  * @cpumask: outarg, the resulting cpumask
@@ -3488,6 +3454,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 {
 	struct pool_workqueue *old_pwq;
 
+	lockdep_assert_held(&wq_pool_mutex);
 	lockdep_assert_held(&wq->mutex);
 
 	/* link_pwq() can handle duplicate calls */
@@ -3498,46 +3465,59 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 	return old_pwq;
 }
 
-/**
- * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
- * @wq: the target workqueue
- * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
- *
- * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on.  Older pwqs are released as in-flight work
- * items finish.  Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
- *
- * Performs GFP_KERNEL allocations.
- *
- * Return: 0 on success and -errno on failure.
- */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
-			  const struct workqueue_attrs *attrs)
+/* context to store the prepared attrs & pwqs before applying */
+struct apply_wqattrs_ctx {
+	struct workqueue_struct	*wq;		/* target workqueue */
+	struct workqueue_attrs	*attrs;		/* attrs to apply */
+	struct list_head	list;		/* queued for batching commit */
+	struct pool_workqueue	*dfl_pwq;
+	struct pool_workqueue	*pwq_tbl[];
+};
+
+/* free the resources after success or abort */
+static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
+{
+	if (ctx) {
+		int node;
+
+		for_each_node(node)
+			put_pwq_unlocked(ctx->pwq_tbl[node]);
+		put_pwq_unlocked(ctx->dfl_pwq);
+
+		free_workqueue_attrs(ctx->attrs);
+
+		kfree(ctx);
+	}
+}
+
+/* allocate the attrs and pwqs for later installation */
+static struct apply_wqattrs_ctx *
+apply_wqattrs_prepare(struct workqueue_struct *wq,
+		      const struct workqueue_attrs *attrs)
 {
+	struct apply_wqattrs_ctx *ctx;
 	struct workqueue_attrs *new_attrs, *tmp_attrs;
-	struct pool_workqueue **pwq_tbl, *dfl_pwq;
-	int node, ret;
+	int node;
 
-	/* only unbound workqueues can change attributes */
-	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
-		return -EINVAL;
+	lockdep_assert_held(&wq_pool_mutex);
 
-	/* creating multiple pwqs breaks ordering guarantee */
-	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
-		return -EINVAL;
+	ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
+		      GFP_KERNEL);
 
-	pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
-	if (!pwq_tbl || !new_attrs || !tmp_attrs)
-		goto enomem;
+	if (!ctx || !new_attrs || !tmp_attrs)
+		goto out_free;
 
-	/* make a copy of @attrs and sanitize it */
+	/*
+	 * Calculate the attrs of the default pwq.
+	 * If the user configured cpumask doesn't overlap with the
+	 * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
+	 */
 	copy_workqueue_attrs(new_attrs, attrs);
-	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+	if (unlikely(cpumask_empty(new_attrs->cpumask)))
+		cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
@@ -3547,75 +3527,129 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	copy_workqueue_attrs(tmp_attrs, new_attrs);
 
 	/*
-	 * CPUs should stay stable across pwq creations and installations.
-	 * Pin CPUs, determine the target cpumask for each node and create
-	 * pwqs accordingly.
-	 */
-	get_online_cpus();
-
-	mutex_lock(&wq_pool_mutex);
-
-	/*
 	 * If something goes wrong during CPU up/down, we'll fall back to
 	 * the default pwq covering whole @attrs->cpumask.  Always create
 	 * it even if we don't use it immediately.
 	 */
-	dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
-	if (!dfl_pwq)
-		goto enomem_pwq;
+	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	if (!ctx->dfl_pwq)
+		goto out_free;
 
 	for_each_node(node) {
-		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
-			pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
-			if (!pwq_tbl[node])
-				goto enomem_pwq;
+		if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
+			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+			if (!ctx->pwq_tbl[node])
+				goto out_free;
 		} else {
-			dfl_pwq->refcnt++;
-			pwq_tbl[node] = dfl_pwq;
+			ctx->dfl_pwq->refcnt++;
+			ctx->pwq_tbl[node] = ctx->dfl_pwq;
 		}
 	}
 
-	mutex_unlock(&wq_pool_mutex);
+	/* save the user configured attrs and sanitize it. */
+	copy_workqueue_attrs(new_attrs, attrs);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	ctx->attrs = new_attrs;
+
+	ctx->wq = wq;
+	free_workqueue_attrs(tmp_attrs);
+	return ctx;
+
+out_free:
+	free_workqueue_attrs(tmp_attrs);
+	free_workqueue_attrs(new_attrs);
+	apply_wqattrs_cleanup(ctx);
+	return NULL;
+}
+
+/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
+static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
+{
+	int node;
 
 	/* all pwqs have been created successfully, let's install'em */
-	mutex_lock(&wq->mutex);
+	mutex_lock(&ctx->wq->mutex);
 
-	copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+	copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
 
 	/* save the previous pwq and install the new one */
 	for_each_node(node)
-		pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+		ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
+							  ctx->pwq_tbl[node]);
 
 	/* @dfl_pwq might not have been used, ensure it's linked */
-	link_pwq(dfl_pwq);
-	swap(wq->dfl_pwq, dfl_pwq);
+	link_pwq(ctx->dfl_pwq);
+	swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
 
-	mutex_unlock(&wq->mutex);
+	mutex_unlock(&ctx->wq->mutex);
+}
 
-	/* put the old pwqs */
-	for_each_node(node)
-		put_pwq_unlocked(pwq_tbl[node]);
-	put_pwq_unlocked(dfl_pwq);
+static void apply_wqattrs_lock(void)
+{
+	/* CPUs should stay stable across pwq creations and installations */
+	get_online_cpus();
+	mutex_lock(&wq_pool_mutex);
+}
 
+static void apply_wqattrs_unlock(void)
+{
+	mutex_unlock(&wq_pool_mutex);
 	put_online_cpus();
-	ret = 0;
-	/* fall through */
-out_free:
-	free_workqueue_attrs(tmp_attrs);
-	free_workqueue_attrs(new_attrs);
-	kfree(pwq_tbl);
+}
+
+static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
+					const struct workqueue_attrs *attrs)
+{
+	struct apply_wqattrs_ctx *ctx;
+	int ret = -ENOMEM;
+
+	/* only unbound workqueues can change attributes */
+	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+		return -EINVAL;
+
+	/* creating multiple pwqs breaks ordering guarantee */
+	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+		return -EINVAL;
+
+	ctx = apply_wqattrs_prepare(wq, attrs);
+
+	/* the ctx has been prepared successfully, let's commit it */
+	if (ctx) {
+		apply_wqattrs_commit(ctx);
+		ret = 0;
+	}
+
+	apply_wqattrs_cleanup(ctx);
+
 	return ret;
+}
 
-enomem_pwq:
-	free_unbound_pwq(dfl_pwq);
-	for_each_node(node)
-		if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
-			free_unbound_pwq(pwq_tbl[node]);
-	mutex_unlock(&wq_pool_mutex);
-	put_online_cpus();
-enomem:
-	ret = -ENOMEM;
-	goto out_free;
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
+ * items finish.  Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+			  const struct workqueue_attrs *attrs)
+{
+	int ret;
+
+	apply_wqattrs_lock();
+	ret = apply_workqueue_attrs_locked(wq, attrs);
+	apply_wqattrs_unlock();
+
+	return ret;
 }
 
 /**
@@ -3651,7 +3685,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 
 	lockdep_assert_held(&wq_pool_mutex);
 
-	if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+	if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
+	    wq->unbound_attrs->no_numa)
 		return;
 
 	/*
@@ -3662,48 +3697,37 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 	target_attrs = wq_update_unbound_numa_attrs_buf;
 	cpumask = target_attrs->cpumask;
 
-	mutex_lock(&wq->mutex);
-	if (wq->unbound_attrs->no_numa)
-		goto out_unlock;
-
 	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
 	pwq = unbound_pwq_by_node(wq, node);
 
 	/*
 	 * Let's determine what needs to be done.  If the target cpumask is
-	 * different from wq's, we need to compare it to @pwq's and create
-	 * a new one if they don't match.  If the target cpumask equals
-	 * wq's, the default pwq should be used.
+	 * different from the default pwq's, we need to compare it to @pwq's
+	 * and create a new one if they don't match.  If the target cpumask
+	 * equals the default pwq's, the default pwq should be used.
 	 */
-	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+	if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
 		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
-			goto out_unlock;
+			return;
 	} else {
 		goto use_dfl_pwq;
 	}
 
-	mutex_unlock(&wq->mutex);
-
 	/* create a new pwq */
 	pwq = alloc_unbound_pwq(wq, target_attrs);
 	if (!pwq) {
 		pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
 			wq->name);
-		mutex_lock(&wq->mutex);
 		goto use_dfl_pwq;
 	}
 
-	/*
-	 * Install the new pwq.  As this function is called only from CPU
-	 * hotplug callbacks and applying a new attrs is wrapped with
-	 * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
-	 * inbetween.
-	 */
+	/* Install the new pwq. */
 	mutex_lock(&wq->mutex);
 	old_pwq = numa_pwq_tbl_install(wq, node, pwq);
 	goto out_unlock;
 
 use_dfl_pwq:
+	mutex_lock(&wq->mutex);
 	spin_lock_irq(&wq->dfl_pwq->pool->lock);
 	get_pwq(wq->dfl_pwq);
 	spin_unlock_irq(&wq->dfl_pwq->pool->lock);
@@ -4385,7 +4409,7 @@ static void rebind_workers(struct worker_pool *pool)
 	/*
 	 * Restore CPU affinity of all workers.  As all idle workers should
 	 * be on the run-queue of the associated CPU before any local
-	 * wake-ups for concurrency management happen, restore CPU affinty
+	 * wake-ups for concurrency management happen, restore CPU affinity
 	 * of all workers first and then clear UNBOUND.  As we're called
 	 * from CPU_ONLINE, the following shouldn't fail.
 	 */
@@ -4698,6 +4722,82 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
+static int workqueue_apply_unbound_cpumask(void)
+{
+	LIST_HEAD(ctxs);
+	int ret = 0;
+	struct workqueue_struct *wq;
+	struct apply_wqattrs_ctx *ctx, *n;
+
+	lockdep_assert_held(&wq_pool_mutex);
+
+	list_for_each_entry(wq, &workqueues, list) {
+		if (!(wq->flags & WQ_UNBOUND))
+			continue;
+		/* creating multiple pwqs breaks ordering guarantee */
+		if (wq->flags & __WQ_ORDERED)
+			continue;
+
+		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+		if (!ctx) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		list_add_tail(&ctx->list, &ctxs);
+	}
+
+	list_for_each_entry_safe(ctx, n, &ctxs, list) {
+		if (!ret)
+			apply_wqattrs_commit(ctx);
+		apply_wqattrs_cleanup(ctx);
+	}
+
+	return ret;
+}
+
+/**
+ *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ *  @cpumask: the cpumask to set
+ *
+ *  The low-level workqueues cpumask is a global cpumask that limits
+ *  the affinity of all unbound workqueues.  This function check the @cpumask
+ *  and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ *  Retun:	0	- Success
+ *  		-EINVAL	- Invalid @cpumask
+ *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
+ */
+int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+	int ret = -EINVAL;
+	cpumask_var_t saved_cpumask;
+
+	if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_and(cpumask, cpumask, cpu_possible_mask);
+	if (!cpumask_empty(cpumask)) {
+		apply_wqattrs_lock();
+
+		/* save the old wq_unbound_cpumask. */
+		cpumask_copy(saved_cpumask, wq_unbound_cpumask);
+
+		/* update wq_unbound_cpumask at first and apply it to wqs. */
+		cpumask_copy(wq_unbound_cpumask, cpumask);
+		ret = workqueue_apply_unbound_cpumask();
+
+		/* restore the wq_unbound_cpumask when failed. */
+		if (ret < 0)
+			cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+
+		apply_wqattrs_unlock();
+	}
+
+	free_cpumask_var(saved_cpumask);
+	return ret;
+}
+
 #ifdef CONFIG_SYSFS
 /*
  * Workqueues with WQ_SYSFS flag set is visible to userland via
@@ -4802,13 +4902,13 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
 {
 	struct workqueue_attrs *attrs;
 
+	lockdep_assert_held(&wq_pool_mutex);
+
 	attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	if (!attrs)
 		return NULL;
 
-	mutex_lock(&wq->mutex);
 	copy_workqueue_attrs(attrs, wq->unbound_attrs);
-	mutex_unlock(&wq->mutex);
 	return attrs;
 }
 
@@ -4817,18 +4917,22 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
 	struct workqueue_attrs *attrs;
-	int ret;
+	int ret = -ENOMEM;
+
+	apply_wqattrs_lock();
 
 	attrs = wq_sysfs_prep_attrs(wq);
 	if (!attrs)
-		return -ENOMEM;
+		goto out_unlock;
 
 	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
 	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
-		ret = apply_workqueue_attrs(wq, attrs);
+		ret = apply_workqueue_attrs_locked(wq, attrs);
 	else
 		ret = -EINVAL;
 
+out_unlock:
+	apply_wqattrs_unlock();
 	free_workqueue_attrs(attrs);
 	return ret ?: count;
 }
@@ -4852,16 +4956,20 @@ static ssize_t wq_cpumask_store(struct device *dev,
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
 	struct workqueue_attrs *attrs;
-	int ret;
+	int ret = -ENOMEM;
+
+	apply_wqattrs_lock();
 
 	attrs = wq_sysfs_prep_attrs(wq);
 	if (!attrs)
-		return -ENOMEM;
+		goto out_unlock;
 
 	ret = cpumask_parse(buf, attrs->cpumask);
 	if (!ret)
-		ret = apply_workqueue_attrs(wq, attrs);
+		ret = apply_workqueue_attrs_locked(wq, attrs);
 
+out_unlock:
+	apply_wqattrs_unlock();
 	free_workqueue_attrs(attrs);
 	return ret ?: count;
 }
@@ -4885,18 +4993,22 @@ static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
 	struct workqueue_attrs *attrs;
-	int v, ret;
+	int v, ret = -ENOMEM;
+
+	apply_wqattrs_lock();
 
 	attrs = wq_sysfs_prep_attrs(wq);
 	if (!attrs)
-		return -ENOMEM;
+		goto out_unlock;
 
 	ret = -EINVAL;
 	if (sscanf(buf, "%d", &v) == 1) {
 		attrs->no_numa = !v;
-		ret = apply_workqueue_attrs(wq, attrs);
+		ret = apply_workqueue_attrs_locked(wq, attrs);
 	}
 
+out_unlock:
+	apply_wqattrs_unlock();
 	free_workqueue_attrs(attrs);
 	return ret ?: count;
 }
@@ -4914,9 +5026,49 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int written;
+
+	mutex_lock(&wq_pool_mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+			    cpumask_pr_args(wq_unbound_cpumask));
+	mutex_unlock(&wq_pool_mutex);
+
+	return written;
+}
+
+static ssize_t wq_unbound_cpumask_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	cpumask_var_t cpumask;
+	int ret;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, cpumask);
+	if (!ret)
+		ret = workqueue_set_unbound_cpumask(cpumask);
+
+	free_cpumask_var(cpumask);
+	return ret ? ret : count;
+}
+
+static struct device_attribute wq_sysfs_cpumask_attr =
+	__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
+	       wq_unbound_cpumask_store);
+
 static int __init wq_sysfs_init(void)
 {
-	return subsys_virtual_register(&wq_subsys, NULL);
+	int err;
+
+	err = subsys_virtual_register(&wq_subsys, NULL);
+	if (err)
+		return err;
+
+	return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
 }
 core_initcall(wq_sysfs_init);
 
@@ -4948,7 +5100,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
 	int ret;
 
 	/*
-	 * Adjusting max_active or creating new pwqs by applyting
+	 * Adjusting max_active or creating new pwqs by applying
 	 * attributes breaks ordering guarantee.  Disallow exposing ordered
 	 * workqueues.
 	 */
@@ -5064,6 +5216,9 @@ static int __init init_workqueues(void)
 
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
+	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
 	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);