Linux-libre 4.5-gnu

author: André Fabian Silva Delgado <emulatorman@parabola.nu> 2016-03-25 03:53:42 -0300
committer: André Fabian Silva Delgado <emulatorman@parabola.nu> 2016-03-25 03:53:42 -0300
commit: 03dd4cb26d967f9588437b0fc9cc0e8353322bb7 (patch)
tree: fa581f6dc1c0596391690d1f67eceef3af8246dc /kernel
parent: d4e493caf788ef44982e131ff9c786546904d934 (diff)
155 files changed, 5394 insertions, 21496 deletions
diff --git a/kernel/async.c b/kernel/async.c
index 4c3773c0b..d2edd6efe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -326,3 +326,4 @@ bool current_is_async(void)
 
 	return worker && worker->current_func == async_run_entry_fn;
 }
+EXPORT_SYMBOL_GPL(current_is_async);
diff --git a/kernel/audit.c b/kernel/audit.c
index 5ffcbd354..3a3e5deed 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -110,7 +110,6 @@ static u32	audit_backlog_limit = 64;
 #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
 static u32	audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
 static u32	audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
-static u32	audit_backlog_wait_overflow = 0;
 
 /* The identity of the user shutting down the audit system. */
 kuid_t		audit_sig_uid = INVALID_UID;
@@ -509,8 +508,7 @@ static void flush_hold_queue(void)
 	 * if auditd just disappeared but we
 	 * dequeued an skb we need to drop ref
 	 */
-	if (skb)
-		consume_skb(skb);
+	consume_skb(skb);
 }
 
 static int kauditd_thread(void *dummy)
@@ -524,7 +522,8 @@ static int kauditd_thread(void *dummy)
 		skb = skb_dequeue(&audit_skb_queue);
 
 		if (skb) {
-			if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+			if (!audit_backlog_limit ||
+			    (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit))
 				wake_up(&audit_backlog_wait);
 			if (audit_pid)
 				kauditd_send_skb(skb);
@@ -1232,9 +1231,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
 	if (!ab)
 		return;
 
-	if (ab->skb)
-		kfree_skb(ab->skb);
-
+	kfree_skb(ab->skb);
 	spin_lock_irqsave(&audit_freelist_lock, flags);
 	if (audit_freelist_count > AUDIT_MAXFREE)
 		kfree(ab);
@@ -1372,7 +1369,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 		return NULL;
 
 	if (gfp_mask & __GFP_DIRECT_RECLAIM) {
-		if (audit_pid && audit_pid == current->pid)
+		if (audit_pid && audit_pid == current->tgid)
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 		else
 			reserve = 0;
@@ -1395,12 +1392,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 				skb_queue_len(&audit_skb_queue),
 				audit_backlog_limit);
 		audit_log_lost("backlog limit exceeded");
-		audit_backlog_wait_time = audit_backlog_wait_overflow;
+		audit_backlog_wait_time = 0;
 		wake_up(&audit_backlog_wait);
 		return NULL;
 	}
 
-	if (!reserve)
+	if (!reserve && !audit_backlog_wait_time)
 		audit_backlog_wait_time = audit_backlog_wait_time_master;
 
 	ab = audit_buffer_alloc(ctx, gfp_mask, type);
@@ -1722,7 +1719,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
 
 /* Copy inode data into an audit_names. */
 void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
-		      const struct inode *inode)
+		      struct inode *inode)
 {
 	name->ino   = inode->i_ino;
 	name->dev   = inode->i_sb->s_dev;
diff --git a/kernel/audit.h b/kernel/audit.h
index de6cbb7cf..cbbe6bb64 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -207,7 +207,7 @@ extern u32 audit_ever_enabled;
 
 extern void audit_copy_inode(struct audit_names *name,
 			     const struct dentry *dentry,
-			     const struct inode *inode);
+			     struct inode *inode);
 extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
 			  kernel_cap_t *cap);
 extern void audit_log_name(struct audit_context *context,
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 27c6046c2..f84f8d06e 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
 	if (IS_ERR(dentry))
 		return (void *)dentry; /* returning an error */
 	inode = path.dentry->d_inode;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
 	if (unlikely(!audit_mark)) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 656c7e93a..9f194aad0 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 	struct dentry *d = kern_path_locked(watch->path, parent);
 	if (IS_ERR(d))
 		return PTR_ERR(d);
-	mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+	inode_unlock(d_backing_inode(parent->dentry));
 	if (d_is_positive(d)) {
 		/* update watch filter fields */
 		watch->dev = d_backing_inode(d)->i_sb->s_dev;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b86cc0495..195ffaee5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
 		   unsigned int flags)
 {
 	struct audit_context *context = current->audit_context;
-	const struct inode *inode = d_backing_inode(dentry);
+	struct inode *inode = d_backing_inode(dentry);
 	struct audit_names *n;
 	bool parent = flags & AUDIT_INODE_PARENT;
 
@@ -1848,12 +1848,12 @@ void __audit_file(const struct file *file)
  * must be hooked prior, in order to capture the target inode during
  * unsuccessful attempts.
  */
-void __audit_inode_child(const struct inode *parent,
+void __audit_inode_child(struct inode *parent,
 			 const struct dentry *dentry,
 			 const unsigned char type)
 {
 	struct audit_context *context = current->audit_context;
-	const struct inode *inode = d_backing_inode(dentry);
+	struct inode *inode = d_backing_inode(dentry);
 	const char *dname = dentry->d_name.name;
 	struct audit_names *n, *found_parent = NULL, *found_child = NULL;
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b0799bced..89ebbc4d1 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
 {
 	struct perf_event *event;
 	const struct perf_event_attr *attr;
+	struct file *file;
 
-	event = perf_event_get(fd);
-	if (IS_ERR(event))
-		return event;
+	file = perf_event_get(fd);
+	if (IS_ERR(file))
+		return file;
+
+	event = file->private_data;
 
 	attr = perf_event_attrs(event);
 	if (IS_ERR(attr))
@@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
 		goto err;
 
 	if (attr->type == PERF_TYPE_RAW)
-		return event;
+		return file;
 
 	if (attr->type == PERF_TYPE_HARDWARE)
-		return event;
+		return file;
 
 	if (attr->type == PERF_TYPE_SOFTWARE &&
 	    attr->config == PERF_COUNT_SW_BPF_OUTPUT)
-		return event;
+		return file;
 err:
-	perf_event_release_kernel(event);
+	fput(file);
 	return ERR_PTR(-EINVAL);
 }
 
 static void perf_event_fd_array_put_ptr(void *ptr)
 {
-	struct perf_event *event = ptr;
-
-	perf_event_release_kernel(event);
+	fput((struct file *)ptr);
 }
 
 static const struct bpf_map_ops perf_event_array_ops = {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bdd5..972d9a8e4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
 	ARG1 = (u64) (unsigned long) ctx;
 
-	/* Registers used in classic BPF programs need to be reset first. */
-	regs[BPF_REG_A] = 0;
-	regs[BPF_REG_X] = 0;
-
 select_insn:
 	goto *jumptable[insn->code];
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34777b374..c5b30fd8a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,11 +14,15 @@
 #include <linux/filter.h>
 #include <linux/vmalloc.h>
 
+struct bucket {
+	struct hlist_head head;
+	raw_spinlock_t lock;
+};
+
 struct bpf_htab {
 	struct bpf_map map;
-	struct hlist_head *buckets;
-	raw_spinlock_t lock;
-	u32 count;	/* number of elements in this hashtable */
+	struct bucket *buckets;
+	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
 };
@@ -79,34 +83,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
-	    htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
 		goto free_htab;
 
-	if ((u64) htab->n_buckets * sizeof(struct hlist_head) +
+	if ((u64) htab->n_buckets * sizeof(struct bucket) +
 	    (u64) htab->elem_size * htab->map.max_entries >=
 	    U32_MAX - PAGE_SIZE)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
-	htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+	htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
 				   htab->elem_size * htab->map.max_entries,
 				   PAGE_SIZE) >> PAGE_SHIFT;
 
 	err = -ENOMEM;
-	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
 				      GFP_USER | __GFP_NOWARN);
 
 	if (!htab->buckets) {
-		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
 		if (!htab->buckets)
 			goto free_htab;
 	}
 
-	for (i = 0; i < htab->n_buckets; i++)
-		INIT_HLIST_HEAD(&htab->buckets[i]);
+	for (i = 0; i < htab->n_buckets; i++) {
+		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		raw_spin_lock_init(&htab->buckets[i].lock);
+	}
 
-	raw_spin_lock_init(&htab->lock);
-	htab->count = 0;
+	atomic_set(&htab->count, 0);
 
 	return &htab->map;
 
@@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
 	return jhash(key, key_len, 0);
 }
 
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
 {
 	return &htab->buckets[hash & (htab->n_buckets - 1)];
 }
 
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
 static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
 					 void *key, u32 key_size)
 {
@@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new, *l_old;
 	struct hlist_head *head;
+	struct bucket *b;
 	unsigned long flags;
 	u32 key_size;
 	int ret;
@@ -248,15 +259,15 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
 
 	l_new->hash = htab_map_hash(l_new->key, key_size);
+	b = __select_bucket(htab, l_new->hash);
+	head = &b->head;
 
 	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&htab->lock, flags);
-
-	head = select_bucket(htab, l_new->hash);
+	raw_spin_lock_irqsave(&b->lock, flags);
 
 	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
 
-	if (!l_old && unlikely(htab->count >= map->max_entries)) {
+	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
 		/* if elem with this 'key' doesn't exist and we've reached
 		 * max_entries limit, fail insertion of new elem
 		 */
@@ -284,13 +295,13 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		hlist_del_rcu(&l_old->hash_node);
 		kfree_rcu(l_old, rcu);
 	} else {
-		htab->count++;
+		atomic_inc(&htab->count);
 	}
-	raw_spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&b->lock, flags);
 
 	return 0;
 err:
-	raw_spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&b->lock, flags);
 	kfree(l_new);
 	return ret;
 }
@@ -300,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_head *head;
+	struct bucket *b;
 	struct htab_elem *l;
 	unsigned long flags;
 	u32 hash, key_size;
@@ -310,21 +322,21 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	key_size = map->key_size;
 
 	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
 
-	raw_spin_lock_irqsave(&htab->lock, flags);
-
-	head = select_bucket(htab, hash);
+	raw_spin_lock_irqsave(&b->lock, flags);
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
-		htab->count--;
+		atomic_dec(&htab->count);
 		kfree_rcu(l, rcu);
 		ret = 0;
 	}
 
-	raw_spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&b->lock, flags);
 	return ret;
 }
 
@@ -339,7 +351,7 @@ static void delete_all_elements(struct bpf_htab *htab)
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
-			htab->count--;
+			atomic_dec(&htab->count);
 			kfree(l);
 		}
 	}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8a797d5..f2ece3c17 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
 	}
 }
 
+static int bpf_link(struct dentry *old_dentry, struct inode *dir,
+		    struct dentry *new_dentry)
+{
+	if (bpf_dname_reserved(new_dentry))
+		return -EPERM;
+
+	return simple_link(old_dentry, dir, new_dentry);
+}
+
+static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (bpf_dname_reserved(new_dentry))
+		return -EPERM;
+
+	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static const struct inode_operations bpf_dir_iops = {
 	.lookup		= simple_lookup,
 	.mknod		= bpf_mkobj,
 	.mkdir		= bpf_mkdir,
 	.rmdir		= simple_rmdir,
+	.rename		= bpf_rename,
+	.link		= bpf_link,
 	.unlink		= simple_unlink,
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3b39550d8..637397059 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -113,8 +113,28 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_FS
+static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	const struct bpf_map *map = filp->private_data;
+
+	seq_printf(m,
+		   "map_type:\t%u\n"
+		   "key_size:\t%u\n"
+		   "value_size:\t%u\n"
+		   "max_entries:\t%u\n",
+		   map->map_type,
+		   map->key_size,
+		   map->value_size,
+		   map->max_entries);
+}
+#endif
+
 static const struct file_operations bpf_map_fops = {
-	.release = bpf_map_release,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= bpf_map_show_fdinfo,
+#endif
+	.release	= bpf_map_release,
 };
 
 int bpf_map_new_fd(struct bpf_map *map)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fb1ecfd2d..d27904c19 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,8 +57,9 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
 #include <linux/delay.h>
-#include <linux/cpuset.h>
 #include <linux/atomic.h>
+#include <linux/cpuset.h>
+#include <net/sock.h>
 
 /*
  * pidlists linger the following amount before being destroyed.  The goal
@@ -211,6 +212,7 @@ static unsigned long have_free_callback __read_mostly;
 /* Ditto for the can_fork callback. */
 static unsigned long have_canfork_callback __read_mostly;
 
+static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
@@ -440,11 +442,6 @@ static bool cgroup_tryget(struct cgroup *cgrp)
 	return css_tryget(&cgrp->self);
 }
 
-static void cgroup_put(struct cgroup *cgrp)
-{
-	css_put(&cgrp->self);
-}
-
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
 	struct cgroup *cgrp = of->kn->parent->priv;
@@ -465,25 +462,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 }
 EXPORT_SYMBOL_GPL(of_css);
 
-/**
- * cgroup_is_descendant - test ancestry
- * @cgrp: the cgroup to be tested
- * @ancestor: possible ancestor of @cgrp
- *
- * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
- * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
- * and @ancestor are accessible.
- */
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
-{
-	while (cgrp) {
-		if (cgrp == ancestor)
-			return true;
-		cgrp = cgroup_parent(cgrp);
-	}
-	return false;
-}
-
 static int notify_on_release(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -1647,10 +1625,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			all_ss = true;
 			continue;
 		}
-		if (!strcmp(token, "__DEVEL__sane_behavior")) {
-			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
-			continue;
-		}
 		if (!strcmp(token, "noprefix")) {
 			opts->flags |= CGRP_ROOT_NOPREFIX;
 			continue;
@@ -1717,15 +1691,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			return -ENOENT;
 	}
 
-	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-		pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-		if (nr_opts != 1) {
-			pr_err("sane_behavior: no other mount options allowed\n");
-			return -EINVAL;
-		}
-		return 0;
-	}
-
 	/*
 	 * If the 'all' option was specified select all the subsystems,
 	 * otherwise if 'none', 'name=' and a subsystem name options were
@@ -1924,6 +1889,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	if (ret < 0)
 		goto out;
 	root_cgrp->id = ret;
+	root_cgrp->ancestor_ids[0] = ret;
 
 	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
 			      GFP_KERNEL);
@@ -2004,6 +1970,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
+	bool is_v2 = fs_type == &cgroup2_fs_type;
 	struct super_block *pinned_sb = NULL;
 	struct cgroup_subsys *ss;
 	struct cgroup_root *root;
@@ -2020,6 +1987,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 
+	if (is_v2) {
+		if (data) {
+			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+			return ERR_PTR(-EINVAL);
+		}
+		cgrp_dfl_root_visible = true;
+		root = &cgrp_dfl_root;
+		cgroup_get(&root->cgrp);
+		goto out_mount;
+	}
+
 	mutex_lock(&cgroup_mutex);
 
 	/* First find the desired set of subsystems */
@@ -2027,15 +2005,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (ret)
 		goto out_unlock;
 
-	/* look for a matching existing root */
-	if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
-		cgrp_dfl_root_visible = true;
-		root = &cgrp_dfl_root;
-		cgroup_get(&root->cgrp);
-		ret = 0;
-		goto out_unlock;
-	}
-
 	/*
 	 * Destruction of cgroup root is asynchronous, so subsystems may
 	 * still be dying after the previous unmount.  Let's drain the
@@ -2146,9 +2115,10 @@ out_free:
 
 	if (ret)
 		return ERR_PTR(ret);
-
+out_mount:
 	dentry = kernfs_mount(fs_type, flags, root->kf_root,
-				CGROUP_SUPER_MAGIC, &new_sb);
+			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+			      &new_sb);
 	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 
@@ -2191,6 +2161,12 @@ static struct file_system_type cgroup_fs_type = {
 	.kill_sb = cgroup_kill_sb,
 };
 
+static struct file_system_type cgroup2_fs_type = {
+	.name = "cgroup2",
+	.mount = cgroup_mount,
+	.kill_sb = cgroup_kill_sb,
+};
+
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
@@ -4063,7 +4039,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 		goto out_err;
 
 	/*
-	 * Migrate tasks one-by-one until @form is empty.  This fails iff
+	 * Migrate tasks one-by-one until @from is empty.  This fails iff
 	 * ->can_attach() fails.
 	 */
 	do {
@@ -4681,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
 
 	if (ss) {
 		/* css free path */
+		struct cgroup_subsys_state *parent = css->parent;
 		int id = css->id;
 
-		if (css->parent)
-			css_put(css->parent);
-
 		ss->css_free(css);
 		cgroup_idr_remove(&ss->css_idr, id);
 		cgroup_put(cgrp);
+
+		if (parent)
+			css_put(parent);
 	} else {
 		/* cgroup free path */
 		atomic_dec(&cgrp->root->nr_cgrps);
@@ -4909,11 +4886,11 @@ err_free_css:
 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 			umode_t mode)
 {
-	struct cgroup *parent, *cgrp;
+	struct cgroup *parent, *cgrp, *tcgrp;
 	struct cgroup_root *root;
 	struct cgroup_subsys *ss;
 	struct kernfs_node *kn;
-	int ssid, ret;
+	int level, ssid, ret;
 
 	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
 	 */
@@ -4924,9 +4901,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	if (!parent)
 		return -ENODEV;
 	root = parent->root;
+	level = parent->level + 1;
 
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
-	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
+	cgrp = kzalloc(sizeof(*cgrp) +
+		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
 	if (!cgrp) {
 		ret = -ENOMEM;
 		goto out_unlock;
@@ -4950,6 +4929,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
+	cgrp->level = level;
+
+	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
 
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5201,7 +5184,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 {
 	struct cgroup_subsys_state *css;
 
-	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+	pr_debug("Initializing cgroup subsys %s\n", ss->name);
 
 	mutex_lock(&cgroup_mutex);
 
@@ -5359,6 +5342,7 @@ int __init cgroup_init(void)
 
 	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
 	WARN_ON(register_filesystem(&cgroup_fs_type));
+	WARN_ON(register_filesystem(&cgroup2_fs_type));
 	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
 
 	return 0;
@@ -5502,19 +5486,6 @@ static const struct file_operations proc_cgroupstats_operations = {
 	.release = single_release,
 };
 
-static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
-	if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
-		return &ss_priv[i - CGROUP_CANFORK_START];
-	return NULL;
-}
-
-static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
-	void **private = subsys_canfork_priv_p(ss_priv, i);
-	return private ? *private : NULL;
-}
-
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -5537,14 +5508,13 @@ void cgroup_fork(struct task_struct *child)
  * returns an error, the fork aborts with that error code. This allows for
  * a cgroup subsystem to conditionally allow or deny new forks.
  */
-int cgroup_can_fork(struct task_struct *child,
-		    void *ss_priv[CGROUP_CANFORK_COUNT])
+int cgroup_can_fork(struct task_struct *child)
 {
 	struct cgroup_subsys *ss;
 	int i, j, ret;
 
 	for_each_subsys_which(ss, i, &have_canfork_callback) {
-		ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+		ret = ss->can_fork(child);
 		if (ret)
 			goto out_revert;
 	}
@@ -5556,7 +5526,7 @@ out_revert:
 		if (j >= i)
 			break;
 		if (ss->cancel_fork)
-			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+			ss->cancel_fork(child);
 	}
 
 	return ret;
@@ -5569,15 +5539,14 @@ out_revert:
  * This calls the cancel_fork() callbacks if a fork failed *after*
  * cgroup_can_fork() succeded.
  */
-void cgroup_cancel_fork(struct task_struct *child,
-			void *ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_cancel_fork(struct task_struct *child)
 {
 	struct cgroup_subsys *ss;
 	int i;
 
 	for_each_subsys(ss, i)
 		if (ss->cancel_fork)
-			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+			ss->cancel_fork(child);
 }
 
 /**
@@ -5590,8 +5559,7 @@ void cgroup_cancel_fork(struct task_struct *child,
  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
  * list.
  */
-void cgroup_post_fork(struct task_struct *child,
-		      void *old_ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_post_fork(struct task_struct *child)
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -5635,7 +5603,7 @@ void cgroup_post_fork(struct task_struct *child,
 	 * and addition to css_set.
 	 */
 	for_each_subsys_which(ss, i, &have_fork_callback)
-		ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
+		ss->fork(child);
 }
 
 /**
@@ -5835,6 +5803,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
 }
 
+/**
+ * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
+ * @path: path on the default hierarchy
+ *
+ * Find the cgroup at @path on the default hierarchy, increment its
+ * reference count and return it.  Returns pointer to the found cgroup on
+ * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
+ * if @path points to a non-directory.
+ */
+struct cgroup *cgroup_get_from_path(const char *path)
+{
+	struct kernfs_node *kn;
+	struct cgroup *cgrp;
+
+	mutex_lock(&cgroup_mutex);
+
+	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+	if (kn) {
+		if (kernfs_type(kn) == KERNFS_DIR) {
+			cgrp = kn->priv;
+			cgroup_get(cgrp);
+		} else {
+			cgrp = ERR_PTR(-ENOTDIR);
+		}
+		kernfs_put(kn);
+	} else {
+		cgrp = ERR_PTR(-ENOENT);
+	}
+
+	mutex_unlock(&cgroup_mutex);
+	return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+
+/*
+ * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+
+DEFINE_SPINLOCK(cgroup_sk_update_lock);
+static bool cgroup_sk_alloc_disabled __read_mostly;
+
+void cgroup_sk_alloc_disable(void)
+{
+	if (cgroup_sk_alloc_disabled)
+		return;
+	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+	cgroup_sk_alloc_disabled = true;
+}
+
+#else
+
+#define cgroup_sk_alloc_disabled	false
+
+#endif
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+	if (cgroup_sk_alloc_disabled)
+		return;
+
+	rcu_read_lock();
+
+	while (true) {
+		struct css_set *cset;
+
+		cset = task_css_set(current);
+		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+			skcd->val = (unsigned long)cset->dfl_cgrp;
+			break;
+		}
+		cpu_relax();
+	}
+
+	rcu_read_unlock();
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+	cgroup_put(sock_cgroup_ptr(skcd));
+}
+
+#endif	/* CONFIG_SOCK_CGROUP_DATA */
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2d3df82c5..1b72d56ed 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
  * to do anything as freezer_attach() will put @task into the appropriate
  * state.
  */
-static void freezer_fork(struct task_struct *task, void *private)
+static void freezer_fork(struct task_struct *task)
 {
 	struct freezer *freezer;
 
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index b50d5a167..303097b37 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
  *
  * This function follows the set limit. It will fail if the charge would cause
  * the new value to exceed the hierarchical limit. Returns 0 if the charge
- * succeded, otherwise -EAGAIN.
+ * succeeded, otherwise -EAGAIN.
  */
 static int pids_try_charge(struct pids_cgroup *pids, int num)
 {
@@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
  * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
  * on threadgroup_change_begin() held by the copy_process().
  */
-static int pids_can_fork(struct task_struct *task, void **priv_p)
+static int pids_can_fork(struct task_struct *task)
 {
 	struct cgroup_subsys_state *css;
 	struct pids_cgroup *pids;
@@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
 	return pids_try_charge(pids, 1);
 }
 
-static void pids_cancel_fork(struct task_struct *task, void *priv)
+static void pids_cancel_fork(struct task_struct *task)
 {
 	struct cgroup_subsys_state *css;
 	struct pids_cgroup *pids;
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index d8560ee3b..9ad37b9e4 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -24,7 +24,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/context_tracking.h>
 
-struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(context_tracking_enabled);
 EXPORT_SYMBOL_GPL(context_tracking_enabled);
 
 DEFINE_PER_CPU(struct context_tracking, context_tracking);
@@ -191,7 +191,7 @@ void __init context_tracking_cpu_set(int cpu)
 
 	if (!per_cpu(context_tracking.active, cpu)) {
 		per_cpu(context_tracking.active, cpu) = true;
-		static_key_slow_inc(&context_tracking_enabled);
+		static_branch_inc(&context_tracking_enabled);
 	}
 
 	if (initialized)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 85ff5e26e..5b9d39633 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -759,71 +759,33 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
 EXPORT_SYMBOL(cpu_all_bits);
 
 #ifdef CONFIG_INIT_ALL_POSSIBLE
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
-	= CPU_BITS_ALL;
+struct cpumask __cpu_possible_mask __read_mostly
+	= {CPU_BITS_ALL};
 #else
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+struct cpumask __cpu_possible_mask __read_mostly;
 #endif
-const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
-EXPORT_SYMBOL(cpu_possible_mask);
+EXPORT_SYMBOL(__cpu_possible_mask);
 
-static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
-EXPORT_SYMBOL(cpu_online_mask);
+struct cpumask __cpu_online_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_online_mask);
 
-static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
-EXPORT_SYMBOL(cpu_present_mask);
+struct cpumask __cpu_present_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_present_mask);
 
-static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
-EXPORT_SYMBOL(cpu_active_mask);
-
-void set_cpu_possible(unsigned int cpu, bool possible)
-{
-	if (possible)
-		cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
-	else
-		cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
-}
-
-void set_cpu_present(unsigned int cpu, bool present)
-{
-	if (present)
-		cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
-	else
-		cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
-}
-
-void set_cpu_online(unsigned int cpu, bool online)
-{
-	if (online) {
-		cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
-		cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
-	} else {
-		cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
-	}
-}
-
-void set_cpu_active(unsigned int cpu, bool active)
-{
-	if (active)
-		cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
-	else
-		cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
-}
+struct cpumask __cpu_active_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_active_mask);
 
 void init_cpu_present(const struct cpumask *src)
 {
-	cpumask_copy(to_cpumask(cpu_present_bits), src);
+	cpumask_copy(&__cpu_present_mask, src);
 }
 
 void init_cpu_possible(const struct cpumask *src)
 {
-	cpumask_copy(to_cpumask(cpu_possible_bits), src);
+	cpumask_copy(&__cpu_possible_mask, src);
 }
 
 void init_cpu_online(const struct cpumask *src)
 {
-	cpumask_copy(to_cpumask(cpu_online_bits), src);
+	cpumask_copy(&__cpu_online_mask, src);
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2ade63219..41989ab4d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -51,6 +51,7 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/time.h>
+#include <linux/time64.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 
@@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
 struct fmeter {
 	int cnt;		/* unprocessed events count */
 	int val;		/* most recent output value */
-	time_t time;		/* clock (secs) when val computed */
+	time64_t time;		/* clock (secs) when val computed */
 	spinlock_t lock;	/* guards read or write of above */
 };
 
@@ -1397,7 +1398,7 @@ out:
  */
 
 #define FM_COEF 933		/* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
 #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
 #define FM_SCALE 1000		/* faux fixed point scale */
 
@@ -1413,8 +1414,11 @@ static void fmeter_init(struct fmeter *fmp)
 /* Internal meter update - process cnt events and update value */
 static void fmeter_update(struct fmeter *fmp)
 {
-	time_t now = get_seconds();
-	time_t ticks = now - fmp->time;
+	time64_t now;
+	u32 ticks;
+
+	now = ktime_get_seconds();
+	ticks = now - fmp->time;
 
 	if (ticks == 0)
 		return;
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c..0c0cd8a62 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds);
 void __init cred_init(void)
 {
 	/* allocate a slab in which we can store credentials */
-	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
-				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 }
 
 /**
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 412134549..2a20c0dfd 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv)
 			continue;
 
 		kdb_printf("%-20s%8u  0x%p ", mod->name,
-			   mod->core_size, (void *)mod);
+			   mod->core_layout.size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
 		kdb_printf("%4d ", module_refcount(mod));
 #endif
@@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv)
 			kdb_printf(" (Loading)");
 		else
 			kdb_printf(" (Live)");
-		kdb_printf(" 0x%p", mod->module_core);
+		kdb_printf(" 0x%p", mod->core_layout.base);
 
 #ifdef CONFIG_MODULE_UNLOAD
 		{
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ef90b04d7..435c14a45 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
 
 void delayacct_init(void)
 {
-	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
 	delayacct_tsk_init(&init_task);
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1087bbeb1..614614821 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@
 
 #include <asm/irq_regs.h>
 
-static struct workqueue_struct *perf_wq;
-
 typedef int (*remote_function_f)(void *);
 
 struct remote_function_call {
@@ -66,8 +64,17 @@ static void remote_function(void *data)
 	struct task_struct *p = tfc->p;
 
 	if (p) {
-		tfc->ret = -EAGAIN;
-		if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+		/* -EAGAIN */
+		if (task_cpu(p) != smp_processor_id())
+			return;
+
+		/*
+		 * Now that we're on right CPU with IRQs disabled, we can test
+		 * if we hit the right task without races.
+		 */
+
+		tfc->ret = -ESRCH; /* No such (running) process */
+		if (p != current)
 			return;
 	}
 
@@ -94,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
 		.p	= p,
 		.func	= func,
 		.info	= info,
-		.ret	= -ESRCH, /* No such (running) process */
+		.ret	= -EAGAIN,
 	};
+	int ret;
 
-	if (task_curr(p))
-		smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+	do {
+		ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+		if (!ret)
+			ret = data.ret;
+	} while (ret == -EAGAIN);
 
-	return data.ret;
+	return ret;
 }
 
 /**
@@ -126,11 +137,168 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
 	return data.ret;
 }
 
-#define EVENT_OWNER_KERNEL ((void *) -1)
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+			  struct perf_event_context *ctx)
+{
+	raw_spin_lock(&cpuctx->ctx.lock);
+	if (ctx)
+		raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+			    struct perf_event_context *ctx)
+{
+	if (ctx)
+		raw_spin_unlock(&ctx->lock);
+	raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
+#define TASK_TOMBSTONE ((void *)-1L)
 
 static bool is_kernel_event(struct perf_event *event)
 {
-	return event->owner == EVENT_OWNER_KERNEL;
+	return READ_ONCE(event->owner) == TASK_TOMBSTONE;
+}
+
+/*
+ * On task ctx scheduling...
+ *
+ * When !ctx->nr_events a task context will not be scheduled. This means
+ * we can disable the scheduler hooks (for performance) without leaving
+ * pending task ctx state.
+ *
+ * This however results in two special cases:
+ *
+ *  - removing the last event from a task ctx; this is relatively straight
+ *    forward and is done in __perf_remove_from_context.
+ *
+ *  - adding the first event to a task ctx; this is tricky because we cannot
+ *    rely on ctx->is_active and therefore cannot use event_function_call().
+ *    See perf_install_in_context().
+ *
+ * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
+ */
+
+typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
+			struct perf_event_context *, void *);
+
+struct event_function_struct {
+	struct perf_event *event;
+	event_f func;
+	void *data;
+};
+
+static int event_function(void *info)
+{
+	struct event_function_struct *efs = info;
+	struct perf_event *event = efs->event;
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+	struct perf_event_context *task_ctx = cpuctx->task_ctx;
+	int ret = 0;
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	perf_ctx_lock(cpuctx, task_ctx);
+	/*
+	 * Since we do the IPI call without holding ctx->lock things can have
+	 * changed, double check we hit the task we set out to hit.
+	 */
+	if (ctx->task) {
+		if (ctx->task != current) {
+			ret = -ESRCH;
+			goto unlock;
+		}
+
+		/*
+		 * We only use event_function_call() on established contexts,
+		 * and event_function() is only ever called when active (or
+		 * rather, we'll have bailed in task_function_call() or the
+		 * above ctx->task != current test), therefore we must have
+		 * ctx->is_active here.
+		 */
+		WARN_ON_ONCE(!ctx->is_active);
+		/*
+		 * And since we have ctx->is_active, cpuctx->task_ctx must
+		 * match.
+		 */
+		WARN_ON_ONCE(task_ctx != ctx);
+	} else {
+		WARN_ON_ONCE(&cpuctx->ctx != ctx);
+	}
+
+	efs->func(event, cpuctx, ctx, efs->data);
+unlock:
+	perf_ctx_unlock(cpuctx, task_ctx);
+
+	return ret;
+}
+
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+	struct event_function_struct efs = {
+		.event = event,
+		.func = func,
+		.data = data,
+	};
+
+	int ret = event_function(&efs);
+	WARN_ON_ONCE(ret);
+}
+
+static void event_function_call(struct perf_event *event, event_f func, void *data)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+	struct event_function_struct efs = {
+		.event = event,
+		.func = func,
+		.data = data,
+	};
+
+	if (!event->parent) {
+		/*
+		 * If this is a !child event, we must hold ctx::mutex to
+		 * stabilize the the event->ctx relation. See
+		 * perf_event_ctx_lock().
+		 */
+		lockdep_assert_held(&ctx->mutex);
+	}
+
+	if (!task) {
+		cpu_function_call(event->cpu, event_function, &efs);
+		return;
+	}
+
+	if (task == TASK_TOMBSTONE)
+		return;
+
+again:
+	if (!task_function_call(task, event_function, &efs))
+		return;
+
+	raw_spin_lock_irq(&ctx->lock);
+	/*
+	 * Reload the task pointer, it might have been changed by
+	 * a concurrent perf_event_context_sched_out().
+	 */
+	task = ctx->task;
+	if (task == TASK_TOMBSTONE) {
+		raw_spin_unlock_irq(&ctx->lock);
+		return;
+	}
+	if (ctx->is_active) {
+		raw_spin_unlock_irq(&ctx->lock);
+		goto again;
+	}
+	func(event, NULL, ctx, data);
+	raw_spin_unlock_irq(&ctx->lock);
 }
 
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
@@ -148,6 +316,7 @@ static bool is_kernel_event(struct perf_event *event)
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
+	EVENT_TIME = 0x4,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -155,7 +324,13 @@ enum event_type_t {
  * perf_sched_events : >0 events exist
  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  */
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
@@ -337,28 +512,6 @@ static inline u64 perf_event_clock(struct perf_event *event)
 	return event->clock();
 }
 
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
-	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
-static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
-			  struct perf_event_context *ctx)
-{
-	raw_spin_lock(&cpuctx->ctx.lock);
-	if (ctx)
-		raw_spin_lock(&ctx->lock);
-}
-
-static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
-			    struct perf_event_context *ctx)
-{
-	if (ctx)
-		raw_spin_unlock(&ctx->lock);
-	raw_spin_unlock(&cpuctx->ctx.lock);
-}
-
 #ifdef CONFIG_CGROUP_PERF
 
 static inline bool
@@ -548,13 +701,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
 	 * we are holding the rcu lock
 	 */
 	cgrp1 = perf_cgroup_from_task(task, NULL);
-
-	/*
-	 * next is NULL when called from perf_event_enable_on_exec()
-	 * that will systematically cause a cgroup_switch()
-	 */
-	if (next)
-		cgrp2 = perf_cgroup_from_task(next, NULL);
+	cgrp2 = perf_cgroup_from_task(next, NULL);
 
 	/*
 	 * only schedule out current cgroup events if we know
@@ -580,8 +727,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
 	 * we are holding the rcu lock
 	 */
 	cgrp1 = perf_cgroup_from_task(task, NULL);
-
-	/* prev can never be NULL */
 	cgrp2 = perf_cgroup_from_task(prev, NULL);
 
 	/*
@@ -886,7 +1031,7 @@ static void put_ctx(struct perf_event_context *ctx)
 	if (atomic_dec_and_test(&ctx->refcount)) {
 		if (ctx->parent_ctx)
 			put_ctx(ctx->parent_ctx);
-		if (ctx->task)
+		if (ctx->task && ctx->task != TASK_TOMBSTONE)
 			put_task_struct(ctx->task);
 		call_rcu(&ctx->rcu_head, free_ctx);
 	}
@@ -903,9 +1048,8 @@ static void put_ctx(struct perf_event_context *ctx)
  * perf_event_context::mutex nests and those are:
  *
  *  - perf_event_exit_task_context()	[ child , 0 ]
- *      __perf_event_exit_task()
- *        sync_child_event()
- *          put_event()			[ parent, 1 ]
+ *      perf_event_exit_event()
+ *        put_event()			[ parent, 1 ]
  *
  *  - perf_event_init_context()		[ parent, 0 ]
  *      inherit_task_group()
@@ -948,8 +1092,8 @@ static void put_ctx(struct perf_event_context *ctx)
  * Lock order:
  *	task_struct::perf_event_mutex
  *	  perf_event_context::mutex
- *	    perf_event_context::lock
  *	    perf_event::child_mutex;
+ *	      perf_event_context::lock
  *	    perf_event::mmap_mutex
  *	    mmap_sem
  */
@@ -1047,6 +1191,7 @@ static u64 primary_event_id(struct perf_event *event)
 
 /*
  * Get the perf_event_context for a task and lock it.
+ *
  * This has to cope with with the fact that until it is locked,
  * the context could get moved to another task.
  */
@@ -1087,9 +1232,12 @@ retry:
 			goto retry;
 		}
 
-		if (!atomic_inc_not_zero(&ctx->refcount)) {
+		if (ctx->task == TASK_TOMBSTONE ||
+		    !atomic_inc_not_zero(&ctx->refcount)) {
 			raw_spin_unlock(&ctx->lock);
 			ctx = NULL;
+		} else {
+			WARN_ON_ONCE(ctx->task != task);
 		}
 	}
 	rcu_read_unlock();
@@ -1149,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event)
 
 /*
  * Update the total_time_enabled and total_time_running fields for a event.
- * The caller of this function needs to hold the ctx->lock.
  */
 static void update_event_times(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 	u64 run_end;
 
+	lockdep_assert_held(&ctx->lock);
+
 	if (event->state < PERF_EVENT_STATE_INACTIVE ||
 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 		return;
+
 	/*
 	 * in cgroup mode, time_enabled represents
 	 * the time the event was enabled AND active
@@ -1215,6 +1365,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+	lockdep_assert_held(&ctx->lock);
+
 	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
 	event->attach_state |= PERF_ATTACH_CONTEXT;
 
@@ -1417,11 +1569,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	if (is_cgroup_event(event)) {
 		ctx->nr_cgroups--;
+		/*
+		 * Because cgroup events are always per-cpu events, this will
+		 * always be called from the right CPU.
+		 */
 		cpuctx = __get_cpu_context(ctx);
 		/*
-		 * if there are no more cgroup events
-		 * then cler cgrp to avoid stale pointer
-		 * in update_cgrp_time_from_cpuctx()
+		 * If there are no more cgroup events then clear cgrp to avoid
+		 * stale pointer in update_cgrp_time_from_cpuctx().
 		 */
 		if (!ctx->nr_cgroups)
 			cpuctx->cgrp = NULL;
@@ -1499,45 +1654,11 @@ out:
 		perf_event__header_size(tmp);
 }
 
-/*
- * User event without the task.
- */
 static bool is_orphaned_event(struct perf_event *event)
 {
-	return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
-	return is_orphaned_event(event->parent);
-}
-
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
-	if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
-		return;
-
-	if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
-		get_ctx(ctx);
-		ctx->orphans_remove_sched = true;
-	}
-}
-
-static int __init perf_workqueue_init(void)
-{
-	perf_wq = create_singlethread_workqueue("perf");
-	WARN(!perf_wq, "failed to create perf workqueue\n");
-	return perf_wq ? 0 : -1;
+	return event->state == PERF_EVENT_STATE_DEAD;
 }
 
-core_initcall(perf_workqueue_init);
-
 static inline int pmu_filter_match(struct perf_event *event)
 {
 	struct pmu *pmu = event->pmu;
@@ -1580,14 +1701,14 @@ event_sched_out(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
+	event->tstamp_stopped = tstamp;
+	event->pmu->del(event, 0);
+	event->oncpu = -1;
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	if (event->pending_disable) {
 		event->pending_disable = 0;
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-	event->tstamp_stopped = tstamp;
-	event->pmu->del(event, 0);
-	event->oncpu = -1;
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu--;
@@ -1598,9 +1719,6 @@ event_sched_out(struct perf_event *event,
 	if (event->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
 
-	if (is_orphaned_child(event))
-		schedule_orphans_remove(ctx);
-
 	perf_pmu_enable(event->pmu);
 }
 
@@ -1624,10 +1742,7 @@ group_sched_out(struct perf_event *group_event,
 		cpuctx->exclusive = 0;
 }
 
-struct remove_event {
-	struct perf_event *event;
-	bool detach_group;
-};
+#define DETACH_GROUP	0x01UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -1635,34 +1750,31 @@ struct remove_event {
  * We disable the event on the hardware level first. After that we
  * remove it from the context list.
  */
-static int __perf_remove_from_context(void *info)
+static void
+__perf_remove_from_context(struct perf_event *event,
+			   struct perf_cpu_context *cpuctx,
+			   struct perf_event_context *ctx,
+			   void *info)
 {
-	struct remove_event *re = info;
-	struct perf_event *event = re->event;
-	struct perf_event_context *ctx = event->ctx;
-	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+	unsigned long flags = (unsigned long)info;
 
-	raw_spin_lock(&ctx->lock);
 	event_sched_out(event, cpuctx, ctx);
-	if (re->detach_group)
+	if (flags & DETACH_GROUP)
 		perf_group_detach(event);
 	list_del_event(event, ctx);
-	if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+
+	if (!ctx->nr_events && ctx->is_active) {
 		ctx->is_active = 0;
-		cpuctx->task_ctx = NULL;
+		if (ctx->task) {
+			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+			cpuctx->task_ctx = NULL;
+		}
 	}
-	raw_spin_unlock(&ctx->lock);
-
-	return 0;
 }
 
-
 /*
  * Remove the event from a task's (or a CPU's) list of events.
  *
- * CPU events are removed with a smp call. For task events we only
- * call when the task is on a CPU.
- *
  * If event->ctx is a cloned context, callers must make sure that
  * every task struct that event->ctx->task could possibly point to
  * remains valid.  This is OK when called from perf_release since
@@ -1670,96 +1782,32 @@ static int __perf_remove_from_context(void *info)
  * When called from perf_event_exit_task, it's OK because the
  * context has been detached from its task.
  */
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
 {
-	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
-	struct remove_event re = {
-		.event = event,
-		.detach_group = detach_group,
-	};
+	lockdep_assert_held(&event->ctx->mutex);
 
-	lockdep_assert_held(&ctx->mutex);
-
-	if (!task) {
-		/*
-		 * Per cpu events are removed via an smp call. The removal can
-		 * fail if the CPU is currently offline, but in that case we
-		 * already called __perf_remove_from_context from
-		 * perf_event_exit_cpu.
-		 */
-		cpu_function_call(event->cpu, __perf_remove_from_context, &re);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_remove_from_context, &re))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If we failed to find a running task, but find the context active now
-	 * that we've acquired the ctx->lock, retry.
-	 */
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since the task isn't running, its safe to remove the event, us
-	 * holding the ctx->lock ensures the task won't get scheduled in.
-	 */
-	if (detach_group)
-		perf_group_detach(event);
-	list_del_event(event, ctx);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_remove_from_context, (void *)flags);
 }
 
 /*
  * Cross CPU call to disable a performance event
  */
-int __perf_event_disable(void *info)
+static void __perf_event_disable(struct perf_event *event,
+				 struct perf_cpu_context *cpuctx,
+				 struct perf_event_context *ctx,
+				 void *info)
 {
-	struct perf_event *event = info;
-	struct perf_event_context *ctx = event->ctx;
-	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
-	/*
-	 * If this is a per-task event, need to check whether this
-	 * event's task is the current task on this cpu.
-	 *
-	 * Can trigger due to concurrent perf_event_context_sched_out()
-	 * flipping contexts around.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return -EINVAL;
-
-	raw_spin_lock(&ctx->lock);
-
-	/*
-	 * If the event is on, turn it off.
-	 * If it is in error state, leave it in error state.
-	 */
-	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
-		update_context_time(ctx);
-		update_cgrp_time_from_event(event);
-		update_group_times(event);
-		if (event == event->group_leader)
-			group_sched_out(event, cpuctx, ctx);
-		else
-			event_sched_out(event, cpuctx, ctx);
-		event->state = PERF_EVENT_STATE_OFF;
-	}
-
-	raw_spin_unlock(&ctx->lock);
+	if (event->state < PERF_EVENT_STATE_INACTIVE)
+		return;
 
-	return 0;
+	update_context_time(ctx);
+	update_cgrp_time_from_event(event);
+	update_group_times(event);
+	if (event == event->group_leader)
+		group_sched_out(event, cpuctx, ctx);
+	else
+		event_sched_out(event, cpuctx, ctx);
+	event->state = PERF_EVENT_STATE_OFF;
 }
 
 /*
@@ -1770,7 +1818,8 @@ int __perf_event_disable(void *info)
  * remains valid.  This condition is satisifed when called through
  * perf_event_for_each_child or perf_event_for_each because they
  * hold the top-level event's child_mutex, so any descendant that
- * goes to exit will block in sync_child_event.
+ * goes to exit will block in perf_event_exit_event().
+ *
  * When called from perf_pending_event it's OK because event->ctx
  * is the current context on this CPU and preemption is disabled,
  * hence we can't get into perf_event_task_sched_out for this context.
@@ -1778,43 +1827,20 @@ int __perf_event_disable(void *info)
 static void _perf_event_disable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Disable the event on the cpu that it's on
-		 */
-		cpu_function_call(event->cpu, __perf_event_disable, event);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_event_disable, event))
-		return;
 
 	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If the event is still active, we need to retry the cross-call.
-	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+	if (event->state <= PERF_EVENT_STATE_OFF) {
 		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE) {
-		update_group_times(event);
-		event->state = PERF_EVENT_STATE_OFF;
+		return;
 	}
 	raw_spin_unlock_irq(&ctx->lock);
+
+	event_function_call(event, __perf_event_disable, NULL);
+}
+
+void perf_event_disable_local(struct perf_event *event)
+{
+	event_function_local(event, __perf_event_disable, NULL);
 }
 
 /*
@@ -1927,9 +1953,6 @@ event_sched_in(struct perf_event *event,
 	if (event->attr.exclusive)
 		cpuctx->exclusive = 1;
 
-	if (is_orphaned_child(event))
-		schedule_orphans_remove(ctx);
-
 out:
 	perf_pmu_enable(event->pmu);
 
@@ -2048,13 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event,
 	event->tstamp_stopped = tstamp;
 }
 
-static void task_ctx_sched_out(struct perf_event_context *ctx);
+static void ctx_sched_out(struct perf_event_context *ctx,
+			  struct perf_cpu_context *cpuctx,
+			  enum event_type_t event_type);
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
 	     enum event_type_t event_type,
 	     struct task_struct *task);
 
+static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
+			       struct perf_event_context *ctx)
+{
+	if (!cpuctx->task_ctx)
+		return;
+
+	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+		return;
+
+	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+}
+
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 				struct perf_event_context *ctx,
 				struct task_struct *task)
@@ -2067,10 +2104,22 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 }
 
+static void ctx_resched(struct perf_cpu_context *cpuctx,
+			struct perf_event_context *task_ctx)
+{
+	perf_pmu_disable(cpuctx->ctx.pmu);
+	if (task_ctx)
+		task_ctx_sched_out(cpuctx, task_ctx);
+	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+	perf_event_sched_in(cpuctx, task_ctx, current);
+	perf_pmu_enable(cpuctx->ctx.pmu);
+}
+
 /*
  * Cross CPU call to install and enable a performance event
  *
- * Must be called with ctx->mutex held
+ * Very similar to remote_function() + event_function() but cannot assume that
+ * things like ctx->is_active and cpuctx->task_ctx are set.
  */
 static int  __perf_install_in_context(void *info)
 {
@@ -2078,72 +2127,59 @@ static int  __perf_install_in_context(void *info)
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
-	struct task_struct *task = current;
-
-	perf_ctx_lock(cpuctx, task_ctx);
-	perf_pmu_disable(cpuctx->ctx.pmu);
-
-	/*
-	 * If there was an active task_ctx schedule it out.
-	 */
-	if (task_ctx)
-		task_ctx_sched_out(task_ctx);
+	bool activate = true;
+	int ret = 0;
 
-	/*
-	 * If the context we're installing events in is not the
-	 * active task_ctx, flip them.
-	 */
-	if (ctx->task && task_ctx != ctx) {
-		if (task_ctx)
-			raw_spin_unlock(&task_ctx->lock);
+	raw_spin_lock(&cpuctx->ctx.lock);
+	if (ctx->task) {
 		raw_spin_lock(&ctx->lock);
 		task_ctx = ctx;
-	}
-
-	if (task_ctx) {
-		cpuctx->task_ctx = task_ctx;
-		task = task_ctx->task;
-	}
 
-	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+		/* If we're on the wrong CPU, try again */
+		if (task_cpu(ctx->task) != smp_processor_id()) {
+			ret = -ESRCH;
+			goto unlock;
+		}
 
-	update_context_time(ctx);
-	/*
-	 * update cgrp time only if current cgrp
-	 * matches event->cgrp. Must be done before
-	 * calling add_event_to_ctx()
-	 */
-	update_cgrp_time_from_event(event);
+		/*
+		 * If we're on the right CPU, see if the task we target is
+		 * current, if not we don't have to activate the ctx, a future
+		 * context switch will do that for us.
+		 */
+		if (ctx->task != current)
+			activate = false;
+		else
+			WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
 
-	add_event_to_ctx(event, ctx);
+	} else if (task_ctx) {
+		raw_spin_lock(&task_ctx->lock);
+	}
 
-	/*
-	 * Schedule everything back in
-	 */
-	perf_event_sched_in(cpuctx, task_ctx, task);
+	if (activate) {
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		add_event_to_ctx(event, ctx);
+		ctx_resched(cpuctx, task_ctx);
+	} else {
+		add_event_to_ctx(event, ctx);
+	}
 
-	perf_pmu_enable(cpuctx->ctx.pmu);
+unlock:
 	perf_ctx_unlock(cpuctx, task_ctx);
 
-	return 0;
+	return ret;
 }
 
 /*
- * Attach a performance event to a context
+ * Attach a performance event to a context.
  *
- * First we add the event to the list with the hardware enable bit
- * in event->hw_config cleared.
- *
- * If the event is attached to a task which is on a CPU we use a smp
- * call to enable it in the task context. The task might have been
- * scheduled away, but we check this in the smp call again.
+ * Very similar to event_function_call, see comment there.
  */
 static void
 perf_install_in_context(struct perf_event_context *ctx,
 			struct perf_event *event,
 			int cpu)
 {
-	struct task_struct *task = ctx->task;
+	struct task_struct *task = READ_ONCE(ctx->task);
 
 	lockdep_assert_held(&ctx->mutex);
 
@@ -2152,39 +2188,45 @@ perf_install_in_context(struct perf_event_context *ctx,
 		event->cpu = cpu;
 
 	if (!task) {
-		/*
-		 * Per cpu events are installed via an smp call and
-		 * the install is always successful.
-		 */
 		cpu_function_call(cpu, __perf_install_in_context, event);
 		return;
 	}
 
-retry:
-	if (!task_function_call(task, __perf_install_in_context, event))
+	/*
+	 * Should not happen, we validate the ctx is still alive before calling.
+	 */
+	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
 		return;
 
-	raw_spin_lock_irq(&ctx->lock);
 	/*
-	 * If we failed to find a running task, but find the context active now
-	 * that we've acquired the ctx->lock, retry.
+	 * Installing events is tricky because we cannot rely on ctx->is_active
+	 * to be set in case this is the nr_events 0 -> 1 transition.
 	 */
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
+again:
+	/*
+	 * Cannot use task_function_call() because we need to run on the task's
+	 * CPU regardless of whether its current or not.
+	 */
+	if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+		return;
+
+	raw_spin_lock_irq(&ctx->lock);
+	task = ctx->task;
+	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
 		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
+		 * Cannot happen because we already checked above (which also
+		 * cannot happen), and we hold ctx->mutex, which serializes us
+		 * against perf_event_exit_task_context().
 		 */
-		task = ctx->task;
-		goto retry;
+		raw_spin_unlock_irq(&ctx->lock);
+		return;
 	}
-
+	raw_spin_unlock_irq(&ctx->lock);
 	/*
-	 * Since the task isn't running, its safe to add the event, us holding
-	 * the ctx->lock ensures the task won't get scheduled in.
+	 * Since !ctx->is_active doesn't mean anything, we must IPI
+	 * unconditionally.
 	 */
-	add_event_to_ctx(event, ctx);
-	raw_spin_unlock_irq(&ctx->lock);
+	goto again;
 }
 
 /*
@@ -2211,80 +2253,47 @@ static void __perf_event_mark_enabled(struct perf_event *event)
 /*
  * Cross CPU call to enable a performance event
  */
-static int __perf_event_enable(void *info)
+static void __perf_event_enable(struct perf_event *event,
+				struct perf_cpu_context *cpuctx,
+				struct perf_event_context *ctx,
+				void *info)
 {
-	struct perf_event *event = info;
-	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *leader = event->group_leader;
-	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-	int err;
+	struct perf_event_context *task_ctx;
 
-	/*
-	 * There's a time window between 'ctx->is_active' check
-	 * in perf_event_enable function and this place having:
-	 *   - IRQs on
-	 *   - ctx->lock unlocked
-	 *
-	 * where the task could be killed and 'ctx' deactivated
-	 * by perf_event_exit_task.
-	 */
-	if (!ctx->is_active)
-		return -EINVAL;
-
-	raw_spin_lock(&ctx->lock);
-	update_context_time(ctx);
-
-	if (event->state >= PERF_EVENT_STATE_INACTIVE)
-		goto unlock;
+	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+	    event->state <= PERF_EVENT_STATE_ERROR)
+		return;
 
-	/*
-	 * set current task's cgroup time reference point
-	 */
-	perf_cgroup_set_timestamp(current, ctx);
+	if (ctx->is_active)
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 
 	__perf_event_mark_enabled(event);
 
+	if (!ctx->is_active)
+		return;
+
 	if (!event_filter_match(event)) {
 		if (is_cgroup_event(event))
 			perf_cgroup_defer_enabled(event);
-		goto unlock;
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		return;
 	}
 
 	/*
 	 * If the event is in a group and isn't the group leader,
 	 * then don't put it on unless the group is on.
 	 */
-	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
-		goto unlock;
-
-	if (!group_can_go_on(event, cpuctx, 1)) {
-		err = -EEXIST;
-	} else {
-		if (event == leader)
-			err = group_sched_in(event, cpuctx, ctx);
-		else
-			err = event_sched_in(event, cpuctx, ctx);
-	}
-
-	if (err) {
-		/*
-		 * If this event can't go on and it's part of a
-		 * group, then the whole group has to come off.
-		 */
-		if (leader != event) {
-			group_sched_out(leader, cpuctx, ctx);
-			perf_mux_hrtimer_restart(cpuctx);
-		}
-		if (leader->attr.pinned) {
-			update_group_times(leader);
-			leader->state = PERF_EVENT_STATE_ERROR;
-		}
+	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		return;
 	}
 
-unlock:
-	raw_spin_unlock(&ctx->lock);
+	task_ctx = cpuctx->task_ctx;
+	if (ctx->task)
+		WARN_ON_ONCE(task_ctx != ctx);
 
-	return 0;
+	ctx_resched(cpuctx, task_ctx);
 }
 
 /*
@@ -2299,58 +2308,26 @@ unlock:
 static void _perf_event_enable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
 
-	if (!task) {
-		/*
-		 * Enable the event on the cpu that it's on
-		 */
-		cpu_function_call(event->cpu, __perf_event_enable, event);
+	raw_spin_lock_irq(&ctx->lock);
+	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+	    event->state <  PERF_EVENT_STATE_ERROR) {
+		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
 
-	raw_spin_lock_irq(&ctx->lock);
-	if (event->state >= PERF_EVENT_STATE_INACTIVE)
-		goto out;
-
 	/*
 	 * If the event is in error state, clear that first.
-	 * That way, if we see the event in error state below, we
-	 * know that it has gone back into error state, as distinct
-	 * from the task having been scheduled away before the
-	 * cross-call arrived.
+	 *
+	 * That way, if we see the event in error state below, we know that it
+	 * has gone back into error state, as distinct from the task having
+	 * been scheduled away before the cross-call arrived.
 	 */
 	if (event->state == PERF_EVENT_STATE_ERROR)
 		event->state = PERF_EVENT_STATE_OFF;
-
-retry:
-	if (!ctx->is_active) {
-		__perf_event_mark_enabled(event);
-		goto out;
-	}
-
 	raw_spin_unlock_irq(&ctx->lock);
 
-	if (!task_function_call(task, __perf_event_enable, event))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-
-	/*
-	 * If the context is active and the event is still off,
-	 * we need to retry the cross-call.
-	 */
-	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
-		/*
-		 * task could have been flipped by a concurrent
-		 * perf_event_context_sched_out()
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-out:
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_event_enable, NULL);
 }
 
 /*
@@ -2400,25 +2377,49 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
 			  enum event_type_t event_type)
 {
-	struct perf_event *event;
 	int is_active = ctx->is_active;
+	struct perf_event *event;
 
-	ctx->is_active &= ~event_type;
-	if (likely(!ctx->nr_events))
+	lockdep_assert_held(&ctx->lock);
+
+	if (likely(!ctx->nr_events)) {
+		/*
+		 * See __perf_remove_from_context().
+		 */
+		WARN_ON_ONCE(ctx->is_active);
+		if (ctx->task)
+			WARN_ON_ONCE(cpuctx->task_ctx);
 		return;
+	}
 
-	update_context_time(ctx);
-	update_cgrp_time_from_cpuctx(cpuctx);
-	if (!ctx->nr_active)
+	ctx->is_active &= ~event_type;
+	if (!(ctx->is_active & EVENT_ALL))
+		ctx->is_active = 0;
+
+	if (ctx->task) {
+		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+		if (!ctx->is_active)
+			cpuctx->task_ctx = NULL;
+	}
+
+	is_active ^= ctx->is_active; /* changed bits */
+
+	if (is_active & EVENT_TIME) {
+		/* update (and stop) ctx time */
+		update_context_time(ctx);
+		update_cgrp_time_from_cpuctx(cpuctx);
+	}
+
+	if (!ctx->nr_active || !(is_active & EVENT_ALL))
 		return;
 
 	perf_pmu_disable(ctx->pmu);
-	if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+	if (is_active & EVENT_PINNED) {
 		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 	}
 
-	if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+	if (is_active & EVENT_FLEXIBLE) {
 		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 	}
@@ -2576,17 +2577,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 		raw_spin_lock(&ctx->lock);
 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
-			/*
-			 * XXX do we need a memory barrier of sorts
-			 * wrt to rcu_dereference() of perf_event_ctxp
-			 */
-			task->perf_event_ctxp[ctxn] = next_ctx;
-			next->perf_event_ctxp[ctxn] = ctx;
-			ctx->task = next;
-			next_ctx->task = task;
+			WRITE_ONCE(ctx->task, next);
+			WRITE_ONCE(next_ctx->task, task);
 
 			swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
 
+			/*
+			 * RCU_INIT_POINTER here is safe because we've not
+			 * modified the ctx and the above modification of
+			 * ctx->task and ctx->task_ctx_data are immaterial
+			 * since those values are always verified under
+			 * ctx->lock which we're now holding.
+			 */
+			RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
+			RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+
 			do_switch = 0;
 
 			perf_event_sync_stat(ctx, next_ctx);
@@ -2599,8 +2604,7 @@ unlock:
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
-		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
-		cpuctx->task_ctx = NULL;
+		task_ctx_sched_out(cpuctx, ctx);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
@@ -2695,20 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
 		perf_cgroup_sched_out(task, next);
 }
 
-static void task_ctx_sched_out(struct perf_event_context *ctx)
-{
-	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
-	if (!cpuctx->task_ctx)
-		return;
-
-	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
-		return;
-
-	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
-	cpuctx->task_ctx = NULL;
-}
-
 /*
  * Called with IRQs disabled
  */
@@ -2783,25 +2773,40 @@ ctx_sched_in(struct perf_event_context *ctx,
 	     enum event_type_t event_type,
 	     struct task_struct *task)
 {
-	u64 now;
 	int is_active = ctx->is_active;
+	u64 now;
+
+	lockdep_assert_held(&ctx->lock);
 
-	ctx->is_active |= event_type;
 	if (likely(!ctx->nr_events))
 		return;
 
-	now = perf_clock();
-	ctx->timestamp = now;
-	perf_cgroup_set_timestamp(task, ctx);
+	ctx->is_active |= (event_type | EVENT_TIME);
+	if (ctx->task) {
+		if (!is_active)
+			cpuctx->task_ctx = ctx;
+		else
+			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+	}
+
+	is_active ^= ctx->is_active; /* changed bits */
+
+	if (is_active & EVENT_TIME) {
+		/* start ctx time */
+		now = perf_clock();
+		ctx->timestamp = now;
+		perf_cgroup_set_timestamp(task, ctx);
+	}
+
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+	if (is_active & EVENT_PINNED)
 		ctx_pinned_sched_in(ctx, cpuctx);
 
 	/* Then walk through the lower prio flexible groups */
-	if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+	if (is_active & EVENT_FLEXIBLE)
 		ctx_flexible_sched_in(ctx, cpuctx);
 }
 
@@ -2831,12 +2836,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 * cpu flexible, task flexible.
 	 */
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-
-	if (ctx->nr_events)
-		cpuctx->task_ctx = ctx;
-
-	perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
-
+	perf_event_sched_in(cpuctx, ctx, task);
 	perf_pmu_enable(ctx->pmu);
 	perf_ctx_unlock(cpuctx, ctx);
 }
@@ -2858,6 +2858,16 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	struct perf_event_context *ctx;
 	int ctxn;
 
+	/*
+	 * If cgroup events exist on this CPU, then we need to check if we have
+	 * to switch in PMU state; cgroup event are system-wide mode only.
+	 *
+	 * Since cgroup events are CPU events, we must schedule these in before
+	 * we schedule in the task events.
+	 */
+	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+		perf_cgroup_sched_in(prev, task);
+
 	for_each_task_context_nr(ctxn) {
 		ctx = task->perf_event_ctxp[ctxn];
 		if (likely(!ctx))
@@ -2865,13 +2875,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
 		perf_event_context_sched_in(ctx, task);
 	}
-	/*
-	 * if cgroup events exist on this CPU, then we need
-	 * to check if we have to switch in PMU state.
-	 * cgroup event are system-wide mode only
-	 */
-	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
-		perf_cgroup_sched_in(prev, task);
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
@@ -3157,46 +3160,31 @@ static int event_enable_on_exec(struct perf_event *event,
 static void perf_event_enable_on_exec(int ctxn)
 {
 	struct perf_event_context *ctx, *clone_ctx = NULL;
+	struct perf_cpu_context *cpuctx;
 	struct perf_event *event;
 	unsigned long flags;
 	int enabled = 0;
-	int ret;
 
 	local_irq_save(flags);
 	ctx = current->perf_event_ctxp[ctxn];
 	if (!ctx || !ctx->nr_events)
 		goto out;
 
-	/*
-	 * We must ctxsw out cgroup events to avoid conflict
-	 * when invoking perf_task_event_sched_in() later on
-	 * in this function. Otherwise we end up trying to
-	 * ctxswin cgroup events which are already scheduled
-	 * in.
-	 */
-	perf_cgroup_sched_out(current, NULL);
-
-	raw_spin_lock(&ctx->lock);
-	task_ctx_sched_out(ctx);
-
-	list_for_each_entry(event, &ctx->event_list, event_entry) {
-		ret = event_enable_on_exec(event, ctx);
-		if (ret)
-			enabled = 1;
-	}
+	cpuctx = __get_cpu_context(ctx);
+	perf_ctx_lock(cpuctx, ctx);
+	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+	list_for_each_entry(event, &ctx->event_list, event_entry)
+		enabled |= event_enable_on_exec(event, ctx);
 
 	/*
-	 * Unclone this context if we enabled any event.
+	 * Unclone and reschedule this context if we enabled any event.
 	 */
-	if (enabled)
+	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
+		ctx_resched(cpuctx, ctx);
+	}
+	perf_ctx_unlock(cpuctx, ctx);
 
-	raw_spin_unlock(&ctx->lock);
-
-	/*
-	 * Also calls ctxswin for cgroup events, if any:
-	 */
-	perf_event_context_sched_in(ctx, ctx->task);
 out:
 	local_irq_restore(flags);
 
@@ -3392,7 +3380,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
-	INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
 }
 
 static struct perf_event_context *
@@ -3579,11 +3566,13 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
 
 static void unaccount_event(struct perf_event *event)
 {
+	bool dec = false;
+
 	if (event->parent)
 		return;
 
 	if (event->attach_state & PERF_ATTACH_TASK)
-		static_key_slow_dec_deferred(&perf_sched_events);
+		dec = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_dec(&nr_mmap_events);
 	if (event->attr.comm)
@@ -3593,17 +3582,30 @@ static void unaccount_event(struct perf_event *event)
 	if (event->attr.freq)
 		atomic_dec(&nr_freq_events);
 	if (event->attr.context_switch) {
-		static_key_slow_dec_deferred(&perf_sched_events);
+		dec = true;
 		atomic_dec(&nr_switch_events);
 	}
 	if (is_cgroup_event(event))
-		static_key_slow_dec_deferred(&perf_sched_events);
+		dec = true;
 	if (has_branch_stack(event))
-		static_key_slow_dec_deferred(&perf_sched_events);
+		dec = true;
+
+	if (dec) {
+		if (!atomic_add_unless(&perf_sched_count, -1, 1))
+			schedule_delayed_work(&perf_sched_work, HZ);
+	}
 
 	unaccount_event_cpu(event, event->cpu);
 }
 
+static void perf_sched_delayed(struct work_struct *work)
+{
+	mutex_lock(&perf_sched_mutex);
+	if (atomic_dec_and_test(&perf_sched_count))
+		static_branch_disable(&perf_sched_events);
+	mutex_unlock(&perf_sched_mutex);
+}
+
 /*
  * The following implement mutual exclusion of events on "exclusive" pmus
  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -3614,7 +3616,7 @@ static void unaccount_event(struct perf_event *event)
  *  3) two matching events on the same context.
  *
  * The former two cases are handled in the allocation path (perf_event_alloc(),
- * __free_event()), the latter -- before the first perf_install_in_context().
+ * _free_event()), the latter -- before the first perf_install_in_context().
  */
 static int exclusive_event_init(struct perf_event *event)
 {
@@ -3689,29 +3691,6 @@ static bool exclusive_event_installable(struct perf_event *event,
 	return true;
 }
 
-static void __free_event(struct perf_event *event)
-{
-	if (!event->parent) {
-		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
-			put_callchain_buffers();
-	}
-
-	perf_event_free_bpf_prog(event);
-
-	if (event->destroy)
-		event->destroy(event);
-
-	if (event->ctx)
-		put_ctx(event->ctx);
-
-	if (event->pmu) {
-		exclusive_event_destroy(event);
-		module_put(event->pmu->module);
-	}
-
-	call_rcu(&event->rcu_head, free_event_rcu);
-}
-
 static void _free_event(struct perf_event *event)
 {
 	irq_work_sync(&event->pending);
@@ -3733,7 +3712,25 @@ static void _free_event(struct perf_event *event)
 	if (is_cgroup_event(event))
 		perf_detach_cgroup(event);
 
-	__free_event(event);
+	if (!event->parent) {
+		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+			put_callchain_buffers();
+	}
+
+	perf_event_free_bpf_prog(event);
+
+	if (event->destroy)
+		event->destroy(event);
+
+	if (event->ctx)
+		put_ctx(event->ctx);
+
+	if (event->pmu) {
+		exclusive_event_destroy(event);
+		module_put(event->pmu->module);
+	}
+
+	call_rcu(&event->rcu_head, free_event_rcu);
 }
 
 /*
@@ -3760,14 +3757,13 @@ static void perf_remove_from_owner(struct perf_event *event)
 	struct task_struct *owner;
 
 	rcu_read_lock();
-	owner = ACCESS_ONCE(event->owner);
 	/*
-	 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
-	 * !owner it means the list deletion is complete and we can indeed
-	 * free this event, otherwise we need to serialize on
+	 * Matches the smp_store_release() in perf_event_exit_task(). If we
+	 * observe !owner it means the list deletion is complete and we can
+	 * indeed free this event, otherwise we need to serialize on
 	 * owner->perf_event_mutex.
 	 */
-	smp_read_barrier_depends();
+	owner = lockless_dereference(event->owner);
 	if (owner) {
 		/*
 		 * Since delayed_put_task_struct() also drops the last
@@ -3795,8 +3791,10 @@ static void perf_remove_from_owner(struct perf_event *event)
 		 * ensured they're done, and we can proceed with freeing the
 		 * event.
 		 */
-		if (event->owner)
+		if (event->owner) {
 			list_del_init(&event->owner_entry);
+			smp_store_release(&event->owner, NULL);
+		}
 		mutex_unlock(&owner->perf_event_mutex);
 		put_task_struct(owner);
 	}
@@ -3804,37 +3802,111 @@ static void perf_remove_from_owner(struct perf_event *event)
 
 static void put_event(struct perf_event *event)
 {
-	struct perf_event_context *ctx;
-
 	if (!atomic_long_dec_and_test(&event->refcount))
 		return;
 
+	_free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_event *child, *tmp;
+
+	/*
+	 * If we got here through err_file: fput(event_file); we will not have
+	 * attached to a context yet.
+	 */
+	if (!ctx) {
+		WARN_ON_ONCE(event->attach_state &
+				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+		goto no_ctx;
+	}
+
 	if (!is_kernel_event(event))
 		perf_remove_from_owner(event);
 
+	ctx = perf_event_ctx_lock(event);
+	WARN_ON_ONCE(ctx->parent_ctx);
+	perf_remove_from_context(event, DETACH_GROUP);
+
+	raw_spin_lock_irq(&ctx->lock);
 	/*
-	 * There are two ways this annotation is useful:
+	 * Mark this even as STATE_DEAD, there is no external reference to it
+	 * anymore.
 	 *
-	 *  1) there is a lock recursion from perf_event_exit_task
-	 *     see the comment there.
+	 * Anybody acquiring event->child_mutex after the below loop _must_
+	 * also see this, most importantly inherit_event() which will avoid
+	 * placing more children on the list.
 	 *
-	 *  2) there is a lock-inversion with mmap_sem through
-	 *     perf_read_group(), which takes faults while
-	 *     holding ctx->mutex, however this is called after
-	 *     the last filedesc died, so there is no possibility
-	 *     to trigger the AB-BA case.
+	 * Thus this guarantees that we will in fact observe and kill _ALL_
+	 * child events.
 	 */
-	ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
-	WARN_ON_ONCE(ctx->parent_ctx);
-	perf_remove_from_context(event, true);
+	event->state = PERF_EVENT_STATE_DEAD;
+	raw_spin_unlock_irq(&ctx->lock);
+
 	perf_event_ctx_unlock(event, ctx);
 
-	_free_event(event);
-}
+again:
+	mutex_lock(&event->child_mutex);
+	list_for_each_entry(child, &event->child_list, child_list) {
 
-int perf_event_release_kernel(struct perf_event *event)
-{
-	put_event(event);
+		/*
+		 * Cannot change, child events are not migrated, see the
+		 * comment with perf_event_ctx_lock_nested().
+		 */
+		ctx = lockless_dereference(child->ctx);
+		/*
+		 * Since child_mutex nests inside ctx::mutex, we must jump
+		 * through hoops. We start by grabbing a reference on the ctx.
+		 *
+		 * Since the event cannot get freed while we hold the
+		 * child_mutex, the context must also exist and have a !0
+		 * reference count.
+		 */
+		get_ctx(ctx);
+
+		/*
+		 * Now that we have a ctx ref, we can drop child_mutex, and
+		 * acquire ctx::mutex without fear of it going away. Then we
+		 * can re-acquire child_mutex.
+		 */
+		mutex_unlock(&event->child_mutex);
+		mutex_lock(&ctx->mutex);
+		mutex_lock(&event->child_mutex);
+
+		/*
+		 * Now that we hold ctx::mutex and child_mutex, revalidate our
+		 * state, if child is still the first entry, it didn't get freed
+		 * and we can continue doing so.
+		 */
+		tmp = list_first_entry_or_null(&event->child_list,
+					       struct perf_event, child_list);
+		if (tmp == child) {
+			perf_remove_from_context(child, DETACH_GROUP);
+			list_del(&child->child_list);
+			free_event(child);
+			/*
+			 * This matches the refcount bump in inherit_event();
+			 * this can't be the last reference.
+			 */
+			put_event(event);
+		}
+
+		mutex_unlock(&event->child_mutex);
+		mutex_unlock(&ctx->mutex);
+		put_ctx(ctx);
+		goto again;
+	}
+	mutex_unlock(&event->child_mutex);
+
+no_ctx:
+	put_event(event); /* Must be the 'last' reference */
 	return 0;
 }
 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -3844,46 +3916,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
  */
 static int perf_release(struct inode *inode, struct file *file)
 {
-	put_event(file->private_data);
+	perf_event_release_kernel(file->private_data);
 	return 0;
 }
 
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
-	struct perf_event_context *ctx;
-	struct perf_event *event, *tmp;
-
-	ctx = container_of(work, struct perf_event_context,
-			   orphans_remove.work);
-
-	mutex_lock(&ctx->mutex);
-	list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
-		struct perf_event *parent_event = event->parent;
-
-		if (!is_orphaned_child(event))
-			continue;
-
-		perf_remove_from_context(event, true);
-
-		mutex_lock(&parent_event->child_mutex);
-		list_del_init(&event->child_list);
-		mutex_unlock(&parent_event->child_mutex);
-
-		free_event(event);
-		put_event(parent_event);
-	}
-
-	raw_spin_lock_irq(&ctx->lock);
-	ctx->orphans_remove_sched = false;
-	raw_spin_unlock_irq(&ctx->lock);
-	mutex_unlock(&ctx->mutex);
-
-	put_ctx(ctx);
-}
-
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
 	struct perf_event *child;
@@ -4027,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event)
 {
 	bool no_children;
 
-	if (event->state != PERF_EVENT_STATE_EXIT)
+	if (event->state > PERF_EVENT_STATE_EXIT)
 		return false;
 
 	mutex_lock(&event->child_mutex);
@@ -4112,7 +4148,7 @@ static void _perf_event_reset(struct perf_event *event)
 /*
  * Holding the top-level event's child_mutex means that any
  * descendant process that has inherited this event will block
- * in sync_child_event if it goes to exit, thus satisfying the
+ * in perf_event_exit_event() if it goes to exit, thus satisfying the
  * task existence requirements of perf_event_enable/disable.
  */
 static void perf_event_for_each_child(struct perf_event *event,
@@ -4144,20 +4180,14 @@ static void perf_event_for_each(struct perf_event *event,
 		perf_event_for_each_child(sibling, func);
 }
 
-struct period_event {
-	struct perf_event *event;
-	u64 value;
-};
-
-static int __perf_event_period(void *info)
+static void __perf_event_period(struct perf_event *event,
+				struct perf_cpu_context *cpuctx,
+				struct perf_event_context *ctx,
+				void *info)
 {
-	struct period_event *pe = info;
-	struct perf_event *event = pe->event;
-	struct perf_event_context *ctx = event->ctx;
-	u64 value = pe->value;
+	u64 value = *((u64 *)info);
 	bool active;
 
-	raw_spin_lock(&ctx->lock);
 	if (event->attr.freq) {
 		event->attr.sample_freq = value;
 	} else {
@@ -4177,16 +4207,10 @@ static int __perf_event_period(void *info)
 		event->pmu->start(event, PERF_EF_RELOAD);
 		perf_pmu_enable(ctx->pmu);
 	}
-	raw_spin_unlock(&ctx->lock);
-
-	return 0;
 }
 
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
-	struct period_event pe = { .event = event, };
-	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task;
 	u64 value;
 
 	if (!is_sampling_event(event))
@@ -4201,34 +4225,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
 	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
 		return -EINVAL;
 
-	task = ctx->task;
-	pe.value = value;
-
-	if (!task) {
-		cpu_function_call(event->cpu, __perf_event_period, &pe);
-		return 0;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_event_period, &pe))
-		return 0;
-
-	raw_spin_lock_irq(&ctx->lock);
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		task = ctx->task;
-		goto retry;
-	}
-
-	if (event->attr.freq) {
-		event->attr.sample_freq = value;
-	} else {
-		event->attr.sample_period = value;
-		event->hw.sample_period = value;
-	}
-
-	local64_set(&event->hw.period_left, 0);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_event_period, &value);
 
 	return 0;
 }
@@ -4940,9 +4937,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
 	struct perf_event *event = filp->private_data;
 	int retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	retval = fasync_helper(fd, filp, on, &event->fasync);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (retval < 0)
 		return retval;
@@ -5000,7 +4997,7 @@ static void perf_pending_event(struct irq_work *entry)
 
 	if (event->pending_disable) {
 		event->pending_disable = 0;
-		__perf_event_disable(event);
+		perf_event_disable_local(event);
 	}
 
 	if (event->pending_wakeup) {
@@ -7821,11 +7818,13 @@ static void account_event_cpu(struct perf_event *event, int cpu)
 
 static void account_event(struct perf_event *event)
 {
+	bool inc = false;
+
 	if (event->parent)
 		return;
 
 	if (event->attach_state & PERF_ATTACH_TASK)
-		static_key_slow_inc(&perf_sched_events.key);
+		inc = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_inc(&nr_mmap_events);
 	if (event->attr.comm)
@@ -7838,12 +7837,35 @@ static void account_event(struct perf_event *event)
 	}
 	if (event->attr.context_switch) {
 		atomic_inc(&nr_switch_events);
-		static_key_slow_inc(&perf_sched_events.key);
+		inc = true;
 	}
 	if (has_branch_stack(event))
-		static_key_slow_inc(&perf_sched_events.key);
+		inc = true;
 	if (is_cgroup_event(event))
-		static_key_slow_inc(&perf_sched_events.key);
+		inc = true;
+
+	if (inc) {
+		if (atomic_inc_not_zero(&perf_sched_count))
+			goto enabled;
+
+		mutex_lock(&perf_sched_mutex);
+		if (!atomic_read(&perf_sched_count)) {
+			static_branch_enable(&perf_sched_events);
+			/*
+			 * Guarantee that all CPUs observe they key change and
+			 * call the perf scheduling hooks before proceeding to
+			 * install events that need them.
+			 */
+			synchronize_sched();
+		}
+		/*
+		 * Now that we have waited for the sync_sched(), allow further
+		 * increments to by-pass the mutex.
+		 */
+		atomic_inc(&perf_sched_count);
+		mutex_unlock(&perf_sched_mutex);
+	}
+enabled:
 
 	account_event_cpu(event, event->cpu);
 }
@@ -8462,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (move_group) {
 		gctx = group_leader->ctx;
 		mutex_lock_double(&gctx->mutex, &ctx->mutex);
+		if (gctx->task == TASK_TOMBSTONE) {
+			err = -ESRCH;
+			goto err_locked;
+		}
 	} else {
 		mutex_lock(&ctx->mutex);
 	}
 
+	if (ctx->task == TASK_TOMBSTONE) {
+		err = -ESRCH;
+		goto err_locked;
+	}
+
 	if (!perf_event_validate_size(event)) {
 		err = -E2BIG;
 		goto err_locked;
@@ -8490,11 +8521,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * See perf_event_ctx_lock() for comments on the details
 		 * of swizzling perf_event::ctx.
 		 */
-		perf_remove_from_context(group_leader, false);
+		perf_remove_from_context(group_leader, 0);
 
 		list_for_each_entry(sibling, &group_leader->sibling_list,
 				    group_entry) {
-			perf_remove_from_context(sibling, false);
+			perf_remove_from_context(sibling, 0);
 			put_ctx(gctx);
 		}
 
@@ -8547,6 +8578,8 @@ SYSCALL_DEFINE5(perf_event_open,
 	perf_event__header_size(event);
 	perf_event__id_header_size(event);
 
+	event->owner = current;
+
 	perf_install_in_context(ctx, event, event->cpu);
 	perf_unpin_context(ctx);
 
@@ -8556,8 +8589,6 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	put_online_cpus();
 
-	event->owner = current;
-
 	mutex_lock(&current->perf_event_mutex);
 	list_add_tail(&event->owner_entry, &current->perf_event_list);
 	mutex_unlock(&current->perf_event_mutex);
@@ -8582,7 +8613,12 @@ err_context:
 	perf_unpin_context(ctx);
 	put_ctx(ctx);
 err_alloc:
-	free_event(event);
+	/*
+	 * If event_file is set, the fput() above will have called ->release()
+	 * and that will take care of freeing the event.
+	 */
+	if (!event_file)
+		free_event(event);
 err_cpus:
 	put_online_cpus();
 err_task:
@@ -8624,7 +8660,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	}
 
 	/* Mark owner so we could distinguish it from user events. */
-	event->owner = EVENT_OWNER_KERNEL;
+	event->owner = TASK_TOMBSTONE;
 
 	account_event(event);
 
@@ -8636,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
+	if (ctx->task == TASK_TOMBSTONE) {
+		err = -ESRCH;
+		goto err_unlock;
+	}
+
 	if (!exclusive_event_installable(event, ctx)) {
-		mutex_unlock(&ctx->mutex);
-		perf_unpin_context(ctx);
-		put_ctx(ctx);
 		err = -EBUSY;
-		goto err_free;
+		goto err_unlock;
 	}
 
 	perf_install_in_context(ctx, event, cpu);
@@ -8650,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
+err_unlock:
+	mutex_unlock(&ctx->mutex);
+	perf_unpin_context(ctx);
+	put_ctx(ctx);
 err_free:
 	free_event(event);
 err:
@@ -8674,7 +8716,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 	mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
 	list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
 				 event_entry) {
-		perf_remove_from_context(event, false);
+		perf_remove_from_context(event, 0);
 		unaccount_event_cpu(event, src_cpu);
 		put_ctx(src_ctx);
 		list_add(&event->migrate_entry, &events);
@@ -8741,33 +8783,15 @@ static void sync_child_event(struct perf_event *child_event,
 		     &parent_event->child_total_time_enabled);
 	atomic64_add(child_event->total_time_running,
 		     &parent_event->child_total_time_running);
-
-	/*
-	 * Remove this event from the parent's list
-	 */
-	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-	mutex_lock(&parent_event->child_mutex);
-	list_del_init(&child_event->child_list);
-	mutex_unlock(&parent_event->child_mutex);
-
-	/*
-	 * Make sure user/parent get notified, that we just
-	 * lost one event.
-	 */
-	perf_event_wakeup(parent_event);
-
-	/*
-	 * Release the parent event, if this was the last
-	 * reference to it.
-	 */
-	put_event(parent_event);
 }
 
 static void
-__perf_event_exit_task(struct perf_event *child_event,
-			 struct perf_event_context *child_ctx,
-			 struct task_struct *child)
+perf_event_exit_event(struct perf_event *child_event,
+		      struct perf_event_context *child_ctx,
+		      struct task_struct *child)
 {
+	struct perf_event *parent_event = child_event->parent;
+
 	/*
 	 * Do not destroy the 'original' grouping; because of the context
 	 * switch optimization the original events could've ended up in a
@@ -8780,57 +8804,86 @@ __perf_event_exit_task(struct perf_event *child_event,
 	 * Do destroy all inherited groups, we don't care about those
 	 * and being thorough is better.
 	 */
-	perf_remove_from_context(child_event, !!child_event->parent);
+	raw_spin_lock_irq(&child_ctx->lock);
+	WARN_ON_ONCE(child_ctx->is_active);
+
+	if (parent_event)
+		perf_group_detach(child_event);
+	list_del_event(child_event, child_ctx);
+	child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+	raw_spin_unlock_irq(&child_ctx->lock);
 
 	/*
-	 * It can happen that the parent exits first, and has events
-	 * that are still around due to the child reference. These
-	 * events need to be zapped.
+	 * Parent events are governed by their filedesc, retain them.
 	 */
-	if (child_event->parent) {
-		sync_child_event(child_event, child);
-		free_event(child_event);
-	} else {
-		child_event->state = PERF_EVENT_STATE_EXIT;
+	if (!parent_event) {
 		perf_event_wakeup(child_event);
+		return;
 	}
+	/*
+	 * Child events can be cleaned up.
+	 */
+
+	sync_child_event(child_event, child);
+
+	/*
+	 * Remove this event from the parent's list
+	 */
+	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+	mutex_lock(&parent_event->child_mutex);
+	list_del_init(&child_event->child_list);
+	mutex_unlock(&parent_event->child_mutex);
+
+	/*
+	 * Kick perf_poll() for is_event_hup().
+	 */
+	perf_event_wakeup(parent_event);
+	free_event(child_event);
+	put_event(parent_event);
 }
 
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
-	struct perf_event *child_event, *next;
 	struct perf_event_context *child_ctx, *clone_ctx = NULL;
-	unsigned long flags;
+	struct perf_event *child_event, *next;
 
-	if (likely(!child->perf_event_ctxp[ctxn]))
+	WARN_ON_ONCE(child != current);
+
+	child_ctx = perf_pin_task_context(child, ctxn);
+	if (!child_ctx)
 		return;
 
-	local_irq_save(flags);
 	/*
-	 * We can't reschedule here because interrupts are disabled,
-	 * and either child is current or it is a task that can't be
-	 * scheduled, so we are now safe from rescheduling changing
-	 * our context.
+	 * In order to reduce the amount of tricky in ctx tear-down, we hold
+	 * ctx::mutex over the entire thing. This serializes against almost
+	 * everything that wants to access the ctx.
+	 *
+	 * The exception is sys_perf_event_open() /
+	 * perf_event_create_kernel_count() which does find_get_context()
+	 * without ctx::mutex (it cannot because of the move_group double mutex
+	 * lock thing). See the comments in perf_install_in_context().
 	 */
-	child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
+	mutex_lock(&child_ctx->mutex);
 
 	/*
-	 * Take the context lock here so that if find_get_context is
-	 * reading child->perf_event_ctxp, we wait until it has
-	 * incremented the context's refcount before we do put_ctx below.
+	 * In a single ctx::lock section, de-schedule the events and detach the
+	 * context from the task such that we cannot ever get it scheduled back
+	 * in.
 	 */
-	raw_spin_lock(&child_ctx->lock);
-	task_ctx_sched_out(child_ctx);
-	child->perf_event_ctxp[ctxn] = NULL;
+	raw_spin_lock_irq(&child_ctx->lock);
+	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
 
 	/*
-	 * If this context is a clone; unclone it so it can't get
-	 * swapped to another process while we're removing all
-	 * the events from it.
+	 * Now that the context is inactive, destroy the task <-> ctx relation
+	 * and mark the context dead.
 	 */
+	RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+	put_ctx(child_ctx); /* cannot be last */
+	WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
+	put_task_struct(current); /* cannot be last */
+
 	clone_ctx = unclone_ctx(child_ctx);
-	update_context_time(child_ctx);
-	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+	raw_spin_unlock_irq(&child_ctx->lock);
 
 	if (clone_ctx)
 		put_ctx(clone_ctx);
@@ -8842,20 +8895,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	 */
 	perf_event_task(child, child_ctx, 0);
 
-	/*
-	 * We can recurse on the same lock type through:
-	 *
-	 *   __perf_event_exit_task()
-	 *     sync_child_event()
-	 *       put_event()
-	 *         mutex_lock(&ctx->mutex)
-	 *
-	 * But since its the parent context it won't be the same instance.
-	 */
-	mutex_lock(&child_ctx->mutex);
-
 	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
-		__perf_event_exit_task(child_event, child_ctx, child);
+		perf_event_exit_event(child_event, child_ctx, child);
 
 	mutex_unlock(&child_ctx->mutex);
 
@@ -8880,8 +8921,7 @@ void perf_event_exit_task(struct task_struct *child)
 		 * the owner, closes a race against perf_release() where
 		 * we need to serialize on the owner->perf_event_mutex.
 		 */
-		smp_wmb();
-		event->owner = NULL;
+		smp_store_release(&event->owner, NULL);
 	}
 	mutex_unlock(&child->perf_event_mutex);
 
@@ -8964,21 +9004,20 @@ void perf_event_delayed_put(struct task_struct *task)
 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
 
-struct perf_event *perf_event_get(unsigned int fd)
+struct file *perf_event_get(unsigned int fd)
 {
-	int err;
-	struct fd f;
-	struct perf_event *event;
+	struct file *file;
 
-	err = perf_fget_light(fd, &f);
-	if (err)
-		return ERR_PTR(err);
+	file = fget_raw(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
 
-	event = f.file->private_data;
-	atomic_long_inc(&event->refcount);
-	fdput(f);
+	if (file->f_op != &perf_fops) {
+		fput(file);
+		return ERR_PTR(-EBADF);
+	}
 
-	return event;
+	return file;
 }
 
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
@@ -9021,8 +9060,16 @@ inherit_event(struct perf_event *parent_event,
 	if (IS_ERR(child_event))
 		return child_event;
 
+	/*
+	 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+	 * must be under the same lock in order to serialize against
+	 * perf_event_release_kernel(), such that either we must observe
+	 * is_orphaned_event() or they will observe us on the child_list.
+	 */
+	mutex_lock(&parent_event->child_mutex);
 	if (is_orphaned_event(parent_event) ||
 	    !atomic_long_inc_not_zero(&parent_event->refcount)) {
+		mutex_unlock(&parent_event->child_mutex);
 		free_event(child_event);
 		return NULL;
 	}
@@ -9070,8 +9117,6 @@ inherit_event(struct perf_event *parent_event,
 	/*
 	 * Link this into the parent event's child list
 	 */
-	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-	mutex_lock(&parent_event->child_mutex);
 	list_add_tail(&child_event->child_list, &parent_event->child_list);
 	mutex_unlock(&parent_event->child_mutex);
 
@@ -9276,7 +9321,7 @@ static void perf_event_init_cpu(int cpu)
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
 	mutex_lock(&swhash->hlist_mutex);
-	if (swhash->hlist_refcount > 0) {
+	if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
 		struct swevent_hlist *hlist;
 
 		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -9289,13 +9334,14 @@ static void perf_event_init_cpu(int cpu)
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
-	struct remove_event re = { .detach_group = true };
 	struct perf_event_context *ctx = __info;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+	struct perf_event *event;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
-		__perf_remove_from_context(&re);
-	rcu_read_unlock();
+	raw_spin_lock(&ctx->lock);
+	list_for_each_entry(event, &ctx->event_list, event_entry)
+		__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+	raw_spin_unlock(&ctx->lock);
 }
 
 static void perf_event_exit_cpu_context(int cpu)
@@ -9351,11 +9397,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	switch (action & ~CPU_TASKS_FROZEN) {
 
 	case CPU_UP_PREPARE:
-	case CPU_DOWN_FAILED:
 		perf_event_init_cpu(cpu);
 		break;
 
-	case CPU_UP_CANCELED:
 	case CPU_DOWN_PREPARE:
 		perf_event_exit_cpu(cpu);
 		break;
@@ -9384,9 +9428,6 @@ void __init perf_event_init(void)
 	ret = init_hw_breakpoint();
 	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 
-	/* do not patch jump label more than once per second */
-	jump_label_rate_limit(&perf_sched_events, HZ);
-
 	/*
 	 * Build time assertion that we keep the data_head at the intended
 	 * location.  IOW, validation we got the __reserved[] size right.
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 92ce5f4cc..3f8cb1e14 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	 * current task.
 	 */
 	if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
-		__perf_event_disable(bp);
+		perf_event_disable_local(bp);
 	else
 		perf_event_disable(bp);
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index adfdc0536..1faad2cfd 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
 	__free_page(page);
 }
 
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+	int pg;
+
+	if (rb->aux_priv) {
+		rb->free_aux(rb->aux_priv);
+		rb->free_aux = NULL;
+		rb->aux_priv = NULL;
+	}
+
+	if (rb->aux_nr_pages) {
+		for (pg = 0; pg < rb->aux_nr_pages; pg++)
+			rb_free_aux_page(rb, pg);
+
+		kfree(rb->aux_pages);
+		rb->aux_nr_pages = 0;
+	}
+}
+
 int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 		 pgoff_t pgoff, int nr_pages, long watermark, int flags)
 {
@@ -547,30 +566,11 @@ out:
 	if (!ret)
 		rb->aux_pgoff = pgoff;
 	else
-		rb_free_aux(rb);
+		__rb_free_aux(rb);
 
 	return ret;
 }
 
-static void __rb_free_aux(struct ring_buffer *rb)
-{
-	int pg;
-
-	if (rb->aux_priv) {
-		rb->free_aux(rb->aux_priv);
-		rb->free_aux = NULL;
-		rb->aux_priv = NULL;
-	}
-
-	if (rb->aux_nr_pages) {
-		for (pg = 0; pg < rb->aux_nr_pages; pg++)
-			rb_free_aux_page(rb, pg);
-
-		kfree(rb->aux_pages);
-		rb->aux_nr_pages = 0;
-	}
-}
-
 void rb_free_aux(struct ring_buffer *rb)
 {
 	if (atomic_dec_and_test(&rb->aux_refcount))
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913..016767918 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	const unsigned long mmun_end   = addr + PAGE_SIZE;
 	struct mem_cgroup *memcg;
 
-	err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+	err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+			false);
 	if (err)
 		return err;
 
@@ -175,12 +176,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 		goto unlock;
 
 	get_page(kpage);
-	page_add_new_anon_rmap(kpage, vma, addr);
-	mem_cgroup_commit_charge(kpage, memcg, false);
+	page_add_new_anon_rmap(kpage, vma, addr, false);
+	mem_cgroup_commit_charge(kpage, memcg, false, false);
 	lru_cache_add_active_or_unevictable(kpage, vma);
 
 	if (!PageAnon(page)) {
-		dec_mm_counter(mm, MM_FILEPAGES);
+		dec_mm_counter(mm, mm_counter_file(page));
 		inc_mm_counter(mm, MM_ANONPAGES);
 	}
 
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	ptep_clear_flush_notify(vma, addr, ptep);
 	set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 
-	page_remove_rmap(page);
+	page_remove_rmap(page, false);
 	if (!page_mapped(page))
 		try_to_free_swap(page);
 	pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	err = 0;
  unlock:
-	mem_cgroup_cancel_charge(kpage, memcg);
+	mem_cgroup_cancel_charge(kpage, memcg, false);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	unlock_page(page);
 	return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 07110c602..10e088237 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,8 +59,6 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
-static void exit_mm(struct task_struct *tsk);
-
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
 	nr_threads--;
@@ -1120,8 +1118,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 static int *task_stopped_code(struct task_struct *p, bool ptrace)
 {
 	if (ptrace) {
-		if (task_is_stopped_or_traced(p) &&
-		    !(p->jobctl & JOBCTL_LISTENING))
+		if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
 			return &p->exit_code;
 	} else {
 		if (p->signal->flags & SIGNAL_STOP_STOPPED)
diff --git a/kernel/fork.c b/kernel/fork.c
index 0b59aed29..f91740137 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -138,7 +138,7 @@ static struct kmem_cache *task_struct_cachep;
 
 static inline struct task_struct *alloc_task_struct_node(int node)
 {
-	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node);
+	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
 }
 
 static inline void free_task_struct(struct task_struct *tsk)
@@ -300,9 +300,9 @@ void __init fork_init(void)
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
-	task_struct_cachep =
-		kmem_cache_create("task_struct", arch_task_struct_size,
-			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+	task_struct_cachep = kmem_cache_create("task_struct",
+			arch_task_struct_size, ARCH_MIN_TASKALIGN,
+			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
 
 	mm->total_vm = oldmm->total_vm;
-	mm->shared_vm = oldmm->shared_vm;
+	mm->data_vm = oldmm->data_vm;
 	mm->exec_vm = oldmm->exec_vm;
 	mm->stack_vm = oldmm->stack_vm;
 
@@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		struct file *file;
 
 		if (mpnt->vm_flags & VM_DONTCOPY) {
-			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-							-vma_pages(mpnt));
+			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
 			continue;
 		}
 		charge = 0;
@@ -465,7 +464,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			struct inode *inode = file_inode(file);
 			struct address_space *mapping = file->f_mapping;
 
-			vma_get_file(tmp);
+			get_file(file);
 			if (tmp->vm_flags & VM_DENYWRITE)
 				atomic_dec(&inode->i_writecount);
 			i_mmap_lock_write(mapping);
@@ -1250,7 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
 	int retval;
 	struct task_struct *p;
-	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1349,9 +1347,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	prev_cputime_init(&p->prev_cputime);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqlock_init(&p->vtime_seqlock);
+	seqcount_init(&p->vtime_seqcount);
 	p->vtime_snap = 0;
-	p->vtime_snap_whence = VTIME_SLEEPING;
+	p->vtime_snap_whence = VTIME_INACTIVE;
 #endif
 
 #if defined(SPLIT_RSS_COUNTING)
@@ -1527,7 +1525,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * between here and cgroup_post_fork() if an organisation operation is in
 	 * progress.
 	 */
-	retval = cgroup_can_fork(p, cgrp_ss_priv);
+	retval = cgroup_can_fork(p);
 	if (retval)
 		goto bad_fork_free_pid;
 
@@ -1609,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	write_unlock_irq(&tasklist_lock);
 
 	proc_fork_connector(p);
-	cgroup_post_fork(p, cgrp_ss_priv);
+	cgroup_post_fork(p);
 	threadgroup_change_end(current);
 	perf_event_fork(p);
 
@@ -1619,7 +1617,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	return p;
 
 bad_fork_cancel_cgroup:
-	cgroup_cancel_fork(p, cgrp_ss_priv);
+	cgroup_cancel_fork(p);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
@@ -1849,16 +1847,19 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
-			SLAB_NOTRACK, sighand_ctor);
+			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	/*
 	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
 	 * whole struct cpumask for the OFFSTACK case. We could change
@@ -1868,8 +1869,9 @@ void __init proc_caches_init(void)
 	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
-	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
+	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
 	mmap_init();
 	nsproxy_cache_init();
 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 461c72b2d..5d6ce6413 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
-	struct page *page, *page_head;
+	struct page *page;
+	struct address_space *mapping;
 	int err, ro = 0;
 
 	/*
@@ -519,46 +520,9 @@ again:
 	else
 		err = 0;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	page_head = page;
-	if (unlikely(PageTail(page))) {
-		put_page(page);
-		/* serialize against __split_huge_page_splitting() */
-		local_irq_disable();
-		if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
-			page_head = compound_head(page);
-			/*
-			 * page_head is valid pointer but we must pin
-			 * it before taking the PG_lock and/or
-			 * PG_compound_lock. The moment we re-enable
-			 * irqs __split_huge_page_splitting() can
-			 * return and the head page can be freed from
-			 * under us. We can't take the PG_lock and/or
-			 * PG_compound_lock on a page that could be
-			 * freed from under us.
-			 */
-			if (page != page_head) {
-				get_page(page_head);
-				put_page(page);
-			}
-			local_irq_enable();
-		} else {
-			local_irq_enable();
-			goto again;
-		}
-	}
-#else
-	page_head = compound_head(page);
-	if (page != page_head) {
-		get_page(page_head);
-		put_page(page);
-	}
-#endif
-
-	lock_page(page_head);
-
+	lock_page(page);
 	/*
-	 * If page_head->mapping is NULL, then it cannot be a PageAnon
+	 * If page->mapping is NULL, then it cannot be a PageAnon
 	 * page; but it might be the ZERO_PAGE or in the gate area or
 	 * in a special mapping (all cases which we are happy to fail);
 	 * or it may have been a good file page when get_user_pages_fast
@@ -570,12 +534,13 @@ again:
 	 *
 	 * The case we do have to guard against is when memory pressure made
 	 * shmem_writepage move it from filecache to swapcache beneath us:
-	 * an unlikely race, but we do need to retry for page_head->mapping.
+	 * an unlikely race, but we do need to retry for page->mapping.
 	 */
-	if (!page_head->mapping) {
-		int shmem_swizzled = PageSwapCache(page_head);
-		unlock_page(page_head);
-		put_page(page_head);
+	mapping = compound_head(page)->mapping;
+	if (!mapping) {
+		int shmem_swizzled = PageSwapCache(page);
+		unlock_page(page);
+		put_page(page);
 		if (shmem_swizzled)
 			goto again;
 		return -EFAULT;
@@ -588,7 +553,7 @@ again:
 	 * it's a read-only handle, it's expected that futexes attach to
 	 * the object not the particular process.
 	 */
-	if (PageAnon(page_head)) {
+	if (PageAnon(page)) {
 		/*
 		 * A RO anonymous page will never change and thus doesn't make
 		 * sense for futex operations.
@@ -603,15 +568,15 @@ again:
 		key->private.address = address;
 	} else {
 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-		key->shared.inode = page_head->mapping->host;
+		key->shared.inode = mapping->host;
 		key->shared.pgoff = basepage_index(page);
 	}
 
 	get_futex_key_refs(key); /* implies MB (B) */
 
 out:
-	unlock_page(page_head);
-	put_page(page_head);
+	unlock_page(page);
+	put_page(page);
 	return err;
 }
 
@@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 
 	down_read(&mm->mmap_sem);
 	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
-			       FAULT_FLAG_WRITE);
+			       FAULT_FLAG_WRITE, NULL);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
@@ -725,9 +690,12 @@ static struct futex_pi_state * alloc_pi_state(void)
 }
 
 /*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
  * Must be called with the hb lock held.
  */
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
 {
 	if (!pi_state)
 		return;
@@ -1223,7 +1191,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
 	if (pi_state->owner != current)
 		return -EINVAL;
 
-	raw_spin_lock(&pi_state->pi_mutex.wait_lock);
+	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 
 	/*
@@ -1249,22 +1217,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
 	else if (curval != uval)
 		ret = -EINVAL;
 	if (ret) {
-		raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 		return ret;
 	}
 
-	raw_spin_lock_irq(&pi_state->owner->pi_lock);
+	raw_spin_lock(&pi_state->owner->pi_lock);
 	WARN_ON(list_empty(&pi_state->list));
 	list_del_init(&pi_state->list);
-	raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+	raw_spin_unlock(&pi_state->owner->pi_lock);
 
-	raw_spin_lock_irq(&new_owner->pi_lock);
+	raw_spin_lock(&new_owner->pi_lock);
 	WARN_ON(!list_empty(&pi_state->list));
 	list_add(&pi_state->list, &new_owner->pi_state_list);
 	pi_state->owner = new_owner;
-	raw_spin_unlock_irq(&new_owner->pi_lock);
+	raw_spin_unlock(&new_owner->pi_lock);
 
-	raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 
 	deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
 
@@ -1706,31 +1674,35 @@ retry_private:
 		 * exist yet, look it up one more time to ensure we have a
 		 * reference to it. If the lock was taken, ret contains the
 		 * vpid of the top waiter task.
+		 * If the lock was not taken, we have pi_state and an initial
+		 * refcount on it. In case of an error we have nothing.
 		 */
 		if (ret > 0) {
 			WARN_ON(pi_state);
 			drop_count++;
 			task_count++;
 			/*
-			 * If we acquired the lock, then the user
-			 * space value of uaddr2 should be vpid. It
-			 * cannot be changed by the top waiter as it
-			 * is blocked on hb2 lock if it tries to do
-			 * so. If something fiddled with it behind our
-			 * back the pi state lookup might unearth
-			 * it. So we rather use the known value than
-			 * rereading and handing potential crap to
-			 * lookup_pi_state.
+			 * If we acquired the lock, then the user space value
+			 * of uaddr2 should be vpid. It cannot be changed by
+			 * the top waiter as it is blocked on hb2 lock if it
+			 * tries to do so. If something fiddled with it behind
+			 * our back the pi state lookup might unearth it. So
+			 * we rather use the known value than rereading and
+			 * handing potential crap to lookup_pi_state.
+			 *
+			 * If that call succeeds then we have pi_state and an
+			 * initial refcount on it.
 			 */
 			ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
 		}
 
 		switch (ret) {
 		case 0:
+			/* We hold a reference on the pi state. */
 			break;
+
+			/* If the above failed, then pi_state is NULL */
 		case -EFAULT:
-			free_pi_state(pi_state);
-			pi_state = NULL;
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
 			put_futex_key(&key2);
@@ -1746,8 +1718,6 @@ retry_private:
 			 *   exit to complete.
 			 * - The user space value changed.
 			 */
-			free_pi_state(pi_state);
-			pi_state = NULL;
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
 			put_futex_key(&key2);
@@ -1801,30 +1771,58 @@ retry_private:
 		 * of requeue_pi if we couldn't acquire the lock atomically.
 		 */
 		if (requeue_pi) {
-			/* Prepare the waiter to take the rt_mutex. */
+			/*
+			 * Prepare the waiter to take the rt_mutex. Take a
+			 * refcount on the pi_state and store the pointer in
+			 * the futex_q object of the waiter.
+			 */
 			atomic_inc(&pi_state->refcount);
 			this->pi_state = pi_state;
 			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
 							this->rt_waiter,
 							this->task);
 			if (ret == 1) {
-				/* We got the lock. */
+				/*
+				 * We got the lock. We do neither drop the
+				 * refcount on pi_state nor clear
+				 * this->pi_state because the waiter needs the
+				 * pi_state for cleaning up the user space
+				 * value. It will drop the refcount after
+				 * doing so.
+				 */
 				requeue_pi_wake_futex(this, &key2, hb2);
 				drop_count++;
 				continue;
 			} else if (ret) {
-				/* -EDEADLK */
+				/*
+				 * rt_mutex_start_proxy_lock() detected a
+				 * potential deadlock when we tried to queue
+				 * that waiter. Drop the pi_state reference
+				 * which we took above and remove the pointer
+				 * to the state from the waiters futex_q
+				 * object.
+				 */
 				this->pi_state = NULL;
-				free_pi_state(pi_state);
-				goto out_unlock;
+				put_pi_state(pi_state);
+				/*
+				 * We stop queueing more waiters and let user
+				 * space deal with the mess.
+				 */
+				break;
 			}
 		}
 		requeue_futex(this, hb1, hb2, &key2);
 		drop_count++;
 	}
 
+	/*
+	 * We took an extra initial reference to the pi_state either
+	 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
+	 * need to drop it here again.
+	 */
+	put_pi_state(pi_state);
+
 out_unlock:
-	free_pi_state(pi_state);
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
@@ -1973,7 +1971,7 @@ static void unqueue_me_pi(struct futex_q *q)
 	__unqueue_futex(q);
 
 	BUG_ON(!q->pi_state);
-	free_pi_state(q->pi_state);
+	put_pi_state(q->pi_state);
 	q->pi_state = NULL;
 
 	spin_unlock(q->lock_ptr);
@@ -2129,11 +2127,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 		 * we returned due to timeout or signal without taking the
 		 * rt_mutex. Too late.
 		 */
-		raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
+		raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
 		owner = rt_mutex_owner(&q->pi_state->pi_mutex);
 		if (!owner)
 			owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
-		raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+		raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
 		ret = fixup_pi_state_owner(uaddr, q, owner);
 		goto out;
 	}
@@ -2759,7 +2757,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 			 * Drop the reference to the pi state which
 			 * the requeue_pi() code acquired for us.
 			 */
-			free_pi_state(q.pi_state);
+			put_pi_state(q.pi_state);
 			spin_unlock(q.lock_ptr);
 		}
 	} else {
@@ -3051,7 +3049,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 
 	if (op & FUTEX_CLOCK_REALTIME) {
 		flags |= FLAGS_CLOCKRT;
-		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+		if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
+		    cmd != FUTEX_WAIT_REQUEUE_PI)
 			return -ENOSYS;
 	}
 
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 7080ae1eb..2f9df3794 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -123,11 +123,6 @@ void gcov_enable_events(void)
 }
 
 #ifdef CONFIG_MODULES
-static inline int within(void *addr, void *start, unsigned long size)
-{
-	return ((addr >= start) && (addr < start + size));
-}
-
 /* Update list and generate events when modules are unloaded. */
 static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
 				void *data)
@@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
 
 	/* Remove entries located in module from linked list. */
 	while ((info = gcov_info_next(info))) {
-		if (within(info, mod->module_core, mod->core_size)) {
+		if (within_module((unsigned long)info, mod)) {
 			gcov_info_unlink(prev, info);
 			if (gcov_events_enabled)
 				gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 15206453b..5797909f4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -338,7 +338,6 @@ void handle_nested_irq(unsigned int irq)
 	raw_spin_lock_irq(&desc->lock);
 
 	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-	kstat_incr_irqs_this_cpu(desc);
 
 	action = desc->action;
 	if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
@@ -346,6 +345,7 @@ void handle_nested_irq(unsigned int irq)
 		goto out_unlock;
 	}
 
+	kstat_incr_irqs_this_cpu(desc);
 	irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 	raw_spin_unlock_irq(&desc->lock);
 
@@ -412,13 +412,13 @@ void handle_simple_irq(struct irq_desc *desc)
 		goto out_unlock;
 
 	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-	kstat_incr_irqs_this_cpu(desc);
 
 	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
 		desc->istate |= IRQS_PENDING;
 		goto out_unlock;
 	}
 
+	kstat_incr_irqs_this_cpu(desc);
 	handle_irq_event(desc);
 
 out_unlock:
@@ -462,7 +462,6 @@ void handle_level_irq(struct irq_desc *desc)
 		goto out_unlock;
 
 	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-	kstat_incr_irqs_this_cpu(desc);
 
 	/*
 	 * If its disabled or no action available
@@ -473,6 +472,7 @@ void handle_level_irq(struct irq_desc *desc)
 		goto out_unlock;
 	}
 
+	kstat_incr_irqs_this_cpu(desc);
 	handle_irq_event(desc);
 
 	cond_unmask_irq(desc);
@@ -532,7 +532,6 @@ void handle_fasteoi_irq(struct irq_desc *desc)
 		goto out;
 
 	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-	kstat_incr_irqs_this_cpu(desc);
 
 	/*
 	 * If its disabled or no action available
@@ -544,6 +543,7 @@ void handle_fasteoi_irq(struct irq_desc *desc)
 		goto out;
 	}
 
+	kstat_incr_irqs_this_cpu(desc);
 	if (desc->istate & IRQS_ONESHOT)
 		mask_irq(desc);
 
@@ -950,6 +950,7 @@ void irq_chip_ack_parent(struct irq_data *data)
 	data = data->parent_data;
 	data->chip->irq_ack(data);
 }
+EXPORT_SYMBOL_GPL(irq_chip_ack_parent);
 
 /**
  * irq_chip_mask_parent - Mask the parent interrupt
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 239e2ae2c..0409da0bc 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -159,6 +159,7 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
 
 	raw_spin_lock_init(&desc->lock);
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_rcu_head(&desc->rcu);
 
 	desc_set_defaults(irq, desc, node, owner);
 
@@ -171,6 +172,15 @@ err_desc:
 	return NULL;
 }
 
+static void delayed_free_desc(struct rcu_head *rhp)
+{
+	struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+
+	free_masks(desc);
+	free_percpu(desc->kstat_irqs);
+	kfree(desc);
+}
+
 static void free_desc(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -187,9 +197,12 @@ static void free_desc(unsigned int irq)
 	delete_irq_desc(irq);
 	mutex_unlock(&sparse_irq_lock);
 
-	free_masks(desc);
-	free_percpu(desc->kstat_irqs);
-	kfree(desc);
+	/*
+	 * We free the descriptor, masks and stat fields via RCU. That
+	 * allows demultiplex interrupts to do rcu based management of
+	 * the child interrupts.
+	 */
+	call_rcu(&desc->rcu, delayed_free_desc);
 }
 
 static int alloc_descs(unsigned int start, unsigned int cnt, int node,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 22aa9612e..3e56d2f03 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -60,6 +60,7 @@ struct fwnode_handle *irq_domain_alloc_fwnode(void *data)
 	fwid->fwnode.type = FWNODE_IRQCHIP;
 	return &fwid->fwnode;
 }
+EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode);
 
 /**
  * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
@@ -70,13 +71,14 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
 {
 	struct irqchip_fwid *fwid;
 
-	if (WARN_ON(fwnode->type != FWNODE_IRQCHIP))
+	if (WARN_ON(!is_fwnode_irqchip(fwnode)))
 		return;
 
 	fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
 	kfree(fwid->name);
 	kfree(fwid);
 }
+EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
 
 /**
  * __irq_domain_add() - Allocate a new irq_domain data structure
@@ -573,10 +575,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 	unsigned int type = IRQ_TYPE_NONE;
 	int virq;
 
-	if (fwspec->fwnode)
-		domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY);
-	else
+	if (fwspec->fwnode) {
+		domain = irq_find_matching_fwnode(fwspec->fwnode,
+						  DOMAIN_BUS_WIRED);
+		if (!domain)
+			domain = irq_find_matching_fwnode(fwspec->fwnode,
+							  DOMAIN_BUS_ANY);
+	} else {
 		domain = irq_default_domain;
+	}
 
 	if (!domain) {
 		pr_warn("no irq domain found for %s !\n",
@@ -1013,6 +1020,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
 
 /**
  * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
@@ -1058,6 +1066,7 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
 	__irq_set_handler(virq, handler, 0, handler_name);
 	irq_set_handler_data(virq, handler_data);
 }
+EXPORT_SYMBOL(irq_domain_set_info);
 
 /**
  * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
@@ -1125,9 +1134,9 @@ static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
 	}
 }
 
-static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
-					   unsigned int irq_base,
-					   unsigned int nr_irqs, void *arg)
+int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
+				    unsigned int irq_base,
+				    unsigned int nr_irqs, void *arg)
 {
 	int ret = 0;
 	struct irq_domain *parent = domain->parent;
@@ -1343,6 +1352,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
 
 	return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
 }
+EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
 
 /**
  * irq_domain_set_info - Set the complete data for a @virq in @domain
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6ead20037..841187239 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1743,6 +1743,31 @@ out:
 }
 EXPORT_SYMBOL_GPL(enable_percpu_irq);
 
+/**
+ * irq_percpu_is_enabled - Check whether the per cpu irq is enabled
+ * @irq:	Linux irq number to check for
+ *
+ * Must be called from a non migratable context. Returns the enable
+ * state of a per cpu interrupt on the current cpu.
+ */
+bool irq_percpu_is_enabled(unsigned int irq)
+{
+	unsigned int cpu = smp_processor_id();
+	struct irq_desc *desc;
+	unsigned long flags;
+	bool is_enabled;
+
+	desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+	if (!desc)
+		return false;
+
+	is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
+	irq_put_desc_unlock(desc, flags);
+
+	return is_enabled;
+}
+EXPORT_SYMBOL_GPL(irq_percpu_is_enabled);
+
 void disable_percpu_irq(unsigned int irq)
 {
 	unsigned int cpu = smp_processor_id();
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6b0c0b74a..38e89ce7b 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -109,9 +109,11 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	if (irq_find_mapping(domain, hwirq) > 0)
 		return -EEXIST;
 
-	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
-	if (ret < 0)
-		return ret;
+	if (domain->parent) {
+		ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+		if (ret < 0)
+			return ret;
+	}
 
 	for (i = 0; i < nr_irqs; i++) {
 		ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
@@ -252,6 +254,60 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 					   &msi_domain_ops, info);
 }
 
+int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+			    int nvec, msi_alloc_info_t *arg)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+	int ret;
+
+	ret = ops->msi_check(domain, info, dev);
+	if (ret == 0)
+		ret = ops->msi_prepare(domain, dev, nvec, arg);
+
+	return ret;
+}
+
+int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
+			     int virq, int nvec, msi_alloc_info_t *arg)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+	struct msi_desc *desc;
+	int ret = 0;
+
+	for_each_msi_entry(desc, dev) {
+		/* Don't even try the multi-MSI brain damage. */
+		if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
+			continue;
+
+		ops->set_desc(arg, desc);
+		/* Assumes the domain mutex is held! */
+		ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg);
+		if (ret)
+			break;
+
+		irq_set_msi_desc_off(virq, 0, desc);
+	}
+
+	if (ret) {
+		/* Mop up the damage */
+		for_each_msi_entry(desc, dev) {
+			if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
+				continue;
+
+			irq_domain_free_irqs_common(domain, desc->irq, 1);
+		}
+	}
+
+	return ret;
+}
+
 /**
  * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
  * @domain:	The domain to allocate from
@@ -270,9 +326,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	struct msi_desc *desc;
 	int i, ret, virq = -1;
 
-	ret = ops->msi_check(domain, info, dev);
-	if (ret == 0)
-		ret = ops->msi_prepare(domain, dev, nvec, &arg);
+	ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
 	if (ret)
 		return ret;
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index d873b64fb..ee70aef5c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -63,16 +63,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 	if (ret)
 		goto out_free_image;
 
-	ret = sanity_check_segment_list(image);
-	if (ret)
-		goto out_free_image;
-
-	 /* Enable the special crash kernel control page allocation policy. */
 	if (kexec_on_panic) {
+		/* Enable special crash kernel control page alloc policy. */
 		image->control_page = crashk_res.start;
 		image->type = KEXEC_TYPE_CRASH;
 	}
 
+	ret = sanity_check_segment_list(image);
+	if (ret)
+		goto out_free_image;
+
 	/*
 	 * Find a location for the control code buffer, and add it
 	 * the vector of segments so that it's pages will also be
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 11b64a63c..8dc659144 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -310,12 +310,9 @@ static void kimage_free_pages(struct page *page)
 
 void kimage_free_page_list(struct list_head *list)
 {
-	struct list_head *pos, *next;
+	struct page *page, *next;
 
-	list_for_each_safe(pos, next, list) {
-		struct page *page;
-
-		page = list_entry(pos, struct page, lru);
+	list_for_each_entry_safe(page, next, list, lru) {
 		list_del(&page->lru);
 		kimage_free_pages(page);
 	}
@@ -853,7 +850,12 @@ struct kimage *kexec_image;
 struct kimage *kexec_crash_image;
 int kexec_load_disabled;
 
-void crash_kexec(struct pt_regs *regs)
+/*
+ * No panic_cpu check version of crash_kexec().  This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
+ */
+void __crash_kexec(struct pt_regs *regs)
 {
 	/* Take the kexec_mutex here to prevent sys_kexec_load
 	 * running on one cpu from replacing the crash kernel
@@ -876,6 +878,29 @@ void crash_kexec(struct pt_regs *regs)
 	}
 }
 
+void crash_kexec(struct pt_regs *regs)
+{
+	int old_cpu, this_cpu;
+
+	/*
+	 * Only one CPU is allowed to execute the crash_kexec() code as with
+	 * panic().  Otherwise parallel calls of panic() and crash_kexec()
+	 * may stop each other.  To exclude them, we use panic_cpu here too.
+	 */
+	this_cpu = raw_smp_processor_id();
+	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+	if (old_cpu == PANIC_CPU_INVALID) {
+		/* This is the 1st CPU which comes here, so go ahead. */
+		__crash_kexec(regs);
+
+		/*
+		 * Reset panic_cpu to allow another panic()/crash_kexec()
+		 * call.
+		 */
+		atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+	}
+}
+
 size_t crash_get_memory_size(void)
 {
 	size_t size = 0;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b70ada002..007b791f6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -109,11 +109,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
 	return -EINVAL;
 }
 
+#ifdef CONFIG_KEXEC_VERIFY_SIG
 int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
 					unsigned long buf_len)
 {
 	return -EKEYREJECTED;
 }
+#endif
 
 /* Apply relocations of type RELA */
 int __weak
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index e4392a698..0a52315d9 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image,
 extern struct mutex kexec_mutex;
 
 #ifdef CONFIG_KEXEC_FILE
+struct kexec_sha_region {
+	unsigned long start;
+	unsigned long len;
+};
+
+/*
+ * Keeps track of buffer parameters as provided by caller for requesting
+ * memory placement of buffer.
+ */
+struct kexec_buf {
+	struct kimage *image;
+	char *buffer;
+	unsigned long bufsz;
+	unsigned long mem;
+	unsigned long memsz;
+	unsigned long buf_align;
+	unsigned long buf_min;
+	unsigned long buf_max;
+	bool top_down;		/* allocate from top of memory hole */
+};
+
 void kimage_file_post_load_cleanup(struct kimage *image);
 #else /* CONFIG_KEXEC_FILE */
 static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e83b26464..152da4a48 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -20,7 +20,7 @@
 #include <linux/capability.h>
 #include <linux/compiler.h>
 
-#include <linux/rcupdate.h>	/* rcu_expedited */
+#include <linux/rcupdate.h>	/* rcu_expedited and rcu_normal */
 
 #define KERNEL_ATTR_RO(_name) \
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(fscaps);
 
+#ifndef CONFIG_TINY_RCU
 int rcu_expedited;
 static ssize_t rcu_expedited_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", rcu_expedited);
+	return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
 }
 static ssize_t rcu_expedited_store(struct kobject *kobj,
 				   struct kobj_attribute *attr,
@@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj,
 }
 KERNEL_ATTR_RW(rcu_expedited);
 
+int rcu_normal;
+static ssize_t rcu_normal_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
+}
+static ssize_t rcu_normal_store(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				const char *buf, size_t count)
+{
+	if (kstrtoint(buf, 0, &rcu_normal))
+		return -EINVAL;
+
+	return count;
+}
+KERNEL_ATTR_RW(rcu_normal);
+#endif /* #ifndef CONFIG_TINY_RCU */
+
 /*
  * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
  */
@@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = {
 	&kexec_crash_size_attr.attr,
 	&vmcoreinfo_attr.attr,
 #endif
+#ifndef CONFIG_TINY_RCU
 	&rcu_expedited_attr.attr,
+	&rcu_normal_attr.attr,
+#endif
 	NULL
 };
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 12d8a8f88..9ff173dca 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -275,7 +275,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct task_struct *task;
 	struct kthread_create_info *create = kmalloc(sizeof(*create),
-						     GFP_KERNEL | ___GFP_TOI_NOTRACK);
+						     GFP_KERNEL);
 
 	if (!create)
 		return ERR_PTR(-ENOMEM);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a02812743..b5c30d9f4 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -47,12 +47,12 @@
  * of times)
  */
 
-#include <linux/latencytop.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
+#include <linux/latencytop.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
 	proc_create("latency_stats", 0644, NULL, &lstats_fops);
 	return 0;
 }
+
+int sysctl_latencytop(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err;
+
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (latencytop_enabled)
+		force_schedstat_enabled();
+
+	return err;
+}
 device_initcall(init_lstats_procfs);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index db545cbcd..bc2c85c06 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -28,6 +28,7 @@
 #include <linux/list.h>
 #include <linux/kallsyms.h>
 #include <linux/livepatch.h>
+#include <asm/cacheflush.h>
 
 /**
  * struct klp_ops - structure for tracking registered ftrace ops structs
@@ -135,13 +136,8 @@ struct klp_find_arg {
 	const char *objname;
 	const char *name;
 	unsigned long addr;
-	/*
-	 * If count == 0, the symbol was not found. If count == 1, a unique
-	 * match was found and addr is set.  If count > 1, there is
-	 * unresolvable ambiguity among "count" number of symbols with the same
-	 * name in the same object.
-	 */
 	unsigned long count;
+	unsigned long pos;
 };
 
 static int klp_find_callback(void *data, const char *name,
@@ -158,37 +154,48 @@ static int klp_find_callback(void *data, const char *name,
 	if (args->objname && strcmp(args->objname, mod->name))
 		return 0;
 
-	/*
-	 * args->addr might be overwritten if another match is found
-	 * but klp_find_object_symbol() handles this and only returns the
-	 * addr if count == 1.
-	 */
 	args->addr = addr;
 	args->count++;
 
+	/*
+	 * Finish the search when the symbol is found for the desired position
+	 * or the position is not defined for a non-unique symbol.
+	 */
+	if ((args->pos && (args->count == args->pos)) ||
+	    (!args->pos && (args->count > 1)))
+		return 1;
+
 	return 0;
 }
 
 static int klp_find_object_symbol(const char *objname, const char *name,
-				  unsigned long *addr)
+				  unsigned long sympos, unsigned long *addr)
 {
 	struct klp_find_arg args = {
 		.objname = objname,
 		.name = name,
 		.addr = 0,
-		.count = 0
+		.count = 0,
+		.pos = sympos,
 	};
 
 	mutex_lock(&module_mutex);
 	kallsyms_on_each_symbol(klp_find_callback, &args);
 	mutex_unlock(&module_mutex);
 
-	if (args.count == 0)
+	/*
+	 * Ensure an address was found. If sympos is 0, ensure symbol is unique;
+	 * otherwise ensure the symbol position count matches sympos.
+	 */
+	if (args.addr == 0)
 		pr_err("symbol '%s' not found in symbol table\n", name);
-	else if (args.count > 1)
+	else if (args.count > 1 && sympos == 0) {
 		pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
 		       args.count, name, objname);
-	else {
+	} else if (sympos != args.count && sympos > 0) {
+		pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n",
+		       sympos, name, objname ? objname : "vmlinux");
+	} else {
 		*addr = args.addr;
 		return 0;
 	}
@@ -197,66 +204,6 @@ static int klp_find_object_symbol(const char *objname, const char *name,
 	return -EINVAL;
 }
 
-struct klp_verify_args {
-	const char *name;
-	const unsigned long addr;
-};
-
-static int klp_verify_callback(void *data, const char *name,
-			       struct module *mod, unsigned long addr)
-{
-	struct klp_verify_args *args = data;
-
-	if (!mod &&
-	    !strcmp(args->name, name) &&
-	    args->addr == addr)
-		return 1;
-
-	return 0;
-}
-
-static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
-{
-	struct klp_verify_args args = {
-		.name = name,
-		.addr = addr,
-	};
-	int ret;
-
-	mutex_lock(&module_mutex);
-	ret = kallsyms_on_each_symbol(klp_verify_callback, &args);
-	mutex_unlock(&module_mutex);
-
-	if (!ret) {
-		pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
-			name, addr);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int klp_find_verify_func_addr(struct klp_object *obj,
-				     struct klp_func *func)
-{
-	int ret;
-
-#if defined(CONFIG_RANDOMIZE_BASE)
-	/* If KASLR has been enabled, adjust old_addr accordingly */
-	if (kaslr_enabled() && func->old_addr)
-		func->old_addr += kaslr_offset();
-#endif
-
-	if (!func->old_addr || klp_is_module(obj))
-		ret = klp_find_object_symbol(obj->name, func->old_name,
-					     &func->old_addr);
-	else
-		ret = klp_verify_vmlinux_symbol(func->old_name,
-						func->old_addr);
-
-	return ret;
-}
-
 /*
  * external symbols are located outside the parent object (where the parent
  * object is either vmlinux or the kmod being patched).
@@ -276,14 +223,18 @@ static int klp_find_external_symbol(struct module *pmod, const char *name,
 	}
 	preempt_enable();
 
-	/* otherwise check if it's in another .o within the patch module */
-	return klp_find_object_symbol(pmod->name, name, addr);
+	/*
+	 * Check if it's in another .o within the patch module. This also
+	 * checks that the external symbol is unique.
+	 */
+	return klp_find_object_symbol(pmod->name, name, 0, addr);
 }
 
 static int klp_write_object_relocations(struct module *pmod,
 					struct klp_object *obj)
 {
-	int ret;
+	int ret = 0;
+	unsigned long val;
 	struct klp_reloc *reloc;
 
 	if (WARN_ON(!klp_is_object_loaded(obj)))
@@ -292,41 +243,38 @@ static int klp_write_object_relocations(struct module *pmod,
 	if (WARN_ON(!obj->relocs))
 		return -EINVAL;
 
+	module_disable_ro(pmod);
+
 	for (reloc = obj->relocs; reloc->name; reloc++) {
-		if (!klp_is_module(obj)) {
-
-#if defined(CONFIG_RANDOMIZE_BASE)
-			/* If KASLR has been enabled, adjust old value accordingly */
-			if (kaslr_enabled())
-				reloc->val += kaslr_offset();
-#endif
-			ret = klp_verify_vmlinux_symbol(reloc->name,
-							reloc->val);
-			if (ret)
-				return ret;
-		} else {
-			/* module, reloc->val needs to be discovered */
-			if (reloc->external)
-				ret = klp_find_external_symbol(pmod,
-							       reloc->name,
-							       &reloc->val);
-			else
-				ret = klp_find_object_symbol(obj->mod->name,
-							     reloc->name,
-							     &reloc->val);
-			if (ret)
-				return ret;
-		}
+		/* discover the address of the referenced symbol */
+		if (reloc->external) {
+			if (reloc->sympos > 0) {
+				pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n",
+				       reloc->name);
+				ret = -EINVAL;
+				goto out;
+			}
+			ret = klp_find_external_symbol(pmod, reloc->name, &val);
+		} else
+			ret = klp_find_object_symbol(obj->name,
+						     reloc->name,
+						     reloc->sympos,
+						     &val);
+		if (ret)
+			goto out;
+
 		ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
-					     reloc->val + reloc->addend);
+					     val + reloc->addend);
 		if (ret) {
 			pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
-			       reloc->name, reloc->val, ret);
-			return ret;
+			       reloc->name, val, ret);
+			goto out;
 		}
 	}
 
-	return 0;
+out:
+	module_enable_ro(pmod);
+	return ret;
 }
 
 static void notrace klp_ftrace_handler(unsigned long ip,
@@ -593,7 +541,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
  * /sys/kernel/livepatch/<patch>
  * /sys/kernel/livepatch/<patch>/enabled
  * /sys/kernel/livepatch/<patch>/<object>
- * /sys/kernel/livepatch/<patch>/<object>/<func>
+ * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
  */
 
 static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -738,8 +686,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 	INIT_LIST_HEAD(&func->stack_node);
 	func->state = KLP_DISABLED;
 
+	/* The format for the sysfs directory is <function,sympos> where sympos
+	 * is the nth occurrence of this symbol in kallsyms for the patched
+	 * object. If the user selects 0 for old_sympos, then 1 will be used
+	 * since a unique symbol will be the first occurrence.
+	 */
 	return kobject_init_and_add(&func->kobj, &klp_ktype_func,
-				    &obj->kobj, "%s", func->old_name);
+				    &obj->kobj, "%s,%lu", func->old_name,
+				    func->old_sympos ? func->old_sympos : 1);
 }
 
 /* parts of the initialization that is done only when the object is loaded */
@@ -756,7 +710,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
 	}
 
 	klp_for_each_func(obj, func) {
-		ret = klp_find_verify_func_addr(obj, func);
+		ret = klp_find_object_symbol(obj->name, func->old_name,
+					     func->old_sympos,
+					     &func->old_addr);
 		if (ret)
 			return ret;
 	}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 60ace5661..716547fdb 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes);
 #define __classhashfn(key)	hash_long((unsigned long)key, CLASSHASH_BITS)
 #define classhashentry(key)	(classhash_table + __classhashfn((key)))
 
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
 
 /*
  * We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
 #define __chainhashfn(chain)	hash_long(chain, CHAINHASH_BITS)
 #define chainhashentry(chain)	(chainhash_table + __chainhashfn((chain)))
 
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
 
 /*
  * The hash key of the lock dependency chains is a hash itself too:
@@ -666,7 +666,7 @@ static inline struct lock_class *
 look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 {
 	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
+	struct hlist_head *hash_head;
 	struct lock_class *class;
 
 #ifdef CONFIG_DEBUG_LOCKDEP
@@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return NULL;
 
-	list_for_each_entry_rcu(class, hash_head, hash_entry) {
+	hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
 		if (class->key == key) {
 			/*
 			 * Huh! same key, different name? Did someone trample
@@ -742,7 +742,7 @@ static inline struct lock_class *
 register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 {
 	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
+	struct hlist_head *hash_head;
 	struct lock_class *class;
 
 	DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * We have to do the hash-walk again, to avoid races
 	 * with another CPU:
 	 */
-	list_for_each_entry_rcu(class, hash_head, hash_entry) {
+	hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
 		if (class->key == key)
 			goto out_unlock_set;
 	}
@@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * We use RCU's safe list-add method to make
 	 * parallel walking of the hash-list safe:
 	 */
-	list_add_tail_rcu(&class->hash_entry, hash_head);
+	hlist_add_head_rcu(&class->hash_entry, hash_head);
 	/*
 	 * Add it to the global list of classes:
 	 */
@@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-	       struct held_lock *next, int distance, int trylock_loop)
+	       struct held_lock *next, int distance, int *stack_saved)
 {
 	struct lock_list *entry;
 	int ret;
@@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		}
 	}
 
-	if (!trylock_loop && !save_trace(&trace))
-		return 0;
+	if (!*stack_saved) {
+		if (!save_trace(&trace))
+			return 0;
+		*stack_saved = 1;
+	}
 
 	/*
 	 * Ok, all validations passed, add the new lock
@@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 * Debugging printouts:
 	 */
 	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+		/* We drop graph lock, so another thread can overwrite trace. */
+		*stack_saved = 0;
 		graph_unlock();
 		printk("\n new dependency: ");
 		print_lock_name(hlock_class(prev));
@@ -1929,7 +1934,7 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
 	int depth = curr->lockdep_depth;
-	int trylock_loop = 0;
+	int stack_saved = 0;
 	struct held_lock *hlock;
 
 	/*
@@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		 */
 		if (hlock->read != 2 && hlock->check) {
 			if (!check_prev_add(curr, hlock, next,
-						distance, trylock_loop))
+						distance, &stack_saved))
 				return 0;
 			/*
 			 * Stop after the first non-trylock entry,
@@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		if (curr->held_locks[depth].irq_context !=
 				curr->held_locks[depth-1].irq_context)
 			break;
-		trylock_loop = 1;
 	}
 	return 1;
 out_bug:
@@ -2017,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 				     u64 chain_key)
 {
 	struct lock_class *class = hlock_class(hlock);
-	struct list_head *hash_head = chainhashentry(chain_key);
+	struct hlist_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
 	struct held_lock *hlock_curr;
 	int i, j;
@@ -2033,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	 * We can walk it lock-free, because entries only get added
 	 * to the hash:
 	 */
-	list_for_each_entry_rcu(chain, hash_head, entry) {
+	hlist_for_each_entry_rcu(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 cache_hit:
 			debug_atomic_inc(chain_lookup_hits);
@@ -2057,7 +2061,7 @@ cache_hit:
 	/*
 	 * We have to walk the chain again locked - to avoid duplicates:
 	 */
-	list_for_each_entry(chain, hash_head, entry) {
+	hlist_for_each_entry(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 			graph_unlock();
 			goto cache_hit;
@@ -2091,7 +2095,7 @@ cache_hit:
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
 	}
-	list_add_tail_rcu(&chain->entry, hash_head);
+	hlist_add_head_rcu(&chain->entry, hash_head);
 	debug_atomic_inc(chain_lookup_misses);
 	inc_chains();
 
@@ -3875,7 +3879,7 @@ void lockdep_reset(void)
 	nr_process_chains = 0;
 	debug_locks = 1;
 	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_LIST_HEAD(chainhash_table + i);
+		INIT_HLIST_HEAD(chainhash_table + i);
 	raw_local_irq_restore(flags);
 }
 
@@ -3894,7 +3898,7 @@ static void zap_class(struct lock_class *class)
 	/*
 	 * Unhash the class and remove it from the all_lock_classes list:
 	 */
-	list_del_rcu(&class->hash_entry);
+	hlist_del_rcu(&class->hash_entry);
 	list_del_rcu(&class->lock_entry);
 
 	RCU_INIT_POINTER(class->key, NULL);
@@ -3917,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size)
 void lockdep_free_key_range(void *start, unsigned long size)
 {
 	struct lock_class *class;
-	struct list_head *head;
+	struct hlist_head *head;
 	unsigned long flags;
 	int i;
 	int locked;
@@ -3930,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 	 */
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry_rcu(class, head, hash_entry) {
+		hlist_for_each_entry_rcu(class, head, hash_entry) {
 			if (within(class->key, start, size))
 				zap_class(class);
 			else if (within(class->name, start, size))
@@ -3962,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
 	struct lock_class *class;
-	struct list_head *head;
+	struct hlist_head *head;
 	unsigned long flags;
 	int i, j;
 	int locked;
@@ -3987,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 	locked = graph_lock();
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry_rcu(class, head, hash_entry) {
+		hlist_for_each_entry_rcu(class, head, hash_entry) {
 			int match = 0;
 
 			for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
@@ -4027,10 +4027,10 @@ void lockdep_init(void)
 		return;
 
 	for (i = 0; i < CLASSHASH_SIZE; i++)
-		INIT_LIST_HEAD(classhash_table + i);
+		INIT_HLIST_HEAD(classhash_table + i);
 
 	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_LIST_HEAD(chainhash_table + i);
+		INIT_HLIST_HEAD(chainhash_table + i);
 
 	lockdep_initialized = 1;
 }
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a6..393d1874b 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
  * (C) Copyright 2013-2014 Red Hat, Inc.
  * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
  *
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
  *          Peter Zijlstra <peterz@infradead.org>
  */
 
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
 	struct __qspinlock *l = (void *)lock;
 
-	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+	/*
+	 * Use release semantics to make sure that the MCS node is properly
+	 * initialized before changing the tail code.
+	 */
+	return (u32)xchg_release(&l->tail,
+				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
 }
 
 #else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 
 	for (;;) {
 		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
-		old = atomic_cmpxchg(&lock->val, val, new);
+		/*
+		 * Use release semantics to make sure that the MCS node is
+		 * properly initialized before changing the tail code.
+		 */
+		old = atomic_cmpxchg_release(&lock->val, val, new);
 		if (old == val)
 			break;
 
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
  */
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+					   struct mcs_spinlock *prev) { }
 static __always_inline void __pv_kick_node(struct qspinlock *lock,
 					   struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
-					   struct mcs_spinlock *node) { }
+static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
+						   struct mcs_spinlock *node)
+						   { return 0; }
 
 #define pv_enabled()		false
 
 #define pv_init_node		__pv_init_node
 #define pv_wait_node		__pv_wait_node
 #define pv_kick_node		__pv_kick_node
-#define pv_wait_head		__pv_wait_head
+#define pv_wait_head_or_lock	__pv_wait_head_or_lock
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 		if (val == new)
 			new |= _Q_PENDING_VAL;
 
-		old = atomic_cmpxchg(&lock->val, val, new);
+		/*
+		 * Acquire semantic is required here as the function may
+		 * return immediately if the lock was free.
+		 */
+		old = atomic_cmpxchg_acquire(&lock->val, val, new);
 		if (old == val)
 			break;
 
@@ -382,6 +398,7 @@ queue:
 	 * p,*,* -> n,*,*
 	 */
 	old = xchg_tail(lock, tail);
+	next = NULL;
 
 	/*
 	 * if there was a previous node; link it and wait until reaching the
@@ -391,8 +408,18 @@ queue:
 		prev = decode_tail(old);
 		WRITE_ONCE(prev->next, node);
 
-		pv_wait_node(node);
+		pv_wait_node(node, prev);
 		arch_mcs_spin_lock_contended(&node->locked);
+
+		/*
+		 * While waiting for the MCS lock, the next pointer may have
+		 * been set by another lock waiter. We optimistically load
+		 * the next pointer & prefetch the cacheline for writing
+		 * to reduce latency in the upcoming MCS unlock operation.
+		 */
+		next = READ_ONCE(node->next);
+		if (next)
+			prefetchw(next);
 	}
 
 	/*
@@ -406,11 +433,22 @@ queue:
 	 * sequentiality; this is because the set_locked() function below
 	 * does not imply a full barrier.
 	 *
+	 * The PV pv_wait_head_or_lock function, if active, will acquire
+	 * the lock and return a non-zero value. So we have to skip the
+	 * smp_load_acquire() call. As the next PV queue head hasn't been
+	 * designated yet, there is no way for the locked value to become
+	 * _Q_SLOW_VAL. So both the set_locked() and the
+	 * atomic_cmpxchg_relaxed() calls will be safe.
+	 *
+	 * If PV isn't active, 0 will be returned instead.
+	 *
 	 */
-	pv_wait_head(lock, node);
-	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
-		cpu_relax();
+	if ((val = pv_wait_head_or_lock(lock, node)))
+		goto locked;
 
+	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+
+locked:
 	/*
 	 * claim the lock:
 	 *
@@ -422,11 +460,17 @@ queue:
 	 * to grab the lock.
 	 */
 	for (;;) {
-		if (val != tail) {
+		/* In the PV case we might already have _Q_LOCKED_VAL set */
+		if ((val & _Q_TAIL_MASK) != tail) {
 			set_locked(lock);
 			break;
 		}
-		old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+		/*
+		 * The smp_load_acquire() call above has provided the necessary
+		 * acquire semantics required for locking. At most two
+		 * iterations of this loop may be ran.
+		 */
+		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
 		if (old == val)
 			goto release;	/* No contention */
 
@@ -434,10 +478,12 @@ queue:
 	}
 
 	/*
-	 * contended path; wait for next, release.
+	 * contended path; wait for next if not observed yet, release.
 	 */
-	while (!(next = READ_ONCE(node->next)))
-		cpu_relax();
+	if (!next) {
+		while (!(next = READ_ONCE(node->next)))
+			cpu_relax();
+	}
 
 	arch_mcs_spin_unlock_contended(&next->locked);
 	pv_kick_node(lock, next);
@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 #undef pv_init_node
 #undef pv_wait_node
 #undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
 
 #undef  queued_spin_lock_slowpath
 #define queued_spin_lock_slowpath	__pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff48..87bb235c3 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
 #define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
 
 /*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK	0xff
+
+/*
  * Queue node uses: vcpu_running & vcpu_halted.
  * Queue head uses: vcpu_running & vcpu_hashed.
  */
@@ -41,6 +55,94 @@ struct pv_node {
 };
 
 /*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l)	pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+		(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->pending, 1);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->pending, 0);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	return !READ_ONCE(l->locked) &&
+	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+			== _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+	atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+	atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+	int val = atomic_read(&lock->val);
+
+	for (;;) {
+		int old, new;
+
+		if (val  & _Q_LOCKED_MASK)
+			break;
+
+		/*
+		 * Try to clear pending bit & set locked bit
+		 */
+		old = val;
+		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+		val = atomic_cmpxchg(&lock->val, old, new);
+
+		if (val == old)
+			return 1;
+	}
+	return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
  * Lock and MCS node addresses hash table for fast lookup
  *
  * Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 {
 	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
 	struct pv_hash_entry *he;
+	int hopcnt = 0;
 
 	for_each_hash_entry(he, offset, hash) {
+		hopcnt++;
 		if (!cmpxchg(&he->lock, NULL, lock)) {
 			WRITE_ONCE(he->node, node);
+			qstat_hop(hopcnt);
 			return &he->lock;
 		}
 	}
@@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
 }
 
 /*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+
+	if ((loop & PV_PREV_CHECK_MASK) != 0)
+		return false;
+
+	return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
  * Initialize the PV part of the mcs_spinlock node.
  */
 static void pv_init_node(struct mcs_spinlock *node)
@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
  * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
  * behalf.
  */
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	struct pv_node *pp = (struct pv_node *)prev;
+	int waitcnt = 0;
 	int loop;
+	bool wait_early;
 
-	for (;;) {
-		for (loop = SPIN_THRESHOLD; loop; loop--) {
+	/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+	for (;; waitcnt++) {
+		for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
 			if (READ_ONCE(node->locked))
 				return;
+			if (pv_wait_early(pp, loop)) {
+				wait_early = true;
+				break;
+			}
 			cpu_relax();
 		}
 
@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 */
 		smp_store_mb(pn->state, vcpu_halted);
 
-		if (!READ_ONCE(node->locked))
+		if (!READ_ONCE(node->locked)) {
+			qstat_inc(qstat_pv_wait_node, true);
+			qstat_inc(qstat_pv_wait_again, waitcnt);
+			qstat_inc(qstat_pv_wait_early, wait_early);
 			pv_wait(&pn->state, vcpu_halted);
+		}
 
 		/*
-		 * If pv_kick_node() changed us to vcpu_hashed, retain that value
-		 * so that pv_wait_head() knows to not also try to hash this lock.
+		 * If pv_kick_node() changed us to vcpu_hashed, retain that
+		 * value so that pv_wait_head_or_lock() knows to not also try
+		 * to hash this lock.
 		 */
 		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 * So it is better to spin for a while in the hope that the
 		 * MCS lock will be released soon.
 		 */
+		qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
 	}
 
 	/*
@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
 /*
  * Called after setting next->locked = 1 when we're the lock owner.
  *
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
  */
 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 }
 
 /*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
  * __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
  */
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
 	struct __qspinlock *l = (void *)lock;
 	struct qspinlock **lp = NULL;
+	int waitcnt = 0;
 	int loop;
 
 	/*
@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 	if (READ_ONCE(pn->state) == vcpu_hashed)
 		lp = (struct qspinlock **)1;
 
-	for (;;) {
+	for (;; waitcnt++) {
+		/*
+		 * Set correct vCPU state to be used by queue node wait-early
+		 * mechanism.
+		 */
+		WRITE_ONCE(pn->state, vcpu_running);
+
+		/*
+		 * Set the pending bit in the active lock spinning loop to
+		 * disable lock stealing before attempting to acquire the lock.
+		 */
+		set_pending(lock);
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
-			if (!READ_ONCE(l->locked))
-				return;
+			if (trylock_clear_pending(lock))
+				goto gotlock;
 			cpu_relax();
 		}
+		clear_pending(lock);
+
 
 		if (!lp) { /* ONCE */
 			lp = pv_hash(lock, pn);
@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 			 *
 			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 			 */
-			if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+			if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
 				/*
-				 * The lock is free and _Q_SLOW_VAL has never
-				 * been set. Therefore we need to unhash before
-				 * getting the lock.
+				 * The lock was free and now we own the lock.
+				 * Change the lock value back to _Q_LOCKED_VAL
+				 * and unhash the table.
 				 */
+				WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
 				WRITE_ONCE(*lp, NULL);
-				return;
+				goto gotlock;
 			}
 		}
+		WRITE_ONCE(pn->state, vcpu_halted);
+		qstat_inc(qstat_pv_wait_head, true);
+		qstat_inc(qstat_pv_wait_again, waitcnt);
 		pv_wait(&l->locked, _Q_SLOW_VAL);
 
 		/*
 		 * The unlocker should have freed the lock before kicking the
 		 * CPU. So if the lock is still not free, it is a spurious
-		 * wakeup and so the vCPU should wait again after spinning for
-		 * a while.
+		 * wakeup or another vCPU has stolen the lock. The current
+		 * vCPU should spin again.
 		 */
+		qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
 	}
 
 	/*
-	 * Lock is unlocked now; the caller will acquire it without waiting.
-	 * As with pv_wait_node() we rely on the caller to do a load-acquire
-	 * for us.
+	 * The cmpxchg() or xchg() call before coming here provides the
+	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+	 * here is to indicate to the compiler that the value will always
+	 * be nozero to enable better code optimization.
 	 */
+gotlock:
+	return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
 }
 
 /*
- * PV version of the unlock function to be used in stead of
- * queued_spin_unlock().
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
  */
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 {
 	struct __qspinlock *l = (void *)lock;
 	struct pv_node *node;
-	u8 locked;
-
-	/*
-	 * We must not unlock if SLOW, because in that case we must first
-	 * unhash. Otherwise it would be possible to have multiple @lock
-	 * entries, which would be BAD.
-	 */
-	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
-	if (likely(locked == _Q_LOCKED_VAL))
-		return;
 
 	if (unlikely(locked != _Q_SLOW_VAL)) {
 		WARN(!debug_locks_silent,
@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 * so we need a barrier to order the read of the node data in
 	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
 	 *
-	 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
 	 */
 	smp_rmb();
 
@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 * vCPU is harmless other than the additional latency in completing
 	 * the unlock.
 	 */
+	qstat_inc(qstat_pv_kick_unlock, true);
 	pv_kick(node->cpu);
 }
+
 /*
  * Include the architecture specific callee-save thunk of the
  * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
- * that the callee-save thunk and the real unlock function are close
- * to each other sharing consecutive instruction cachelines.
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
  */
 #include <asm/qspinlock_paravirt.h>
 
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+	u8 locked;
+
+	/*
+	 * We must not unlock if SLOW, because in that case we must first
+	 * unhash. Otherwise it would be possible to have multiple @lock
+	 * entries, which would be BAD.
+	 */
+	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+	if (likely(locked == _Q_LOCKED_VAL))
+		return;
+
+	__pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000..640dcecdd
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,300 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ *   pv_hash_hops	- average # of hops per hashing operation
+ *   pv_kick_unlock	- # of vCPU kicks issued at unlock time
+ *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake
+ *   pv_latency_kick	- average latency (ns) of vCPU kick operation
+ *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_stealing	- # of lock stealing operations
+ *   pv_spurious_wakeup	- # of spurious wakeups
+ *   pv_wait_again	- # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_early	- # of early vCPU wait's
+ *   pv_wait_head	- # of vCPU wait's at the queue head
+ *   pv_wait_node	- # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+	qstat_pv_hash_hops,
+	qstat_pv_kick_unlock,
+	qstat_pv_kick_wake,
+	qstat_pv_latency_kick,
+	qstat_pv_latency_wake,
+	qstat_pv_lock_stealing,
+	qstat_pv_spurious_wakeup,
+	qstat_pv_wait_again,
+	qstat_pv_wait_early,
+	qstat_pv_wait_head,
+	qstat_pv_wait_node,
+	qstat_num,	/* Total number of statistical counters */
+	qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+	[qstat_pv_hash_hops]	   = "pv_hash_hops",
+	[qstat_pv_kick_unlock]     = "pv_kick_unlock",
+	[qstat_pv_kick_wake]       = "pv_kick_wake",
+	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+	[qstat_pv_latency_kick]	   = "pv_latency_kick",
+	[qstat_pv_latency_wake]    = "pv_latency_wake",
+	[qstat_pv_lock_stealing]   = "pv_lock_stealing",
+	[qstat_pv_wait_again]      = "pv_wait_again",
+	[qstat_pv_wait_early]      = "pv_wait_early",
+	[qstat_pv_wait_head]       = "pv_wait_head",
+	[qstat_pv_wait_node]       = "pv_wait_node",
+	[qstat_reset_cnts]         = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ *    Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+			  size_t count, loff_t *ppos)
+{
+	char buf[64];
+	int cpu, counter, len;
+	u64 stat = 0, kicks = 0;
+
+	/*
+	 * Get the counter ID stored in file->f_inode->i_private
+	 */
+	if (!file->f_inode) {
+		WARN_ON_ONCE(1);
+		return -EBADF;
+	}
+	counter = (long)(file->f_inode->i_private);
+
+	if (counter >= qstat_num)
+		return -EBADF;
+
+	for_each_possible_cpu(cpu) {
+		stat += per_cpu(qstats[counter], cpu);
+		/*
+		 * Need to sum additional counter for some of them
+		 */
+		switch (counter) {
+
+		case qstat_pv_latency_kick:
+		case qstat_pv_hash_hops:
+			kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+			break;
+
+		case qstat_pv_latency_wake:
+			kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+			break;
+		}
+	}
+
+	if (counter == qstat_pv_hash_hops) {
+		u64 frac;
+
+		frac = 100ULL * do_div(stat, kicks);
+		frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+
+		/*
+		 * Return a X.XX decimal number
+		 */
+		len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+	} else {
+		/*
+		 * Round to the nearest ns
+		 */
+		if ((counter == qstat_pv_latency_kick) ||
+		    (counter == qstat_pv_latency_wake)) {
+			stat = 0;
+			if (kicks)
+				stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+		}
+		len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+	}
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+			   size_t count, loff_t *ppos)
+{
+	int cpu;
+
+	/*
+	 * Get the counter ID stored in file->f_inode->i_private
+	 */
+	if (!file->f_inode) {
+		WARN_ON_ONCE(1);
+		return -EBADF;
+	}
+	if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+		return count;
+
+	for_each_possible_cpu(cpu) {
+		int i;
+		unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+		for (i = 0 ; i < qstat_num; i++)
+			WRITE_ONCE(ptr[i], 0);
+		for (i = 0 ; i < qstat_num; i++)
+			WRITE_ONCE(ptr[i], 0);
+	}
+	return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+	.read = qstat_read,
+	.write = qstat_write,
+	.llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+	struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+	int i;
+
+	if (!d_qstat) {
+		pr_warn("Could not create 'qlockstat' debugfs directory\n");
+		return 0;
+	}
+
+	/*
+	 * Create the debugfs files
+	 *
+	 * As reading from and writing to the stat files can be slow, only
+	 * root is allowed to do the read/write to limit impact to system
+	 * performance.
+	 */
+	for (i = 0; i < qstat_num; i++)
+		debugfs_create_file(qstat_names[i], 0400, d_qstat,
+				   (void *)(long)i, &fops_qstat);
+
+	debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+			   (void *)(long)qstat_reset_cnts, &fops_qstat);
+	return 0;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+	if (cond)
+		this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+	this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+	u64 start = sched_clock();
+
+	per_cpu(pv_kick_time, cpu) = start;
+	pv_kick(cpu);
+	this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+	u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+	*pkick_time = 0;
+	pv_wait(ptr, val);
+	if (*pkick_time) {
+		this_cpu_add(qstats[qstat_pv_latency_wake],
+			     sched_clock() - *pkick_time);
+		qstat_inc(qstat_pv_kick_wake, true);
+	}
+}
+
+#define pv_kick(c)	__pv_kick(c)
+#define pv_wait(p, v)	__pv_wait(p, v)
+
+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_spin_steal_lock(struct qspinlock *lock)
+{
+	int ret = pv_queued_spin_steal_lock(lock);
+
+	qstat_inc(qstat_pv_lock_stealing, ret);
+	return ret;
+}
+#undef  queued_spin_trylock
+#define queued_spin_trylock(l)	qstat_spin_steal_lock(l)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ }
+static inline void qstat_hop(int hopcnt)			{ }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8251e75dd..3e746607a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
  * 2) Drop lock->wait_lock
  * 3) Try to unlock the lock with cmpxchg
  */
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+					unsigned long flags)
 	__releases(lock->wait_lock)
 {
 	struct task_struct *owner = rt_mutex_owner(lock);
 
 	clear_rt_mutex_waiters(lock);
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 	/*
 	 * If a new waiter comes in between the unlock and the cmpxchg
 	 * we have two situations:
@@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 /*
  * Simple slow path only version: lock->owner is protected by lock->wait_lock.
  */
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+					unsigned long flags)
 	__releases(lock->wait_lock)
 {
 	lock->owner = NULL;
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 	return true;
 }
 #endif
@@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	int ret = 0, depth = 0;
 	struct rt_mutex *lock;
 	bool detect_deadlock;
-	unsigned long flags;
 	bool requeue = true;
 
 	detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	/*
 	 * [1] Task cannot go away as we did a get_task() before !
 	 */
-	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	raw_spin_lock_irq(&task->pi_lock);
 
 	/*
 	 * [2] Get the waiter on which @task is blocked on.
@@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * operations.
 	 */
 	if (!raw_spin_trylock(&lock->wait_lock)) {
-		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+		raw_spin_unlock_irq(&task->pi_lock);
 		cpu_relax();
 		goto retry;
 	}
@@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		/*
 		 * No requeue[7] here. Just release @task [8]
 		 */
-		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+		raw_spin_unlock(&task->pi_lock);
 		put_task_struct(task);
 
 		/*
@@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		 * If there is no owner of the lock, end of chain.
 		 */
 		if (!rt_mutex_owner(lock)) {
-			raw_spin_unlock(&lock->wait_lock);
+			raw_spin_unlock_irq(&lock->wait_lock);
 			return 0;
 		}
 
 		/* [10] Grab the next task, i.e. owner of @lock */
 		task = rt_mutex_owner(lock);
 		get_task_struct(task);
-		raw_spin_lock_irqsave(&task->pi_lock, flags);
+		raw_spin_lock(&task->pi_lock);
 
 		/*
 		 * No requeue [11] here. We just do deadlock detection.
@@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		top_waiter = rt_mutex_top_waiter(lock);
 
 		/* [13] Drop locks */
-		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock(&task->pi_lock);
+		raw_spin_unlock_irq(&lock->wait_lock);
 
 		/* If owner is not blocked, end of chain. */
 		if (!next_lock)
@@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	rt_mutex_enqueue(lock, waiter);
 
 	/* [8] Release the task */
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	raw_spin_unlock(&task->pi_lock);
 	put_task_struct(task);
 
 	/*
@@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		 */
 		if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
 			wake_up_process(rt_mutex_top_waiter(lock)->task);
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock_irq(&lock->wait_lock);
 		return 0;
 	}
 
 	/* [10] Grab the next task, i.e. the owner of @lock */
 	task = rt_mutex_owner(lock);
 	get_task_struct(task);
-	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	raw_spin_lock(&task->pi_lock);
 
 	/* [11] requeue the pi waiters if necessary */
 	if (waiter == rt_mutex_top_waiter(lock)) {
@@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	top_waiter = rt_mutex_top_waiter(lock);
 
 	/* [13] Drop the locks */
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock(&task->pi_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	/*
 	 * Make the actual exit decisions [12], based on the stored
@@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	goto again;
 
  out_unlock_pi:
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	raw_spin_unlock_irq(&task->pi_lock);
  out_put_task:
 	put_task_struct(task);
 
@@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 /*
  * Try to take an rt-mutex
  *
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
  *
  * @lock:   The lock to be acquired.
  * @task:   The task which wants to acquire the lock
@@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 				struct rt_mutex_waiter *waiter)
 {
-	unsigned long flags;
-
 	/*
 	 * Before testing whether we can acquire @lock, we set the
 	 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	 * case, but conditionals are more expensive than a redundant
 	 * store.
 	 */
-	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	raw_spin_lock(&task->pi_lock);
 	task->pi_blocked_on = NULL;
 	/*
 	 * Finish the lock acquisition. @task is the new owner. If
@@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	 */
 	if (rt_mutex_has_waiters(lock))
 		rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	raw_spin_unlock(&task->pi_lock);
 
 takeit:
 	/* We got the lock. */
@@ -883,7 +882,7 @@ takeit:
  *
  * Prepare waiter and propagate pi chain
  *
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
  */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 				   struct rt_mutex_waiter *waiter,
@@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	struct rt_mutex_waiter *top_waiter = waiter;
 	struct rt_mutex *next_lock;
 	int chain_walk = 0, res;
-	unsigned long flags;
 
 	/*
 	 * Early deadlock detection. We really don't want the task to
@@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	if (owner == task)
 		return -EDEADLK;
 
-	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	raw_spin_lock(&task->pi_lock);
 	__rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
@@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 
 	task->pi_blocked_on = waiter;
 
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	raw_spin_unlock(&task->pi_lock);
 
 	if (!owner)
 		return 0;
 
-	raw_spin_lock_irqsave(&owner->pi_lock, flags);
+	raw_spin_lock(&owner->pi_lock);
 	if (waiter == rt_mutex_top_waiter(lock)) {
 		rt_mutex_dequeue_pi(owner, top_waiter);
 		rt_mutex_enqueue_pi(owner, waiter);
@@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	/* Store the lock on which owner is blocked or NULL */
 	next_lock = task_blocked_on_lock(owner);
 
-	raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+	raw_spin_unlock(&owner->pi_lock);
 	/*
 	 * Even if full deadlock detection is on, if the owner is not
 	 * blocked itself, we can avoid finding this out in the chain
@@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	 */
 	get_task_struct(owner);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
 					 next_lock, waiter, task);
 
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irq(&lock->wait_lock);
 
 	return res;
 }
@@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
  * Remove the top waiter from the current tasks pi waiter tree and
  * queue it up.
  *
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
  */
 static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 				    struct rt_mutex *lock)
 {
 	struct rt_mutex_waiter *waiter;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&current->pi_lock, flags);
+	raw_spin_lock(&current->pi_lock);
 
 	waiter = rt_mutex_top_waiter(lock);
 
@@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 	 */
 	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
-	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+	raw_spin_unlock(&current->pi_lock);
 
 	wake_q_add(wake_q, waiter->task);
 }
@@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 /*
  * Remove a waiter from a lock and give up
  *
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
  * have just failed to try_to_take_rt_mutex().
  */
 static void remove_waiter(struct rt_mutex *lock,
@@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock,
 	bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex *next_lock;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&current->pi_lock, flags);
+	raw_spin_lock(&current->pi_lock);
 	rt_mutex_dequeue(lock, waiter);
 	current->pi_blocked_on = NULL;
-	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+	raw_spin_unlock(&current->pi_lock);
 
 	/*
 	 * Only update priority if the waiter was the highest priority
@@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock,
 	if (!owner || !is_top_waiter)
 		return;
 
-	raw_spin_lock_irqsave(&owner->pi_lock, flags);
+	raw_spin_lock(&owner->pi_lock);
 
 	rt_mutex_dequeue_pi(owner, waiter);
 
@@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock,
 	/* Store the lock on which owner is blocked or NULL */
 	next_lock = task_blocked_on_lock(owner);
 
-	raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+	raw_spin_unlock(&owner->pi_lock);
 
 	/*
 	 * Don't walk the chain, if the owner task is not blocked
@@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock,
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(owner);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
 				   next_lock, NULL, current);
 
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irq(&lock->wait_lock);
 }
 
 /*
@@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task)
  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
  * @lock:		 the rt_mutex to take
  * @state:		 the state the task should block in (TASK_INTERRUPTIBLE
- * 			 or TASK_UNINTERRUPTIBLE)
+ *			 or TASK_UNINTERRUPTIBLE)
  * @timeout:		 the pre-initialized and started timer, or NULL for none
  * @waiter:		 the pre-initialized rt_mutex_waiter
  *
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
  */
 static int __sched
 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
@@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 				break;
 		}
 
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock_irq(&lock->wait_lock);
 
 		debug_rt_mutex_print_deadlock(waiter);
 
 		schedule();
 
-		raw_spin_lock(&lock->wait_lock);
+		raw_spin_lock_irq(&lock->wait_lock);
 		set_current_state(state);
 	}
 
@@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		  enum rtmutex_chainwalk chwalk)
 {
 	struct rt_mutex_waiter waiter;
+	unsigned long flags;
 	int ret = 0;
 
 	debug_rt_mutex_init_waiter(&waiter);
 	RB_CLEAR_NODE(&waiter.pi_tree_entry);
 	RB_CLEAR_NODE(&waiter.tree_entry);
 
-	raw_spin_lock(&lock->wait_lock);
+	/*
+	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
+	 * be called in early boot if the cmpxchg() fast path is disabled
+	 * (debug, no architecture support). In this case we will acquire the
+	 * rtmutex with lock->wait_lock held. But we cannot unconditionally
+	 * enable interrupts in that early boot case. So we need to use the
+	 * irqsave/restore variants.
+	 */
+	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	/* Try to acquire the lock again: */
 	if (try_to_take_rt_mutex(lock, current, NULL)) {
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 		return 0;
 	}
 
@@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	 */
 	fixup_rt_mutex_waiters(lock);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	/* Remove pending timer: */
 	if (unlikely(timeout))
@@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
  */
 static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 {
+	unsigned long flags;
 	int ret;
 
 	/*
@@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 		return 0;
 
 	/*
-	 * The mutex has currently no owner. Lock the wait lock and
-	 * try to acquire the lock.
+	 * The mutex has currently no owner. Lock the wait lock and try to
+	 * acquire the lock. We use irqsave here to support early boot calls.
 	 */
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	ret = try_to_take_rt_mutex(lock, current, NULL);
 
@@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 	 */
 	fixup_rt_mutex_waiters(lock);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	return ret;
 }
@@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
 					struct wake_q_head *wake_q)
 {
-	raw_spin_lock(&lock->wait_lock);
+	unsigned long flags;
+
+	/* irqsave required to support early boot calls */
+	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	debug_rt_mutex_unlock(lock);
 
@@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
 	 */
 	while (!rt_mutex_has_waiters(lock)) {
 		/* Drops lock->wait_lock ! */
-		if (unlock_rt_mutex_safe(lock) == true)
+		if (unlock_rt_mutex_safe(lock, flags) == true)
 			return false;
 		/* Relock the rtmutex and try again */
-		raw_spin_lock(&lock->wait_lock);
+		raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	}
 
 	/*
@@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
 	 */
 	mark_wakeup_next_waiter(wake_q, lock);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	/* check PI boosting */
 	return true;
@@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 {
 	int ret;
 
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irq(&lock->wait_lock);
 
 	if (try_to_take_rt_mutex(lock, task, NULL)) {
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock_irq(&lock->wait_lock);
 		return 1;
 	}
 
@@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 	if (unlikely(ret))
 		remove_waiter(lock, waiter);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	debug_rt_mutex_print_deadlock(waiter);
 
@@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 {
 	int ret;
 
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irq(&lock->wait_lock);
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
@@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 	 */
 	fixup_rt_mutex_waiters(lock);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	return ret;
 }
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 25ced161e..6cf54615a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -10,8 +10,11 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  */
+#include <linux/radix-tree.h>
+#include <linux/memremap.h>
 #include <linux/device.h>
 #include <linux/types.h>
+#include <linux/pfn_t.h>
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/memory_hotplug.h>
@@ -26,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
 
 static void *try_ram_remap(resource_size_t offset, size_t size)
 {
-	struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+	unsigned long pfn = PHYS_PFN(offset);
 
 	/* In the simple case just return the existing linear address */
-	if (!PageHighMem(page))
+	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
 		return __va(offset);
 	return NULL; /* fallback to ioremap_cache */
 }
@@ -149,25 +152,134 @@ void devm_memunmap(struct device *dev, void *addr)
 }
 EXPORT_SYMBOL(devm_memunmap);
 
+pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
+{
+	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
+}
+EXPORT_SYMBOL(phys_to_pfn_t);
+
 #ifdef CONFIG_ZONE_DEVICE
+static DEFINE_MUTEX(pgmap_lock);
+static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
+#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
 struct page_map {
 	struct resource res;
+	struct percpu_ref *ref;
+	struct dev_pagemap pgmap;
+	struct vmem_altmap altmap;
 };
 
-static void devm_memremap_pages_release(struct device *dev, void *res)
+void get_zone_device_page(struct page *page)
+{
+	percpu_ref_get(page->pgmap->ref);
+}
+EXPORT_SYMBOL(get_zone_device_page);
+
+void put_zone_device_page(struct page *page)
+{
+	put_dev_pagemap(page->pgmap);
+}
+EXPORT_SYMBOL(put_zone_device_page);
+
+static void pgmap_radix_release(struct resource *res)
+{
+	resource_size_t key, align_start, align_size, align_end;
+
+	align_start = res->start & ~(SECTION_SIZE - 1);
+	align_size = ALIGN(resource_size(res), SECTION_SIZE);
+	align_end = align_start + align_size - 1;
+
+	mutex_lock(&pgmap_lock);
+	for (key = res->start; key <= res->end; key += SECTION_SIZE)
+		radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+	mutex_unlock(&pgmap_lock);
+}
+
+static unsigned long pfn_first(struct page_map *page_map)
+{
+	struct dev_pagemap *pgmap = &page_map->pgmap;
+	const struct resource *res = &page_map->res;
+	struct vmem_altmap *altmap = pgmap->altmap;
+	unsigned long pfn;
+
+	pfn = res->start >> PAGE_SHIFT;
+	if (altmap)
+		pfn += vmem_altmap_offset(altmap);
+	return pfn;
+}
+
+static unsigned long pfn_end(struct page_map *page_map)
 {
-	struct page_map *page_map = res;
+	const struct resource *res = &page_map->res;
+
+	return (res->start + resource_size(res)) >> PAGE_SHIFT;
+}
+
+#define for_each_device_pfn(pfn, map) \
+	for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
+
+static void devm_memremap_pages_release(struct device *dev, void *data)
+{
+	struct page_map *page_map = data;
+	struct resource *res = &page_map->res;
+	resource_size_t align_start, align_size;
+	struct dev_pagemap *pgmap = &page_map->pgmap;
+
+	if (percpu_ref_tryget_live(pgmap->ref)) {
+		dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+		percpu_ref_put(pgmap->ref);
+	}
 
 	/* pages are dead and unused, undo the arch mapping */
-	arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+	align_start = res->start & ~(SECTION_SIZE - 1);
+	align_size = ALIGN(resource_size(res), SECTION_SIZE);
+	arch_remove_memory(align_start, align_size);
+	pgmap_radix_release(res);
+	dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
+			"%s: failed to free all reserved pages\n", __func__);
 }
 
-void *devm_memremap_pages(struct device *dev, struct resource *res)
+/* assumes rcu_read_lock() held at entry */
+struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 {
-	int is_ram = region_intersects(res->start, resource_size(res),
-			"System RAM");
 	struct page_map *page_map;
-	int error, nid;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+	return page_map ? &page_map->pgmap : NULL;
+}
+
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @res: "host memory" address range
+ * @ref: a live per-cpu reference count
+ * @altmap: optional descriptor for allocating the memmap from @res
+ *
+ * Notes:
+ * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
+ *    (or devm release event).
+ *
+ * 2/ @res is expected to be a host memory range that could feasibly be
+ *    treated as a "System RAM" range, i.e. not a device mmio range, but
+ *    this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct resource *res,
+		struct percpu_ref *ref, struct vmem_altmap *altmap)
+{
+	resource_size_t key, align_start, align_size, align_end;
+	struct dev_pagemap *pgmap;
+	struct page_map *page_map;
+	int error, nid, is_ram;
+	unsigned long pfn;
+
+	align_start = res->start & ~(SECTION_SIZE - 1);
+	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+		- align_start;
+	is_ram = region_intersects(align_start, align_size, "System RAM");
 
 	if (is_ram == REGION_MIXED) {
 		WARN_ONCE(1, "%s attempted on mixed region %pr\n",
@@ -178,25 +290,124 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
 	if (is_ram == REGION_INTERSECTS)
 		return __va(res->start);
 
+	if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
+		dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
+				__func__);
+		return ERR_PTR(-ENXIO);
+	}
+
+	if (!ref)
+		return ERR_PTR(-EINVAL);
+
 	page_map = devres_alloc_node(devm_memremap_pages_release,
 			sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
 	if (!page_map)
 		return ERR_PTR(-ENOMEM);
+	pgmap = &page_map->pgmap;
 
 	memcpy(&page_map->res, res, sizeof(*res));
 
+	pgmap->dev = dev;
+	if (altmap) {
+		memcpy(&page_map->altmap, altmap, sizeof(*altmap));
+		pgmap->altmap = &page_map->altmap;
+	}
+	pgmap->ref = ref;
+	pgmap->res = &page_map->res;
+
+	mutex_lock(&pgmap_lock);
+	error = 0;
+	align_end = align_start + align_size - 1;
+	for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+		struct dev_pagemap *dup;
+
+		rcu_read_lock();
+		dup = find_dev_pagemap(key);
+		rcu_read_unlock();
+		if (dup) {
+			dev_err(dev, "%s: %pr collides with mapping for %s\n",
+					__func__, res, dev_name(dup->dev));
+			error = -EBUSY;
+			break;
+		}
+		error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
+				page_map);
+		if (error) {
+			dev_err(dev, "%s: failed: %d\n", __func__, error);
+			break;
+		}
+	}
+	mutex_unlock(&pgmap_lock);
+	if (error)
+		goto err_radix;
+
 	nid = dev_to_node(dev);
 	if (nid < 0)
 		nid = numa_mem_id();
 
-	error = arch_add_memory(nid, res->start, resource_size(res), true);
-	if (error) {
-		devres_free(page_map);
-		return ERR_PTR(error);
-	}
+	error = arch_add_memory(nid, align_start, align_size, true);
+	if (error)
+		goto err_add_memory;
 
+	for_each_device_pfn(pfn, page_map) {
+		struct page *page = pfn_to_page(pfn);
+
+		/*
+		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
+		 * pointer.  It is a bug if a ZONE_DEVICE page is ever
+		 * freed or placed on a driver-private list.  Seed the
+		 * storage with LIST_POISON* values.
+		 */
+		list_del(&page->lru);
+		page->pgmap = pgmap;
+	}
 	devres_add(dev, page_map);
 	return __va(res->start);
+
+ err_add_memory:
+ err_radix:
+	pgmap_radix_release(res);
+	devres_free(page_map);
+	return ERR_PTR(error);
 }
 EXPORT_SYMBOL(devm_memremap_pages);
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+	/* number of pfns from base where pfn_to_page() is valid */
+	return altmap->reserve + altmap->free;
+}
+
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+	altmap->alloc -= nr_pfns;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+{
+	/*
+	 * 'memmap_start' is the virtual address for the first "struct
+	 * page" in this range of the vmemmap array.  In the case of
+	 * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple
+	 * pointer arithmetic, so we can perform this to_vmem_altmap()
+	 * conversion without concern for the initialization state of
+	 * the struct page fields.
+	 */
+	struct page *page = (struct page *) memmap_start;
+	struct dev_pagemap *pgmap;
+
+	/*
+	 * Uncoditionally retrieve a dev_pagemap associated with the
+	 * given physical address, this is only for use in the
+	 * arch_{add|remove}_memory() for setting up and tearing down
+	 * the memmap.
+	 */
+	rcu_read_lock();
+	pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+	rcu_read_unlock();
+
+	return pgmap ? pgmap->altmap : NULL;
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 #endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index 0e5c71195..794ebe8e8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -80,15 +80,6 @@
 # define debug_align(X) (X)
 #endif
 
-/*
- * Given BASE and SIZE this macro calculates the number of pages the
- * memory regions occupies
- */
-#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ?		\
-		(PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) -	\
-			 PFN_DOWN((unsigned long)BASE) + 1)	\
-		: (0UL))
-
 /* If this is set, the section belongs in the init part of the module */
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
 
@@ -108,13 +99,6 @@ static LIST_HEAD(modules);
  * Use a latched RB-tree for __module_address(); this allows us to use
  * RCU-sched lookups of the address from any context.
  *
- * Because modules have two address ranges: init and core, we need two
- * latch_tree_nodes entries. Therefore we need the back-pointer from
- * mod_tree_node.
- *
- * Because init ranges are short lived we mark them unlikely and have placed
- * them outside the critical cacheline in struct module.
- *
  * This is conditional on PERF_EVENTS || TRACING because those can really hit
  * __module_address() hard by doing a lot of stack unwinding; potentially from
  * NMI context.
@@ -122,24 +106,16 @@ static LIST_HEAD(modules);
 
 static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
 {
-	struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
-	struct module *mod = mtn->mod;
+	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
 
-	if (unlikely(mtn == &mod->mtn_init))
-		return (unsigned long)mod->module_init;
-
-	return (unsigned long)mod->module_core;
+	return (unsigned long)layout->base;
 }
 
 static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
 {
-	struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
-	struct module *mod = mtn->mod;
-
-	if (unlikely(mtn == &mod->mtn_init))
-		return (unsigned long)mod->init_size;
+	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
 
-	return (unsigned long)mod->core_size;
+	return (unsigned long)layout->size;
 }
 
 static __always_inline bool
@@ -197,23 +173,23 @@ static void __mod_tree_remove(struct mod_tree_node *node)
  */
 static void mod_tree_insert(struct module *mod)
 {
-	mod->mtn_core.mod = mod;
-	mod->mtn_init.mod = mod;
+	mod->core_layout.mtn.mod = mod;
+	mod->init_layout.mtn.mod = mod;
 
-	__mod_tree_insert(&mod->mtn_core);
-	if (mod->init_size)
-		__mod_tree_insert(&mod->mtn_init);
+	__mod_tree_insert(&mod->core_layout.mtn);
+	if (mod->init_layout.size)
+		__mod_tree_insert(&mod->init_layout.mtn);
 }
 
 static void mod_tree_remove_init(struct module *mod)
 {
-	if (mod->init_size)
-		__mod_tree_remove(&mod->mtn_init);
+	if (mod->init_layout.size)
+		__mod_tree_remove(&mod->init_layout.mtn);
 }
 
 static void mod_tree_remove(struct module *mod)
 {
-	__mod_tree_remove(&mod->mtn_core);
+	__mod_tree_remove(&mod->core_layout.mtn);
 	mod_tree_remove_init(mod);
 }
 
@@ -267,9 +243,9 @@ static void __mod_update_bounds(void *base, unsigned int size)
 
 static void mod_update_bounds(struct module *mod)
 {
-	__mod_update_bounds(mod->module_core, mod->core_size);
-	if (mod->init_size)
-		__mod_update_bounds(mod->module_init, mod->init_size);
+	__mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
+	if (mod->init_layout.size)
+		__mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
 }
 
 #ifdef CONFIG_KGDB_KDB
@@ -1008,6 +984,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		mod->exit();
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
+	ftrace_release_mod(mod);
+
 	async_synchronize_full();
 
 	/* Store the name of the last unloaded module for diagnostic purposes */
@@ -1217,7 +1195,7 @@ struct module_attribute module_uevent =
 static ssize_t show_coresize(struct module_attribute *mattr,
 			     struct module_kobject *mk, char *buffer)
 {
-	return sprintf(buffer, "%u\n", mk->mod->core_size);
+	return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
 }
 
 static struct module_attribute modinfo_coresize =
@@ -1226,7 +1204,7 @@ static struct module_attribute modinfo_coresize =
 static ssize_t show_initsize(struct module_attribute *mattr,
 			     struct module_kobject *mk, char *buffer)
 {
-	return sprintf(buffer, "%u\n", mk->mod->init_size);
+	return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
 }
 
 static struct module_attribute modinfo_initsize =
@@ -1876,64 +1854,75 @@ static void mod_sysfs_teardown(struct module *mod)
 /*
  * LKM RO/NX protection: protect module's text/ro-data
  * from modification and any data from execution.
+ *
+ * General layout of module is:
+ *          [text] [read-only-data] [writable data]
+ * text_size -----^                ^               ^
+ * ro_size ------------------------|               |
+ * size -------------------------------------------|
+ *
+ * These values are always page-aligned (as is base)
  */
-void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+static void frob_text(const struct module_layout *layout,
+		      int (*set_memory)(unsigned long start, int num_pages))
 {
-	unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
-	unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base,
+		   layout->text_size >> PAGE_SHIFT);
+}
 
-	if (end_pfn > begin_pfn)
-		set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+static void frob_rodata(const struct module_layout *layout,
+			int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base + layout->text_size,
+		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
 }
 
-static void set_section_ro_nx(void *base,
-			unsigned long text_size,
-			unsigned long ro_size,
-			unsigned long total_size)
+static void frob_writable_data(const struct module_layout *layout,
+			       int (*set_memory)(unsigned long start, int num_pages))
 {
-	/* begin and end PFNs of the current subsection */
-	unsigned long begin_pfn;
-	unsigned long end_pfn;
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base + layout->ro_size,
+		   (layout->size - layout->ro_size) >> PAGE_SHIFT);
+}
 
-	/*
-	 * Set RO for module text and RO-data:
-	 * - Always protect first page.
-	 * - Do not protect last partial page.
-	 */
-	if (ro_size > 0)
-		set_page_attributes(base, base + ro_size, set_memory_ro);
+/* livepatching wants to disable read-only so it can frob module. */
+void module_disable_ro(const struct module *mod)
+{
+	frob_text(&mod->core_layout, set_memory_rw);
+	frob_rodata(&mod->core_layout, set_memory_rw);
+	frob_text(&mod->init_layout, set_memory_rw);
+	frob_rodata(&mod->init_layout, set_memory_rw);
+}
 
-	/*
-	 * Set NX permissions for module data:
-	 * - Do not protect first partial page.
-	 * - Always protect last page.
-	 */
-	if (total_size > text_size) {
-		begin_pfn = PFN_UP((unsigned long)base + text_size);
-		end_pfn = PFN_UP((unsigned long)base + total_size);
-		if (end_pfn > begin_pfn)
-			set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
-	}
+void module_enable_ro(const struct module *mod)
+{
+	frob_text(&mod->core_layout, set_memory_ro);
+	frob_rodata(&mod->core_layout, set_memory_ro);
+	frob_text(&mod->init_layout, set_memory_ro);
+	frob_rodata(&mod->init_layout, set_memory_ro);
 }
 
-static void unset_module_core_ro_nx(struct module *mod)
+static void module_enable_nx(const struct module *mod)
 {
-	set_page_attributes(mod->module_core + mod->core_text_size,
-		mod->module_core + mod->core_size,
-		set_memory_x);
-	set_page_attributes(mod->module_core,
-		mod->module_core + mod->core_ro_size,
-		set_memory_rw);
+	frob_rodata(&mod->core_layout, set_memory_nx);
+	frob_writable_data(&mod->core_layout, set_memory_nx);
+	frob_rodata(&mod->init_layout, set_memory_nx);
+	frob_writable_data(&mod->init_layout, set_memory_nx);
 }
 
-static void unset_module_init_ro_nx(struct module *mod)
+static void module_disable_nx(const struct module *mod)
 {
-	set_page_attributes(mod->module_init + mod->init_text_size,
-		mod->module_init + mod->init_size,
-		set_memory_x);
-	set_page_attributes(mod->module_init,
-		mod->module_init + mod->init_ro_size,
-		set_memory_rw);
+	frob_rodata(&mod->core_layout, set_memory_x);
+	frob_writable_data(&mod->core_layout, set_memory_x);
+	frob_rodata(&mod->init_layout, set_memory_x);
+	frob_writable_data(&mod->init_layout, set_memory_x);
 }
 
 /* Iterate through all modules and set each module's text as RW */
@@ -1945,16 +1934,9 @@ void set_all_modules_text_rw(void)
 	list_for_each_entry_rcu(mod, &modules, list) {
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
-		if ((mod->module_core) && (mod->core_text_size)) {
-			set_page_attributes(mod->module_core,
-						mod->module_core + mod->core_text_size,
-						set_memory_rw);
-		}
-		if ((mod->module_init) && (mod->init_text_size)) {
-			set_page_attributes(mod->module_init,
-						mod->module_init + mod->init_text_size,
-						set_memory_rw);
-		}
+
+		frob_text(&mod->core_layout, set_memory_rw);
+		frob_text(&mod->init_layout, set_memory_rw);
 	}
 	mutex_unlock(&module_mutex);
 }
@@ -1968,23 +1950,25 @@ void set_all_modules_text_ro(void)
 	list_for_each_entry_rcu(mod, &modules, list) {
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
-		if ((mod->module_core) && (mod->core_text_size)) {
-			set_page_attributes(mod->module_core,
-						mod->module_core + mod->core_text_size,
-						set_memory_ro);
-		}
-		if ((mod->module_init) && (mod->init_text_size)) {
-			set_page_attributes(mod->module_init,
-						mod->module_init + mod->init_text_size,
-						set_memory_ro);
-		}
+
+		frob_text(&mod->core_layout, set_memory_ro);
+		frob_text(&mod->init_layout, set_memory_ro);
 	}
 	mutex_unlock(&module_mutex);
 }
+
+static void disable_ro_nx(const struct module_layout *layout)
+{
+	frob_text(layout, set_memory_rw);
+	frob_rodata(layout, set_memory_rw);
+	frob_rodata(layout, set_memory_x);
+	frob_writable_data(layout, set_memory_x);
+}
+
 #else
-static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
-static void unset_module_core_ro_nx(struct module *mod) { }
-static void unset_module_init_ro_nx(struct module *mod) { }
+static void disable_ro_nx(const struct module_layout *layout) { }
+static void module_enable_nx(const struct module *mod) { }
+static void module_disable_nx(const struct module *mod) { }
 #endif
 
 void __weak module_memfree(void *module_region)
@@ -2036,19 +2020,19 @@ static void free_module(struct module *mod)
 	synchronize_sched();
 	mutex_unlock(&module_mutex);
 
-	/* This may be NULL, but that's OK */
-	unset_module_init_ro_nx(mod);
+	/* This may be empty, but that's OK */
+	disable_ro_nx(&mod->init_layout);
 	module_arch_freeing_init(mod);
-	module_memfree(mod->module_init);
+	module_memfree(mod->init_layout.base);
 	kfree(mod->args);
 	percpu_modfree(mod);
 
 	/* Free lock-classes; relies on the preceding sync_rcu(). */
-	lockdep_free_key_range(mod->module_core, mod->core_size);
+	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
 
 	/* Finally, free the core (containing the module structure) */
-	unset_module_core_ro_nx(mod);
-	module_memfree(mod->module_core);
+	disable_ro_nx(&mod->core_layout);
+	module_memfree(mod->core_layout.base);
 
 #ifdef CONFIG_MPU
 	update_protections(current->mm);
@@ -2251,20 +2235,20 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			    || s->sh_entsize != ~0UL
 			    || strstarts(sname, ".init"))
 				continue;
-			s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
+			s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
 			pr_debug("\t%s\n", sname);
 		}
 		switch (m) {
 		case 0: /* executable */
-			mod->core_size = debug_align(mod->core_size);
-			mod->core_text_size = mod->core_size;
+			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.text_size = mod->core_layout.size;
 			break;
 		case 1: /* RO: text and ro-data */
-			mod->core_size = debug_align(mod->core_size);
-			mod->core_ro_size = mod->core_size;
+			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.ro_size = mod->core_layout.size;
 			break;
 		case 3: /* whole core */
-			mod->core_size = debug_align(mod->core_size);
+			mod->core_layout.size = debug_align(mod->core_layout.size);
 			break;
 		}
 	}
@@ -2280,21 +2264,21 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			    || s->sh_entsize != ~0UL
 			    || !strstarts(sname, ".init"))
 				continue;
-			s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
+			s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
 					 | INIT_OFFSET_MASK);
 			pr_debug("\t%s\n", sname);
 		}
 		switch (m) {
 		case 0: /* executable */
-			mod->init_size = debug_align(mod->init_size);
-			mod->init_text_size = mod->init_size;
+			mod->init_layout.size = debug_align(mod->init_layout.size);
+			mod->init_layout.text_size = mod->init_layout.size;
 			break;
 		case 1: /* RO: text and ro-data */
-			mod->init_size = debug_align(mod->init_size);
-			mod->init_ro_size = mod->init_size;
+			mod->init_layout.size = debug_align(mod->init_layout.size);
+			mod->init_layout.ro_size = mod->init_layout.size;
 			break;
 		case 3: /* whole init */
-			mod->init_size = debug_align(mod->init_size);
+			mod->init_layout.size = debug_align(mod->init_layout.size);
 			break;
 		}
 	}
@@ -2404,7 +2388,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
 	}
 	if (sym->st_shndx == SHN_UNDEF)
 		return 'U';
-	if (sym->st_shndx == SHN_ABS)
+	if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
 		return 'a';
 	if (sym->st_shndx >= SHN_LORESERVE)
 		return '?';
@@ -2433,7 +2417,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
 }
 
 static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
-			unsigned int shnum)
+			unsigned int shnum, unsigned int pcpundx)
 {
 	const Elf_Shdr *sec;
 
@@ -2442,6 +2426,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
 	    || !src->st_name)
 		return false;
 
+#ifdef CONFIG_KALLSYMS_ALL
+	if (src->st_shndx == pcpundx)
+		return true;
+#endif
+
 	sec = sechdrs + src->st_shndx;
 	if (!(sec->sh_flags & SHF_ALLOC)
 #ifndef CONFIG_KALLSYMS_ALL
@@ -2469,7 +2458,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 
 	/* Put symbol section at end of init part of module. */
 	symsect->sh_flags |= SHF_ALLOC;
-	symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
+	symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
 					 info->index.sym) | INIT_OFFSET_MASK;
 	pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
 
@@ -2479,30 +2468,31 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	/* Compute total space required for the core symbols' strtab. */
 	for (ndst = i = 0; i < nsrc; i++) {
 		if (i == 0 ||
-		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+				   info->index.pcpu)) {
 			strtab_size += strlen(&info->strtab[src[i].st_name])+1;
 			ndst++;
 		}
 	}
 
 	/* Append room for core symbols at end of core part. */
-	info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
-	info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
-	mod->core_size += strtab_size;
-	mod->core_size = debug_align(mod->core_size);
+	info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
+	info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+	mod->core_layout.size += strtab_size;
+	mod->core_layout.size = debug_align(mod->core_layout.size);
 
 	/* Put string table section at end of init part of module. */
 	strsect->sh_flags |= SHF_ALLOC;
-	strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
+	strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
 					 info->index.str) | INIT_OFFSET_MASK;
 	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
 
 	/* We'll tack temporary mod_kallsyms on the end. */
-	mod->init_size = ALIGN(mod->init_size,
-			       __alignof__(struct mod_kallsyms));
-	info->mod_kallsyms_init_off = mod->init_size;
-	mod->init_size += sizeof(struct mod_kallsyms);
-	mod->init_size = debug_align(mod->init_size);
+	mod->init_layout.size = ALIGN(mod->init_layout.size,
+				      __alignof__(struct mod_kallsyms));
+	info->mod_kallsyms_init_off = mod->init_layout.size;
+	mod->init_layout.size += sizeof(struct mod_kallsyms);
+	mod->init_layout.size = debug_align(mod->init_layout.size);
 }
 
 /*
@@ -2519,7 +2509,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
 
 	/* Set up to point into init section. */
-	mod->kallsyms = mod->module_init + info->mod_kallsyms_init_off;
+	mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
 
 	mod->kallsyms->symtab = (void *)symsec->sh_addr;
 	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
@@ -2532,12 +2522,13 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 			= elf_type(&mod->kallsyms->symtab[i], info);
 
 	/* Now populate the cut down core kallsyms for after init. */
-	mod->core_kallsyms.symtab = dst = mod->module_core + info->symoffs;
-	mod->core_kallsyms.strtab = s = mod->module_core + info->stroffs;
+	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
 	src = mod->kallsyms->symtab;
 	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
 		if (i == 0 ||
-		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+				   info->index.pcpu)) {
 			dst[ndst] = src[i];
 			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
 			s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
@@ -2983,7 +2974,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	void *ptr;
 
 	/* Do the allocs. */
-	ptr = module_alloc(mod->core_size);
+	ptr = module_alloc(mod->core_layout.size);
 	/*
 	 * The pointer to this block is stored in the module structure
 	 * which is inside the block. Just mark it as not being a
@@ -2993,11 +2984,11 @@ static int move_module(struct module *mod, struct load_info *info)
 	if (!ptr)
 		return -ENOMEM;
 
-	memset(ptr, 0, mod->core_size);
-	mod->module_core = ptr;
+	memset(ptr, 0, mod->core_layout.size);
+	mod->core_layout.base = ptr;
 
-	if (mod->init_size) {
-		ptr = module_alloc(mod->init_size);
+	if (mod->init_layout.size) {
+		ptr = module_alloc(mod->init_layout.size);
 		/*
 		 * The pointer to this block is stored in the module structure
 		 * which is inside the block. This block doesn't need to be
@@ -3006,13 +2997,13 @@ static int move_module(struct module *mod, struct load_info *info)
 		 */
 		kmemleak_ignore(ptr);
 		if (!ptr) {
-			module_memfree(mod->module_core);
+			module_memfree(mod->core_layout.base);
 			return -ENOMEM;
 		}
-		memset(ptr, 0, mod->init_size);
-		mod->module_init = ptr;
+		memset(ptr, 0, mod->init_layout.size);
+		mod->init_layout.base = ptr;
 	} else
-		mod->module_init = NULL;
+		mod->init_layout.base = NULL;
 
 	/* Transfer each section which specifies SHF_ALLOC */
 	pr_debug("final section addresses:\n");
@@ -3024,10 +3015,10 @@ static int move_module(struct module *mod, struct load_info *info)
 			continue;
 
 		if (shdr->sh_entsize & INIT_OFFSET_MASK)
-			dest = mod->module_init
+			dest = mod->init_layout.base
 				+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
 		else
-			dest = mod->module_core + shdr->sh_entsize;
+			dest = mod->core_layout.base + shdr->sh_entsize;
 
 		if (shdr->sh_type != SHT_NOBITS)
 			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
@@ -3089,12 +3080,12 @@ static void flush_module_icache(const struct module *mod)
 	 * Do it before processing of module parameters, so the module
 	 * can provide parameter accessor functions of its own.
 	 */
-	if (mod->module_init)
-		flush_icache_range((unsigned long)mod->module_init,
-				   (unsigned long)mod->module_init
-				   + mod->init_size);
-	flush_icache_range((unsigned long)mod->module_core,
-			   (unsigned long)mod->module_core + mod->core_size);
+	if (mod->init_layout.base)
+		flush_icache_range((unsigned long)mod->init_layout.base,
+				   (unsigned long)mod->init_layout.base
+				   + mod->init_layout.size);
+	flush_icache_range((unsigned long)mod->core_layout.base,
+			   (unsigned long)mod->core_layout.base + mod->core_layout.size);
 
 	set_fs(old_fs);
 }
@@ -3152,8 +3143,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 {
 	percpu_modfree(mod);
 	module_arch_freeing_init(mod);
-	module_memfree(mod->module_init);
-	module_memfree(mod->module_core);
+	module_memfree(mod->init_layout.base);
+	module_memfree(mod->core_layout.base);
 }
 
 int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3240,7 +3231,7 @@ static noinline int do_init_module(struct module *mod)
 		ret = -ENOMEM;
 		goto fail;
 	}
-	freeinit->module_init = mod->module_init;
+	freeinit->module_init = mod->init_layout.base;
 
 	/*
 	 * We want to find out whether @mod uses async during init.  Clear
@@ -3297,12 +3288,12 @@ static noinline int do_init_module(struct module *mod)
 	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
 #endif
 	mod_tree_remove_init(mod);
-	unset_module_init_ro_nx(mod);
+	disable_ro_nx(&mod->init_layout);
 	module_arch_freeing_init(mod);
-	mod->module_init = NULL;
-	mod->init_size = 0;
-	mod->init_ro_size = 0;
-	mod->init_text_size = 0;
+	mod->init_layout.base = NULL;
+	mod->init_layout.size = 0;
+	mod->init_layout.ro_size = 0;
+	mod->init_layout.text_size = 0;
 	/*
 	 * We want to free module_init, but be aware that kallsyms may be
 	 * walking this with preempt disabled.  In all the failure paths, we
@@ -3324,6 +3315,7 @@ fail:
 	module_put(mod);
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
+	ftrace_release_mod(mod);
 	free_module(mod);
 	wake_up_all(&module_wq);
 	return ret;
@@ -3391,23 +3383,16 @@ static int complete_formation(struct module *mod, struct load_info *info)
 	/* This relies on module_mutex for list integrity. */
 	module_bug_finalize(info->hdr, info->sechdrs, mod);
 
-	/* Set RO and NX regions for core */
-	set_section_ro_nx(mod->module_core,
-				mod->core_text_size,
-				mod->core_ro_size,
-				mod->core_size);
-
-	/* Set RO and NX regions for init */
-	set_section_ro_nx(mod->module_init,
-				mod->init_text_size,
-				mod->init_ro_size,
-				mod->init_size);
+	/* Set RO and NX regions */
+	module_enable_ro(mod);
+	module_enable_nx(mod);
 
 	/* Mark state as coming so strong_try_module_get() ignores us,
 	 * but kallsyms etc. can see us. */
 	mod->state = MODULE_STATE_COMING;
 	mutex_unlock(&module_mutex);
 
+	ftrace_module_enable(mod);
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_COMING, mod);
 	return 0;
@@ -3566,8 +3551,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 				     MODULE_STATE_GOING, mod);
 
 	/* we can't deallocate the module until we clear memory protection */
-	unset_module_init_ro_nx(mod);
-	unset_module_core_ro_nx(mod);
+	module_disable_ro(mod);
+	module_disable_nx(mod);
 
  ddebug_cleanup:
 	dynamic_debug_remove(info->debug);
@@ -3596,7 +3581,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	 */
 	ftrace_release_mod(mod);
 	/* Free lock-classes; relies on the preceding sync_rcu() */
-	lockdep_free_key_range(mod->module_core, mod->core_size);
+	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
 
 	module_deallocate(mod, info);
  free_copy:
@@ -3680,9 +3665,9 @@ static const char *get_ksymbol(struct module *mod,
 
 	/* At worse, next value is at end of module */
 	if (within_module_init(addr, mod))
-		nextval = (unsigned long)mod->module_init+mod->init_text_size;
+		nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
 	else
-		nextval = (unsigned long)mod->module_core+mod->core_text_size;
+		nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
 
 	/* Scan for closest preceding symbol, and next symbol. (ELF
 	   starts real symbols at 1). */
@@ -3935,7 +3920,7 @@ static int m_show(struct seq_file *m, void *p)
 		return 0;
 
 	seq_printf(m, "%s %u",
-		   mod->name, mod->init_size + mod->core_size);
+		   mod->name, mod->init_layout.size + mod->core_layout.size);
 	print_unload_info(m, mod);
 
 	/* Informative for users. */
@@ -3944,7 +3929,7 @@ static int m_show(struct seq_file *m, void *p)
 		   mod->state == MODULE_STATE_COMING ? "Loading" :
 		   "Live");
 	/* Used by oprofile and other similar tools. */
-	seq_printf(m, " 0x%pK", mod->module_core);
+	seq_printf(m, " 0x%pK", mod->core_layout.base);
 
 	/* Taints info */
 	if (mod->taints)
@@ -4087,8 +4072,8 @@ struct module *__module_text_address(unsigned long addr)
 	struct module *mod = __module_address(addr);
 	if (mod) {
 		/* Make sure it's within the text section. */
-		if (!within(addr, mod->module_init, mod->init_text_size)
-		    && !within(addr, mod->module_core, mod->core_text_size))
+		if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
+		    && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
 			mod = NULL;
 	}
 	return mod;
diff --git a/kernel/panic.c b/kernel/panic.c
index 41e2b54f3..d96469de7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -61,6 +61,17 @@ void __weak panic_smp_self_stop(void)
 		cpu_relax();
 }
 
+/*
+ * Stop ourselves in NMI context if another CPU has already panicked. Arch code
+ * may override this to prepare for crash dumping, e.g. save regs info.
+ */
+void __weak nmi_panic_self_stop(struct pt_regs *regs)
+{
+	panic_smp_self_stop();
+}
+
+atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+
 /**
  *	panic - halt the system
  *	@fmt: The text string to print
@@ -71,17 +82,17 @@ void __weak panic_smp_self_stop(void)
  */
 void panic(const char *fmt, ...)
 {
-	static DEFINE_SPINLOCK(panic_lock);
 	static char buf[1024];
 	va_list args;
 	long i, i_next = 0;
 	int state = 0;
+	int old_cpu, this_cpu;
 
 	/*
 	 * Disable local interrupts. This will prevent panic_smp_self_stop
 	 * from deadlocking the first cpu that invokes the panic, since
 	 * there is nothing to prevent an interrupt handler (that runs
-	 * after the panic_lock is acquired) from invoking panic again.
+	 * after setting panic_cpu) from invoking panic() again.
 	 */
 	local_irq_disable();
 
@@ -94,8 +105,16 @@ void panic(const char *fmt, ...)
 	 * multiple parallel invocations of panic, all other CPUs either
 	 * stop themself or will wait until they are stopped by the 1st CPU
 	 * with smp_send_stop().
+	 *
+	 * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
+	 * comes here, so go ahead.
+	 * `old_cpu == this_cpu' means we came from nmi_panic() which sets
+	 * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
 	 */
-	if (!spin_trylock(&panic_lock))
+	this_cpu = raw_smp_processor_id();
+	old_cpu  = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+
+	if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
 		panic_smp_self_stop();
 
 	console_verbose();
@@ -117,9 +136,11 @@ void panic(const char *fmt, ...)
 	 * everything else.
 	 * If we want to run this after calling panic_notifiers, pass
 	 * the "crash_kexec_post_notifiers" option to the kernel.
+	 *
+	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
 	if (!crash_kexec_post_notifiers)
-		crash_kexec(NULL);
+		__crash_kexec(NULL);
 
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
@@ -142,9 +163,11 @@ void panic(const char *fmt, ...)
 	 * panic_notifiers and dumping kmsg before kdump.
 	 * Note: since some panic_notifiers can make crashed kernel
 	 * more unstable, it can increase risks of the kdump failure too.
+	 *
+	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
 	if (crash_kexec_post_notifiers)
-		crash_kexec(NULL);
+		__crash_kexec(NULL);
 
 	bust_spinlocks(0);
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 78b3d9f80..4d73a834c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -588,7 +588,7 @@ void __init pidhash_init(void)
 
 void __init pidmap_init(void)
 {
-	/* Veryify no one has done anything silly */
+	/* Verify no one has done anything silly: */
 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
@@ -604,5 +604,5 @@ void __init pidmap_init(void)
 	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9e2ee0cb1..68d3ebc12 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -101,284 +101,6 @@ config PM_STD_PARTITION
 	  suspended image to. It will simply pick the first available swap 
 	  device.
 
-menuconfig TOI_CORE
-	bool "Enhanced Hibernation (TuxOnIce)"
-	depends on HIBERNATION
-	default y
-	---help---
-	  TuxOnIce is the 'new and improved' suspend support.
-
-	  See the TuxOnIce home page (tuxonice.net)
-	  for FAQs, HOWTOs and other documentation.
-
-	comment "Image Storage (you need at least one allocator)"
-		depends on TOI_CORE
-
-	config TOI_FILE
-		bool "File Allocator"
-		depends on TOI_CORE
-		default y
-		---help---
-		  This option enables support for storing an image in a
-		  simple file. You might want this if your swap is
-		  sometimes full enough that you don't have enough spare
-		  space to store an image.
-
-	config TOI_SWAP
-		bool "Swap Allocator"
-		depends on TOI_CORE && SWAP
-		default y
-		---help---
-		  This option enables support for storing an image in your
-		  swap space.
-
-	comment "General Options"
-		depends on TOI_CORE
-
-	config TOI_PRUNE
-		bool "Image pruning support"
-		depends on TOI_CORE && CRYPTO && BROKEN
-		default y
-		---help---
-		  This option adds support for using cryptoapi hashing
-		  algorithms to identify pages with the same content. We
-		  then write a much smaller pointer to the first copy of
-		  the data instead of a complete (perhaps compressed)
-                  additional copy.
-
-		  You probably want this, so say Y here.
-
-	comment "No image pruning support available without Cryptoapi support."
-		depends on TOI_CORE && !CRYPTO
-
-	config TOI_CRYPTO
-		bool "Compression support"
-		depends on TOI_CORE && CRYPTO
-		default y
-		---help---
-		  This option adds support for using cryptoapi compression
-		  algorithms. Compression is particularly useful as it can
-		  more than double your suspend and resume speed (depending
-		  upon how well your image compresses).
-
-		  You probably want this, so say Y here.
-
-	comment "No compression support available without Cryptoapi support."
-		depends on TOI_CORE && !CRYPTO
-
-	config TOI_USERUI
-		bool "Userspace User Interface support"
-		depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
-		default y
-		---help---
-		  This option enabled support for a userspace based user interface
-		  to TuxOnIce, which allows you to have a nice display while suspending
-		  and resuming, and also enables features such as pressing escape to
-		  cancel a cycle or interactive debugging.
-
-	config TOI_USERUI_DEFAULT_PATH
-		string "Default userui program location"
-		default "/usr/local/sbin/tuxoniceui_text"
-		depends on TOI_USERUI
-		---help---
-		  This entry allows you to specify a default path to the userui binary.
-
-	config TOI_DEFAULT_IMAGE_SIZE_LIMIT
-		int "Default image size limit"
-		range -2 65536 
-		default "-2"
-		depends on TOI_CORE
-		---help---
-		  This entry allows you to specify a default image size limit. It can
-		  be overridden at run-time using /sys/power/tuxonice/image_size_limit.
-
-	config TOI_KEEP_IMAGE
-		bool "Allow Keep Image Mode"
-		depends on TOI_CORE
-		---help---
-		  This option allows you to keep and image and reuse it. It is intended
-		  __ONLY__ for use with systems where all filesystems are mounted read-
-		  only (kiosks, for example). To use it, compile this option in and boot
-		  normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
-		  When you resume, the image will not be removed. You will be unable to turn
-		  off swap partitions (assuming you are using the swap allocator), but future
-		  suspends simply do a power-down. The image can be updated using the
-		  kernel command line parameter suspend_act= to turn off the keep image
-		  bit. Keep image mode is a little less user friendly on purpose - it
-		  should not be used without thought!
-
-	config TOI_INCREMENTAL
-		bool "Incremental Image Support"
-		depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE
-		default n
-		---help---
-		  This option enables the work in progress toward using the dirty page
-		  tracking to record changes to pages. It is hoped that
-		  this will be an initial step toward implementing storing just
-		  the differences between consecutive images, which will
-		  increase the amount of storage needed for the image, but also
-		  increase the speed at which writing an image occurs and
-		  reduce the wear and tear on drives.
-
-		  At the moment, all that is implemented is the first step of keeping
-		  an existing image and then comparing it to the contents in memory
-		  (by setting /sys/power/tuxonice/verify_image to 1 and triggering a
-		  (fake) resume) to see what the page change tracking should find to be
-		  different. If you have verify_image set to 1, TuxOnIce will automatically
-		  invalidate the old image when you next try to hibernate, so there's no
-		  greater chance of disk corruption than normal.
-
-	comment "No incremental image support available without Keep Image support."
-		depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT
-
-	config TOI_REPLACE_SWSUSP
-		bool "Replace swsusp by default"
-		default y
-		depends on TOI_CORE
-		---help---
-		  TuxOnIce can replace swsusp. This option makes that the default state,
-		  requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
-		  to use the vanilla kernel functionality. Note that your initrd/ramfs will
-		  need to do this before trying to resume, too.
-		  With overriding swsusp enabled, echoing disk  to /sys/power/state will
-		  start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
-		  the swap and file allocators are compiled in, the swap allocator will be
-		  used by default.
-
-	config TOI_IGNORE_LATE_INITCALL
-		bool "Wait for initrd/ramfs to run, by default"
-		default n
-		depends on TOI_CORE
-		---help---
-		  When booting, TuxOnIce can check for an image and start to resume prior
-		  to any initrd/ramfs running (via a late initcall).
-
-		  If you don't have an initrd/ramfs, this is what you want to happen -
-		  otherwise you won't be able to safely resume. You should set this option
-		  to 'No'.
-
-		  If, however, you want your initrd/ramfs to run anyway before resuming,
-		  you need to tell TuxOnIce to ignore that earlier opportunity to resume.
-		  This can be done either by using this compile time option, or by
-		  overriding this option with the boot-time parameter toi_initramfs_resume_only=1.
-
-		  Note that if TuxOnIce can't resume at the earlier opportunity, the
-		  value of this option won't matter - the initramfs/initrd (if any) will
-		  run anyway.
-
-	menuconfig TOI_CLUSTER
-		bool "Cluster support"
-		default n
-		depends on TOI_CORE && NET && BROKEN
-		---help---
-		  Support for linking multiple machines in a cluster so that they suspend
-		  and resume together.
-
-	config TOI_DEFAULT_CLUSTER_INTERFACE
-		string "Default cluster interface"
-		depends on TOI_CLUSTER
-		---help---
-		  The default interface on which to communicate with other nodes in
-		  the cluster.
-
-		  If no value is set here, cluster support will be disabled by default.
-
-	config TOI_DEFAULT_CLUSTER_KEY
-		string "Default cluster key"
-		default "Default"
-		depends on TOI_CLUSTER
-		---help---
-		  The default key used by this node. All nodes in the same cluster
-		  have the same key. Multiple clusters may coexist on the same lan
-		  by using different values for this key.
-
-	config TOI_CLUSTER_IMAGE_TIMEOUT
-		int "Timeout when checking for image"
-		default 15
-		depends on TOI_CLUSTER
-		---help---
-		  Timeout (seconds) before continuing to boot when waiting to see
-		  whether other nodes might have an image. Set to -1 to wait
-		  indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
-		  booting sooner than this timeout.
-
-	config TOI_CLUSTER_WAIT_UNTIL_NODES
-		int "Nodes without image before continuing"
-		default 0
-		depends on TOI_CLUSTER
-		---help---
-		  When booting and no image is found, we wait to see if other nodes
-		  have an image before continuing to boot. This value lets us
-		  continue after seeing a certain number of nodes without an image,
-		  instead of continuing to wait for the timeout. Set to 0 to only
-		  use the timeout.
-
-	config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
-		string "Default pre-hibernate script"
-		depends on TOI_CLUSTER
-		---help---
-		  The default script to be called when starting to hibernate.
-
-	config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
-		string "Default post-hibernate script"
-		depends on TOI_CLUSTER
-		---help---
-		  The default script to be called after resuming from hibernation.
-
-	config TOI_DEFAULT_WAIT
-		int "Default waiting time for emergency boot messages"
-		default "25"
-		range -1 32768
-		depends on TOI_CORE
-		help
-		  TuxOnIce can display warnings very early in the process of resuming,
-		  if (for example) it appears that you have booted a kernel that doesn't
-		  match an image on disk. It can then give you the opportunity to either
-		  continue booting that kernel, or reboot the machine. This option can be
-		  used to control how long to wait in such circumstances. -1 means wait
-		  forever. 0 means don't wait at all (do the default action, which will
-		  generally be to continue booting and remove the image). Values of 1 or
-		  more indicate a number of seconds (up to 255) to wait before doing the
-		  default.
-
-	config  TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE
-		int "Default extra pages allowance"
-		default "2000"
-		range 500 32768
-		depends on TOI_CORE
-		help
-		  This value controls the default for the allowance TuxOnIce makes for
-		  drivers to allocate extra memory during the atomic copy. The default
-		  value of 2000 will be okay in most cases. If you are using
-		  DRI, the easiest way to find what value to use is to try to hibernate
-		  and look at how many pages were actually needed in the sysfs entry
-		  /sys/power/tuxonice/debug_info (first number on the last line), adding
-		  a little extra because the value is not always the same.
-
-	config TOI_CHECKSUM
-		bool "Checksum pageset2"
-		default n
-		depends on TOI_CORE
-		select CRYPTO
-		select CRYPTO_ALGAPI
-		select CRYPTO_MD4
-		---help---
-		  Adds support for checksumming pageset2 pages, to ensure you really get an
-		  atomic copy. Since some filesystems (XFS especially) change metadata even
-		  when there's no other activity, we need this to check for pages that have
-		  been changed while we were saving the page cache. If your debugging output
-		  always says no pages were resaved, you may be able to safely disable this
-		  option.
-
-config TOI
-	bool
-	depends on TOI_CORE!=n
-	default y
-
-config TOI_ZRAM_SUPPORT
-	def_bool y
-	depends on TOI && ZRAM!=n
-
 config PM_SLEEP
 	def_bool y
 	depends on SUSPEND || HIBERNATE_CALLBACKS
@@ -513,7 +235,7 @@ config PM_TRACE_RTC
 
 config APM_EMULATION
 	tristate "Advanced Power Management Emulation"
-	depends on PM && SYS_SUPPORTS_APM_EMULATION
+	depends on SYS_SUPPORTS_APM_EMULATION
 	help
 	  APM is a BIOS specification for saving power using several different
 	  techniques. This is mostly useful for battery powered laptops with
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 82c4795e8..cb880a14c 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,38 +1,6 @@
 
 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
 
-tuxonice_core-y := tuxonice_modules.o
-
-obj-$(CONFIG_TOI)		+= tuxonice_builtin.o
-obj-$(CONFIG_TOI_INCREMENTAL)   += tuxonice_incremental.o \
-    tuxonice_copy_before_write.o
-
-tuxonice_core-$(CONFIG_PM_DEBUG)	+= tuxonice_alloc.o
-
-# Compile these in after allocation debugging, if used.
-
-tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \
-		tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
-		tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
-		tuxonice_power_off.o tuxonice_atomic_copy.o
-
-tuxonice_core-$(CONFIG_TOI_CHECKSUM)	+= tuxonice_checksum.o
-
-tuxonice_core-$(CONFIG_NET)	+= tuxonice_storage.o tuxonice_netlink.o
-
-obj-$(CONFIG_TOI_CORE)		+= tuxonice_core.o
-obj-$(CONFIG_TOI_PRUNE)		+= tuxonice_prune.o
-obj-$(CONFIG_TOI_CRYPTO)	+= tuxonice_compress.o
-
-tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \
-		tuxonice_bio_signature.o
-
-obj-$(CONFIG_TOI_SWAP)		+= tuxonice_bio.o tuxonice_swap.o
-obj-$(CONFIG_TOI_FILE)		+= tuxonice_bio.o tuxonice_file.o
-obj-$(CONFIG_TOI_CLUSTER)	+= tuxonice_cluster.o
-
-obj-$(CONFIG_TOI_USERUI)	+= tuxonice_userui.o
-
 obj-y				+= qos.o
 obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 153e51db5..b7342a24f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -31,7 +31,7 @@
 #include <linux/ktime.h>
 #include <trace/events/power.h>
 
-#include "tuxonice.h"
+#include "power.h"
 
 
 static int nocompress;
@@ -39,7 +39,7 @@ static int noresume;
 static int nohibernate;
 static int resume_wait;
 static unsigned int resume_delay;
-char resume_file[256] = CONFIG_PM_STD_PARTITION;
+static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
 __visible int in_suspend __nosavedata;
@@ -123,7 +123,7 @@ static int hibernation_test(int level) { return 0; }
  * platform_begin - Call platform to start hibernation.
  * @platform_mode: Whether or not to use the platform driver.
  */
-int platform_begin(int platform_mode)
+static int platform_begin(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->begin() : 0;
@@ -133,7 +133,7 @@ int platform_begin(int platform_mode)
  * platform_end - Call platform to finish transition to the working state.
  * @platform_mode: Whether or not to use the platform driver.
  */
-void platform_end(int platform_mode)
+static void platform_end(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->end();
@@ -147,7 +147,7 @@ void platform_end(int platform_mode)
  * if so configured, and return an error code if that fails.
  */
 
-int platform_pre_snapshot(int platform_mode)
+static int platform_pre_snapshot(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->pre_snapshot() : 0;
@@ -162,7 +162,7 @@ int platform_pre_snapshot(int platform_mode)
  *
  * This routine is called on one CPU with interrupts disabled.
  */
-void platform_leave(int platform_mode)
+static void platform_leave(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->leave();
@@ -177,7 +177,7 @@ void platform_leave(int platform_mode)
  *
  * This routine must be called after platform_prepare().
  */
-void platform_finish(int platform_mode)
+static void platform_finish(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->finish();
@@ -193,7 +193,7 @@ void platform_finish(int platform_mode)
  * If the restore fails after this function has been called,
  * platform_restore_cleanup() must be called.
  */
-int platform_pre_restore(int platform_mode)
+static int platform_pre_restore(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->pre_restore() : 0;
@@ -210,7 +210,7 @@ int platform_pre_restore(int platform_mode)
  * function must be called too, regardless of the result of
  * platform_pre_restore().
  */
-void platform_restore_cleanup(int platform_mode)
+static void platform_restore_cleanup(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->restore_cleanup();
@@ -220,7 +220,7 @@ void platform_restore_cleanup(int platform_mode)
  * platform_recover - Recover from a failure to suspend devices.
  * @platform_mode: Whether or not to use the platform driver.
  */
-void platform_recover(int platform_mode)
+static void platform_recover(int platform_mode)
 {
 	if (platform_mode && hibernation_ops && hibernation_ops->recover)
 		hibernation_ops->recover();
@@ -648,9 +648,6 @@ int hibernate(void)
 {
 	int error;
 
-	if (test_action_state(TOI_REPLACE_SWSUSP))
-		return try_tuxonice_hibernate();
-
 	if (!hibernation_available()) {
 		pr_debug("PM: Hibernation not available.\n");
 		return -EPERM;
@@ -740,19 +737,11 @@ int hibernate(void)
  * attempts to recover gracefully and make the kernel return to the normal mode
  * of operation.
  */
-int software_resume(void)
+static int software_resume(void)
 {
 	int error;
 	unsigned int flags;
 
-	resume_attempted = 1;
-
-	/*
-	 * We can't know (until an image header - if any - is loaded), whether
-	 * we did override swsusp. We therefore ensure that both are tried.
-	 */
-	try_tuxonice_resume();
-
 	/*
 	 * If the user said "noresume".. bail out early.
 	 */
@@ -1139,7 +1128,6 @@ static int __init hibernate_setup(char *str)
 static int __init noresume_setup(char *str)
 {
 	noresume = 1;
-	set_toi_state(TOI_NORESUME_SPECIFIED);
 	return 1;
 }
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b2dd4d999..27946975e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
 	return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
 }
 
-static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
-					struct kobj_attribute *attr,
-					const char *buf, size_t n)
-{
-	return -EINVAL;
-}
-power_attr(pm_wakeup_irq);
+power_attr_ro(pm_wakeup_irq);
 
 #else /* !CONFIG_PM_SLEEP_DEBUG */
 static inline void pm_print_times_init(void) {}
@@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
 	return show_trace_dev_match(buf, PAGE_SIZE);
 }
 
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
-			 const char *buf, size_t n)
-{
-	return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
 
 #endif /* CONFIG_PM_TRACE */
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b5c9efb36..efe1b3b17 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -36,12 +36,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 	return arch_hibernation_header_restore(info) ?
 			"architecture specific data" : NULL;
 }
-#else
-extern char *check_image_kernel(struct swsusp_info *info);
 #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-extern int init_header(struct swsusp_info *info);
 
-extern char resume_file[256];
 /*
  * Keep some memory free so that I/O operations can succeed without paging
  * [Might this be more than 4 MB?]
@@ -81,7 +77,14 @@ static struct kobj_attribute _name##_attr = {	\
 	.store	= _name##_store,		\
 }
 
-extern struct pbe *restore_pblist;
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = {	\
+	.attr	= {				\
+		.name = __stringify(_name),	\
+		.mode = S_IRUGO,		\
+	},					\
+	.show	= _name##_show,			\
+}
 
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
@@ -266,31 +269,6 @@ static inline void suspend_thaw_processes(void)
 }
 #endif
 
-extern struct page *saveable_page(struct zone *z, unsigned long p);
-#ifdef CONFIG_HIGHMEM
-struct page *saveable_highmem_page(struct zone *z, unsigned long p);
-#else
-static
-inline void *saveable_highmem_page(struct zone *z, unsigned long p)
-{
-	return NULL;
-}
-#endif
-
-#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
-extern struct list_head nosave_regions;
-
-/**
- *	This structure represents a range of page frames the contents of which
- *	should not be saved during the suspend.
- */
-
-struct nosave_region {
-	struct list_head list;
-	unsigned long start_pfn;
-	unsigned long end_pfn;
-};
-
 #ifdef CONFIG_PM_AUTOSLEEP
 
 /* kernel/power/autosleep.c */
@@ -317,10 +295,3 @@ extern int pm_wake_lock(const char *buf);
 extern int pm_wake_unlock(const char *buf);
 
 #endif /* !CONFIG_PM_WAKELOCKS */
-
-#ifdef CONFIG_TOI
-unsigned long toi_get_nonconflicting_page(void);
-#define BM_END_OF_MAP	(~0UL)
-#else
-#define toi_get_nonconflicting_page() (0)
-#endif
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 542163a01..3a9706043 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,9 +36,6 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
 #include "power.h"
 
 static int swsusp_page_is_free(struct page *);
@@ -101,9 +98,6 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
 
-        if (toi_running)
-            return (void *) toi_get_nonconflicting_page();
-
 	res = (void *)get_zeroed_page(gfp_mask);
 	if (safe_needed)
 		while (res && swsusp_page_is_free(virt_to_page(res))) {
@@ -149,11 +143,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 
 	page = virt_to_page(addr);
 
-        if (toi_running) {
-            toi__free_page(29, page);
-            return;
-        }
-
 	swsusp_unset_page_forbidden(page);
 	if (clear_nosave_free)
 		swsusp_unset_page_free(page);
@@ -313,15 +302,13 @@ struct bm_position {
 	int node_bit;
 };
 
-#define BM_POSITION_SLOTS (NR_CPUS * 2)
-
 struct memory_bitmap {
 	struct list_head zones;
 	struct linked_page *p_list;	/* list of pages used to store zone
 					 * bitmap objects and bitmap block
 					 * objects
 					 */
-	struct bm_position cur[BM_POSITION_SLOTS];    /* most recently used bit position */
+	struct bm_position cur;	/* most recently used bit position */
 };
 
 /* Functions that operate on memory bitmaps */
@@ -486,39 +473,16 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
 		free_image_page(node->data, clear_nosave_free);
 }
 
-void memory_bm_position_reset(struct memory_bitmap *bm)
+static void memory_bm_position_reset(struct memory_bitmap *bm)
 {
-    int index;
-
-    for (index = 0; index < BM_POSITION_SLOTS; index++) {
-	bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+	bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
 				  list);
-	bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+	bm->cur.node = list_entry(bm->cur.zone->leaves.next,
 				  struct rtree_node, list);
-	bm->cur[index].node_pfn = 0;
-	bm->cur[index].node_bit = 0;
-    }
+	bm->cur.node_pfn = 0;
+	bm->cur.node_bit = 0;
 }
 
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-
-/**
- *      memory_bm_clear
- *      @param bm - The bitmap to clear
- *
- *      Only run while single threaded - locking not needed
- */
-void memory_bm_clear(struct memory_bitmap *bm)
-{
-    memory_bm_position_reset(bm);
-
-    while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) {
-        memory_bm_clear_current(bm, 0);
-    }
-
-    memory_bm_position_reset(bm);
-}
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 
 struct mem_extent {
@@ -631,8 +595,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 	}
 
 	bm->p_list = ca.chain;
-
-        memory_bm_position_reset(bm);
+	memory_bm_position_reset(bm);
  Exit:
 	free_mem_extents(&mem_extents);
 	return error;
@@ -668,24 +631,14 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
  *	It walks the radix tree to find the page which contains the bit for
  *	pfn and returns the bit position in **addr and *bit_nr.
  */
-int memory_bm_find_bit(struct memory_bitmap *bm, int index,
-        unsigned long pfn, void **addr, unsigned int *bit_nr)
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+			      void **addr, unsigned int *bit_nr)
 {
 	struct mem_zone_bm_rtree *curr, *zone;
 	struct rtree_node *node;
 	int i, block_nr;
 
-        if (!bm->cur[index].zone) {
-            // Reset
-            bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
-                    list);
-            bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
-                    struct rtree_node, list);
-            bm->cur[index].node_pfn = 0;
-            bm->cur[index].node_bit = 0;
-        }
-
-	zone = bm->cur[index].zone;
+	zone = bm->cur.zone;
 
 	if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
 		goto zone_found;
@@ -709,8 +662,8 @@ zone_found:
 	 * node for our pfn.
 	 */
 
-	node = bm->cur[index].node;
-	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn)
+	node = bm->cur.node;
+	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
 		goto node_found;
 
 	node      = zone->rtree;
@@ -727,9 +680,9 @@ zone_found:
 
 node_found:
 	/* Update last position */
-	bm->cur[index].zone = zone;
-	bm->cur[index].node = node;
-	bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+	bm->cur.zone = zone;
+	bm->cur.node = node;
+	bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
 
 	/* Set return values */
 	*addr = node->data;
@@ -738,66 +691,66 @@ node_found:
 	return 0;
 }
 
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
 	BUG_ON(error);
 	set_bit(bit, addr);
 }
 
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
 	if (!error)
 		set_bit(bit, addr);
 
 	return error;
 }
 
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
 	BUG_ON(error);
 	clear_bit(bit, addr);
 }
 
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index)
+static void memory_bm_clear_current(struct memory_bitmap *bm)
 {
 	int bit;
 
-	bit = max(bm->cur[index].node_bit - 1, 0);
-	clear_bit(bit, bm->cur[index].node->data);
+	bit = max(bm->cur.node_bit - 1, 0);
+	clear_bit(bit, bm->cur.node->data);
 }
 
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
 	BUG_ON(error);
 	return test_bit(bit, addr);
 }
 
-static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn)
+static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 
-	return !memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+	return !memory_bm_find_bit(bm, pfn, &addr, &bit);
 }
 
 /*
@@ -810,25 +763,25 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned
  *
  *	Returns true if there is a next node, false otherwise.
  */
-static bool rtree_next_node(struct memory_bitmap *bm, int index)
+static bool rtree_next_node(struct memory_bitmap *bm)
 {
-	bm->cur[index].node = list_entry(bm->cur[index].node->list.next,
+	bm->cur.node = list_entry(bm->cur.node->list.next,
 				  struct rtree_node, list);
-	if (&bm->cur[index].node->list != &bm->cur[index].zone->leaves) {
-		bm->cur[index].node_pfn += BM_BITS_PER_BLOCK;
-		bm->cur[index].node_bit  = 0;
+	if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+		bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+		bm->cur.node_bit  = 0;
 		touch_softlockup_watchdog();
 		return true;
 	}
 
 	/* No more nodes, goto next zone */
-	bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next,
+	bm->cur.zone = list_entry(bm->cur.zone->list.next,
 				  struct mem_zone_bm_rtree, list);
-	if (&bm->cur[index].zone->list != &bm->zones) {
-		bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+	if (&bm->cur.zone->list != &bm->zones) {
+		bm->cur.node = list_entry(bm->cur.zone->leaves.next,
 					  struct rtree_node, list);
-		bm->cur[index].node_pfn = 0;
-		bm->cur[index].node_bit = 0;
+		bm->cur.node_pfn = 0;
+		bm->cur.node_bit = 0;
 		return true;
 	}
 
@@ -846,29 +799,38 @@ static bool rtree_next_node(struct memory_bitmap *bm, int index)
  *	It is required to run memory_bm_position_reset() before the
  *	first call to this function.
  */
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index)
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
 	unsigned long bits, pfn, pages;
 	int bit;
 
-        index += NR_CPUS; /* Iteration state is separated from get/set/test */
-
 	do {
-		pages	  = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn;
-		bits      = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK);
-		bit	  = find_next_bit(bm->cur[index].node->data, bits,
-					  bm->cur[index].node_bit);
+		pages	  = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
+		bits      = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
+		bit	  = find_next_bit(bm->cur.node->data, bits,
+					  bm->cur.node_bit);
 		if (bit < bits) {
-			pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit;
-			bm->cur[index].node_bit = bit + 1;
+			pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
+			bm->cur.node_bit = bit + 1;
 			return pfn;
 		}
-	} while (rtree_next_node(bm, index));
+	} while (rtree_next_node(bm));
 
 	return BM_END_OF_MAP;
 }
 
-LIST_HEAD(nosave_regions);
+/**
+ *	This structure represents a range of page frames the contents of which
+ *	should not be saved during the suspend.
+ */
+
+struct nosave_region {
+	struct list_head list;
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+};
+
+static LIST_HEAD(nosave_regions);
 
 /**
  *	register_nosave_region - register a range of page frames the contents
@@ -927,37 +889,37 @@ static struct memory_bitmap *free_pages_map;
 void swsusp_set_page_free(struct page *page)
 {
 	if (free_pages_map)
-		memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page));
+		memory_bm_set_bit(free_pages_map, page_to_pfn(page));
 }
 
 static int swsusp_page_is_free(struct page *page)
 {
 	return free_pages_map ?
-		memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0;
+		memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
 }
 
 void swsusp_unset_page_free(struct page *page)
 {
 	if (free_pages_map)
-		memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page));
+		memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
 }
 
 static void swsusp_set_page_forbidden(struct page *page)
 {
 	if (forbidden_pages_map)
-		memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page));
+		memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
 }
 
 int swsusp_page_is_forbidden(struct page *page)
 {
 	return forbidden_pages_map ?
-		memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0;
+		memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
 }
 
 static void swsusp_unset_page_forbidden(struct page *page)
 {
 	if (forbidden_pages_map)
-		memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page));
+		memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
 }
 
 /**
@@ -988,7 +950,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 				 * touch the PFNs for which the error is
 				 * returned anyway.
 				 */
-				mem_bm_set_bit_check(bm, 0, pfn);
+				mem_bm_set_bit_check(bm, pfn);
 			}
 	}
 }
@@ -1116,7 +1078,7 @@ static unsigned int count_free_highmem_pages(void)
  *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
  *	and it isn't a part of a free chunk of pages.
  */
-struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -1163,6 +1125,11 @@ static unsigned int count_highmem_pages(void)
 	}
 	return n;
 }
+#else
+static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+	return NULL;
+}
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -1173,7 +1140,7 @@ static unsigned int count_highmem_pages(void)
  *	of pages statically defined as 'unsaveable', and it isn't a part of
  *	a free chunk of pages.
  */
-struct page *saveable_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -1311,15 +1278,15 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (page_is_saveable(zone, pfn))
-				memory_bm_set_bit(orig_bm, 0, pfn);
+				memory_bm_set_bit(orig_bm, pfn);
 	}
 	memory_bm_position_reset(orig_bm);
 	memory_bm_position_reset(copy_bm);
 	for(;;) {
-		pfn = memory_bm_next_pfn(orig_bm, 0);
+		pfn = memory_bm_next_pfn(orig_bm);
 		if (unlikely(pfn == BM_END_OF_MAP))
 			break;
-		copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn);
+		copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
 	}
 }
 
@@ -1365,8 +1332,8 @@ void swsusp_free(void)
 	memory_bm_position_reset(free_pages_map);
 
 loop:
-	fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
-	fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+	fr_pfn = memory_bm_next_pfn(free_pages_map);
+	fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
 
 	/*
 	 * Find the next bit set in both bitmaps. This is guaranteed to
@@ -1374,16 +1341,16 @@ loop:
 	 */
 	do {
 		if (fb_pfn < fr_pfn)
-			fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+			fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
 		if (fr_pfn < fb_pfn)
-			fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
+			fr_pfn = memory_bm_next_pfn(free_pages_map);
 	} while (fb_pfn != fr_pfn);
 
 	if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
 		struct page *page = pfn_to_page(fr_pfn);
 
-		memory_bm_clear_current(forbidden_pages_map, 0);
-		memory_bm_clear_current(free_pages_map, 0);
+		memory_bm_clear_current(forbidden_pages_map);
+		memory_bm_clear_current(free_pages_map);
 		__free_page(page);
 		goto loop;
 	}
@@ -1418,7 +1385,7 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
 		page = alloc_image_page(mask);
 		if (!page)
 			break;
-		memory_bm_set_bit(&copy_bm, 0, page_to_pfn(page));
+		memory_bm_set_bit(&copy_bm, page_to_pfn(page));
 		if (PageHighMem(page))
 			alloc_highmem++;
 		else
@@ -1514,7 +1481,7 @@ static unsigned long free_unnecessary_pages(void)
 	memory_bm_position_reset(&copy_bm);
 
 	while (to_free_normal > 0 || to_free_highmem > 0) {
-		unsigned long pfn = memory_bm_next_pfn(&copy_bm, 0);
+		unsigned long pfn = memory_bm_next_pfn(&copy_bm);
 		struct page *page = pfn_to_page(pfn);
 
 		if (PageHighMem(page)) {
@@ -1528,7 +1495,7 @@ static unsigned long free_unnecessary_pages(void)
 			to_free_normal--;
 			alloc_normal--;
 		}
-		memory_bm_clear_bit(&copy_bm, 0, pfn);
+		memory_bm_clear_bit(&copy_bm, pfn);
 		swsusp_unset_page_forbidden(page);
 		swsusp_unset_page_free(page);
 		__free_page(page);
@@ -1813,7 +1780,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 		struct page *page;
 
 		page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
-		memory_bm_set_bit(bm, 0, page_to_pfn(page));
+		memory_bm_set_bit(bm, page_to_pfn(page));
 	}
 	return nr_highmem;
 }
@@ -1856,7 +1823,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 			page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
 			if (!page)
 				goto err_out;
-			memory_bm_set_bit(copy_bm, 0, page_to_pfn(page));
+			memory_bm_set_bit(copy_bm, page_to_pfn(page));
 		}
 	}
 
@@ -1871,9 +1838,6 @@ asmlinkage __visible int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
 
-        if (toi_running)
-            return toi_post_context_save();
-
 	printk(KERN_INFO "PM: Creating hibernation image:\n");
 
 	drain_local_pages(NULL);
@@ -1921,7 +1885,7 @@ static int init_header_complete(struct swsusp_info *info)
 	return 0;
 }
 
-char *check_image_kernel(struct swsusp_info *info)
+static char *check_image_kernel(struct swsusp_info *info)
 {
 	if (info->version_code != LINUX_VERSION_CODE)
 		return "kernel version";
@@ -1942,7 +1906,7 @@ unsigned long snapshot_get_image_size(void)
 	return nr_copy_pages + nr_meta_pages + 1;
 }
 
-int init_header(struct swsusp_info *info)
+static int init_header(struct swsusp_info *info)
 {
 	memset(info, 0, sizeof(struct swsusp_info));
 	info->num_physpages = get_num_physpages();
@@ -1964,7 +1928,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 	int j;
 
 	for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
-		buf[j] = memory_bm_next_pfn(bm, 0);
+		buf[j] = memory_bm_next_pfn(bm);
 		if (unlikely(buf[j] == BM_END_OF_MAP))
 			break;
 		/* Save page key for data page (s390 only). */
@@ -2015,7 +1979,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
 	} else {
 		struct page *page;
 
-		page = pfn_to_page(memory_bm_next_pfn(&copy_bm, 0));
+		page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
 		if (PageHighMem(page)) {
 			/* Highmem pages are copied to the buffer,
 			 * because we can't return with a kmapped
@@ -2057,7 +2021,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 	/* Mark pages that correspond to the "original" pfns as "unsafe" */
 	memory_bm_position_reset(bm);
 	do {
-		pfn = memory_bm_next_pfn(bm, 0);
+		pfn = memory_bm_next_pfn(bm);
 		if (likely(pfn != BM_END_OF_MAP)) {
 			if (likely(pfn_valid(pfn)))
 				swsusp_set_page_free(pfn_to_page(pfn));
@@ -2077,10 +2041,10 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
 	unsigned long pfn;
 
 	memory_bm_position_reset(src);
-	pfn = memory_bm_next_pfn(src, 0);
+	pfn = memory_bm_next_pfn(src);
 	while (pfn != BM_END_OF_MAP) {
-		memory_bm_set_bit(dst, 0, pfn);
-		pfn = memory_bm_next_pfn(src, 0);
+		memory_bm_set_bit(dst, pfn);
+		pfn = memory_bm_next_pfn(src);
 	}
 }
 
@@ -2131,8 +2095,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 		/* Extract and buffer page key for data page (s390 only). */
 		page_key_memorize(buf + j);
 
-		if (memory_bm_pfn_present(bm, 0, buf[j]))
-			memory_bm_set_bit(bm, 0, buf[j]);
+		if (memory_bm_pfn_present(bm, buf[j]))
+			memory_bm_set_bit(bm, buf[j]);
 		else
 			return -EFAULT;
 	}
@@ -2175,12 +2139,12 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
 	unsigned int cnt = 0;
 
 	memory_bm_position_reset(bm);
-	pfn = memory_bm_next_pfn(bm, 0);
+	pfn = memory_bm_next_pfn(bm);
 	while (pfn != BM_END_OF_MAP) {
 		if (PageHighMem(pfn_to_page(pfn)))
 			cnt++;
 
-		pfn = memory_bm_next_pfn(bm, 0);
+		pfn = memory_bm_next_pfn(bm);
 	}
 	return cnt;
 }
@@ -2225,7 +2189,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
 		page = alloc_page(__GFP_HIGHMEM);
 		if (!swsusp_page_is_free(page)) {
 			/* The page is "safe", set its bit the bitmap */
-			memory_bm_set_bit(bm, 0, page_to_pfn(page));
+			memory_bm_set_bit(bm, page_to_pfn(page));
 			safe_highmem_pages++;
 		}
 		/* Mark the page as allocated */
@@ -2283,7 +2247,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 
 		/* Copy of the page will be stored in high memory */
 		kaddr = buffer;
-		tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0));
+		tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
 		safe_highmem_pages--;
 		last_highmem_page = tmp;
 		pbe->copy_page = tmp;
@@ -2454,7 +2418,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
 	struct pbe *pbe;
 	struct page *page;
-	unsigned long pfn = memory_bm_next_pfn(bm, 0);
+	unsigned long pfn = memory_bm_next_pfn(bm);
 
 	if (pfn == BM_END_OF_MAP)
 		return ERR_PTR(-EFAULT);
@@ -2641,82 +2605,3 @@ int restore_highmem(void)
 	return 0;
 }
 #endif /* CONFIG_HIGHMEM */
-
-struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map,
-  *pageset1_copy_map, *io_map, *page_resave_map, *compare_map;
-
-int resume_attempted;
-
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
-	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
-    int result;
-
-    memory_bm_position_reset(bm);
-
-    do {
-        result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
-        if (result)
-            return result;
-    } while (rtree_next_node(bm, 0));
-    return 0;
-}
-
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
-	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
-    int result;
-
-    memory_bm_position_reset(bm);
-
-    do {
-        result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
-        if (result)
-            return result;
-
-    } while (rtree_next_node(bm, 0));
-    return 0;
-}
-
-int memory_bm_space_needed(struct memory_bitmap *bm)
-{
-    unsigned long bytes = 0;
-
-    memory_bm_position_reset(bm);
-    do {
-        bytes += PAGE_SIZE;
-    } while (rtree_next_node(bm, 0));
-    return bytes;
-}
-
-int toi_alloc_bitmap(struct memory_bitmap **bm)
-{
-    int error;
-    struct memory_bitmap *bm1;
-
-    bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
-    if (!bm1)
-        return -ENOMEM;
-
-    error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
-    if (error) {
-        printk("Error returned - %d.\n", error);
-        kfree(bm1);
-        return -ENOMEM;
-    }
-
-    *bm = bm1;
-    return 0;
-}
-
-void toi_free_bitmap(struct memory_bitmap **bm)
-{
-    if (!*bm)
-        return;
-
-    memory_bm_free(*bm, 0);
-    kfree(*bm);
-    *bm = NULL;
-}
diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
deleted file mode 100644
index 10b65633f..000000000
--- a/kernel/power/tuxonice.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * kernel/power/tuxonice.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations used throughout swsusp.
- *
- */
-
-#ifndef KERNEL_POWER_TOI_H
-#define KERNEL_POWER_TOI_H
-
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/suspend.h>
-#include <linux/fs.h>
-#include <asm/setup.h>
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-#define TOI_CORE_VERSION "3.3"
-#define        TOI_HEADER_VERSION 3
-#define MY_BOOT_KERNEL_DATA_VERSION 4
-
-struct toi_boot_kernel_data {
-        int version;
-        int size;
-        unsigned long toi_action;
-        unsigned long toi_debug_state;
-        u32 toi_default_console_level;
-        int toi_io_time[2][2];
-        char toi_nosave_commandline[COMMAND_LINE_SIZE];
-        unsigned long pages_used[33];
-        unsigned long incremental_bytes_in;
-        unsigned long incremental_bytes_out;
-        unsigned long compress_bytes_in;
-        unsigned long compress_bytes_out;
-        unsigned long pruned_pages;
-};
-
-extern struct toi_boot_kernel_data toi_bkd;
-
-/* Location of book kernel data struct in kernel being resumed */
-extern unsigned long boot_kernel_data_buffer;
-
-/*                 == Action states ==                 */
-
-enum {
-        TOI_REBOOT,
-        TOI_PAUSE,
-        TOI_LOGALL,
-        TOI_CAN_CANCEL,
-        TOI_KEEP_IMAGE,
-        TOI_FREEZER_TEST,
-        TOI_SINGLESTEP,
-        TOI_PAUSE_NEAR_PAGESET_END,
-        TOI_TEST_FILTER_SPEED,
-        TOI_TEST_BIO,
-        TOI_NO_PAGESET2,
-        TOI_IGNORE_ROOTFS,
-        TOI_REPLACE_SWSUSP,
-        TOI_PAGESET2_FULL,
-        TOI_ABORT_ON_RESAVE_NEEDED,
-        TOI_NO_MULTITHREADED_IO,
-        TOI_NO_DIRECT_LOAD, /* Obsolete */
-        TOI_LATE_CPU_HOTPLUG, /* Obsolete */
-        TOI_GET_MAX_MEM_ALLOCD,
-        TOI_NO_FLUSHER_THREAD,
-        TOI_NO_PS2_IF_UNNEEDED,
-        TOI_POST_RESUME_BREAKPOINT,
-        TOI_NO_READAHEAD,
-        TOI_TRACE_DEBUG_ON,
-        TOI_INCREMENTAL_IMAGE,
-};
-
-extern unsigned long toi_bootflags_mask;
-
-#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
-
-/*                 == Result states ==                 */
-
-enum {
-        TOI_ABORTED,
-        TOI_ABORT_REQUESTED,
-        TOI_NOSTORAGE_AVAILABLE,
-        TOI_INSUFFICIENT_STORAGE,
-        TOI_FREEZING_FAILED,
-        TOI_KEPT_IMAGE,
-        TOI_WOULD_EAT_MEMORY,
-        TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
-        TOI_PM_SEM,
-        TOI_DEVICE_REFUSED,
-        TOI_SYSDEV_REFUSED,
-        TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
-        TOI_UNABLE_TO_PREPARE_IMAGE,
-        TOI_FAILED_MODULE_INIT,
-        TOI_FAILED_MODULE_CLEANUP,
-        TOI_FAILED_IO,
-        TOI_OUT_OF_MEMORY,
-        TOI_IMAGE_ERROR,
-        TOI_PLATFORM_PREP_FAILED,
-        TOI_CPU_HOTPLUG_FAILED,
-        TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */
-        TOI_RESAVE_NEEDED,
-        TOI_CANT_SUSPEND,
-        TOI_NOTIFIERS_PREPARE_FAILED,
-        TOI_PRE_SNAPSHOT_FAILED,
-        TOI_PRE_RESTORE_FAILED,
-        TOI_USERMODE_HELPERS_ERR,
-        TOI_CANT_USE_ALT_RESUME,
-        TOI_HEADER_TOO_BIG,
-        TOI_WAKEUP_EVENT,
-        TOI_SYSCORE_REFUSED,
-        TOI_DPM_PREPARE_FAILED,
-        TOI_DPM_SUSPEND_FAILED,
-        TOI_NUM_RESULT_STATES        /* Used in printing debug info only */
-};
-
-extern unsigned long toi_result;
-
-#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
-#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
-                                test_and_set_bit(bit, &toi_result))
-#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
-#define test_result_state(bit) (test_bit(bit, &toi_result))
-
-/*         == Debug sections and levels ==         */
-
-/* debugging levels. */
-enum {
-        TOI_STATUS = 0,
-        TOI_ERROR = 2,
-        TOI_LOW,
-        TOI_MEDIUM,
-        TOI_HIGH,
-        TOI_VERBOSE,
-};
-
-enum {
-        TOI_ANY_SECTION,
-        TOI_EAT_MEMORY,
-        TOI_IO,
-        TOI_HEADER,
-        TOI_WRITER,
-        TOI_MEMORY,
-        TOI_PAGEDIR,
-        TOI_COMPRESS,
-        TOI_BIO,
-};
-
-#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
-#define clear_debug_state(bit) \
-        (test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
-#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
-
-/*                == Steps in hibernating ==        */
-
-enum {
-        STEP_HIBERNATE_PREPARE_IMAGE,
-        STEP_HIBERNATE_SAVE_IMAGE,
-        STEP_HIBERNATE_POWERDOWN,
-        STEP_RESUME_CAN_RESUME,
-        STEP_RESUME_LOAD_PS1,
-        STEP_RESUME_DO_RESTORE,
-        STEP_RESUME_READ_PS2,
-        STEP_RESUME_GO,
-        STEP_RESUME_ALT_IMAGE,
-        STEP_CLEANUP,
-        STEP_QUIET_CLEANUP
-};
-
-/*                == TuxOnIce states ==
-        (see also include/linux/suspend.h)        */
-
-#define get_toi_state()  (toi_state)
-#define restore_toi_state(saved_state) \
-        do { toi_state = saved_state; } while (0)
-
-/*                == Module support ==                */
-
-struct toi_core_fns {
-        int (*post_context_save)(void);
-        unsigned long (*get_nonconflicting_page)(void);
-        int (*try_hibernate)(void);
-        void (*try_resume)(void);
-};
-
-extern struct toi_core_fns *toi_core_fns;
-
-/*                == All else ==                        */
-#define KB(x) ((x) << (PAGE_SHIFT - 10))
-#define MB(x) ((x) >> (20 - PAGE_SHIFT))
-
-extern int toi_start_anything(int toi_or_resume);
-extern void toi_finish_anything(int toi_or_resume);
-
-extern int save_image_part1(void);
-extern int toi_atomic_restore(void);
-
-extern int toi_try_hibernate(void);
-extern void toi_try_resume(void);
-
-extern int __toi_post_context_save(void);
-
-extern unsigned int nr_hibernates;
-extern char alt_resume_param[256];
-
-extern void copyback_post(void);
-extern int toi_hibernate(void);
-extern unsigned long extra_pd1_pages_used;
-
-#define SECTOR_SIZE 512
-
-extern void toi_early_boot_message(int can_erase_image, int default_answer,
-        char *warning_reason, ...);
-
-extern int do_check_can_resume(void);
-extern int do_toi_step(int step);
-extern int toi_launch_userspace_program(char *command, int channel_no,
-                int wait, int debug);
-
-extern char tuxonice_signature[9];
-
-extern int toi_start_other_threads(void);
-extern void toi_stop_other_threads(void);
-
-extern int toi_trace_index;
-#define TOI_TRACE_DEBUG(PFN, DESC, ...) \
-    do { \
-        if (test_action_state(TOI_TRACE_DEBUG_ON)) { \
-            printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \
-        } \
-    } while(0)
-
-#ifdef CONFIG_TOI_KEEP_IMAGE
-#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE))
-#else
-#define toi_keeping_image (0)
-#endif
-
-#ifdef CONFIG_TOI_INCREMENTAL
-extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose);
-extern int toi_reset_dirtiness(int verbose);
-extern void toi_cbw_write(void);
-extern void toi_cbw_restore(void);
-extern int toi_allocate_cbw_data(void);
-extern void toi_free_cbw_data(void);
-extern int toi_cbw_init(void);
-extern void toi_mark_tasks_cbw(void);
-#else
-static inline int toi_reset_dirtiness(int verbose) { return 0; }
-#define toi_cbw_write() do { } while(0)
-#define toi_cbw_restore() do { } while(0)
-#define toi_allocate_cbw_data() do { } while(0)
-#define toi_free_cbw_data() do { } while(0)
-static inline int toi_cbw_init(void) { return 0; }
-#endif
-#endif
diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
deleted file mode 100644
index 1d8b1cbda..000000000
--- a/kernel/power/tuxonice_alloc.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.c
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/export.h>
-#include <linux/slab.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-#define TOI_ALLOC_PATHS 41
-
-static DEFINE_MUTEX(toi_alloc_mutex);
-
-static struct toi_module_ops toi_alloc_ops;
-
-static int toi_fail_num;
-
-static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
-                toi_free_count[TOI_ALLOC_PATHS],
-                toi_test_count[TOI_ALLOC_PATHS],
-                toi_fail_count[TOI_ALLOC_PATHS];
-static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
-static int cur_allocd, max_allocd;
-
-static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
-        "", /* 0 */
-        "get_io_info_struct",
-        "extent",
-        "extent (loading chain)",
-        "userui channel",
-        "userui arg", /* 5 */
-        "attention list metadata",
-        "extra pagedir memory metadata",
-        "bdev metadata",
-        "extra pagedir memory",
-        "header_locations_read", /* 10 */
-        "bio queue",
-        "prepare_readahead",
-        "i/o buffer",
-        "writer buffer in bio_init",
-        "checksum buffer", /* 15 */
-        "compression buffer",
-        "filewriter signature op",
-        "set resume param alloc1",
-        "set resume param alloc2",
-        "debugging info buffer", /* 20 */
-        "check can resume buffer",
-        "write module config buffer",
-        "read module config buffer",
-        "write image header buffer",
-        "read pageset1 buffer", /* 25 */
-        "get_have_image_data buffer",
-        "checksum page",
-        "worker rw loop",
-        "get nonconflicting page",
-        "ps1 load addresses", /* 30 */
-        "remove swap image",
-        "swap image exists",
-        "swap parse sig location",
-        "sysfs kobj",
-        "swap mark resume attempted buffer", /* 35 */
-        "cluster member",
-        "boot kernel data buffer",
-        "setting swap signature",
-        "block i/o bdev struct",
-        "copy before write", /* 40 */
-};
-
-#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
-        do { \
-                BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
-                \
-                if (FAIL_NUM == toi_fail_num) { \
-                        atomic_inc(&toi_test_count[FAIL_NUM]); \
-                        toi_fail_num = 0; \
-                        return FAIL_VAL; \
-                } \
-        } while (0)
-
-static void alloc_update_stats(int fail_num, void *result, int size)
-{
-        if (!result) {
-                atomic_inc(&toi_fail_count[fail_num]);
-                return;
-        }
-
-        atomic_inc(&toi_alloc_count[fail_num]);
-        if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
-                mutex_lock(&toi_alloc_mutex);
-                toi_cur_allocd[fail_num]++;
-                cur_allocd += size;
-                if (unlikely(cur_allocd > max_allocd)) {
-                        int i;
-
-                        for (i = 0; i < TOI_ALLOC_PATHS; i++)
-                                toi_max_allocd[i] = toi_cur_allocd[i];
-                        max_allocd = cur_allocd;
-                }
-                mutex_unlock(&toi_alloc_mutex);
-        }
-}
-
-static void free_update_stats(int fail_num, int size)
-{
-        BUG_ON(fail_num >= TOI_ALLOC_PATHS);
-        atomic_inc(&toi_free_count[fail_num]);
-        if (unlikely(atomic_read(&toi_free_count[fail_num]) >
-                                atomic_read(&toi_alloc_count[fail_num])))
-                dump_stack();
-        if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
-                mutex_lock(&toi_alloc_mutex);
-                cur_allocd -= size;
-                toi_cur_allocd[fail_num]--;
-                mutex_unlock(&toi_alloc_mutex);
-        }
-}
-
-void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
-{
-        void *result;
-
-        if (toi_alloc_ops.enabled)
-                MIGHT_FAIL(fail_num, NULL);
-        result = kzalloc(size, flags);
-        if (toi_alloc_ops.enabled)
-                alloc_update_stats(fail_num, result, size);
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        return result;
-}
-
-unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
-                unsigned int order)
-{
-        unsigned long result;
-
-        mask |= ___GFP_TOI_NOTRACK;
-        if (toi_alloc_ops.enabled)
-                MIGHT_FAIL(fail_num, 0);
-        result = __get_free_pages(mask, order);
-        if (toi_alloc_ops.enabled)
-                alloc_update_stats(fail_num, (void *) result,
-                                PAGE_SIZE << order);
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        return result;
-}
-
-struct page *toi_alloc_page(int fail_num, gfp_t mask)
-{
-        struct page *result;
-
-        if (toi_alloc_ops.enabled)
-                MIGHT_FAIL(fail_num, NULL);
-        mask |= ___GFP_TOI_NOTRACK;
-        result = alloc_page(mask);
-        if (toi_alloc_ops.enabled)
-                alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        return result;
-}
-
-unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
-{
-        unsigned long result;
-
-        if (toi_alloc_ops.enabled)
-                MIGHT_FAIL(fail_num, 0);
-        mask |= ___GFP_TOI_NOTRACK;
-        result = get_zeroed_page(mask);
-        if (toi_alloc_ops.enabled)
-                alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        return result;
-}
-
-void toi_kfree(int fail_num, const void *arg, int size)
-{
-        if (arg && toi_alloc_ops.enabled)
-                free_update_stats(fail_num, size);
-
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        kfree(arg);
-}
-
-void toi_free_page(int fail_num, unsigned long virt)
-{
-        if (virt && toi_alloc_ops.enabled)
-                free_update_stats(fail_num, PAGE_SIZE);
-
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        free_page(virt);
-}
-
-void toi__free_page(int fail_num, struct page *page)
-{
-        if (page && toi_alloc_ops.enabled)
-                free_update_stats(fail_num, PAGE_SIZE);
-
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        __free_page(page);
-}
-
-void toi_free_pages(int fail_num, struct page *page, int order)
-{
-        if (page && toi_alloc_ops.enabled)
-                free_update_stats(fail_num, PAGE_SIZE << order);
-
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        __free_pages(page, order);
-}
-
-void toi_alloc_print_debug_stats(void)
-{
-        int i, header_done = 0;
-
-        if (!toi_alloc_ops.enabled)
-                return;
-
-        for (i = 0; i < TOI_ALLOC_PATHS; i++)
-                if (atomic_read(&toi_alloc_count[i]) !=
-                    atomic_read(&toi_free_count[i])) {
-                        if (!header_done) {
-                                printk(KERN_INFO "Idx  Allocs   Frees   Tests "
-                                        "  Fails     Max Description\n");
-                                header_done = 1;
-                        }
-
-                        printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
-                                atomic_read(&toi_alloc_count[i]),
-                                atomic_read(&toi_free_count[i]),
-                                atomic_read(&toi_test_count[i]),
-                                atomic_read(&toi_fail_count[i]),
-                                toi_max_allocd[i],
-                                toi_alloc_desc[i]);
-                }
-}
-
-static int toi_alloc_initialise(int starting_cycle)
-{
-        int i;
-
-        if (!starting_cycle)
-                return 0;
-
-        if (toi_trace_allocs)
-                dump_stack();
-
-        for (i = 0; i < TOI_ALLOC_PATHS; i++) {
-                atomic_set(&toi_alloc_count[i], 0);
-                atomic_set(&toi_free_count[i], 0);
-                atomic_set(&toi_test_count[i], 0);
-                atomic_set(&toi_fail_count[i], 0);
-                toi_cur_allocd[i] = 0;
-                toi_max_allocd[i] = 0;
-        };
-
-        max_allocd = 0;
-        cur_allocd = 0;
-        return 0;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL),
-        SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0,
-                        NULL),
-        SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_GET_MAX_MEM_ALLOCD, 0),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0,
-                        NULL)
-};
-
-static struct toi_module_ops toi_alloc_ops = {
-        .type                                        = MISC_HIDDEN_MODULE,
-        .name                                        = "allocation debugging",
-        .directory                                = "alloc",
-        .module                                        = THIS_MODULE,
-        .early                                        = 1,
-        .initialise                                = toi_alloc_initialise,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-int toi_alloc_init(void)
-{
-        int result = toi_register_module(&toi_alloc_ops);
-        return result;
-}
-
-void toi_alloc_exit(void)
-{
-        toi_unregister_module(&toi_alloc_ops);
-}
diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
deleted file mode 100644
index 0cd6b686f..000000000
--- a/kernel/power/tuxonice_alloc.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.h
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/slab.h>
-#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN)
-#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
-
-#ifdef CONFIG_PM_DEBUG
-extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
-extern void toi_kfree(int fail_num, const void *arg, int size);
-
-extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
-                unsigned int order);
-#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
-extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
-extern void toi_free_page(int fail_num, unsigned long buf);
-extern void toi__free_page(int fail_num, struct page *page);
-extern void toi_free_pages(int fail_num, struct page *page, int order);
-extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
-extern int toi_alloc_init(void);
-extern void toi_alloc_exit(void);
-
-extern void toi_alloc_print_debug_stats(void);
-
-#else /* CONFIG_PM_DEBUG */
-
-#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
-#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN))
-
-#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
-#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
-#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
-#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
-#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
-#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
-#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
-static inline int toi_alloc_init(void)
-{
-        return 0;
-}
-
-static inline void toi_alloc_exit(void) { }
-
-static inline void toi_alloc_print_debug_stats(void) { }
-
-#endif
-
-extern int toi_trace_allocs;
diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
deleted file mode 100644
index 5845217f8..000000000
--- a/kernel/power/tuxonice_atomic_copy.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.c
- *
- * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/syscore_ops.h>
-#include <linux/ftrace.h>
-#include <asm/suspend.h>
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-
-unsigned long extra_pd1_pages_used;
-
-/**
- * free_pbe_list - free page backup entries used by the atomic copy code.
- * @list:        List to free.
- * @highmem:        Whether the list is in highmem.
- *
- * Normally, this function isn't used. If, however, we need to abort before
- * doing the atomic copy, we use this to free the pbes previously allocated.
- **/
-static void free_pbe_list(struct pbe **list, int highmem)
-{
-        while (*list) {
-                int i;
-                struct pbe *free_pbe, *next_page = NULL;
-                struct page *page;
-
-                if (highmem) {
-                        page = (struct page *) *list;
-                        free_pbe = (struct pbe *) kmap(page);
-                } else {
-                        page = virt_to_page(*list);
-                        free_pbe = *list;
-                }
-
-                for (i = 0; i < PBES_PER_PAGE; i++) {
-                        if (!free_pbe)
-                                break;
-                        if (highmem)
-                                toi__free_page(29, free_pbe->address);
-                        else
-                                toi_free_page(29,
-                                        (unsigned long) free_pbe->address);
-                        free_pbe = free_pbe->next;
-                }
-
-                if (highmem) {
-                        if (free_pbe)
-                                next_page = free_pbe;
-                        kunmap(page);
-                } else {
-                        if (free_pbe)
-                                next_page = free_pbe;
-                }
-
-                toi__free_page(29, page);
-                *list = (struct pbe *) next_page;
-        };
-}
-
-/**
- * copyback_post - post atomic-restore actions
- *
- * After doing the atomic restore, we have a few more things to do:
- *        1) We want to retain some values across the restore, so we now copy
- *        these from the nosave variables to the normal ones.
- *        2) Set the status flags.
- *        3) Resume devices.
- *        4) Tell userui so it can redraw & restore settings.
- *        5) Reread the page cache.
- **/
-void copyback_post(void)
-{
-        struct toi_boot_kernel_data *bkd =
-                (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
-        if (toi_activate_storage(1))
-                panic("Failed to reactivate our storage.");
-
-        toi_post_atomic_restore_modules(bkd);
-
-        toi_cond_pause(1, "About to reload secondary pagedir.");
-
-        if (read_pageset2(0))
-                panic("Unable to successfully reread the page cache.");
-
-        /*
-         * If the user wants to sleep again after resuming from full-off,
-         * it's most likely to be in order to suspend to ram, so we'll
-         * do this check after loading pageset2, to give them the fastest
-         * wakeup when they are ready to use the computer again.
-         */
-        toi_check_resleep();
-
-        if (test_action_state(TOI_INCREMENTAL_IMAGE))
-            toi_reset_dirtiness(1);
-}
-
-/**
- * toi_copy_pageset1 - do the atomic copy of pageset1
- *
- * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
- * because we can't be sure what side effects it has. On my old Duron, with
- * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
- * count at resume time 4 instead of 3.
- *
- * We don't want to call kmap_atomic unconditionally because it has the side
- * effect of incrementing the preempt count, which will leave it one too high
- * post resume (the page containing the preempt count will be copied after
- * its incremented. This is essentially the same problem.
- **/
-void toi_copy_pageset1(void)
-{
-        int i;
-        unsigned long source_index, dest_index;
-
-        memory_bm_position_reset(pageset1_map);
-        memory_bm_position_reset(pageset1_copy_map);
-
-        source_index = memory_bm_next_pfn(pageset1_map, 0);
-        dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
-
-        for (i = 0; i < pagedir1.size; i++) {
-                unsigned long *origvirt, *copyvirt;
-                struct page *origpage, *copypage;
-                int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1,
-                    was_present1, was_present2;
-
-                origpage = pfn_to_page(source_index);
-                copypage = pfn_to_page(dest_index);
-
-                origvirt = PageHighMem(origpage) ?
-                        kmap_atomic(origpage) :
-                        page_address(origpage);
-
-                copyvirt = PageHighMem(copypage) ?
-                        kmap_atomic(copypage) :
-                        page_address(copypage);
-
-                was_present1 = kernel_page_present(origpage);
-                if (!was_present1)
-                        kernel_map_pages(origpage, 1, 1);
-
-                was_present2 = kernel_page_present(copypage);
-                if (!was_present2)
-                        kernel_map_pages(copypage, 1, 1);
-
-                while (loop >= 0) {
-                        *(copyvirt + loop) = *(origvirt + loop);
-                        loop--;
-                }
-
-                if (!was_present1)
-                        kernel_map_pages(origpage, 1, 0);
-
-                if (!was_present2)
-                        kernel_map_pages(copypage, 1, 0);
-
-                if (PageHighMem(origpage))
-                        kunmap_atomic(origvirt);
-
-                if (PageHighMem(copypage))
-                        kunmap_atomic(copyvirt);
-
-                source_index = memory_bm_next_pfn(pageset1_map, 0);
-                dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
-        }
-}
-
-/**
- * __toi_post_context_save - steps after saving the cpu context
- *
- * Steps taken after saving the CPU state to make the actual
- * atomic copy.
- *
- * Called from swsusp_save in snapshot.c via toi_post_context_save.
- **/
-int __toi_post_context_save(void)
-{
-        unsigned long old_ps1_size = pagedir1.size;
-
-        check_checksums();
-
-        free_checksum_pages();
-
-        toi_recalculate_image_contents(1);
-
-        extra_pd1_pages_used = pagedir1.size > old_ps1_size ?
-                pagedir1.size - old_ps1_size : 0;
-
-        if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
-                printk(KERN_INFO "Pageset1 has grown by %lu pages. "
-                        "extra_pages_allowance is currently only %lu.\n",
-                        pagedir1.size - old_ps1_size,
-                        extra_pd1_pages_allowance);
-
-                /*
-                 * Highlevel code will see this, clear the state and
-                 * retry if we haven't already done so twice.
-                 */
-                if (any_to_free(1)) {
-                        set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-                        return 1;
-                }
-                if (try_allocate_extra_memory()) {
-                        printk(KERN_INFO "Failed to allocate the extra memory"
-                                        " needed. Restarting the process.");
-                        set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-                        return 1;
-                }
-                printk(KERN_INFO "However it looks like there's enough"
-                        " free ram and storage to handle this, so "
-                        " continuing anyway.");
-                /* 
-                 * What if try_allocate_extra_memory above calls
-                 * toi_allocate_extra_pagedir_memory and it allocs a new
-                 * slab page via toi_kzalloc which should be in ps1? So...
-                 */
-                toi_recalculate_image_contents(1);
-        }
-
-        if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
-            !test_action_state(TOI_TEST_BIO))
-                toi_copy_pageset1();
-
-        return 0;
-}
-
-/**
- * toi_hibernate - high level code for doing the atomic copy
- *
- * High-level code which prepares to do the atomic copy. Loosely based
- * on the swsusp version, but with the following twists:
- *        - We set toi_running so the swsusp code uses our code paths.
- *        - We give better feedback regarding what goes wrong if there is a
- *          problem.
- *        - We use an extra function to call the assembly, just in case this code
- *          is in a module (return address).
- **/
-int toi_hibernate(void)
-{
-        int error;
-
-        error = toi_lowlevel_builtin();
-
-        if (!error) {
-                struct toi_boot_kernel_data *bkd =
-                        (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
-                /*
-                 * The boot kernel's data may be larger (newer version) or
-                 * smaller (older version) than ours. Copy the minimum
-                 * of the two sizes, so that we don't overwrite valid values
-                 * from pre-atomic copy.
-                 */
-
-                memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
-                        min_t(int, sizeof(struct toi_boot_kernel_data),
-                                bkd->size));
-        }
-
-        return error;
-}
-
-/**
- * toi_atomic_restore - prepare to do the atomic restore
- *
- * Get ready to do the atomic restore. This part gets us into the same
- * state we are in prior to do calling do_toi_lowlevel while
- * hibernating: hot-unplugging secondary cpus and freeze processes,
- * before starting the thread that will do the restore.
- **/
-int toi_atomic_restore(void)
-{
-        int error;
-
-        toi_prepare_status(DONT_CLEAR_BAR,        "Atomic restore.");
-
-        memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line,
-                strlen(saved_command_line));
-
-        toi_pre_atomic_restore_modules(&toi_bkd);
-
-        if (add_boot_kernel_data_pbe())
-                goto Failed;
-
-        toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
-        if (toi_go_atomic(PMSG_QUIESCE, 0))
-                goto Failed;
-
-        /* We'll ignore saved state, but this gets preempt count (etc) right */
-        save_processor_state();
-
-        error = swsusp_arch_resume();
-        /*
-         * Code below is only ever reached in case of failure. Otherwise
-         * execution continues at place where swsusp_arch_suspend was called.
-         *
-         * We don't know whether it's safe to continue (this shouldn't happen),
-         * so lets err on the side of caution.
-         */
-        BUG();
-
-Failed:
-        free_pbe_list(&restore_pblist, 0);
-#ifdef CONFIG_HIGHMEM
-        free_pbe_list(&restore_highmem_pblist, 1);
-#endif
-        return 1;
-}
-
-/**
- * toi_go_atomic - do the actual atomic copy/restore
- * @state:           The state to use for dpm_suspend_start & power_down calls.
- * @suspend_time:  Whether we're suspending or resuming.
- **/
-int toi_go_atomic(pm_message_t state, int suspend_time)
-{
-  if (suspend_time) {
-    if (platform_begin(1)) {
-      set_abort_result(TOI_PLATFORM_PREP_FAILED);
-      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
-      return 1;
-    }
-
-    if (dpm_prepare(PMSG_FREEZE)) {
-      set_abort_result(TOI_DPM_PREPARE_FAILED);
-      dpm_complete(PMSG_RECOVER);
-      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
-      return 1;
-    }
-  }
-
-        suspend_console();
-        pm_restrict_gfp_mask();
-
-  if (suspend_time) {
-    if (dpm_suspend(state)) {
-      set_abort_result(TOI_DPM_SUSPEND_FAILED);
-      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
-      return 1;
-    }
-  } else {
-    if (dpm_suspend_start(state)) {
-      set_abort_result(TOI_DPM_SUSPEND_FAILED);
-      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
-      return 1;
-    }
-  }
-
-        /* At this point, dpm_suspend_start() has been called, but *not*
-         * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now.
-         * Otherwise, drivers for some devices (e.g. interrupt controllers)
-         * become desynchronized with the actual state of the hardware
-         * at resume time, and evil weirdness ensues.
-         */
-
-        if (dpm_suspend_end(state)) {
-                set_abort_result(TOI_DEVICE_REFUSED);
-                toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1);
-                return 1;
-        }
-
-        if (suspend_time) {
-                if (platform_pre_snapshot(1))
-                        set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
-        } else {
-                if (platform_pre_restore(1))
-                        set_abort_result(TOI_PRE_RESTORE_FAILED);
-        }
-
-        if (test_result_state(TOI_ABORTED)) {
-                toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1);
-                return 1;
-        }
-
-        if (disable_nonboot_cpus()) {
-            set_abort_result(TOI_CPU_HOTPLUG_FAILED);
-            toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
-                    suspend_time, 1);
-            return 1;
-        }
-
-        local_irq_disable();
-
-        if (syscore_suspend()) {
-                set_abort_result(TOI_SYSCORE_REFUSED);
-                toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1);
-                return 1;
-        }
-
-        if (suspend_time && pm_wakeup_pending()) {
-                set_abort_result(TOI_WAKEUP_EVENT);
-                toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1);
-                return 1;
-        }
-        return 0;
-}
-
-/**
- * toi_end_atomic - post atomic copy/restore routines
- * @stage:                What step to start at.
- * @suspend_time:        Whether we're suspending or resuming.
- * @error:                Whether we're recovering from an error.
- **/
-void toi_end_atomic(int stage, int suspend_time, int error)
-{
-        pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) :
-                PMSG_RESTORE;
-
-        switch (stage) {
-        case ATOMIC_ALL_STEPS:
-                if (!suspend_time) {
-                        events_check_enabled = false;
-                }
-                platform_leave(1);
-        case ATOMIC_STEP_SYSCORE_RESUME:
-                syscore_resume();
-        case ATOMIC_STEP_IRQS:
-                local_irq_enable();
-        case ATOMIC_STEP_CPU_HOTPLUG:
-                enable_nonboot_cpus();
-        case ATOMIC_STEP_PLATFORM_FINISH:
-                if (!suspend_time && error & 2)
-                        platform_restore_cleanup(1);
-                else 
-                        platform_finish(1);
-                dpm_resume_start(msg);
-        case ATOMIC_STEP_DEVICE_RESUME:
-                if (suspend_time && (error & 2))
-                        platform_recover(1);
-                dpm_resume(msg);
-                if (!toi_in_suspend()) {
-                    dpm_resume_end(PMSG_RECOVER);
-                }
-                if (error || !toi_in_suspend()) {
-                        pm_restore_gfp_mask();
-                }
-                resume_console();
-        case ATOMIC_STEP_DPM_COMPLETE:
-                dpm_complete(msg);
-        case ATOMIC_STEP_PLATFORM_END:
-                platform_end(1);
-
-                toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
-        }
-}
diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
deleted file mode 100644
index e2d2b4fb3..000000000
--- a/kernel/power/tuxonice_atomic_copy.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.h
- *
- * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-enum {
-        ATOMIC_ALL_STEPS,
-        ATOMIC_STEP_SYSCORE_RESUME,
-        ATOMIC_STEP_IRQS,
-        ATOMIC_STEP_CPU_HOTPLUG,
-        ATOMIC_STEP_PLATFORM_FINISH,
-        ATOMIC_STEP_DEVICE_RESUME,
-        ATOMIC_STEP_DPM_COMPLETE,
-        ATOMIC_STEP_PLATFORM_END,
-};
-
-int toi_go_atomic(pm_message_t state, int toi_time);
-void toi_end_atomic(int stage, int toi_time, int error);
-
-extern void platform_recover(int platform_mode);
diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h
deleted file mode 100644
index 9d52a3b69..000000000
--- a/kernel/power/tuxonice_bio.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-#include <linux/buffer_head.h>
-#include "tuxonice_extent.h"
-
-void toi_put_extent_chain(struct hibernate_extent_chain *chain);
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
-                unsigned long start, unsigned long end);
-
-struct hibernate_extent_saved_state {
-        int extent_num;
-        struct hibernate_extent *extent_ptr;
-        unsigned long offset;
-};
-
-struct toi_bdev_info {
-        struct toi_bdev_info *next;
-        struct hibernate_extent_chain blocks;
-        struct block_device *bdev;
-        struct toi_module_ops *allocator;
-        int allocator_index;
-        struct hibernate_extent_chain allocations;
-        char name[266]; /* "swap on " or "file " + up to 256 chars */
-
-        /* Saved in header */
-        char uuid[17];
-        dev_t dev_t;
-        int prio;
-        int bmap_shift;
-        int blocks_per_page;
-        unsigned long pages_used;
-        struct hibernate_extent_saved_state saved_state[4];
-};
-
-struct toi_extent_iterate_state {
-        struct toi_bdev_info *current_chain;
-        int num_chains;
-        int saved_chain_number[4];
-        struct toi_bdev_info *saved_chain_ptr[4];
-};
-
-/*
- * Our exported interface so the swapwriter and filewriter don't
- * need these functions duplicated.
- */
-struct toi_bio_ops {
-        int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
-                        struct page *page);
-        int (*register_storage)(struct toi_bdev_info *new);
-        void (*free_storage)(void);
-};
-
-struct toi_allocator_ops {
-        unsigned long (*toi_swap_storage_available) (void);
-};
-
-extern struct toi_bio_ops toi_bio_ops;
-
-extern char *toi_writer_buffer;
-extern int toi_writer_buffer_posn;
-
-struct toi_bio_allocator_ops {
-        int (*register_storage) (void);
-        unsigned long (*storage_available)(void);
-        int (*allocate_storage) (struct toi_bdev_info *, unsigned long);
-        int (*bmap) (struct toi_bdev_info *);
-        void (*free_storage) (struct toi_bdev_info *);
-        unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used);
-};
diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c
deleted file mode 100644
index 086a5527d..000000000
--- a/kernel/power/tuxonice_bio_chains.c
+++ /dev/null
@@ -1,1126 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_devinfo.c
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/mm_types.h>
-#include "tuxonice_bio.h"
-#include "tuxonice_bio_internal.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-#include "tuxonice_io.h"
-
-static struct toi_bdev_info *prio_chain_head;
-static int num_chains;
-
-/* Pointer to current entry being loaded/saved. */
-struct toi_extent_iterate_state toi_writer_posn;
-
-#define metadata_size (sizeof(struct toi_bdev_info) - \
-                offsetof(struct toi_bdev_info, uuid))
-
-/*
- * After section 0 (header) comes 2 => next_section[0] = 2
- */
-static int next_section[3] = { 2, 3, 1 };
-
-/**
- * dump_block_chains - print the contents of the bdev info array.
- **/
-void dump_block_chains(void)
-{
-        int i = 0;
-        int j;
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-
-        while (cur_chain) {
-                struct hibernate_extent *this = cur_chain->blocks.first;
-
-                printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio);
-
-                while (this) {
-                        printk(KERN_CONT " [%lu-%lu]%s", this->start,
-                                        this->end, this->next ? "," : "");
-                        this = this->next;
-                }
-
-                printk("\n");
-                cur_chain = cur_chain->next;
-                i++;
-        }
-
-        printk(KERN_DEBUG "Saved states:\n");
-        for (i = 0; i < 4; i++) {
-                printk(KERN_DEBUG "Slot %d: Chain %d.\n",
-                        i, toi_writer_posn.saved_chain_number[i]);
-
-                cur_chain = prio_chain_head;
-                j = 0;
-                while (cur_chain) {
-                        printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n",
-                                        j, cur_chain->saved_state[i].extent_num,
-                                        cur_chain->saved_state[i].offset);
-                        cur_chain = cur_chain->next;
-                        j++;
-                }
-                printk(KERN_CONT "\n");
-        }
-}
-
-/**
- *
- **/
-static void toi_extent_chain_next(void)
-{
-        struct toi_bdev_info *this = toi_writer_posn.current_chain;
-
-        if (!this->blocks.current_extent)
-                return;
-
-        if (this->blocks.current_offset == this->blocks.current_extent->end) {
-                if (this->blocks.current_extent->next) {
-                        this->blocks.current_extent =
-                                this->blocks.current_extent->next;
-                        this->blocks.current_offset =
-                                this->blocks.current_extent->start;
-                } else {
-                        this->blocks.current_extent = NULL;
-                        this->blocks.current_offset = 0;
-                }
-        } else
-                this->blocks.current_offset++;
-}
-
-/**
- *
- */
-
-static struct toi_bdev_info *__find_next_chain_same_prio(void)
-{
-        struct toi_bdev_info *start_chain = toi_writer_posn.current_chain;
-        struct toi_bdev_info *this = start_chain;
-        int orig_prio = this->prio;
-
-        do {
-                this = this->next;
-
-                if (!this)
-                        this = prio_chain_head;
-
-                /* Back on original chain? Use it again. */
-                if (this == start_chain)
-                        return start_chain;
-
-        } while (!this->blocks.current_extent || this->prio != orig_prio);
-
-        return this;
-}
-
-static void find_next_chain(void)
-{
-        struct toi_bdev_info *this;
-
-        this = __find_next_chain_same_prio();
-
-        /*
-         * If we didn't get another chain of the same priority that we
-         * can use, look for the next priority.
-         */
-        while (this && !this->blocks.current_extent)
-                this = this->next;
-
-        toi_writer_posn.current_chain = this;
-}
-
-/**
- * toi_extent_state_next - go to the next extent
- * @blocks: The number of values to progress.
- * @stripe_mode: Whether to spread usage across all chains.
- *
- * Given a state, progress to the next valid entry. We may begin in an
- * invalid state, as we do when invoked after extent_state_goto_start below.
- *
- * When using compression and expected_compression > 0, we let the image size
- * be larger than storage, so we can validly run out of data to return.
- **/
-static unsigned long toi_extent_state_next(int blocks, int current_stream)
-{
-        int i;
-
-        if (!toi_writer_posn.current_chain)
-                return -ENOSPC;
-
-        /* Assume chains always have lengths that are multiples of @blocks */
-        for (i = 0; i < blocks; i++)
-                toi_extent_chain_next();
-
-        /* The header stream is not striped */
-        if (current_stream ||
-            !toi_writer_posn.current_chain->blocks.current_extent)
-                find_next_chain();
-
-        return  toi_writer_posn.current_chain ? 0 : -ENOSPC;
-}
-
-static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this)
-{
-        struct toi_bdev_info **prev_ptr;
-        struct toi_bdev_info *cur;
-
-        /* Loop through the existing chain, finding where to insert it */
-        prev_ptr = &prio_chain_head;
-        cur = prio_chain_head;
-
-        while (cur && cur->prio >= this->prio) {
-                prev_ptr = &cur->next;
-                cur = cur->next;
-        }
-
-        this->next = *prev_ptr;
-        *prev_ptr = this;
-
-        this = prio_chain_head;
-        while (this)
-                this = this->next;
-        num_chains++;
-}
-
-/**
- * toi_extent_state_goto_start - reinitialize an extent chain iterator
- * @state:        Iterator to reinitialize
- **/
-void toi_extent_state_goto_start(void)
-{
-        struct toi_bdev_info *this = prio_chain_head;
-
-        while (this) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "Setting current extent to %p.", this->blocks.first);
-                this->blocks.current_extent = this->blocks.first;
-                if (this->blocks.current_extent) {
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Setting current offset to %lu.",
-                                        this->blocks.current_extent->start);
-                        this->blocks.current_offset =
-                                this->blocks.current_extent->start;
-                }
-
-                this = this->next;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.",
-                        prio_chain_head);
-        toi_writer_posn.current_chain = prio_chain_head;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start.");
-}
-
-/**
- * toi_extent_state_save - save state of the iterator
- * @state:                Current state of the chain
- * @saved_state:        Iterator to populate
- *
- * Given a state and a struct hibernate_extent_state_store, save the current
- * position in a format that can be used with relocated chains (at
- * resume time).
- **/
-void toi_extent_state_save(int slot)
-{
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-        struct hibernate_extent *extent;
-        struct hibernate_extent_saved_state *chain_state;
-        int i = 0;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.",
-                        slot);
-
-        if (!toi_writer_posn.current_chain) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => "
-                                "chain_num = -1.");
-                toi_writer_posn.saved_chain_number[slot] = -1;
-                return;
-        }
-
-        while (cur_chain) {
-                i++;
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) "
-                                "state, slot %d.", i, cur_chain, slot);
-
-                chain_state = &cur_chain->saved_state[slot];
-
-                chain_state->offset = cur_chain->blocks.current_offset;
-
-                if (toi_writer_posn.current_chain == cur_chain) {
-                        toi_writer_posn.saved_chain_number[slot] = i;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain "
-                                        "we were on => chain_num is %d.", i);
-                }
-
-                if (!cur_chain->blocks.current_extent) {
-                        chain_state->extent_num = 0;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent "
-                                        "for this chain => extent_num %d is 0.",
-                                        i);
-                        cur_chain = cur_chain->next;
-                        continue;
-                }
-
-                extent = cur_chain->blocks.first;
-                chain_state->extent_num = 1;
-
-                while (extent != cur_chain->blocks.current_extent) {
-                        chain_state->extent_num++;
-                        extent = extent->next;
-                }
-
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i,
-                                chain_state->extent_num);
-
-                cur_chain = cur_chain->next;
-        }
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "Completed saving extent state slot %d.", slot);
-}
-
-/**
- * toi_extent_state_restore - restore the position saved by extent_state_save
- * @state:                State to populate
- * @saved_state:        Iterator saved to restore
- **/
-void toi_extent_state_restore(int slot)
-{
-        int i = 0;
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-        struct hibernate_extent_saved_state *chain_state;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "toi_extent_state_restore - slot %d.", slot);
-
-        if (toi_writer_posn.saved_chain_number[slot] == -1) {
-                toi_writer_posn.current_chain = NULL;
-                return;
-        }
-
-        while (cur_chain) {
-                int posn;
-                int j;
-                i++;
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) "
-                                "state, slot %d.", i, cur_chain, slot);
-
-                chain_state = &cur_chain->saved_state[slot];
-
-                posn = chain_state->extent_num;
-
-                cur_chain->blocks.current_extent = cur_chain->blocks.first;
-                cur_chain->blocks.current_offset = chain_state->offset;
-
-                if (i == toi_writer_posn.saved_chain_number[slot]) {
-                        toi_writer_posn.current_chain = cur_chain;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Found current chain.");
-                }
-
-                for (j = 0; j < 4; j++)
-                        if (i == toi_writer_posn.saved_chain_number[j]) {
-                                toi_writer_posn.saved_chain_ptr[j] = cur_chain;
-                                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Found saved chain ptr %d (%p) (offset"
-                                        " %d).", j, cur_chain,
-                                        cur_chain->saved_state[j].offset);
-                        }
-
-                if (posn) {
-                        while (--posn)
-                                cur_chain->blocks.current_extent =
-                                        cur_chain->blocks.current_extent->next;
-                } else
-                        cur_chain->blocks.current_extent = NULL;
-
-                cur_chain = cur_chain->next;
-        }
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done.");
-        if (test_action_state(TOI_LOGALL))
-                dump_block_chains();
-}
-
-/*
- * Storage needed
- *
- * Returns amount of space in the image header required
- * for the chain data. This ignores the links between
- * pages, which we factor in when allocating the space.
- */
-int toi_bio_devinfo_storage_needed(void)
-{
-        int result = sizeof(num_chains);
-        struct toi_bdev_info *chain = prio_chain_head;
-
-        while (chain) {
-                result += metadata_size;
-
-                /* Chain size */
-                result += sizeof(int);
-
-                /* Extents */
-                result += (2 * sizeof(unsigned long) *
-                        chain->blocks.num_extents);
-
-                chain = chain->next;
-        }
-
-        result += 4 * sizeof(int);
-        return result;
-}
-
-static unsigned long chain_pages_used(struct toi_bdev_info *chain)
-{
-        struct hibernate_extent *this = chain->blocks.first;
-        struct hibernate_extent_saved_state *state = &chain->saved_state[3];
-        unsigned long size = 0;
-        int extent_idx = 1;
-
-        if (!state->extent_num) {
-                if (!this)
-                        return 0;
-                else
-                        return chain->blocks.size;
-        }
-
-        while (extent_idx < state->extent_num) {
-                size += (this->end - this->start + 1);
-                this = this->next;
-                extent_idx++;
-        }
-
-        /* We didn't use the one we're sitting on, so don't count it */
-        return size + state->offset - this->start;
-}
-
-void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain)
-{
-    unsigned long used = chain_pages_used(chain);
-
-    /* Free the storage */
-    unsigned long first_freed = 0;
-
-    if (chain->allocator->bio_allocator_ops->free_unused_storage)
-        first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used);
-
-    printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed);
-
-    /* Adjust / free the extents. */
-    toi_put_extent_chain_from(&chain->blocks, first_freed);
-
-    {
-        struct hibernate_extent *this = chain->blocks.first;
-        while (this) {
-            printk("Extent %lx-%lx.\n", this->start, this->end);
-            this = this->next;
-        }
-    }
-}
-
-/**
- * toi_serialise_extent_chain - write a chain in the image
- * @chain:        Chain to write.
- **/
-static int toi_serialise_extent_chain(struct toi_bdev_info *chain)
-{
-        struct hibernate_extent *this;
-        int ret;
-        int i = 1;
-
-        chain->pages_used = chain_pages_used(chain);
-
-        if (test_action_state(TOI_LOGALL))
-                dump_block_chains();
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).",
-                        chain->dev_t);
-        /* Device info -  dev_t, prio, bmap_shift, blocks per page, positions */
-        ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
-                        (char *) &chain->uuid, metadata_size);
-        if (ret)
-                return ret;
-
-        /* Num extents */
-        ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
-                        (char *) &chain->blocks.num_extents, sizeof(int));
-        if (ret)
-                return ret;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
-                        chain->blocks.num_extents);
-
-        this = chain->blocks.first;
-        while (this) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i);
-                ret = toiActiveAllocator->rw_header_chunk(WRITE,
-                                &toi_blockwriter_ops,
-                                (char *) this, 2 * sizeof(this->start));
-                if (ret)
-                        return ret;
-                this = this->next;
-                i++;
-        }
-
-        return ret;
-}
-
-int toi_serialise_extent_chains(void)
-{
-        struct toi_bdev_info *this = prio_chain_head;
-        int result;
-
-        /* Write the number of chains */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)",
-                        num_chains);
-        result = toiActiveAllocator->rw_header_chunk(WRITE,
-                        &toi_blockwriter_ops, (char *) &num_chains,
-                        sizeof(int));
-        if (result)
-                return result;
-
-        /* Then the chains themselves */
-        while (this) {
-                result = toi_serialise_extent_chain(this);
-                if (result)
-                        return result;
-                this = this->next;
-        }
-
-        /*
-         * Finally, the chain we should be on at the start of each
-         * section.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers.");
-        result = toiActiveAllocator->rw_header_chunk(WRITE,
-                        &toi_blockwriter_ops,
-                        (char *) &toi_writer_posn.saved_chain_number[0],
-                        4 * sizeof(int));
-
-        return result;
-}
-
-int toi_register_storage_chain(struct toi_bdev_info *new)
-{
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.",
-                        new);
-        toi_insert_chain_in_prio_list(new);
-        return 0;
-}
-
-static void free_bdev_info(struct toi_bdev_info *chain)
-{
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain);
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents.");
-        toi_put_extent_chain(&chain->blocks);
-
-        /*
-         * The allocator may need to do more than just free the chains
-         * (swap_free, for example). Don't call from boot kernel.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents.");
-        if (chain->allocator)
-                chain->allocator->bio_allocator_ops->free_storage(chain);
-
-        /*
-         * Dropping out of reading atomic copy? Need to undo
-         * toi_open_by_devnum.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev.");
-        if (chain->bdev && !IS_ERR(chain->bdev) &&
-                        chain->bdev != resume_block_device &&
-                        chain->bdev != header_block_device &&
-                        test_toi_state(TOI_TRYING_TO_RESUME))
-                toi_close_bdev(chain->bdev);
-
-        /* Poison */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct.");
-        toi_kfree(39, chain, sizeof(*chain));
-
-        if (prio_chain_head == chain)
-                prio_chain_head = NULL;
-
-        num_chains--;
-}
-
-void free_all_bdev_info(void)
-{
-        struct toi_bdev_info *this = prio_chain_head;
-
-        while (this) {
-                struct toi_bdev_info *next = this->next;
-                free_bdev_info(this);
-                this = next;
-        }
-
-        memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn));
-        prio_chain_head = NULL;
-}
-
-static void set_up_start_position(void)
-{
-        toi_writer_posn.current_chain = prio_chain_head;
-        go_next_page(0, 0);
-}
-
-/**
- * toi_load_extent_chain - read back a chain saved in the image
- * @chain:        Chain to load
- *
- * The linked list of extents is reconstructed from the disk. chain will point
- * to the first entry.
- **/
-int toi_load_extent_chain(int index, int *num_loaded)
-{
-        struct toi_bdev_info *chain = toi_kzalloc(39,
-                        sizeof(struct toi_bdev_info), GFP_ATOMIC);
-        struct hibernate_extent *this, *last = NULL;
-        int i, ret;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index);
-        /* Get dev_t, prio, bmap_shift, blocks per page, positions */
-        ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
-                        (char *) &chain->uuid, metadata_size);
-
-        if (ret) {
-                printk(KERN_ERR "Failed to read the size of extent chain.\n");
-                toi_kfree(39, chain, sizeof(*chain));
-                return 1;
-        }
-
-        toi_bkd.pages_used[index] = chain->pages_used;
-
-        ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
-                        (char *) &chain->blocks.num_extents, sizeof(int));
-        if (ret) {
-                printk(KERN_ERR "Failed to read the size of extent chain.\n");
-                toi_kfree(39, chain, sizeof(*chain));
-                return 1;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
-                        chain->blocks.num_extents);
-
-        for (i = 0; i < chain->blocks.num_extents; i++) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1);
-
-                this = toi_kzalloc(2, sizeof(struct hibernate_extent),
-                                TOI_ATOMIC_GFP);
-                if (!this) {
-                        printk(KERN_INFO "Failed to allocate a new extent.\n");
-                        free_bdev_info(chain);
-                        return -ENOMEM;
-                }
-                this->next = NULL;
-                /* Get the next page */
-                ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
-                                NULL, (char *) this, 2 * sizeof(this->start));
-                if (ret) {
-                        printk(KERN_INFO "Failed to read an extent.\n");
-                        toi_kfree(2, this, sizeof(struct hibernate_extent));
-                        free_bdev_info(chain);
-                        return 1;
-                }
-
-                if (last)
-                        last->next = this;
-                else {
-                        char b1[32], b2[32], b3[32];
-                        /*
-                         * Open the bdev
-                         */
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                "Chain dev_t is %s. Resume dev t is %s. Header"
-                                " bdev_t is %s.\n",
-                                format_dev_t(b1, chain->dev_t),
-                                format_dev_t(b2, resume_dev_t),
-                                format_dev_t(b3, toi_sig_data->header_dev_t));
-
-                        if (chain->dev_t == resume_dev_t)
-                                chain->bdev = resume_block_device;
-                        else if (chain->dev_t == toi_sig_data->header_dev_t)
-                                chain->bdev = header_block_device;
-                        else {
-                                chain->bdev = toi_open_bdev(chain->uuid,
-                                                chain->dev_t, 1);
-                                if (IS_ERR(chain->bdev)) {
-                                        free_bdev_info(chain);
-                                        return -ENODEV;
-                                }
-                        }
-
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift "
-                                        "is %d and blocks per page is %d.",
-                                        chain->bmap_shift,
-                                        chain->blocks_per_page);
-
-                        chain->blocks.first = this;
-
-                        /*
-                         * Couldn't do this earlier, but can't do
-                         * goto_start now - we may have already used blocks
-                         * in the first chain.
-                         */
-                        chain->blocks.current_extent = this;
-                        chain->blocks.current_offset = this->start;
-
-                        /*
-                         * Can't wait until we've read the whole chain
-                         * before we insert it in the list. We might need
-                         * this chain to read the next page in the header
-                         */
-                        toi_insert_chain_in_prio_list(chain);
-                }
-
-                /*
-                 * We have to wait until 2 extents are loaded before setting up
-                 * properly because if the first extent has only one page, we
-                 * will need to put the position on the second extent. Sounds
-                 * obvious, but it wasn't!
-                 */
-                (*num_loaded)++;
-                if ((*num_loaded) == 2)
-                        set_up_start_position();
-                last = this;
-        }
-
-        /*
-         * Shouldn't get empty chains, but it's not impossible. Link them in so
-         * they get freed properly later.
-         */
-        if (!chain->blocks.num_extents)
-                toi_insert_chain_in_prio_list(chain);
-
-        if (!chain->blocks.current_extent) {
-                chain->blocks.current_extent = chain->blocks.first;
-                if (chain->blocks.current_extent)
-                        chain->blocks.current_offset =
-                                chain->blocks.current_extent->start;
-        }
-        return 0;
-}
-
-int toi_load_extent_chains(void)
-{
-        int result;
-        int to_load;
-        int i;
-        int extents_loaded = 0;
-
-        result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
-                        (char *) &to_load,
-                        sizeof(int));
-        if (result)
-                return result;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load);
-
-        for (i = 0; i < to_load; i++) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.",
-                                i, to_load);
-                result = toi_load_extent_chain(i, &extents_loaded);
-                if (result)
-                        return result;
-        }
-
-        /* If we never got to a second extent, we still need to do this. */
-        if (extents_loaded == 1)
-                set_up_start_position();
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers.");
-        result = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
-                        &toi_blockwriter_ops,
-                        (char *) &toi_writer_posn.saved_chain_number[0],
-                        4 * sizeof(int));
-
-        return result;
-}
-
-static int toi_end_of_stream(int writing, int section_barrier)
-{
-        struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
-        int compare_to = next_section[current_stream];
-        struct toi_bdev_info *compare_chain =
-                toi_writer_posn.saved_chain_ptr[compare_to];
-        int compare_offset = compare_chain ?
-                compare_chain->saved_state[compare_to].offset : 0;
-
-        if (!section_barrier)
-                return 0;
-
-        if (!cur_chain)
-                return 1;
-
-        if (cur_chain == compare_chain &&
-            cur_chain->blocks.current_offset == compare_offset) {
-                if (writing) {
-                        if (!current_stream) {
-                                debug_broken_header();
-                                return 1;
-                        }
-                } else {
-                        more_readahead = 0;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Reached the end of stream %d "
-                                        "(not an error).", current_stream);
-                        return 1;
-                }
-        }
-
-        return 0;
-}
-
-/**
- * go_next_page - skip blocks to the start of the next page
- * @writing: Whether we're reading or writing the image.
- *
- * Go forward one page.
- **/
-int go_next_page(int writing, int section_barrier)
-{
-        struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
-        int max = cur_chain ? cur_chain->blocks_per_page : 1;
-
-        /* Nope. Go foward a page - or maybe two. Don't stripe the header,
-         * so that bad fragmentation doesn't put the extent data containing
-         * the location of the second page out of the first header page.
-         */
-        if (toi_extent_state_next(max, current_stream)) {
-                /* Don't complain if readahead falls off the end */
-                if (writing && section_barrier) {
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. "
-                                "Expected compression ratio too optimistic?");
-                        if (test_action_state(TOI_LOGALL))
-                                dump_block_chains();
-                }
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to "
-                                "read/write. (Not necessarily a fatal error.");
-                return -ENOSPC;
-        }
-
-        return 0;
-}
-
-int devices_of_same_priority(struct toi_bdev_info *this)
-{
-        struct toi_bdev_info *check = prio_chain_head;
-        int i = 0;
-
-        while (check) {
-                if (check->prio == this->prio)
-                        i++;
-                check = check->next;
-        }
-
-        return i;
-}
-
-/**
- * toi_bio_rw_page - do i/o on the next disk page in the image
- * @writing: Whether reading or writing.
- * @page: Page to do i/o on.
- * @is_readahead: Whether we're doing readahead
- * @free_group: The group used in allocating the page
- *
- * Submit a page for reading or writing, possibly readahead.
- * Pass the group used in allocating the page as well, as it should
- * be freed on completion of the bio if we're writing the page.
- **/
-int toi_bio_rw_page(int writing, struct page *page,
-                int is_readahead, int free_group)
-{
-        int result = toi_end_of_stream(writing, 1);
-        struct toi_bdev_info *dev_info = toi_writer_posn.current_chain;
-
-        if (result) {
-                if (writing)
-                        abort_hibernate(TOI_INSUFFICIENT_STORAGE,
-                                "Insufficient storage for your image.");
-                else
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to "
-                                "read/write another page when stream has "
-                                "ended.");
-                return -ENOSPC;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "%s %lx:%ld",
-                        writing ? "Write" : "Read",
-                        dev_info->dev_t, dev_info->blocks.current_offset);
-
-        result = toi_do_io(writing, dev_info->bdev,
-                dev_info->blocks.current_offset << dev_info->bmap_shift,
-                page, is_readahead, 0, free_group);
-
-        /* Ignore the result here - will check end of stream if come in again */
-        go_next_page(writing, 1);
-
-        if (result)
-                printk(KERN_ERR "toi_do_io returned %d.\n", result);
-        return result;
-}
-
-dev_t get_header_dev_t(void)
-{
-        return prio_chain_head->dev_t;
-}
-
-struct block_device *get_header_bdev(void)
-{
-        return prio_chain_head->bdev;
-}
-
-unsigned long get_headerblock(void)
-{
-        return prio_chain_head->blocks.first->start <<
-                prio_chain_head->bmap_shift;
-}
-
-int get_main_pool_phys_params(void)
-{
-        struct toi_bdev_info *this = prio_chain_head;
-        int result;
-
-        while (this) {
-                result = this->allocator->bio_allocator_ops->bmap(this);
-                if (result)
-                        return result;
-                this = this->next;
-        }
-
-        return 0;
-}
-
-static int apply_header_reservation(void)
-{
-        int i;
-
-        if (!header_pages_reserved) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                "No header pages reserved at the moment.");
-                return 0;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation.");
-
-        /* Apply header space reservation */
-        toi_extent_state_goto_start();
-
-        for (i = 0; i < header_pages_reserved; i++)
-                if (go_next_page(1, 0))
-                        return -ENOSPC;
-
-        /* The end of header pages will be the start of pageset 2 */
-        toi_extent_state_save(2);
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "Finished applying header reservation.");
-        return 0;
-}
-
-static int toi_bio_register_storage(void)
-{
-        int result = 0;
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    this_module->type != BIO_ALLOCATOR_MODULE)
-                        continue;
-                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                "Registering storage from %s.",
-                                this_module->name);
-                result = this_module->bio_allocator_ops->register_storage();
-                if (result)
-                        break;
-        }
-
-        return result;
-}
-
-void toi_bio_free_unused_storage(void)
-{
-    struct toi_bdev_info *this = prio_chain_head;
-
-    while (this) {
-        toi_bio_free_unused_storage_chain(this);
-        this = this->next;
-    }
-}
-
-int toi_bio_allocate_storage(unsigned long request)
-{
-        struct toi_bdev_info *chain = prio_chain_head;
-        unsigned long to_get = request;
-        unsigned long extra_pages, needed;
-        int no_free = 0;
-
-        if (!chain) {
-                int result = toi_bio_register_storage();
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
-                        "Registering storage.");
-                if (result)
-                        return 0;
-                chain = prio_chain_head;
-                if (!chain) {
-                        printk("TuxOnIce: No storage was registered.\n");
-                        return 0;
-                }
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
-                        "Request is %lu pages.", request);
-        extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
-                               + sizeof(int)), PAGE_SIZE);
-        needed = request + extra_pages + header_pages_reserved;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu "
-                        "for header => %lu.",
-                        extra_pages, header_pages_reserved, needed);
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.",
-                        raw_pages_allocd);
-
-        to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get);
-
-        if (!to_get)
-                return apply_header_reservation();
-
-        while (to_get && chain) {
-                int num_group = devices_of_same_priority(chain);
-                int divisor = num_group - no_free;
-                int i;
-                unsigned long portion = DIV_ROUND_UP(to_get, divisor);
-                unsigned long got = 0;
-                unsigned long got_this_round = 0;
-                struct toi_bdev_info *top = chain;
-
-                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                " Start of loop. To get is %lu. Divisor is %d.",
-                                to_get, divisor);
-                no_free = 0;
-
-                /*
-                 * We're aiming to spread the allocated storage as evenly
-                 * as possible, but we also want to get all the storage we
-                 * can off this priority.
-                 */
-                for (i = 0; i < num_group; i++) {
-                        struct toi_bio_allocator_ops *ops =
-                                chain->allocator->bio_allocator_ops;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        " Asking for %lu pages from chain %p.",
-                                        portion, chain);
-                        got = ops->allocate_storage(chain, portion);
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        " Got %lu pages from allocator %p.",
-                                        got, chain);
-                        if (!got)
-                                no_free++;
-                        got_this_round += got;
-                        chain = chain->next;
-                }
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a "
-                                "total of %lu pages from %d allocators.",
-                                got_this_round, divisor - no_free);
-
-                raw_pages_allocd += got_this_round;
-                to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd :
-                        0;
-
-                /*
-                 * If we got anything from chains of this priority and we
-                 * still have storage to allocate, go over this priority
-                 * again.
-                 */
-                if (got_this_round && to_get)
-                        chain = top;
-                else
-                        no_free = 0;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling "
-                        "get_main_pool_phys_params");
-        /* Now let swap allocator bmap the pages */
-        get_main_pool_phys_params();
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header.");
-        return apply_header_reservation();
-}
-
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd)
-{
-        int i = 0;
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-
-        while (cur_chain) {
-                cur_chain->pages_used = bkd->pages_used[i];
-                cur_chain = cur_chain->next;
-                i++;
-        }
-}
-
-int toi_bio_chains_debug_info(char *buffer, int size)
-{
-        /* Show what we actually used */
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-        int len = 0;
-
-        while (cur_chain) {
-                len += scnprintf(buffer + len, size - len, "  Used %lu pages "
-                                "from %s.\n", cur_chain->pages_used,
-                                cur_chain->name);
-                cur_chain = cur_chain->next;
-        }
-
-        return len;
-}
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
-    struct toi_bdev_info *this = toi_writer_posn.current_chain,
-                         *cmp = prio_chain_head;
-
-    ptr->save.chain = 1;
-    while (this != cmp) {
-        ptr->save.chain++;
-        cmp = cmp->next;
-    }
-    ptr->save.block = this->blocks.current_offset;
-
-    /* Save the raw info internally for quicker access when updating pointers */
-    ptr->bdev = this->bdev;
-    ptr->block = this->blocks.current_offset << this->bmap_shift;
-}
-
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
-    int i = ptr->save.chain - 1;
-    struct toi_bdev_info *this;
-    struct hibernate_extent *hib;
-
-    /* Find chain by stored index */
-    this = prio_chain_head;
-    while (i) {
-        this = this->next;
-        i--;
-    }
-    toi_writer_posn.current_chain = this;
-
-    /* Restore block */
-    this->blocks.current_offset = ptr->save.block;
-
-    /* Find current offset from block number */
-    hib = this->blocks.first;
-
-    while (hib->start > ptr->save.block) {
-        hib = hib->next;
-    }
-
-    this->blocks.last_touched = this->blocks.current_extent = hib;
-}
diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c
deleted file mode 100644
index 87aa4c96e..000000000
--- a/kernel/power/tuxonice_bio_core.c
+++ /dev/null
@@ -1,1932 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains block io functions for TuxOnIce. These are
- * used by the swapwriter and it is planned that they will also
- * be used by the NFSwriter.
- *
- */
-
-#include <linux/blkdev.h>
-#include <linux/syscalls.h>
-#include <linux/suspend.h>
-#include <linux/ctype.h>
-#include <linux/mount.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-#define MEMORY_ONLY 1
-#define THROTTLE_WAIT 2
-
-/* #define MEASURE_MUTEX_CONTENTION */
-#ifndef MEASURE_MUTEX_CONTENTION
-#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
-#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
-#else
-unsigned long mutex_times[2][2][NR_CPUS];
-#define my_mutex_lock(index, the_lock) do { \
-        int have_mutex; \
-        have_mutex = mutex_trylock(the_lock); \
-        if (!have_mutex) { \
-                mutex_lock(the_lock); \
-                mutex_times[index][0][smp_processor_id()]++; \
-        } else { \
-                mutex_times[index][1][smp_processor_id()]++; \
-        }
-
-#define my_mutex_unlock(index, the_lock) \
-        mutex_unlock(the_lock); \
-} while (0)
-#endif
-
-static int page_idx, reset_idx;
-
-static int target_outstanding_io = 1024;
-static int max_outstanding_writes, max_outstanding_reads;
-
-static struct page *bio_queue_head, *bio_queue_tail;
-static atomic_t toi_bio_queue_size;
-static DEFINE_SPINLOCK(bio_queue_lock);
-
-static int free_mem_throttle, throughput_throttle;
-int more_readahead = 1;
-static struct page *readahead_list_head, *readahead_list_tail;
-
-static struct page *waiting_on;
-
-static atomic_t toi_io_in_progress, toi_io_done;
-static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
-
-int current_stream;
-/* Not static, so that the allocators can setup and complete
- * writing the header */
-char *toi_writer_buffer;
-int toi_writer_buffer_posn;
-
-static DEFINE_MUTEX(toi_bio_mutex);
-static DEFINE_MUTEX(toi_bio_readahead_mutex);
-
-static struct task_struct *toi_queue_flusher;
-static int toi_bio_queue_flush_pages(int dedicated_thread);
-
-struct toi_module_ops toi_blockwriter_ops;
-
-struct toi_incremental_image_pointer toi_inc_ptr[2][2];
-
-#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \
-               atomic_read(&toi_bio_queue_size))
-
-unsigned long raw_pages_allocd, header_pages_reserved;
-
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
-                int no_readahead);
-
-/**
- * set_free_mem_throttle - set the point where we pause to avoid oom.
- *
- * Initially, this value is zero, but when we first fail to allocate memory,
- * we set it (plus a buffer) and thereafter throttle i/o once that limit is
- * reached.
- **/
-static void set_free_mem_throttle(void)
-{
-        int new_throttle = nr_free_buffer_pages() + 256;
-
-        if (new_throttle > free_mem_throttle)
-                free_mem_throttle = new_throttle;
-}
-
-#define NUM_REASONS 7
-static atomic_t reasons[NUM_REASONS];
-static char *reason_name[NUM_REASONS] = {
-        "readahead not ready",
-        "bio allocation",
-        "synchronous I/O",
-        "toi_bio_get_new_page",
-        "memory low",
-        "readahead buffer allocation",
-        "throughput_throttle",
-};
-
-/* User Specified Parameters. */
-unsigned long resume_firstblock;
-dev_t resume_dev_t;
-struct block_device *resume_block_device;
-static atomic_t resume_bdev_open_count;
-
-struct block_device *header_block_device;
-
-/**
- * toi_open_bdev: Open a bdev at resume time.
- *
- * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
- * (the user can have resume= pointing at a swap partition/file that isn't
- * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
- * header. It will be from a swap partition that was enabled when we hibernated,
- * but we don't know it's real index until we read that first page.
- * dev_t: The device major/minor.
- * display_errs: Whether to try to do this quietly.
- *
- * We stored a dev_t in the image header. Open the matching device without
- * requiring /dev/<whatever> in most cases and record the details needed
- * to close it later and avoid duplicating work.
- */
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
-                int display_errs)
-{
-        struct block_device *bdev;
-        dev_t device = default_device;
-        char buf[32];
-        int retried = 0;
-
-retry:
-        if (uuid) {
-                struct fs_info seek;
-                strncpy((char *) &seek.uuid, uuid, 16);
-                seek.dev_t = 0;
-                seek.last_mount_size = 0;
-                device = blk_lookup_fs_info(&seek);
-                if (!device) {
-                        device = default_device;
-                        printk(KERN_DEBUG "Unable to resolve uuid. Falling back"
-                                        " to dev_t.\n");
-                } else
-                        printk(KERN_DEBUG "Resolved uuid to device %s.\n",
-                                        format_dev_t(buf, device));
-        }
-
-        if (!device) {
-                printk(KERN_ERR "TuxOnIce attempting to open a "
-                                "blank dev_t!\n");
-                dump_stack();
-                return NULL;
-        }
-        bdev = toi_open_by_devnum(device);
-
-        if (IS_ERR(bdev) || !bdev) {
-                if (!retried) {
-                        retried = 1;
-                        wait_for_device_probe();
-                        goto retry;
-                }
-                if (display_errs)
-                        toi_early_boot_message(1, TOI_CONTINUE_REQ,
-                                "Failed to get access to block device "
-                                "\"%x\" (error %d).\n Maybe you need "
-                                "to run mknod and/or lvmsetup in an "
-                                "initrd/ramfs?", device, bdev);
-                return ERR_PTR(-EINVAL);
-        }
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "TuxOnIce got bdev %p for dev_t %x.",
-                        bdev, device);
-
-        return bdev;
-}
-
-static void toi_bio_reserve_header_space(unsigned long request)
-{
-        header_pages_reserved = request;
-}
-
-/**
- * do_bio_wait - wait for some TuxOnIce I/O to complete
- * @reason: The array index of the reason we're waiting.
- *
- * Wait for a particular page of I/O if we're after a particular page.
- * If we're not after a particular page, wait instead for all in flight
- * I/O to be completed or for us to have enough free memory to be able
- * to submit more I/O.
- *
- * If we wait, we also update our statistics regarding why we waited.
- **/
-static void do_bio_wait(int reason)
-{
-        struct page *was_waiting_on = waiting_on;
-
-        /* On SMP, waiting_on can be reset, so we make a copy */
-        if (was_waiting_on) {
-                wait_on_page_locked(was_waiting_on);
-                atomic_inc(&reasons[reason]);
-        } else {
-                atomic_inc(&reasons[reason]);
-
-                wait_event(num_in_progress_wait,
-                        !atomic_read(&toi_io_in_progress) ||
-                        nr_free_buffer_pages() > free_mem_throttle);
-        }
-}
-
-/**
- * throttle_if_needed - wait for I/O completion if throttle points are reached
- * @flags: What to check and how to act.
- *
- * Check whether we need to wait for some I/O to complete. We always check
- * whether we have enough memory available, but may also (depending upon
- * @reason) check if the throughput throttle limit has been reached.
- **/
-static int throttle_if_needed(int flags)
-{
-        int free_pages = nr_free_buffer_pages();
-
-        /* Getting low on memory and I/O is in progress? */
-        while (unlikely(free_pages < free_mem_throttle) &&
-                        atomic_read(&toi_io_in_progress) &&
-                        !test_result_state(TOI_ABORTED)) {
-                if (!(flags & THROTTLE_WAIT))
-                        return -ENOMEM;
-                do_bio_wait(4);
-                free_pages = nr_free_buffer_pages();
-        }
-
-        while (!(flags & MEMORY_ONLY) && throughput_throttle &&
-                TOTAL_OUTSTANDING_IO >= throughput_throttle &&
-                !test_result_state(TOI_ABORTED)) {
-                int result = toi_bio_queue_flush_pages(0);
-                if (result)
-                        return result;
-                atomic_inc(&reasons[6]);
-                wait_event(num_in_progress_wait,
-                        !atomic_read(&toi_io_in_progress) ||
-                        TOTAL_OUTSTANDING_IO < throughput_throttle);
-        }
-
-        return 0;
-}
-
-/**
- * update_throughput_throttle - update the raw throughput throttle
- * @jif_index: The number of times this function has been called.
- *
- * This function is called four times per second by the core, and used to limit
- * the amount of I/O we submit at once, spreading out our waiting through the
- * whole job and letting userui get an opportunity to do its work.
- *
- * We don't start limiting I/O until 1/4s has gone so that we get a
- * decent sample for our initial limit, and keep updating it because
- * throughput may vary (on rotating media, eg) with our block number.
- *
- * We throttle to 1/10s worth of I/O.
- **/
-static void update_throughput_throttle(int jif_index)
-{
-        int done = atomic_read(&toi_io_done);
-        throughput_throttle = done * 2 / 5 / jif_index;
-}
-
-/**
- * toi_finish_all_io - wait for all outstanding i/o to complete
- *
- * Flush any queued but unsubmitted I/O and wait for it all to complete.
- **/
-static int toi_finish_all_io(void)
-{
-        int result = toi_bio_queue_flush_pages(0);
-        toi_bio_queue_flusher_should_finish = 1;
-        wake_up(&toi_io_queue_flusher);
-        wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO);
-        return result;
-}
-
-/**
- * toi_end_bio - bio completion function.
- * @bio: bio that has completed.
- *
- * Function called by the block driver from interrupt context when I/O is
- * completed. If we were writing the page, we want to free it and will have
- * set bio->bi_private to the parameter we should use in telling the page
- * allocation accounting code what the page was allocated for. If we're
- * reading the page, it will be in the singly linked list made from
- * page->private pointers.
- **/
-static void toi_end_bio(struct bio *bio)
-{
-        struct page *page = bio->bi_io_vec[0].bv_page;
-
-        BUG_ON(bio->bi_error);
-
-        unlock_page(page);
-        bio_put(bio);
-
-        if (waiting_on == page)
-                waiting_on = NULL;
-
-        put_page(page);
-
-        if (bio->bi_private)
-                toi__free_page((int) ((unsigned long) bio->bi_private) , page);
-
-        bio_put(bio);
-
-        atomic_dec(&toi_io_in_progress);
-        atomic_inc(&toi_io_done);
-
-        wake_up(&num_in_progress_wait);
-}
-
-/**
- * submit - submit BIO request
- * @writing: READ or WRITE.
- * @dev: The block device we're using.
- * @first_block: The first sector we're using.
- * @page: The page being used for I/O.
- * @free_group: If writing, the group that was used in allocating the page
- *         and which will be used in freeing the page from the completion
- *         routine.
- *
- * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
- * textbook - allocate and initialize the bio. If we're writing, make sure
- * the page is marked as dirty. Then submit it and carry on."
- *
- * If we're just testing the speed of our own code, we fake having done all
- * the hard work and all toi_end_bio immediately.
- **/
-static int submit(int writing, struct block_device *dev, sector_t first_block,
-                struct page *page, int free_group)
-{
-        struct bio *bio = NULL;
-        int cur_outstanding_io, result;
-
-        /*
-         * Shouldn't throttle if reading - can deadlock in the single
-         * threaded case as pages are only freed when we use the
-         * readahead.
-         */
-        if (writing) {
-                result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT);
-                if (result)
-                        return result;
-        }
-
-        while (!bio) {
-                bio = bio_alloc(TOI_ATOMIC_GFP, 1);
-                if (!bio) {
-                        set_free_mem_throttle();
-                        do_bio_wait(1);
-                }
-        }
-
-        bio->bi_bdev = dev;
-        bio->bi_iter.bi_sector = first_block;
-        bio->bi_private = (void *) ((unsigned long) free_group);
-        bio->bi_end_io = toi_end_bio;
-        bio_set_flag(bio, BIO_TOI);
-
-        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-                printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n",
-                                (unsigned long long) first_block);
-                bio_put(bio);
-                return -EFAULT;
-        }
-
-        bio_get(bio);
-
-        cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
-        if (writing) {
-                if (cur_outstanding_io > max_outstanding_writes)
-                        max_outstanding_writes = cur_outstanding_io;
-        } else {
-                if (cur_outstanding_io > max_outstanding_reads)
-                        max_outstanding_reads = cur_outstanding_io;
-        }
-
-        /* Still read the header! */
-        if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) {
-                /* Fake having done the hard work */
-                bio->bi_error = 0;
-                toi_end_bio(bio);
-        } else
-                submit_bio(writing | REQ_SYNC, bio);
-
-        return 0;
-}
-
-/**
- * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
- *
- * @writing: Whether reading or writing.
- * @bdev: The block device which we're using.
- * @block0: The first sector we're reading or writing.
- * @page: The page on which I/O is being done.
- * @readahead_index: If doing readahead, the index (reset this flag when done).
- * @syncio: Whether the i/o is being done synchronously.
- *
- * Prepare and start a read or write operation.
- *
- * Note that we always work with our own page. If writing, we might be given a
- * compression buffer that will immediately be used to start compressing the
- * next page. For reading, we do readahead and therefore don't know the final
- * address where the data needs to go.
- **/
-int toi_do_io(int writing, struct block_device *bdev, long block0,
-        struct page *page, int is_readahead, int syncio, int free_group)
-{
-        page->private = 0;
-
-        /* Do here so we don't race against toi_bio_get_next_page_read */
-        lock_page(page);
-
-        if (is_readahead) {
-                if (readahead_list_head)
-                        readahead_list_tail->private = (unsigned long) page;
-                else
-                        readahead_list_head = page;
-
-                readahead_list_tail = page;
-        }
-
-        /* Done before submitting to avoid races. */
-        if (syncio)
-                waiting_on = page;
-
-        /* Submit the page */
-        get_page(page);
-
-        if (submit(writing, bdev, block0, page, free_group))
-                return -EFAULT;
-
-        if (syncio)
-                do_bio_wait(2);
-
-        return 0;
-}
-
-/**
- * toi_bdev_page_io - simpler interface to do directly i/o on a single page
- * @writing: Whether reading or writing.
- * @bdev: Block device on which we're operating.
- * @pos: Sector at which page to read or write starts.
- * @page: Page to be read/written.
- *
- * A simple interface to submit a page of I/O and wait for its completion.
- * The caller must free the page used.
- **/
-static int toi_bdev_page_io(int writing, struct block_device *bdev,
-                long pos, struct page *page)
-{
-        return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
-}
-
-/**
- * toi_bio_memory_needed - report the amount of memory needed for block i/o
- *
- * We want to have at least enough memory so as to have target_outstanding_io
- * or more transactions on the fly at once. If we can do more, fine.
- **/
-static int toi_bio_memory_needed(void)
-{
-        return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
-                                sizeof(struct bio));
-}
-
-/**
- * toi_bio_print_debug_stats - put out debugging info in the buffer provided
- * @buffer: A buffer of size @size into which text should be placed.
- * @size: The size of @buffer.
- *
- * Fill a buffer with debugging info. This is used for both our debug_info sysfs
- * entry and for recording the same info in dmesg.
- **/
-static int toi_bio_print_debug_stats(char *buffer, int size)
-{
-        int len = 0;
-
-        if (toiActiveAllocator != &toi_blockwriter_ops) {
-                len = scnprintf(buffer, size,
-                                "- Block I/O inactive.\n");
-                return len;
-        }
-
-        len = scnprintf(buffer, size, "- Block I/O active.\n");
-
-        len += toi_bio_chains_debug_info(buffer + len, size - len);
-
-        len += scnprintf(buffer + len, size - len,
-                        "- Max outstanding reads %d. Max writes %d.\n",
-                        max_outstanding_reads, max_outstanding_writes);
-
-        len += scnprintf(buffer + len, size - len,
-                "  Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
-                target_outstanding_io,
-                PAGE_SIZE, (unsigned int) sizeof(struct request),
-                (unsigned int) sizeof(struct bio), toi_bio_memory_needed());
-
-#ifdef MEASURE_MUTEX_CONTENTION
-        {
-        int i;
-
-        len += scnprintf(buffer + len, size - len,
-                "  Mutex contention while reading:\n  Contended      Free\n");
-
-        for_each_online_cpu(i)
-                len += scnprintf(buffer + len, size - len,
-                "  %9lu %9lu\n",
-                mutex_times[0][0][i], mutex_times[0][1][i]);
-
-        len += scnprintf(buffer + len, size - len,
-                "  Mutex contention while writing:\n  Contended      Free\n");
-
-        for_each_online_cpu(i)
-                len += scnprintf(buffer + len, size - len,
-                "  %9lu %9lu\n",
-                mutex_times[1][0][i], mutex_times[1][1][i]);
-
-        }
-#endif
-
-        return len + scnprintf(buffer + len, size - len,
-                "  Free mem throttle point reached %d.\n", free_mem_throttle);
-}
-
-static int total_header_bytes;
-static int unowned;
-
-void debug_broken_header(void)
-{
-        printk(KERN_DEBUG "Image header too big for size allocated!\n");
-        print_toi_header_storage_for_modules();
-        printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed());
-        printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header));
-        printk(KERN_DEBUG "Total unowned : %d.\n", unowned);
-        printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes,
-                        DIV_ROUND_UP(total_header_bytes, PAGE_SIZE));
-        printk(KERN_DEBUG "Space needed now : %ld.\n",
-                        get_header_storage_needed());
-        dump_block_chains();
-        abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small.");
-}
-
-static int toi_bio_update_previous_inc_img_ptr(int stream)
-{
-    int result;
-    char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP);
-    struct page *page;
-    struct toi_incremental_image_pointer *prev, *this;
-
-    prev = &toi_inc_ptr[stream][0];
-    this = &toi_inc_ptr[stream][1];
-
-    if (!buffer) {
-        // We're at the start of writing a pageset. Memory should not be that scarce.
-        return -ENOMEM;
-    }
-
-    page = virt_to_page(buffer);
-    result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0);
-
-    if (result)
-        goto out;
-
-    memcpy(buffer, (char *) this, sizeof(this->save));
-
-    result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12);
-
-    // If the IO is successfully submitted (!result), the page will be freed
-    // asynchronously on completion.
-out:
-    if (result)
-        toi__free_page(12, virt_to_page(buffer));
-    return result;
-}
-
-/**
- * toi_rw_init_incremental - incremental image part of setting up to write new section
- */
-static int toi_write_init_incremental(int stream)
-{
-    int result = 0;
-
-    // Remember the location of this block so we can link to it.
-    toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
-    // Update the pointer at the start of the last pageset with the same stream number.
-    result = toi_bio_update_previous_inc_img_ptr(stream);
-    if (result)
-        return result;
-
-    // Move the current to the previous slot.
-    memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]));
-
-    // Store a blank pointer at the start of this incremental pageset
-    memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1]));
-    result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
-    if (result)
-        return result;
-
-    // Serialise extent chains if this is an incremental pageset
-    return toi_serialise_extent_chains();
-}
-
-/**
- * toi_read_init_incremental - incremental image part of setting up to read new section
- */
-static int toi_read_init_incremental(int stream)
-{
-    int result;
-
-    // Set our position to the start of the next pageset
-    toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
-    // Read the start of the next incremental pageset (if any)
-    result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
-
-    if (!result)
-        result = toi_load_extent_chains();
-
-    return result;
-}
-
-/**
- * toi_rw_init - prepare to read or write a stream in the image
- * @writing: Whether reading or writing.
- * @stream number: Section of the image being processed.
- *
- * Prepare to read or write a section ('stream') in the image.
- **/
-static int toi_rw_init(int writing, int stream_number)
-{
-        if (stream_number)
-                toi_extent_state_restore(stream_number);
-        else
-                toi_extent_state_goto_start();
-
-        if (writing) {
-                reset_idx = 0;
-                if (!current_stream)
-                        page_idx = 0;
-        } else {
-                reset_idx = 1;
-        }
-
-        atomic_set(&toi_io_done, 0);
-        if (!toi_writer_buffer)
-                toi_writer_buffer = (char *) toi_get_zeroed_page(11,
-                                TOI_ATOMIC_GFP);
-        toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
-
-        current_stream = stream_number;
-
-        more_readahead = 1;
-
-        if (test_result_state(TOI_KEPT_IMAGE)) {
-            int result;
-
-            if (writing) {
-                result = toi_write_init_incremental(stream_number);
-            } else {
-                result = toi_read_init_incremental(stream_number);
-            }
-
-            if (result)
-                return result;
-        }
-
-        return toi_writer_buffer ? 0 : -ENOMEM;
-}
-
-/**
- * toi_bio_queue_write - queue a page for writing
- * @full_buffer: Pointer to a page to be queued
- *
- * Add a page to the queue to be submitted. If we're the queue flusher,
- * we'll do this once we've dropped toi_bio_mutex, so other threads can
- * continue to submit I/O while we're on the slow path doing the actual
- * submission.
- **/
-static void toi_bio_queue_write(char **full_buffer)
-{
-        struct page *page = virt_to_page(*full_buffer);
-        unsigned long flags;
-
-        *full_buffer = NULL;
-        page->private = 0;
-
-        spin_lock_irqsave(&bio_queue_lock, flags);
-        if (!bio_queue_head)
-                bio_queue_head = page;
-        else
-                bio_queue_tail->private = (unsigned long) page;
-
-        bio_queue_tail = page;
-        atomic_inc(&toi_bio_queue_size);
-
-        spin_unlock_irqrestore(&bio_queue_lock, flags);
-        wake_up(&toi_io_queue_flusher);
-}
-
-/**
- * toi_rw_cleanup - Cleanup after i/o.
- * @writing: Whether we were reading or writing.
- *
- * Flush all I/O and clean everything up after reading or writing a
- * section of the image.
- **/
-static int toi_rw_cleanup(int writing)
-{
-        int i, result = 0;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup.");
-        if (writing) {
-                if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED))
-                        toi_bio_queue_write(&toi_writer_buffer);
-
-                while (bio_queue_head && !result)
-                        result = toi_bio_queue_flush_pages(0);
-
-                if (result)
-                        return result;
-
-                if (current_stream == 2)
-                        toi_extent_state_save(1);
-                else if (current_stream == 1)
-                        toi_extent_state_save(3);
-        }
-
-        result = toi_finish_all_io();
-
-        while (readahead_list_head) {
-                void *next = (void *) readahead_list_head->private;
-                toi__free_page(12, readahead_list_head);
-                readahead_list_head = next;
-        }
-
-        readahead_list_tail = NULL;
-
-        if (!current_stream)
-                return result;
-
-        for (i = 0; i < NUM_REASONS; i++) {
-                if (!atomic_read(&reasons[i]))
-                        continue;
-                printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n",
-                                reason_name[i], atomic_read(&reasons[i]));
-                atomic_set(&reasons[i], 0);
-        }
-
-        current_stream = 0;
-        return result;
-}
-
-/**
- * toi_start_one_readahead - start one page of readahead
- * @dedicated_thread: Is this a thread dedicated to doing readahead?
- *
- * Start one new page of readahead. If this is being called by a thread
- * whose only just is to submit readahead, don't quit because we failed
- * to allocate a page.
- **/
-static int toi_start_one_readahead(int dedicated_thread)
-{
-        char *buffer = NULL;
-        int oom = 0, result;
-
-        result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0);
-        if (result) {
-            printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result);
-            return result;
-        }
-
-        mutex_lock(&toi_bio_readahead_mutex);
-
-        while (!buffer) {
-                buffer = (char *) toi_get_zeroed_page(12,
-                                TOI_ATOMIC_GFP);
-                if (!buffer) {
-                        if (oom && !dedicated_thread) {
-                                mutex_unlock(&toi_bio_readahead_mutex);
-                                printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result);
-                                return -ENOMEM;
-                        }
-
-                        oom = 1;
-                        set_free_mem_throttle();
-                        do_bio_wait(5);
-                }
-        }
-
-        result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
-        if (result) {
-            printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result);
-        }
-        if (result == -ENOSPC)
-                toi__free_page(12, virt_to_page(buffer));
-        mutex_unlock(&toi_bio_readahead_mutex);
-        if (result) {
-                if (result == -ENOSPC)
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Last readahead page submitted.");
-                else
-                        printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n",
-                                        result);
-        }
-        return result;
-}
-
-/**
- * toi_start_new_readahead - start new readahead
- * @dedicated_thread: Are we dedicated to this task?
- *
- * Start readahead of image pages.
- *
- * We can be called as a thread dedicated to this task (may be helpful on
- * systems with lots of CPUs), in which case we don't exit until there's no
- * more readahead.
- *
- * If this is not called by a dedicated thread, we top up our queue until
- * there's no more readahead to submit, we've submitted the number given
- * in target_outstanding_io or the number in progress exceeds the target
- * outstanding I/O value.
- *
- * No mutex needed because this is only ever called by the first cpu.
- **/
-static int toi_start_new_readahead(int dedicated_thread)
-{
-        int last_result, num_submitted = 0;
-
-        /* Start a new readahead? */
-        if (!more_readahead)
-                return 0;
-
-        do {
-                last_result = toi_start_one_readahead(dedicated_thread);
-
-                if (last_result) {
-                        if (last_result == -ENOMEM || last_result == -ENOSPC)
-                                return 0;
-
-                        printk(KERN_DEBUG
-                                "Begin read chunk returned %d.\n",
-                                last_result);
-                } else
-                        num_submitted++;
-
-        } while (more_readahead && !last_result &&
-                 (dedicated_thread ||
-                  (num_submitted < target_outstanding_io &&
-                   atomic_read(&toi_io_in_progress) < target_outstanding_io)));
-
-        return last_result;
-}
-
-/**
- * bio_io_flusher - start the dedicated I/O flushing routine
- * @writing: Whether we're writing the image.
- **/
-static int bio_io_flusher(int writing)
-{
-
-        if (writing)
-                return toi_bio_queue_flush_pages(1);
-        else
-                return toi_start_new_readahead(1);
-}
-
-/**
- * toi_bio_get_next_page_read - read a disk page, perhaps with readahead
- * @no_readahead: Whether we can use readahead
- *
- * Read a page from disk, submitting readahead and cleaning up finished i/o
- * while we wait for the page we're after.
- **/
-static int toi_bio_get_next_page_read(int no_readahead)
-{
-        char *virt;
-        struct page *old_readahead_list_head;
-
-        /*
-         * When reading the second page of the header, we have to
-         * delay submitting the read until after we've gotten the
-         * extents out of the first page.
-         */
-        if (unlikely(no_readahead)) {
-            int result = toi_start_one_readahead(0);
-            if (result) {
-                printk(KERN_EMERG "No readahead and toi_start_one_readahead "
-                        "returned non-zero.\n");
-                return -EIO;
-            }
-        }
-
-        if (unlikely(!readahead_list_head)) {
-                /*
-                 * If the last page finishes exactly on the page
-                 * boundary, we will be called one extra time and
-                 * have no data to return. In this case, we should
-                 * not BUG(), like we used to!
-                 */
-                if (!more_readahead) {
-                        printk(KERN_EMERG "No more readahead.\n");
-                        return -ENOSPC;
-                }
-                if (unlikely(toi_start_one_readahead(0))) {
-                        printk(KERN_EMERG "No readahead and "
-                         "toi_start_one_readahead returned non-zero.\n");
-                        return -EIO;
-                }
-        }
-
-        if (PageLocked(readahead_list_head)) {
-                waiting_on = readahead_list_head;
-                do_bio_wait(0);
-        }
-
-        virt = page_address(readahead_list_head);
-        memcpy(toi_writer_buffer, virt, PAGE_SIZE);
-        
-        mutex_lock(&toi_bio_readahead_mutex);
-        old_readahead_list_head = readahead_list_head;
-        readahead_list_head = (struct page *) readahead_list_head->private;
-        mutex_unlock(&toi_bio_readahead_mutex);
-        toi__free_page(12, old_readahead_list_head);
-        return 0;
-}
-
-/**
- * toi_bio_queue_flush_pages - flush the queue of pages queued for writing
- * @dedicated_thread: Whether we're a dedicated thread
- *
- * Flush the queue of pages ready to be written to disk.
- *
- * If we're a dedicated thread, stay in here until told to leave,
- * sleeping in wait_event.
- *
- * The first thread is normally the only one to come in here. Another
- * thread can enter this routine too, though, via throttle_if_needed.
- * Since that's the case, we must be careful to only have one thread
- * doing this work at a time. Otherwise we have a race and could save
- * pages out of order.
- *
- * If an error occurs, free all remaining pages without submitting them
- * for I/O.
- **/
-
-int toi_bio_queue_flush_pages(int dedicated_thread)
-{
-        unsigned long flags;
-        int result = 0;
-        static DEFINE_MUTEX(busy);
-
-        if (!mutex_trylock(&busy))
-                return 0;
-
-top:
-        spin_lock_irqsave(&bio_queue_lock, flags);
-        while (bio_queue_head) {
-                struct page *page = bio_queue_head;
-                bio_queue_head = (struct page *) page->private;
-                if (bio_queue_tail == page)
-                        bio_queue_tail = NULL;
-                atomic_dec(&toi_bio_queue_size);
-                spin_unlock_irqrestore(&bio_queue_lock, flags);
-
-                /* Don't generate more error messages if already had one */
-                if (!result)
-                        result = toi_bio_rw_page(WRITE, page, 0, 11);
-                /*
-                 * If writing the page failed, don't drop out.
-                 * Flush the rest of the queue too.
-                 */
-                if (result)
-                        toi__free_page(11 , page);
-                spin_lock_irqsave(&bio_queue_lock, flags);
-        }
-        spin_unlock_irqrestore(&bio_queue_lock, flags);
-
-        if (dedicated_thread) {
-                wait_event(toi_io_queue_flusher, bio_queue_head ||
-                                toi_bio_queue_flusher_should_finish);
-                if (likely(!toi_bio_queue_flusher_should_finish))
-                        goto top;
-                toi_bio_queue_flusher_should_finish = 0;
-        }
-
-        mutex_unlock(&busy);
-        return result;
-}
-
-/**
- * toi_bio_get_new_page - get a new page for I/O
- * @full_buffer: Pointer to a page to allocate.
- **/
-static int toi_bio_get_new_page(char **full_buffer)
-{
-        int result = throttle_if_needed(THROTTLE_WAIT);
-        if (result)
-                return result;
-
-        while (!*full_buffer) {
-                *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
-                if (!*full_buffer) {
-                        set_free_mem_throttle();
-                        do_bio_wait(3);
-                }
-        }
-
-        return 0;
-}
-
-/**
- * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O
- * @writing:                Bool - whether writing (or reading).
- * @buffer:                The start of the buffer to write or fill.
- * @buffer_size:        The size of the buffer to write or fill.
- * @no_readahead:        Don't try to start readhead (when getting extents).
- **/
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
-                int no_readahead)
-{
-        int bytes_left = buffer_size, result = 0;
-
-        while (bytes_left) {
-                char *source_start = buffer + buffer_size - bytes_left;
-                char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
-                int capacity = PAGE_SIZE - toi_writer_buffer_posn;
-                char *to = writing ? dest_start : source_start;
-                char *from = writing ? source_start : dest_start;
-
-                if (bytes_left <= capacity) {
-                        memcpy(to, from, bytes_left);
-                        toi_writer_buffer_posn += bytes_left;
-                        return 0;
-                }
-
-                /* Complete this page and start a new one */
-                memcpy(to, from, capacity);
-                bytes_left -= capacity;
-
-                if (!writing) {
-                        /*
-                         * Perform actual I/O:
-                         * read readahead_list_head into toi_writer_buffer
-                         */
-                        int result = toi_bio_get_next_page_read(no_readahead);
-                        if (result && bytes_left) {
-                                printk("toi_bio_get_next_page_read "
-                                        "returned %d. Expecting to read %d bytes.\n", result, bytes_left);
-                                return result;
-                        }
-                } else {
-                        toi_bio_queue_write(&toi_writer_buffer);
-                        result = toi_bio_get_new_page(&toi_writer_buffer);
-                        if (result) {
-                                printk(KERN_ERR "toi_bio_get_new_page returned "
-                                                "%d.\n", result);
-                                return result;
-                        }
-                }
-
-                toi_writer_buffer_posn = 0;
-                toi_cond_pause(0, NULL);
-        }
-
-        return 0;
-}
-
-/**
- * toi_bio_read_page - read a page of the image
- * @pfn:                The pfn where the data belongs.
- * @buffer_page:        The page containing the (possibly compressed) data.
- * @buf_size:                The number of bytes on @buffer_page used (PAGE_SIZE).
- *
- * Read a (possibly compressed) page from the image, into buffer_page,
- * returning its pfn and the buffer size.
- **/
-static int toi_bio_read_page(unsigned long *pfn, int buf_type,
-                void *buffer_page, unsigned int *buf_size)
-{
-        int result = 0;
-        int this_idx;
-        char *buffer_virt = TOI_MAP(buf_type, buffer_page);
-
-        /*
-         * Only call start_new_readahead if we don't have a dedicated thread
-         * and we're the queue flusher.
-         */
-        if (current == toi_queue_flusher && more_readahead &&
-                        !test_action_state(TOI_NO_READAHEAD)) {
-                int result2 = toi_start_new_readahead(0);
-                if (result2) {
-                        printk(KERN_DEBUG "Queue flusher and "
-                         "toi_start_one_readahead returned non-zero.\n");
-                        result = -EIO;
-                        goto out;
-                }
-        }
-
-        my_mutex_lock(0, &toi_bio_mutex);
-
-        /*
-         * Structure in the image:
-         *        [destination pfn|page size|page data]
-         * buf_size is PAGE_SIZE
-         * We can validly find there's nothing to read in a multithreaded
-         * situation.
-         */
-        if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) ||
-            toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
-            toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
-            toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
-                result = -ENODATA;
-                goto out_unlock;
-        }
-
-        if (reset_idx) {
-                page_idx = this_idx;
-                reset_idx = 0;
-        } else {
-                page_idx++;
-                if (!this_idx)
-                        result = -ENODATA;
-                else if (page_idx != this_idx)
-                        printk(KERN_ERR "Got page index %d, expected %d.\n",
-                                        this_idx, page_idx);
-        }
-
-out_unlock:
-        my_mutex_unlock(0, &toi_bio_mutex);
-out:
-        TOI_UNMAP(buf_type, buffer_page);
-        return result;
-}
-
-/**
- * toi_bio_write_page - write a page of the image
- * @pfn:                The pfn where the data belongs.
- * @buffer_page:        The page containing the (possibly compressed) data.
- * @buf_size:        The number of bytes on @buffer_page used.
- *
- * Write a (possibly compressed) page to the image from the buffer, together
- * with it's index and buffer size.
- **/
-static int toi_bio_write_page(unsigned long pfn, int buf_type,
-                void *buffer_page, unsigned int buf_size)
-{
-        char *buffer_virt;
-        int result = 0, result2 = 0;
-
-        if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
-                return 0;
-
-        my_mutex_lock(1, &toi_bio_mutex);
-
-        if (test_result_state(TOI_ABORTED)) {
-                my_mutex_unlock(1, &toi_bio_mutex);
-                return 0;
-        }
-
-        buffer_virt = TOI_MAP(buf_type, buffer_page);
-        page_idx++;
-
-        /*
-         * Structure in the image:
-         *        [destination pfn|page size|page data]
-         * buf_size is PAGE_SIZE
-         */
-        if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) ||
-            toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
-            toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
-            toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
-                printk(KERN_DEBUG "toi_rw_buffer returned non-zero to "
-                                "toi_bio_write_page.\n");
-                result = -EIO;
-        }
-
-        TOI_UNMAP(buf_type, buffer_page);
-        my_mutex_unlock(1, &toi_bio_mutex);
-
-        if (current == toi_queue_flusher)
-                result2 = toi_bio_queue_flush_pages(0);
-
-        return result ? result : result2;
-}
-
-/**
- * _toi_rw_header_chunk - read or write a portion of the image header
- * @writing:                Whether reading or writing.
- * @owner:                The module for which we're writing.
- *                        Used for confirming that modules
- *                        don't use more header space than they asked for.
- * @buffer:                Address of the data to write.
- * @buffer_size:        Size of the data buffer.
- * @no_readahead:        Don't try to start readhead (when getting extents).
- *
- * Perform PAGE_SIZE I/O. Start readahead if needed.
- **/
-static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
-                char *buffer, int buffer_size, int no_readahead)
-{
-        int result = 0;
-
-        if (owner) {
-                owner->header_used += buffer_size;
-                toi_message(TOI_HEADER, TOI_LOW, 1,
-                        "Header: %s : %d bytes (%d/%d) from offset %d.",
-                        owner->name,
-                        buffer_size, owner->header_used,
-                        owner->header_requested,
-                        toi_writer_buffer_posn);
-                if (owner->header_used > owner->header_requested && writing) {
-                        printk(KERN_EMERG "TuxOnIce module %s is using more "
-                                "header space (%u) than it requested (%u).\n",
-                                owner->name,
-                                owner->header_used,
-                                owner->header_requested);
-                        return buffer_size;
-                }
-        } else {
-                unowned += buffer_size;
-                toi_message(TOI_HEADER, TOI_LOW, 1,
-                        "Header: (No owner): %d bytes (%d total so far) from "
-                        "offset %d.", buffer_size, unowned,
-                        toi_writer_buffer_posn);
-        }
-
-        if (!writing && !no_readahead && more_readahead) {
-                result = toi_start_new_readahead(0);
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead "
-                                "returned %d.", result);
-        }
-
-        if (!result) {
-                result = toi_rw_buffer(writing, buffer, buffer_size,
-                                no_readahead);
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned "
-                                "%d.", result);
-        }
-
-        total_header_bytes += buffer_size;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning "
-                        "%d.", result);
-        return result;
-}
-
-static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
-                char *buffer, int size)
-{
-        return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-static int toi_rw_header_chunk_noreadahead(int writing,
-                struct toi_module_ops *owner, char *buffer, int size)
-{
-        return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-/**
- * toi_bio_storage_needed - get the amount of storage needed for my fns
- **/
-static int toi_bio_storage_needed(void)
-{
-        return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed();
-}
-
-/**
- * toi_bio_save_config_info - save block I/O config to image header
- * @buf:        PAGE_SIZE'd buffer into which data should be saved.
- **/
-static int toi_bio_save_config_info(char *buf)
-{
-        int *ints = (int *) buf;
-        ints[0] = target_outstanding_io;
-        return sizeof(int);
-}
-
-/**
- * toi_bio_load_config_info - restore block I/O config
- * @buf:        Data to be reloaded.
- * @size:        Size of the buffer saved.
- **/
-static void toi_bio_load_config_info(char *buf, int size)
-{
-        int *ints = (int *) buf;
-        target_outstanding_io  = ints[0];
-}
-
-void close_resume_dev_t(int force)
-{
-        if (!resume_block_device)
-                return;
-
-        if (force)
-                atomic_set(&resume_bdev_open_count, 0);
-        else
-                atomic_dec(&resume_bdev_open_count);
-
-        if (!atomic_read(&resume_bdev_open_count)) {
-                toi_close_bdev(resume_block_device);
-                resume_block_device = NULL;
-        }
-}
-
-int open_resume_dev_t(int force, int quiet)
-{
-        if (force) {
-                close_resume_dev_t(1);
-                atomic_set(&resume_bdev_open_count, 1);
-        } else
-                atomic_inc(&resume_bdev_open_count);
-
-        if (resume_block_device)
-                return 0;
-
-        resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0);
-        if (IS_ERR(resume_block_device)) {
-                if (!quiet)
-                        toi_early_boot_message(1, TOI_CONTINUE_REQ,
-                                "Failed to open device %x, where"
-                                " the header should be found.",
-                                resume_dev_t);
-                resume_block_device = NULL;
-                atomic_set(&resume_bdev_open_count, 0);
-                return 1;
-        }
-
-        return 0;
-}
-
-/**
- * toi_bio_initialise - initialise bio code at start of some action
- * @starting_cycle:        Whether starting a hibernation cycle, or just reading or
- *                        writing a sysfs value.
- **/
-static int toi_bio_initialise(int starting_cycle)
-{
-        int result;
-
-        if (!starting_cycle || !resume_dev_t)
-                return 0;
-
-        max_outstanding_writes = 0;
-        max_outstanding_reads = 0;
-        current_stream = 0;
-        toi_queue_flusher = current;
-#ifdef MEASURE_MUTEX_CONTENTION
-        {
-                int i, j, k;
-
-                for (i = 0; i < 2; i++)
-                        for (j = 0; j < 2; j++)
-                                for_each_online_cpu(k)
-                                        mutex_times[i][j][k] = 0;
-        }
-#endif
-        result = open_resume_dev_t(0, 1);
-
-        if (result)
-                return result;
-
-        return get_signature_page();
-}
-
-static unsigned long raw_to_real(unsigned long raw)
-{
-        unsigned long extra;
-
-        extra = (raw * (sizeof(unsigned long) + sizeof(int)) +
-                (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
-                (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
-
-        return raw > extra ? raw - extra : 0;
-}
-
-static unsigned long toi_bio_storage_available(void)
-{
-        unsigned long sum = 0;
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    this_module->type != BIO_ALLOCATOR_MODULE)
-                        continue;
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage "
-                                "available from %s.", this_module->name);
-                sum += this_module->bio_allocator_ops->storage_available();
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu "
-                        "pages (%d header pages).", sum, header_pages_reserved);
-
-        return sum > header_pages_reserved ?
-                raw_to_real(sum - header_pages_reserved) : 0;
-
-}
-
-static unsigned long toi_bio_storage_allocated(void)
-{
-        return raw_pages_allocd > header_pages_reserved ?
-                raw_to_real(raw_pages_allocd - header_pages_reserved) : 0;
-}
-
-/*
- * If we have read part of the image, we might have filled  memory with
- * data that should be zeroed out.
- */
-static void toi_bio_noresume_reset(void)
-{
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset.");
-        toi_rw_cleanup(READ);
-        free_all_bdev_info();
-}
-
-/**
- * toi_bio_cleanup - cleanup after some action
- * @finishing_cycle:        Whether completing a cycle.
- **/
-static void toi_bio_cleanup(int finishing_cycle)
-{
-        if (!finishing_cycle)
-                return;
-
-        if (toi_writer_buffer) {
-                toi_free_page(11, (unsigned long) toi_writer_buffer);
-                toi_writer_buffer = NULL;
-        }
-
-        forget_signature_page();
-
-        if (header_block_device && toi_sig_data &&
-                        toi_sig_data->header_dev_t != resume_dev_t)
-                toi_close_bdev(header_block_device);
-
-        header_block_device = NULL;
-
-        close_resume_dev_t(0);
-}
-
-static int toi_bio_write_header_init(void)
-{
-        int result;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init");
-        toi_rw_init(WRITE, 0);
-        toi_writer_buffer_posn = 0;
-
-        /* Info needed to bootstrap goes at the start of the header.
-         * First we save the positions and devinfo, including the number
-         * of header pages. Then we save the structs containing data needed
-         * for reading the header pages back.
-         * Note that even if header pages take more than one page, when we
-         * read back the info, we will have restored the location of the
-         * next header page by the time we go to use it.
-         */
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains.");
-        result = toi_serialise_extent_chains();
-
-        if (result)
-                return result;
-
-        /*
-         * Signature page hasn't been modified at this point. Write it in
-         * the header so we can restore it later.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page.");
-        return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops,
-                        (char *) toi_cur_sig_page,
-                        PAGE_SIZE);
-}
-
-static int toi_bio_write_header_cleanup(void)
-{
-        int result = 0;
-
-        if (toi_writer_buffer_posn)
-                toi_bio_queue_write(&toi_writer_buffer);
-
-        result = toi_finish_all_io();
-
-        unowned = 0;
-        total_header_bytes = 0;
-
-        /* Set signature to save we have an image */
-        if (!result)
-                result = toi_bio_mark_have_image();
-
-        return result;
-}
-
-/*
- * toi_bio_read_header_init()
- *
- * Description:
- * 1. Attempt to read the device specified with resume=.
- * 2. Check the contents of the swap header for our signature.
- * 3. Warn, ignore, reset and/or continue as appropriate.
- * 4. If continuing, read the toi_swap configuration section
- *    of the header and set up block device info so we can read
- *    the rest of the header & image.
- *
- * Returns:
- * May not return if user choose to reboot at a warning.
- * -EINVAL if cannot resume at this time. Booting should continue
- * normally.
- */
-
-static int toi_bio_read_header_init(void)
-{
-        int result = 0;
-        char buf[32];
-
-        toi_writer_buffer_posn = 0;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init");
-
-        if (!toi_sig_data) {
-                printk(KERN_INFO "toi_bio_read_header_init called when we "
-                                "haven't verified there is an image!\n");
-                return -EINVAL;
-        }
-
-        /*
-         * If the header is not on the resume_swap_dev_t, get the resume device
-         * first.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.",
-                        toi_sig_data->header_dev_t);
-        if (toi_sig_data->have_uuid) {
-                struct fs_info seek;
-                dev_t device;
-
-                strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16);
-                seek.dev_t = toi_sig_data->header_dev_t;
-                seek.last_mount_size = 0;
-                device = blk_lookup_fs_info(&seek);
-                if (device) {
-                        printk("Using dev_t %s, returned by blk_lookup_fs_info.\n",
-                                        format_dev_t(buf, device));
-                        toi_sig_data->header_dev_t = device;
-                }
-        }
-        if (toi_sig_data->header_dev_t != resume_dev_t) {
-                header_block_device = toi_open_bdev(NULL,
-                                toi_sig_data->header_dev_t, 1);
-
-                if (IS_ERR(header_block_device))
-                        return PTR_ERR(header_block_device);
-        } else
-                header_block_device = resume_block_device;
-
-        if (!toi_writer_buffer)
-                toi_writer_buffer = (char *) toi_get_zeroed_page(11,
-                                TOI_ATOMIC_GFP);
-        more_readahead = 1;
-
-        /*
-         * Read toi_swap configuration.
-         * Headerblock size taken into account already.
-         */
-        result = toi_bio_ops.bdev_page_io(READ, header_block_device,
-                        toi_sig_data->first_header_block,
-                        virt_to_page((unsigned long) toi_writer_buffer));
-        if (result)
-                return result;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains.");
-        result = toi_load_extent_chains();
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page.");
-        toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
-        if (!toi_orig_sig_page) {
-                printk(KERN_ERR "Failed to allocate memory for the current"
-                        " image signature.\n");
-                return -ENOMEM;
-        }
-
-        return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops,
-                        (char *) toi_orig_sig_page,
-                        PAGE_SIZE);
-}
-
-static int toi_bio_read_header_cleanup(void)
-{
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup.");
-        return toi_rw_cleanup(READ);
-}
-
-/* Works only for digits and letters, but small and fast */
-#define TOLOWER(x) ((x) | 0x20)
-
-/*
- * UUID must be 32 chars long. It may have dashes, but nothing
- * else.
- */
-char *uuid_from_commandline(char *commandline)
-{
-        int low = 0;
-        char *result = NULL, *output, *ptr;
-
-        if (strncmp(commandline, "UUID=", 5))
-                return NULL;
-
-        result = kzalloc(17, GFP_KERNEL);
-        if (!result) {
-                printk("Failed to kzalloc UUID text memory.\n");
-                return NULL;
-        }
-
-        ptr = commandline + 5;
-        output = result;
-
-        while (*ptr && (output - result) < 16) {
-                if (isxdigit(*ptr)) {
-                        int value = isdigit(*ptr) ? *ptr - '0' :
-                                TOLOWER(*ptr) - 'a' + 10;
-                        if (low) {
-                                *output += value;
-                                output++;
-                        } else {
-                                *output = value << 4;
-                        }
-                        low = !low;
-                } else if (*ptr != '-')
-                        break;
-                ptr++;
-        }
-
-        if ((output - result) < 16 || *ptr) {
-                printk(KERN_DEBUG "Found resume=UUID=, but the value looks "
-                                "invalid.\n");
-                kfree(result);
-                result = NULL;
-        }
-
-        return result;
-}
-
-#define retry_if_fails(command) \
-do { \
-        command; \
-        if (!resume_dev_t && !waited_for_device_probe) { \
-                wait_for_device_probe(); \
-                command; \
-                waited_for_device_probe = 1; \
-        } \
-} while(0)
-
-/**
- * try_to_open_resume_device: Try to parse and open resume=
- *
- * Any "swap:" has been stripped away and we just have the path to deal with.
- * We attempt to do name_to_dev_t, open and stat the file. Having opened the
- * file, get the struct block_device * to match.
- */
-static int try_to_open_resume_device(char *commandline, int quiet)
-{
-        struct kstat stat;
-        int error = 0;
-        char *uuid = uuid_from_commandline(commandline);
-        int waited_for_device_probe = 0;
-
-        resume_dev_t = MKDEV(0, 0);
-
-        if (!strlen(commandline))
-                retry_if_fails(toi_bio_scan_for_image(quiet));
-
-        if (uuid) {
-                struct fs_info seek;
-                strncpy((char *) &seek.uuid, uuid, 16);
-                seek.dev_t = resume_dev_t;
-                seek.last_mount_size = 0;
-                retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek));
-                kfree(uuid);
-        }
-
-        if (!resume_dev_t)
-                retry_if_fails(resume_dev_t = name_to_dev_t(commandline));
-
-        if (!resume_dev_t) {
-                struct file *file = filp_open(commandline,
-                                O_RDONLY|O_LARGEFILE, 0);
-
-                if (!IS_ERR(file) && file) {
-                        vfs_getattr(&file->f_path, &stat);
-                        filp_close(file, NULL);
-                } else
-                        error = vfs_stat(commandline, &stat);
-                if (!error)
-                        resume_dev_t = stat.rdev;
-        }
-
-        if (!resume_dev_t) {
-                if (quiet)
-                        return 1;
-
-                if (test_toi_state(TOI_TRYING_TO_RESUME))
-                        toi_early_boot_message(1, toi_translate_err_default,
-                          "Failed to translate \"%s\" into a device id.\n",
-                          commandline);
-                else
-                        printk("TuxOnIce: Can't translate \"%s\" into a device "
-                                        "id yet.\n", commandline);
-                return 1;
-        }
-
-        return open_resume_dev_t(1, quiet);
-}
-
-/*
- * Parse Image Location
- *
- * Attempt to parse a resume= parameter.
- * Swap Writer accepts:
- * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
- *
- * Where:
- * DEVNAME is convertable to a dev_t by name_to_dev_t
- * FIRSTBLOCK is the location of the first block in the swap file
- * (specifying for a swap partition is nonsensical but not prohibited).
- * Data is validated by attempting to read a swap header from the
- * location given. Failure will result in toi_swap refusing to
- * save an image, and a reboot with correct parameters will be
- * necessary.
- */
-static int toi_bio_parse_sig_location(char *commandline,
-                int only_allocator, int quiet)
-{
-        char *thischar, *devstart, *colon = NULL;
-        int signature_found, result = -EINVAL, temp_result = 0;
-
-        if (strncmp(commandline, "swap:", 5) &&
-            strncmp(commandline, "file:", 5)) {
-                /*
-                 * Failing swap:, we'll take a simple resume=/dev/hda2, or a
-                 * blank value (scan) but fall through to other allocators
-                 * if /dev/ or UUID= isn't matched.
-                 */
-                if (strncmp(commandline, "/dev/", 5) &&
-                    strncmp(commandline, "UUID=", 5) &&
-                    strlen(commandline))
-                        return 1;
-        } else
-                commandline += 5;
-
-        devstart = commandline;
-        thischar = commandline;
-        while ((*thischar != ':') && (*thischar != '@') &&
-                ((thischar - commandline) < 250) && (*thischar))
-                thischar++;
-
-        if (*thischar == ':') {
-                colon = thischar;
-                *colon = 0;
-                thischar++;
-        }
-
-        while ((thischar - commandline) < 250 && *thischar)
-                thischar++;
-
-        if (colon) {
-                unsigned long block;
-                temp_result = kstrtoul(colon + 1, 0, &block);
-                if (!temp_result)
-                        resume_firstblock = (int) block;
-        } else
-                resume_firstblock = 0;
-
-        clear_toi_state(TOI_CAN_HIBERNATE);
-        clear_toi_state(TOI_CAN_RESUME);
-
-        if (!temp_result)
-                temp_result = try_to_open_resume_device(devstart, quiet);
-
-        if (colon)
-                *colon = ':';
-
-        /* No error if we only scanned */
-        if (temp_result)
-                return strlen(commandline) ? -EINVAL : 1;
-
-        signature_found = toi_bio_image_exists(quiet);
-
-        if (signature_found != -1) {
-                result = 0;
-                /*
-                 * TODO: If only file storage, CAN_HIBERNATE should only be
-                 * set if file allocator's target is valid.
-                 */
-                set_toi_state(TOI_CAN_HIBERNATE);
-                set_toi_state(TOI_CAN_RESUME);
-        } else
-                if (!quiet)
-                        printk(KERN_ERR "TuxOnIce: Block I/O: No "
-                                "signature found at %s.\n", devstart);
-
-        return result;
-}
-
-static void toi_bio_release_storage(void)
-{
-        header_pages_reserved = 0;
-        raw_pages_allocd = 0;
-
-        free_all_bdev_info();
-}
-
-/* toi_swap_remove_image
- *
- */
-static int toi_bio_remove_image(void)
-{
-        int result;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image.");
-
-        result = toi_bio_restore_original_signature();
-
-        /*
-         * We don't do a sanity check here: we want to restore the swap
-         * whatever version of kernel made the hibernate image.
-         *
-         * We need to write swap, but swap may not be enabled so
-         * we write the device directly
-         *
-         * If we don't have an current_signature_page, we didn't
-         * read an image header, so don't change anything.
-         */
-
-        toi_bio_release_storage();
-
-        return result;
-}
-
-struct toi_bio_ops toi_bio_ops = {
-        .bdev_page_io = toi_bdev_page_io,
-        .register_storage = toi_register_storage_chain,
-        .free_storage = toi_bio_release_storage,
-};
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io,
-                        0, 16384, 0, NULL),
-};
-
-struct toi_module_ops toi_blockwriter_ops = {
-        .type                                = WRITER_MODULE,
-        .name                                = "block i/o",
-        .directory                        = "block_io",
-        .module                                = THIS_MODULE,
-        .memory_needed                        = toi_bio_memory_needed,
-        .print_debug_info                = toi_bio_print_debug_stats,
-        .storage_needed                        = toi_bio_storage_needed,
-        .save_config_info                = toi_bio_save_config_info,
-        .load_config_info                = toi_bio_load_config_info,
-        .initialise                        = toi_bio_initialise,
-        .cleanup                        = toi_bio_cleanup,
-        .post_atomic_restore                = toi_bio_chains_post_atomic,
-
-        .rw_init                        = toi_rw_init,
-        .rw_cleanup                        = toi_rw_cleanup,
-        .read_page                        = toi_bio_read_page,
-        .write_page                        = toi_bio_write_page,
-        .rw_header_chunk                = toi_rw_header_chunk,
-        .rw_header_chunk_noreadahead        = toi_rw_header_chunk_noreadahead,
-        .io_flusher                        = bio_io_flusher,
-        .update_throughput_throttle        = update_throughput_throttle,
-        .finish_all_io                        = toi_finish_all_io,
-
-        .noresume_reset                        = toi_bio_noresume_reset,
-        .storage_available                 = toi_bio_storage_available,
-        .storage_allocated                = toi_bio_storage_allocated,
-        .reserve_header_space                = toi_bio_reserve_header_space,
-        .allocate_storage                = toi_bio_allocate_storage,
-        .free_unused_storage            = toi_bio_free_unused_storage,
-        .image_exists                        = toi_bio_image_exists,
-        .mark_resume_attempted                = toi_bio_mark_resume_attempted,
-        .write_header_init                = toi_bio_write_header_init,
-        .write_header_cleanup                = toi_bio_write_header_cleanup,
-        .read_header_init                = toi_bio_read_header_init,
-        .read_header_cleanup                = toi_bio_read_header_cleanup,
-        .get_header_version                = toi_bio_get_header_version,
-        .remove_image                        = toi_bio_remove_image,
-        .parse_sig_location                = toi_bio_parse_sig_location,
-
-        .sysfs_data                        = sysfs_params,
-        .num_sysfs_entries                = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/**
- * toi_block_io_load - load time routine for block I/O module
- *
- * Register block i/o ops and sysfs entries.
- **/
-static __init int toi_block_io_load(void)
-{
-        return toi_register_module(&toi_blockwriter_ops);
-}
-
-late_initcall(toi_block_io_load);
diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h
deleted file mode 100644
index 5e1964a61..000000000
--- a/kernel/power/tuxonice_bio_internal.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_internal.h
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-/* Extent chains */
-void toi_extent_state_goto_start(void);
-void toi_extent_state_save(int slot);
-int go_next_page(int writing, int section_barrier);
-void toi_extent_state_restore(int slot);
-void free_all_bdev_info(void);
-int devices_of_same_priority(struct toi_bdev_info *this);
-int toi_register_storage_chain(struct toi_bdev_info *new);
-int toi_serialise_extent_chains(void);
-int toi_load_extent_chains(void);
-int toi_bio_rw_page(int writing, struct page *page, int is_readahead,
-                int free_group);
-int toi_bio_restore_original_signature(void);
-int toi_bio_devinfo_storage_needed(void);
-unsigned long get_headerblock(void);
-dev_t get_header_dev_t(void);
-struct block_device *get_header_bdev(void);
-int toi_bio_allocate_storage(unsigned long request);
-void toi_bio_free_unused_storage(void);
-
-/* Signature functions */
-#define HaveImage "HaveImage"
-#define NoImage "TuxOnIce"
-#define sig_size (sizeof(HaveImage))
-
-struct sig_data {
-        char sig[sig_size];
-        int have_image;
-        int resumed_before;
-
-        char have_uuid;
-        char header_uuid[17];
-        dev_t header_dev_t;
-        unsigned long first_header_block;
-
-        /* Repeat the signature to be sure we have a header version */
-        char sig2[sig_size];
-        int header_version;
-};
-
-void forget_signature_page(void);
-int toi_check_for_signature(void);
-int toi_bio_image_exists(int quiet);
-int get_signature_page(void);
-int toi_bio_mark_resume_attempted(int);
-extern char *toi_cur_sig_page;
-extern char *toi_orig_sig_page;
-int toi_bio_mark_have_image(void);
-extern struct sig_data *toi_sig_data;
-extern dev_t resume_dev_t;
-extern struct block_device *resume_block_device;
-extern struct block_device *header_block_device;
-extern unsigned long resume_firstblock;
-
-struct block_device *open_bdev(dev_t device, int display_errs);
-extern int current_stream;
-extern int more_readahead;
-int toi_do_io(int writing, struct block_device *bdev, long block0,
-        struct page *page, int is_readahead, int syncio, int free_group);
-int get_main_pool_phys_params(void);
-
-void toi_close_bdev(struct block_device *bdev);
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
-                int display_errs);
-
-extern struct toi_module_ops toi_blockwriter_ops;
-void dump_block_chains(void);
-void debug_broken_header(void);
-extern unsigned long raw_pages_allocd, header_pages_reserved;
-int toi_bio_chains_debug_info(char *buffer, int size);
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd);
-int toi_bio_scan_for_image(int quiet);
-int toi_bio_get_header_version(void);
-
-void close_resume_dev_t(int force);
-int open_resume_dev_t(int force, int quiet);
-
-struct toi_incremental_image_pointer_saved_data {
-    unsigned long block;
-    int chain;
-};
-
-struct toi_incremental_image_pointer {
-    struct toi_incremental_image_pointer_saved_data save;
-    struct block_device *bdev;
-    unsigned long block;
-};
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c
deleted file mode 100644
index f5418f092..000000000
--- a/kernel/power/tuxonice_bio_signature.c
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_signature.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-struct sig_data *toi_sig_data;
-
-/* Struct of swap header pages */
-
-struct old_sig_data {
-        dev_t device;
-        unsigned long sector;
-        int resume_attempted;
-        int orig_sig_type;
-};
-
-union diskpage {
-        union swap_header swh;        /* swh.magic is the only member used */
-        struct sig_data sig_data;
-        struct old_sig_data old_sig_data;
-};
-
-union p_diskpage {
-        union diskpage *pointer;
-        char *ptr;
-        unsigned long address;
-};
-
-char *toi_cur_sig_page;
-char *toi_orig_sig_page;
-int have_image;
-int have_old_image;
-
-int get_signature_page(void)
-{
-        if (!toi_cur_sig_page) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0,
-                                "Allocating current signature page.");
-                toi_cur_sig_page = (char *) toi_get_zeroed_page(38,
-                        TOI_ATOMIC_GFP);
-                if (!toi_cur_sig_page) {
-                        printk(KERN_ERR "Failed to allocate memory for the "
-                                "current image signature.\n");
-                        return -ENOMEM;
-                }
-
-                toi_sig_data = (struct sig_data *) toi_cur_sig_page;
-        }
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx,"
-                        " sector %d.",
-                        resume_block_device->bd_dev, resume_firstblock);
-
-        return toi_bio_ops.bdev_page_io(READ, resume_block_device,
-                resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-void forget_signature_page(void)
-{
-        if (toi_cur_sig_page) {
-                toi_sig_data = NULL;
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page"
-                                " (%p).", toi_cur_sig_page);
-                toi_free_page(38, (unsigned long) toi_cur_sig_page);
-                toi_cur_sig_page = NULL;
-        }
-
-        if (toi_orig_sig_page) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page"
-                                " (%p).", toi_orig_sig_page);
-                toi_free_page(38, (unsigned long) toi_orig_sig_page);
-                toi_orig_sig_page = NULL;
-        }
-}
-
-/*
- * We need to ensure we use the signature page that's currently on disk,
- * so as to not remove the image header. Post-atomic-restore, the orig sig
- * page will be empty, so we can use that as our method of knowing that we
- * need to load the on-disk signature and not use the non-image sig in
- * memory. (We're going to powerdown after writing the change, so it's safe.
- */
-int toi_bio_mark_resume_attempted(int flag)
-{
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.",
-                        flag);
-        if (!toi_orig_sig_page) {
-                forget_signature_page();
-                get_signature_page();
-        }
-        toi_sig_data->resumed_before = flag;
-        return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-                resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int toi_bio_mark_have_image(void)
-{
-        int result = 0;
-        char buf[32];
-        struct fs_info *fs_info;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists.");
-        memcpy(toi_sig_data->sig, tuxonice_signature,
-                        sizeof(tuxonice_signature));
-        toi_sig_data->have_image = 1;
-        toi_sig_data->resumed_before = 0;
-        toi_sig_data->header_dev_t = get_header_dev_t();
-        toi_sig_data->have_uuid = 0;
-
-        fs_info = fs_info_from_block_dev(get_header_bdev());
-        if (fs_info && !IS_ERR(fs_info)) {
-                memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16);
-                free_fs_info(fs_info);
-        } else
-                result = (int) PTR_ERR(fs_info);
-
-        if (!result) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.",
-                                format_dev_t(buf, get_header_dev_t()));
-                toi_sig_data->have_uuid = 1;
-        } else
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for "
-                                "dev_t %s.",
-                                format_dev_t(buf, get_header_dev_t()));
-
-        toi_sig_data->first_header_block = get_headerblock();
-        have_image = 1;
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block "
-                        "is %d.", toi_sig_data->header_dev_t,
-                        toi_sig_data->first_header_block);
-
-        memcpy(toi_sig_data->sig2, tuxonice_signature,
-                        sizeof(tuxonice_signature));
-        toi_sig_data->header_version = TOI_HEADER_VERSION;
-
-        return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-                resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int remove_old_signature(void)
-{
-        union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page;
-        char *orig_sig;
-        char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
-        int result;
-        struct block_device *header_bdev;
-        struct old_sig_data *old_sig_data =
-                &swap_header_page.pointer->old_sig_data;
-
-        header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1);
-        result = toi_bio_ops.bdev_page_io(READ, header_bdev,
-                        old_sig_data->sector, virt_to_page(header_start));
-
-        if (result)
-                goto out;
-
-        /*
-         * TODO: Get the original contents of the first bytes of the swap
-         * header page.
-         */
-        if (!old_sig_data->orig_sig_type)
-                orig_sig = "SWAP-SPACE";
-        else
-                orig_sig = "SWAPSPACE2";
-
-        memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
-        memcpy(swap_header_page.ptr, header_start, 10);
-
-        result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-                resume_firstblock, virt_to_page(swap_header_page.ptr));
-
-out:
-        toi_close_bdev(header_bdev);
-        have_old_image = 0;
-        toi_free_page(38, (unsigned long) header_start);
-        return result;
-}
-
-/*
- * toi_bio_restore_original_signature - restore the original signature
- *
- * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used.
- * It will have the original signature page contents, stored in the image
- * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain
- * the contents that were loaded when we started the cycle.
- */
-int toi_bio_restore_original_signature(void)
-{
-        char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page;
-
-        if (have_old_image)
-                return remove_old_signature();
-
-        if (!use) {
-                printk("toi_bio_restore_original_signature: No signature "
-                                "page loaded.\n");
-                return 0;
-        }
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists.");
-        have_image = 0;
-        toi_sig_data->have_image = 0;
-        return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-                resume_firstblock, virt_to_page(use));
-}
-
-/*
- * check_for_signature - See whether we have an image.
- *
- * Returns 0 if no image, 1 if there is one, -1 if indeterminate.
- */
-int toi_check_for_signature(void)
-{
-        union p_diskpage swap_header_page;
-        int type;
-        const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" };
-        const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" };
-        char *swap_header;
-
-        if (!toi_cur_sig_page) {
-                int result = get_signature_page();
-
-                if (result)
-                        return result;
-        }
-
-        /*
-         * Start by looking for the binary header.
-         */
-        if (!memcmp(tuxonice_signature, toi_cur_sig_page,
-                                sizeof(tuxonice_signature))) {
-                have_image = toi_sig_data->have_image;
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. "
-                                "Have image is %d.", have_image);
-                if (have_image)
-                        toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is "
-                                        "%x. First block is %d.",
-                                        toi_sig_data->header_dev_t,
-                                        toi_sig_data->first_header_block);
-                return toi_sig_data->have_image;
-        }
-
-        /*
-         * Failing that, try old file allocator headers.
-         */
-
-        if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) {
-                have_image = 1;
-                return 1;
-        }
-
-        have_image = 0;
-
-        if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage)))
-                return 0;
-
-        /*
-         * Nope? How about swap?
-         */
-        swap_header_page = (union p_diskpage) toi_cur_sig_page;
-        swap_header = swap_header_page.pointer->swh.magic.magic;
-
-        /* Normal swapspace? */
-        for (type = 0; type < 2; type++)
-                if (!memcmp(normal_sigs[type], swap_header,
-                                        strlen(normal_sigs[type])))
-                        return 0;
-
-        /* Swsusp or uswsusp? */
-        for (type = 0; type < 3; type++)
-                if (!memcmp(swsusp_sigs[type], swap_header,
-                                        strlen(swsusp_sigs[type])))
-                        return 2;
-
-        /* Old TuxOnIce version? */
-        if (!memcmp(tuxonice_signature, swap_header,
-                                sizeof(tuxonice_signature) - 1)) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce "
-                                "signature.");
-                have_old_image = 1;
-                return 3;
-        }
-
-        return -1;
-}
-
-/*
- * Image_exists
- *
- * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
- */
-int toi_bio_image_exists(int quiet)
-{
-        int result;
-        char *msg = NULL;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists.");
-
-        if (!resume_dev_t) {
-                if (!quiet)
-                        printk(KERN_INFO "Not even trying to read header "
-                                "because resume_dev_t is not set.\n");
-                return -1;
-        }
-
-        if (open_resume_dev_t(0, quiet))
-                return -1;
-
-        result = toi_check_for_signature();
-
-        clear_toi_state(TOI_RESUMED_BEFORE);
-        if (toi_sig_data->resumed_before)
-                set_toi_state(TOI_RESUMED_BEFORE);
-
-        if (quiet || result == -ENOMEM)
-                return result;
-
-        if (result == -1)
-                msg = "TuxOnIce: Unable to find a signature."
-                                " Could you have moved a swap file?\n";
-        else if (!result)
-                msg = "TuxOnIce: No image found.\n";
-        else if (result == 1)
-                msg = "TuxOnIce: Image found.\n";
-        else if (result == 2)
-                msg = "TuxOnIce: uswsusp or swsusp image found.\n";
-        else if (result == 3)
-                msg = "TuxOnIce: Old implementation's signature found.\n";
-
-        printk(KERN_INFO "%s", msg);
-
-        return result;
-}
-
-int toi_bio_scan_for_image(int quiet)
-{
-        struct block_device *bdev;
-        char default_name[255] = "";
-
-        if (!quiet)
-                printk(KERN_DEBUG "Scanning swap devices for TuxOnIce "
-                                "signature...\n");
-        for (bdev = next_bdev_of_type(NULL, "swap"); bdev;
-                                bdev = next_bdev_of_type(bdev, "swap")) {
-                int result;
-                char name[255] = "";
-                sprintf(name, "%u:%u", MAJOR(bdev->bd_dev),
-                                MINOR(bdev->bd_dev));
-                if (!quiet)
-                        printk(KERN_DEBUG "- Trying %s.\n", name);
-                resume_block_device = bdev;
-                resume_dev_t = bdev->bd_dev;
-
-                result = toi_check_for_signature();
-
-                resume_block_device = NULL;
-                resume_dev_t = MKDEV(0, 0);
-
-                if (!default_name[0])
-                        strcpy(default_name, name);
-
-                if (result == 1) {
-                        /* Got one! */
-                        strcpy(resume_file, name);
-                        next_bdev_of_type(bdev, NULL);
-                        if (!quiet)
-                                printk(KERN_DEBUG " ==> Image found on %s.\n",
-                                                resume_file);
-                        return 1;
-                }
-                forget_signature_page();
-        }
-
-        if (!quiet)
-                printk(KERN_DEBUG "TuxOnIce scan: No image found.\n");
-        strcpy(resume_file, default_name);
-        return 0;
-}
-
-int toi_bio_get_header_version(void)
-{
-        return (memcmp(toi_sig_data->sig2, tuxonice_signature,
-                                sizeof(tuxonice_signature))) ?
-                0 : toi_sig_data->header_version;
-
-}
diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
deleted file mode 100644
index 22bf07a43..000000000
--- a/kernel/power/tuxonice_builtin.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <linux/kernel.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/bio.h>
-#include <linux/root_dev.h>
-#include <linux/freezer.h>
-#include <linux/reboot.h>
-#include <linux/writeback.h>
-#include <linux/tty.h>
-#include <linux/crypto.h>
-#include <linux/cpu.h>
-#include <linux/ctype.h>
-#include <linux/kthread.h>
-#include "tuxonice_io.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_alloc.h"
-
-unsigned long toi_bootflags_mask;
-
-/*
- * Highmem related functions (x86 only).
- */
-
-#ifdef CONFIG_HIGHMEM
-
-/**
- * copyback_high: Restore highmem pages.
- *
- * Highmem data and pbe lists are/can be stored in highmem.
- * The format is slightly different to the lowmem pbe lists
- * used for the assembly code: the last pbe in each page is
- * a struct page * instead of struct pbe *, pointing to the
- * next page where pbes are stored (or NULL if happens to be
- * the end of the list). Since we don't want to generate
- * unnecessary deltas against swsusp code, we use a cast
- * instead of a union.
- **/
-
-static void copyback_high(void)
-{
-        struct page *pbe_page = (struct page *) restore_highmem_pblist;
-        struct pbe *this_pbe, *first_pbe;
-        unsigned long *origpage, *copypage;
-        int pbe_index = 1;
-
-        if (!pbe_page)
-                return;
-
-        this_pbe = (struct pbe *) kmap_atomic(pbe_page);
-        first_pbe = this_pbe;
-
-        while (this_pbe) {
-                int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
-
-                origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address));
-                copypage = kmap_atomic((struct page *) this_pbe->address);
-
-                while (loop >= 0) {
-                        *(origpage + loop) = *(copypage + loop);
-                        loop--;
-                }
-
-                kunmap_atomic(origpage);
-                kunmap_atomic(copypage);
-
-                if (!this_pbe->next)
-                        break;
-
-                if (pbe_index < PBES_PER_PAGE) {
-                        this_pbe++;
-                        pbe_index++;
-                } else {
-                        pbe_page = (struct page *) this_pbe->next;
-                        kunmap_atomic(first_pbe);
-                        if (!pbe_page)
-                                return;
-                        this_pbe = (struct pbe *) kmap_atomic(pbe_page);
-                        first_pbe = this_pbe;
-                        pbe_index = 1;
-                }
-        }
-        kunmap_atomic(first_pbe);
-}
-
-#else /* CONFIG_HIGHMEM */
-static void copyback_high(void) { }
-#endif
-
-char toi_wait_for_keypress_dev_console(int timeout)
-{
-        int fd, this_timeout = 255, orig_kthread = 0;
-        char key = '\0';
-        struct termios t, t_backup;
-
-        /* We should be guaranteed /dev/console exists after populate_rootfs()
-         * in init/main.c.
-         */
-        fd = sys_open("/dev/console", O_RDONLY, 0);
-        if (fd < 0) {
-                printk(KERN_INFO "Couldn't open /dev/console.\n");
-                return key;
-        }
-
-        if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
-                goto out_close;
-
-        memcpy(&t_backup, &t, sizeof(t));
-
-        t.c_lflag &= ~(ISIG|ICANON|ECHO);
-        t.c_cc[VMIN] = 0;
-
-new_timeout:
-        if (timeout > 0) {
-                this_timeout = timeout < 26 ? timeout : 25;
-                timeout -= this_timeout;
-                this_timeout *= 10;
-        }
-
-        t.c_cc[VTIME] = this_timeout;
-
-        if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
-                goto out_restore;
-
-        if (current->flags & PF_KTHREAD) {
-            orig_kthread = (current->flags & PF_KTHREAD);
-            current->flags &= ~PF_KTHREAD;
-        }
-
-        while (1) {
-                if (sys_read(fd, &key, 1) <= 0) {
-                        if (timeout)
-                                goto new_timeout;
-                        key = '\0';
-                        break;
-                }
-                key = tolower(key);
-                if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
-                        if (key == 'c') {
-                                set_toi_state(TOI_CONTINUE_REQ);
-                                break;
-                        } else if (key == ' ')
-                                break;
-                } else
-                        break;
-        }
-        if (orig_kthread) {
-            current->flags |= PF_KTHREAD;
-        }
-
-out_restore:
-        sys_ioctl(fd, TCSETS, (long)&t_backup);
-out_close:
-        sys_close(fd);
-
-        return key;
-}
-
-struct toi_boot_kernel_data toi_bkd __nosavedata
-                __attribute__((aligned(PAGE_SIZE))) = {
-        MY_BOOT_KERNEL_DATA_VERSION,
-        0,
-#ifdef CONFIG_TOI_REPLACE_SWSUSP
-        (1 << TOI_REPLACE_SWSUSP) |
-#endif
-        (1 << TOI_NO_FLUSHER_THREAD) |
-        (1 << TOI_PAGESET2_FULL),
-};
-
-struct block_device *toi_open_by_devnum(dev_t dev)
-{
-        struct block_device *bdev = bdget(dev);
-        int err = -ENOMEM;
-        if (bdev)
-                err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
-        return err ? ERR_PTR(err) : bdev;
-}
-
-/**
- * toi_close_bdev: Close a swap bdev.
- *
- * int: The swap entry number to close.
- */
-void toi_close_bdev(struct block_device *bdev)
-{
-        blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
-}
-
-int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
-struct toi_core_fns *toi_core_fns;
-unsigned long toi_result;
-struct pagedir pagedir1 = {1};
-struct toi_cbw **toi_first_cbw;
-int toi_next_cbw;
-
-unsigned long toi_get_nonconflicting_page(void)
-{
-        return toi_core_fns->get_nonconflicting_page();
-}
-
-int toi_post_context_save(void)
-{
-        return toi_core_fns->post_context_save();
-}
-
-int try_tuxonice_hibernate(void)
-{
-        if (!toi_core_fns)
-                return -ENODEV;
-
-        return toi_core_fns->try_hibernate();
-}
-
-static int num_resume_calls;
-#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL
-static int ignore_late_initcall = 1;
-#else
-static int ignore_late_initcall;
-#endif
-
-int toi_translate_err_default = TOI_CONTINUE_REQ;
-
-void try_tuxonice_resume(void)
-{
-        if (!hibernation_available())
-                return;
-
-        /* Don't let it wrap around eventually */
-        if (num_resume_calls < 2)
-                num_resume_calls++;
-
-        if (num_resume_calls == 1 && ignore_late_initcall) {
-                printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n");
-                return;
-        }
-
-        if (toi_core_fns)
-                toi_core_fns->try_resume();
-        else
-                printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
-}
-
-int toi_lowlevel_builtin(void)
-{
-        int error = 0;
-
-        save_processor_state();
-        error = swsusp_arch_suspend();
-        if (error)
-                printk(KERN_ERR "Error %d hibernating\n", error);
-
-        /* Restore control flow appears here */
-        if (!toi_in_hibernate) {
-                copyback_high();
-                set_toi_state(TOI_NOW_RESUMING);
-        }
-
-        restore_processor_state();
-        return error;
-}
-
-unsigned long toi_compress_bytes_in;
-unsigned long toi_compress_bytes_out;
-
-int toi_in_suspend(void)
-{
-  return in_suspend;
-}
-
-unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
-                (1 << TOI_IGNORE_LOGLEVEL) |
-                (1 << TOI_IO_STOPPED));
-
-/* The number of hibernates we have started (some may have been cancelled) */
-unsigned int nr_hibernates;
-int toi_running;
-__nosavedata int toi_in_hibernate;
-__nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_trace_allocs;
-
-void toi_read_lock_tasklist(void)
-{
-        read_lock(&tasklist_lock);
-}
-
-void toi_read_unlock_tasklist(void)
-{
-        read_unlock(&tasklist_lock);
-}
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-int (*toi_flag_zram_disks) (void);
-
-int toi_do_flag_zram_disks(void)
-{
-        return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0;
-}
-
-#endif
-
-/* toi_generate_free_page_map
- *
- * Description:        This routine generates a bitmap of free pages from the
- *                 lists used by the memory manager. We then use the bitmap
- *                 to quickly calculate which pages to save and in which
- *                 pagesets.
- */
-void toi_generate_free_page_map(void)
-{
-        int order, cpu, t;
-        unsigned long flags, i;
-        struct zone *zone;
-        struct list_head *curr;
-        unsigned long pfn;
-        struct page *page;
-
-        for_each_populated_zone(zone) {
-
-                if (!zone->spanned_pages)
-                        continue;
-
-                spin_lock_irqsave(&zone->lock, flags);
-
-                for (i = 0; i < zone->spanned_pages; i++) {
-                        pfn = zone->zone_start_pfn + i;
-
-                        if (!pfn_valid(pfn))
-                                continue;
-
-                        page = pfn_to_page(pfn);
-
-                        ClearPageNosaveFree(page);
-                }
-
-                for_each_migratetype_order(order, t) {
-                        list_for_each(curr,
-                                        &zone->free_area[order].free_list[t]) {
-                                unsigned long j;
-
-                                pfn = page_to_pfn(list_entry(curr, struct page,
-                                                        lru));
-                                for (j = 0; j < (1UL << order); j++)
-                                        SetPageNosaveFree(pfn_to_page(pfn + j));
-                        }
-                }
-
-                for_each_online_cpu(cpu) {
-                        struct per_cpu_pageset *pset =
-                                per_cpu_ptr(zone->pageset, cpu);
-                        struct per_cpu_pages *pcp = &pset->pcp;
-                        struct page *page;
-                        int t;
-
-                        for (t = 0; t < MIGRATE_PCPTYPES; t++)
-                                list_for_each_entry(page, &pcp->lists[t], lru)
-                                        SetPageNosaveFree(page);
-                }
-
-                spin_unlock_irqrestore(&zone->lock, flags);
-        }
-}
-
-/* toi_size_of_free_region
- *
- * Description:        Return the number of pages that are free, beginning with and
- *                 including this one.
- */
-int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn)
-{
-        unsigned long this_pfn = start_pfn,
-                      end_pfn = zone_end_pfn(zone);
-
-        while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
-                this_pfn++;
-
-        return this_pfn - start_pfn;
-}
-
-static int __init toi_wait_setup(char *str)
-{
-        int value;
-
-        if (sscanf(str, "=%d", &value)) {
-                if (value < -1 || value > 255)
-                        printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
-                                        "255.\n");
-                else
-                        toi_wait = value;
-        }
-
-        return 1;
-}
-__setup("toi_wait", toi_wait_setup);
-
-static int __init toi_translate_retry_setup(char *str)
-{
-        toi_translate_err_default = 0;
-        return 1;
-}
-__setup("toi_translate_retry", toi_translate_retry_setup);
-
-static int __init toi_debug_setup(char *str)
-{
-        toi_bkd.toi_action |= (1 << TOI_LOGALL);
-        toi_bootflags_mask |= (1 << TOI_LOGALL);
-        toi_bkd.toi_debug_state = 255;
-        toi_bkd.toi_default_console_level = 7;
-        return 1;
-}
-__setup("toi_debug_setup", toi_debug_setup);
-
-static int __init toi_pause_setup(char *str)
-{
-        toi_bkd.toi_action |= (1 << TOI_PAUSE);
-        toi_bootflags_mask |= (1 << TOI_PAUSE);
-        return 1;
-}
-__setup("toi_pause", toi_pause_setup);
-
-#ifdef CONFIG_PM_DEBUG
-static int __init toi_trace_allocs_setup(char *str)
-{
-        int value;
-
-        if (sscanf(str, "=%d", &value))
-                toi_trace_allocs = value;
-
-        return 1;
-}
-__setup("toi_trace_allocs", toi_trace_allocs_setup);
-#endif
-
-static int __init toi_ignore_late_initcall_setup(char *str)
-{
-        int value;
-
-        if (sscanf(str, "=%d", &value))
-                ignore_late_initcall = value;
-
-        return 1;
-}
-__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup);
-
-static int __init toi_force_no_multithreaded_setup(char *str)
-{
-        int value;
-
-        toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO);
-        toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO);
-
-        if (sscanf(str, "=%d", &value) && value)
-                toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO);
-
-        return 1;
-}
-__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup);
-
-#ifdef CONFIG_KGDB
-static int __init toi_post_resume_breakpoint_setup(char *str)
-{
-        int value;
-
-        toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT);
-        toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT);
-        if (sscanf(str, "=%d", &value) && value)
-                toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT);
-
-        return 1;
-}
-__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup);
-#endif
-
-static int __init toi_disable_readahead_setup(char *str)
-{
-        int value;
-
-        toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD);
-        toi_bootflags_mask |= (1 << TOI_NO_READAHEAD);
-        if (sscanf(str, "=%d", &value) && value)
-                toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD);
-
-        return 1;
-}
-__setup("toi_no_readahead", toi_disable_readahead_setup);
diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
deleted file mode 100644
index 9539818e0..000000000
--- a/kernel/power/tuxonice_builtin.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <asm/setup.h>
-
-extern struct toi_core_fns *toi_core_fns;
-extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
-extern unsigned int nr_hibernates;
-extern int toi_in_hibernate;
-
-extern __nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_lowlevel_builtin(void);
-
-#ifdef CONFIG_HIGHMEM
-extern __nosavedata struct zone_data *toi_nosave_zone_list;
-extern __nosavedata unsigned long toi_nosave_max_pfn;
-#endif
-
-extern unsigned long toi_get_nonconflicting_page(void);
-extern int toi_post_context_save(void);
-
-extern char toi_wait_for_keypress_dev_console(int timeout);
-extern struct block_device *toi_open_by_devnum(dev_t dev);
-extern void toi_close_bdev(struct block_device *bdev);
-extern int toi_wait;
-extern int toi_translate_err_default;
-extern int toi_force_no_multithreaded;
-extern void toi_read_lock_tasklist(void);
-extern void toi_read_unlock_tasklist(void);
-extern int toi_in_suspend(void);
-extern void toi_generate_free_page_map(void);
-extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn);
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-extern int toi_do_flag_zram_disks(void);
-#else
-#define toi_do_flag_zram_disks() (0)
-#endif
diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
deleted file mode 100644
index 1c4e10c72..000000000
--- a/kernel/power/tuxonice_checksum.c
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-
-static struct toi_module_ops toi_checksum_ops;
-
-/* Constant at the mo, but I might allow tuning later */
-static char toi_checksum_name[32] = "md4";
-/* Bytes per checksum */
-#define CHECKSUM_SIZE (16)
-
-#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
-
-struct cpu_context {
-        struct crypto_hash *transform;
-        struct hash_desc desc;
-        struct scatterlist sg[2];
-        char *buf;
-};
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-static int pages_allocated;
-static unsigned long page_list;
-
-static int toi_num_resaved;
-
-static unsigned long this_checksum, next_page;
-static int checksum_count;
-
-static inline int checksum_pages_needed(void)
-{
-        return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
-}
-
-/* ---- Local buffer management ---- */
-
-/*
- * toi_checksum_cleanup
- *
- * Frees memory allocated for our labours.
- */
-static void toi_checksum_cleanup(int ending_cycle)
-{
-        int cpu;
-
-        if (ending_cycle) {
-                for_each_online_cpu(cpu) {
-                        struct cpu_context *this = &per_cpu(contexts, cpu);
-                        if (this->transform) {
-                                crypto_free_hash(this->transform);
-                                this->transform = NULL;
-                                this->desc.tfm = NULL;
-                        }
-
-                        if (this->buf) {
-                                toi_free_page(27, (unsigned long) this->buf);
-                                this->buf = NULL;
-                        }
-                }
-        }
-}
-
-/*
- * toi_crypto_initialise
- *
- * Prepare to do some work by allocating buffers and transforms.
- * Returns: Int: Zero. Even if we can't set up checksum, we still
- * seek to hibernate.
- */
-static int toi_checksum_initialise(int starting_cycle)
-{
-        int cpu;
-
-        if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
-                return 0;
-
-        if (!*toi_checksum_name) {
-                printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
-                return 1;
-        }
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                struct page *page;
-
-                this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
-                if (IS_ERR(this->transform)) {
-                        printk(KERN_INFO "TuxOnIce: Failed to initialise the "
-                                "%s checksum algorithm: %ld.\n",
-                                toi_checksum_name, (long) this->transform);
-                        this->transform = NULL;
-                        return 1;
-                }
-
-                this->desc.tfm = this->transform;
-                this->desc.flags = 0;
-
-                page = toi_alloc_page(27, GFP_KERNEL);
-                if (!page)
-                        return 1;
-                this->buf = page_address(page);
-                sg_init_one(&this->sg[0], this->buf, PAGE_SIZE);
-        }
-        return 0;
-}
-
-/*
- * toi_checksum_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_checksum_print_debug_stats(char *buffer, int size)
-{
-        int len;
-
-        if (!toi_checksum_ops.enabled)
-                return scnprintf(buffer, size,
-                        "- Checksumming disabled.\n");
-
-        len = scnprintf(buffer, size, "- Checksum method is '%s'.\n",
-                        toi_checksum_name);
-        len += scnprintf(buffer + len, size - len,
-                "  %d pages resaved in atomic copy.\n", toi_num_resaved);
-        return len;
-}
-
-static int toi_checksum_memory_needed(void)
-{
-        return toi_checksum_ops.enabled ?
-                checksum_pages_needed() << PAGE_SHIFT : 0;
-}
-
-static int toi_checksum_storage_needed(void)
-{
-        if (toi_checksum_ops.enabled)
-                return strlen(toi_checksum_name) + sizeof(int) + 1;
-        else
-                return 0;
-}
-
-/*
- * toi_checksum_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_checksum_save_config_info(char *buffer)
-{
-        int namelen = strlen(toi_checksum_name) + 1;
-        int total_len;
-
-        *((unsigned int *) buffer) = namelen;
-        strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
-        total_len = sizeof(unsigned int) + namelen;
-        return total_len;
-}
-
-/* toi_checksum_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description:        Reload information needed for dechecksuming the image at
- * resume time.
- */
-static void toi_checksum_load_config_info(char *buffer, int size)
-{
-        int namelen;
-
-        namelen = *((unsigned int *) (buffer));
-        strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
-                        namelen);
-        return;
-}
-
-/*
- * Free Checksum Memory
- */
-
-void free_checksum_pages(void)
-{
-        while (pages_allocated) {
-                unsigned long next = *((unsigned long *) page_list);
-                ClearPageNosave(virt_to_page(page_list));
-                toi_free_page(15, (unsigned long) page_list);
-                page_list = next;
-                pages_allocated--;
-        }
-}
-
-/*
- * Allocate Checksum Memory
- */
-
-int allocate_checksum_pages(void)
-{
-        int pages_needed = checksum_pages_needed();
-
-        if (!toi_checksum_ops.enabled)
-                return 0;
-
-        while (pages_allocated < pages_needed) {
-                unsigned long *new_page =
-                  (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
-                if (!new_page) {
-                        printk(KERN_ERR "Unable to allocate checksum pages.\n");
-                        return -ENOMEM;
-                }
-                SetPageNosave(virt_to_page(new_page));
-                (*new_page) = page_list;
-                page_list = (unsigned long) new_page;
-                pages_allocated++;
-        }
-
-        next_page = (unsigned long) page_list;
-        checksum_count = 0;
-
-        return 0;
-}
-
-char *tuxonice_get_next_checksum(void)
-{
-        if (!toi_checksum_ops.enabled)
-                return NULL;
-
-        if (checksum_count % CHECKSUMS_PER_PAGE)
-                this_checksum += CHECKSUM_SIZE;
-        else {
-                this_checksum = next_page + sizeof(void *);
-                next_page = *((unsigned long *) next_page);
-        }
-
-        checksum_count++;
-        return (char *) this_checksum;
-}
-
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
-{
-        char *pa;
-        int result, cpu = smp_processor_id();
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
-        if (!toi_checksum_ops.enabled)
-                return 0;
-
-        pa = kmap(page);
-        memcpy(ctx->buf, pa, PAGE_SIZE);
-        kunmap(page);
-        result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
-                                                checksum_locn);
-        if (result)
-                printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest "
-                                "returned %d.\n", result);
-        return result;
-}
-/*
- * Calculate checksums
- */
-
-void check_checksums(void)
-{
-        int index = 0, cpu = smp_processor_id();
-        char current_checksum[CHECKSUM_SIZE];
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-        unsigned long pfn;
-
-        if (!toi_checksum_ops.enabled) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled.");
-                return;
-        }
-
-        next_page = (unsigned long) page_list;
-
-        toi_num_resaved = 0;
-        this_checksum = 0;
-
-        toi_trace_index++;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums.");
-        memory_bm_position_reset(pageset2_map);
-        for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP;
-                        pfn = memory_bm_next_pfn(pageset2_map, 0)) {
-                int ret, resave_needed = false;
-                char *pa;
-                struct page *page = pfn_to_page(pfn);
-
-                if (index < checksum_count) {
-                    if (index % CHECKSUMS_PER_PAGE) {
-                        this_checksum += CHECKSUM_SIZE;
-                    } else {
-                        this_checksum = next_page + sizeof(void *);
-                        next_page = *((unsigned long *) next_page);
-                    }
-
-                    /* Done when IRQs disabled so must be atomic */
-                    pa = kmap_atomic(page);
-                    memcpy(ctx->buf, pa, PAGE_SIZE);
-                    kunmap_atomic(pa);
-                    ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
-                            current_checksum);
-
-                    if (ret) {
-                        printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
-                        return;
-                    }
-
-                    resave_needed = memcmp(current_checksum, (char *) this_checksum,
-                            CHECKSUM_SIZE);
-                } else {
-                    resave_needed = true;
-                }
-
-                if (resave_needed) {
-                        TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed);
-                        SetPageResave(pfn_to_page(pfn));
-                        toi_num_resaved++;
-                        if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
-                                set_abort_result(TOI_RESAVE_NEEDED);
-                }
-
-                index++;
-        }
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete.");
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0,
-                        NULL),
-        SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_ABORT_ON_RESAVE_NEEDED, 0)
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_checksum_ops = {
-        .type                        = MISC_MODULE,
-        .name                        = "checksumming",
-        .directory                = "checksum",
-        .module                        = THIS_MODULE,
-        .initialise                = toi_checksum_initialise,
-        .cleanup                = toi_checksum_cleanup,
-        .print_debug_info        = toi_checksum_print_debug_stats,
-        .save_config_info        = toi_checksum_save_config_info,
-        .load_config_info        = toi_checksum_load_config_info,
-        .memory_needed                = toi_checksum_memory_needed,
-        .storage_needed                = toi_checksum_storage_needed,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-int toi_checksum_init(void)
-{
-        int result = toi_register_module(&toi_checksum_ops);
-        return result;
-}
-
-void toi_checksum_exit(void)
-{
-        toi_unregister_module(&toi_checksum_ops);
-}
diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
deleted file mode 100644
index c8196fbb0..000000000
--- a/kernel/power/tuxonice_checksum.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#if defined(CONFIG_TOI_CHECKSUM)
-extern int toi_checksum_init(void);
-extern void toi_checksum_exit(void);
-void check_checksums(void);
-int allocate_checksum_pages(void);
-void free_checksum_pages(void);
-char *tuxonice_get_next_checksum(void);
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
-#else
-static inline int toi_checksum_init(void) { return 0; }
-static inline void toi_checksum_exit(void) { }
-static inline void check_checksums(void) { };
-static inline int allocate_checksum_pages(void) { return 0; };
-static inline void free_checksum_pages(void) { };
-static inline char *tuxonice_get_next_checksum(void) { return NULL; };
-static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
-        { return 0; }
-#endif
-
diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
deleted file mode 100644
index 2873f93c6..000000000
--- a/kernel/power/tuxonice_cluster.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines for cluster hibernation support.
- *
- * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
- *
- * How does it work?
- *
- * There is no 'master' node that tells everyone else what to do. All nodes
- * send messages to the broadcast address/port, maintain a list of peers
- * and figure out when to progress to the next step in hibernating or resuming.
- * This makes us more fault tolerant when it comes to nodes coming and going
- * (which may be more of an issue if we're hibernating when power supplies
- * are being unreliable).
- *
- * At boot time, we start a ktuxonice thread that handles communication with
- * other nodes. This node maintains a state machine that controls our progress
- * through hibernating and resuming, keeping us in step with other nodes. Nodes
- * are identified by their hw address.
- *
- * On startup, the node sends CLUSTER_PING on the configured interface's
- * broadcast address, port $toi_cluster_port (see below) and begins to listen
- * for other broadcast messages. CLUSTER_PING messages are repeated at
- * intervals of 5 minutes, with a random offset to spread traffic out.
- *
- * A hibernation cycle is initiated from any node via
- *
- * echo > /sys/power/tuxonice/do_hibernate
- *
- * and (possibily) the hibernate script. At each step of the process, the node
- * completes its work, and waits for all other nodes to signal completion of
- * their work (or timeout) before progressing to the next step.
- *
- * Request/state  Action before reply        Possible reply        Next state
- * HIBERNATE          capable, pre-script        HIBERNATE|ACK        NODE_PREP
- *                                         HIBERNATE|NACK        INIT_0
- *
- * PREP                  prepare_image                PREP|ACK        IMAGE_WRITE
- *                                         PREP|NACK        INIT_0
- *                                         ABORT                RUNNING
- *
- * IO                  write image                IO|ACK                power off
- *                                         ABORT                POST_RESUME
- *
- * (Boot time)          check for image        IMAGE|ACK        RESUME_PREP
- *                                         (Note 1)
- *                                         IMAGE|NACK        (Note 2)
- *
- * PREP                  prepare read image        PREP|ACK        IMAGE_READ
- *                                         PREP|NACK        (As NACK_IMAGE)
- *
- * IO                  read image                IO|ACK                POST_RESUME
- *
- * POST_RESUME          thaw, post-script                        RUNNING
- *
- * INIT_0          init 0
- *
- * Other messages:
- *
- * - PING: Request for all other live nodes to send a PONG. Used at startup to
- *   announce presence, when a node is suspected dead and periodically, in case
- *   segments of the network are [un]plugged.
- *
- * - PONG: Response to a PING.
- *
- * - ABORT: Request to cancel writing an image.
- *
- * - BYE: Notification that this node is shutting down.
- *
- * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
- * nodes which are slower to start up can get state synchronised. If a node
- * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
- * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
- * must invalidate its image (if any) and boot normally.
- *
- * Note 2: May occur when one node lost power or powered off while others
- * hibernated. This node waits for others to complete resuming (ACK_READ)
- * before completing its boot, so that it appears as a fail node restarting.
- *
- * If any node has an image, then it also has a list of nodes that hibernated
- * in synchronisation with it. The node will wait for other nodes to appear
- * or timeout before beginning its restoration.
- *
- * If a node has no image, it needs to wait, in case other nodes which do have
- * an image are going to resume, but are taking longer to announce their
- * presence. For this reason, the user can specify a timeout value and a number
- * of nodes detected before we just continue. (We might want to assume in a
- * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
- * the remaining nodes will too. This might help in situations where some nodes
- * are much slower to boot, or more subject to hardware failures or such like).
- */
-
-#include <linux/suspend.h>
-#include <linux/if.h>
-#include <linux/rtnetlink.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/if_arp.h>
-#include <linux/kthread.h>
-#include <linux/wait.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-
-#if 1
-#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
-#else
-#define PRINTK(a, b...) do { } while (0)
-#endif
-
-static int loopback_mode;
-static int num_local_nodes = 1;
-#define MAX_LOCAL_NODES 8
-#define SADDR (loopback_mode ? b->sid : h->saddr)
-
-#define MYNAME "TuxOnIce Clustering"
-
-enum cluster_message {
-        MSG_ACK = 1,
-        MSG_NACK = 2,
-        MSG_PING = 4,
-        MSG_ABORT = 8,
-        MSG_BYE = 16,
-        MSG_HIBERNATE = 32,
-        MSG_IMAGE = 64,
-        MSG_IO = 128,
-        MSG_RUNNING = 256
-};
-
-static char *str_message(int message)
-{
-        switch (message) {
-        case 4:
-                return "Ping";
-        case 8:
-                return "Abort";
-        case 9:
-                return "Abort acked";
-        case 10:
-                return "Abort nacked";
-        case 16:
-                return "Bye";
-        case 17:
-                return "Bye acked";
-        case 18:
-                return "Bye nacked";
-        case 32:
-                return "Hibernate request";
-        case 33:
-                return "Hibernate ack";
-        case 34:
-                return "Hibernate nack";
-        case 64:
-                return "Image exists?";
-        case 65:
-                return "Image does exist";
-        case 66:
-                return "No image here";
-        case 128:
-                return "I/O";
-        case 129:
-                return "I/O okay";
-        case 130:
-                return "I/O failed";
-        case 256:
-                return "Running";
-        default:
-                printk(KERN_ERR "Unrecognised message %d.\n", message);
-                return "Unrecognised message (see dmesg)";
-        }
-}
-
-#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
-#define MSG_STATE_MASK (~MSG_ACK_MASK)
-
-struct node_info {
-        struct list_head member_list;
-        wait_queue_head_t member_events;
-        spinlock_t member_list_lock;
-        spinlock_t receive_lock;
-        int peer_count, ignored_peer_count;
-        struct toi_sysfs_data sysfs_data;
-        enum cluster_message current_message;
-};
-
-struct node_info node_array[MAX_LOCAL_NODES];
-
-struct cluster_member {
-        __be32 addr;
-        enum cluster_message message;
-        struct list_head list;
-        int ignore;
-};
-
-#define toi_cluster_port_send 3501
-#define toi_cluster_port_recv 3502
-
-static struct net_device *net_dev;
-static struct toi_module_ops toi_cluster_ops;
-
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
-                struct packet_type *pt, struct net_device *orig_dev);
-
-static struct packet_type toi_cluster_packet_type = {
-        .type =        __constant_htons(ETH_P_IP),
-        .func =        toi_recv,
-};
-
-struct toi_pkt {                /* BOOTP packet format */
-        struct iphdr iph;        /* IP header */
-        struct udphdr udph;        /* UDP header */
-        u8 htype;                /* HW address type */
-        u8 hlen;                /* HW address length */
-        __be32 xid;                /* Transaction ID */
-        __be16 secs;                /* Seconds since we started */
-        __be16 flags;                /* Just what it says */
-        u8 hw_addr[16];                /* Sender's HW address */
-        u16 message;                /* Message */
-        unsigned long sid;        /* Source ID for loopback testing */
-};
-
-static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
-
-static int added_pack;
-
-static int others_have_image;
-
-/* Key used to allow multiple clusters on the same lan */
-static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
-static char pre_hibernate_script[255] =
-        CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
-static char post_hibernate_script[255] =
-        CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
-
-/*                        List of cluster members                        */
-static unsigned long continue_delay = 5 * HZ;
-static unsigned long cluster_message_timeout = 3 * HZ;
-
-/*                 === Membership list ===         */
-
-static void print_member_info(int index)
-{
-        struct cluster_member *this;
-
-        printk(KERN_INFO "==> Dumping node %d.\n", index);
-
-        list_for_each_entry(this, &node_array[index].member_list, list)
-                printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
-                                NIPQUAD(this->addr),
-                                str_message(this->message),
-                                this->ignore ? "(Ignored)" : "");
-        printk(KERN_INFO "== Done ==\n");
-}
-
-static struct cluster_member *__find_member(int index, __be32 addr)
-{
-        struct cluster_member *this;
-
-        list_for_each_entry(this, &node_array[index].member_list, list) {
-                if (this->addr != addr)
-                        continue;
-
-                return this;
-        }
-
-        return NULL;
-}
-
-static void set_ignore(int index, __be32 addr, struct cluster_member *this)
-{
-        if (this->ignore) {
-                PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
-                                index, NIPQUAD(addr));
-                return;
-        }
-
-        PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
-                                index, NIPQUAD(addr));
-        this->ignore = 1;
-        node_array[index].ignored_peer_count++;
-}
-
-static int __add_update_member(int index, __be32 addr, int message)
-{
-        struct cluster_member *this;
-
-        this = __find_member(index, addr);
-        if (this) {
-                if (this->message != message) {
-                        this->message = message;
-                        if ((message & MSG_NACK) &&
-                            (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
-                                set_ignore(index, addr, this);
-                        PRINTK("Node %d sees node %d.%d.%d.%d now sending "
-                                        "%s.\n", index, NIPQUAD(addr),
-                                        str_message(message));
-                        wake_up(&node_array[index].member_events);
-                }
-                return 0;
-        }
-
-        this = (struct cluster_member *) toi_kzalloc(36,
-                        sizeof(struct cluster_member), GFP_KERNEL);
-
-        if (!this)
-                return -1;
-
-        this->addr = addr;
-        this->message = message;
-        this->ignore = 0;
-        INIT_LIST_HEAD(&this->list);
-
-        node_array[index].peer_count++;
-
-        PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
-                        NIPQUAD(addr), str_message(message));
-
-        if ((message & MSG_NACK) &&
-            (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
-                set_ignore(index, addr, this);
-        list_add_tail(&this->list, &node_array[index].member_list);
-        return 1;
-}
-
-static int add_update_member(int index, __be32 addr, int message)
-{
-        int result;
-        unsigned long flags;
-        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-        result = __add_update_member(index, addr, message);
-        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-
-        print_member_info(index);
-
-        wake_up(&node_array[index].member_events);
-
-        return result;
-}
-
-static void del_member(int index, __be32 addr)
-{
-        struct cluster_member *this;
-        unsigned long flags;
-
-        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-        this = __find_member(index, addr);
-
-        if (this) {
-                list_del_init(&this->list);
-                toi_kfree(36, this, sizeof(*this));
-                node_array[index].peer_count--;
-        }
-
-        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-/*                 === Message transmission ===        */
-
-static void toi_send_if(int message, unsigned long my_id);
-
-/*
- *  Process received TOI packet.
- */
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
-                struct packet_type *pt, struct net_device *orig_dev)
-{
-        struct toi_pkt *b;
-        struct iphdr *h;
-        int len, result, index;
-        unsigned long addr, message, ack;
-
-        /* Perform verifications before taking the lock.  */
-        if (skb->pkt_type == PACKET_OTHERHOST)
-                goto drop;
-
-        if (dev != net_dev)
-                goto drop;
-
-        skb = skb_share_check(skb, GFP_ATOMIC);
-        if (!skb)
-                return NET_RX_DROP;
-
-        if (!pskb_may_pull(skb,
-                           sizeof(struct iphdr) +
-                           sizeof(struct udphdr)))
-                goto drop;
-
-        b = (struct toi_pkt *)skb_network_header(skb);
-        h = &b->iph;
-
-        if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
-                goto drop;
-
-        /* Fragments are not supported */
-        if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
-                if (net_ratelimit())
-                        printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
-                               "cluster message.\n");
-                goto drop;
-        }
-
-        if (skb->len < ntohs(h->tot_len))
-                goto drop;
-
-        if (ip_fast_csum((char *) h, h->ihl))
-                goto drop;
-
-        if (b->udph.source != htons(toi_cluster_port_send) ||
-            b->udph.dest != htons(toi_cluster_port_recv))
-                goto drop;
-
-        if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
-                goto drop;
-
-        len = ntohs(b->udph.len) - sizeof(struct udphdr);
-
-        /* Ok the front looks good, make sure we can get at the rest.  */
-        if (!pskb_may_pull(skb, skb->len))
-                goto drop;
-
-        b = (struct toi_pkt *)skb_network_header(skb);
-        h = &b->iph;
-
-        addr = SADDR;
-        PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
-                        str_message(b->message), NIPQUAD(addr));
-
-        message = b->message & MSG_STATE_MASK;
-        ack = b->message & MSG_ACK_MASK;
-
-        for (index = 0; index < num_local_nodes; index++) {
-                int new_message = node_array[index].current_message,
-                    old_message = new_message;
-
-                if (index == SADDR || !old_message) {
-                        PRINTK("Ignoring node %d (offline or self).\n", index);
-                        continue;
-                }
-
-                /* One message at a time, please. */
-                spin_lock(&node_array[index].receive_lock);
-
-                result = add_update_member(index, SADDR, b->message);
-                if (result == -1) {
-                        printk(KERN_INFO "Failed to add new cluster member "
-                                        NIPQUAD_FMT ".\n",
-                                        NIPQUAD(addr));
-                        goto drop_unlock;
-                }
-
-                switch (b->message & MSG_STATE_MASK) {
-                case MSG_PING:
-                        break;
-                case MSG_ABORT:
-                        break;
-                case MSG_BYE:
-                        break;
-                case MSG_HIBERNATE:
-                        /* Can I hibernate? */
-                        new_message = MSG_HIBERNATE |
-                                ((index & 1) ? MSG_NACK : MSG_ACK);
-                        break;
-                case MSG_IMAGE:
-                        /* Can I resume? */
-                        new_message = MSG_IMAGE |
-                                ((index & 1) ? MSG_NACK : MSG_ACK);
-                        if (new_message != old_message)
-                                printk(KERN_ERR "Setting whether I can resume "
-                                                "to %d.\n", new_message);
-                        break;
-                case MSG_IO:
-                        new_message = MSG_IO | MSG_ACK;
-                        break;
-                case MSG_RUNNING:
-                        break;
-                default:
-                        if (net_ratelimit())
-                                printk(KERN_ERR "Unrecognised TuxOnIce cluster"
-                                        " message %d from " NIPQUAD_FMT ".\n",
-                                        b->message, NIPQUAD(addr));
-                };
-
-                if (old_message != new_message) {
-                        node_array[index].current_message = new_message;
-                        printk(KERN_INFO ">>> Sending new message for node "
-                                        "%d.\n", index);
-                        toi_send_if(new_message, index);
-                } else if (!ack) {
-                        printk(KERN_INFO ">>> Resending message for node %d.\n",
-                                        index);
-                        toi_send_if(new_message, index);
-                }
-drop_unlock:
-                spin_unlock(&node_array[index].receive_lock);
-        };
-
-drop:
-        /* Throw the packet out. */
-        kfree_skb(skb);
-
-        return 0;
-}
-
-/*
- *  Send cluster message to single interface.
- */
-static void toi_send_if(int message, unsigned long my_id)
-{
-        struct sk_buff *skb;
-        struct toi_pkt *b;
-        int hh_len = LL_RESERVED_SPACE(net_dev);
-        struct iphdr *h;
-
-        /* Allocate packet */
-        skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
-        if (!skb)
-                return;
-        skb_reserve(skb, hh_len);
-        b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
-        memset(b, 0, sizeof(struct toi_pkt));
-
-        /* Construct IP header */
-        skb_reset_network_header(skb);
-        h = ip_hdr(skb);
-        h->version = 4;
-        h->ihl = 5;
-        h->tot_len = htons(sizeof(struct toi_pkt));
-        h->frag_off = htons(IP_DF);
-        h->ttl = 64;
-        h->protocol = IPPROTO_UDP;
-        h->daddr = htonl(INADDR_BROADCAST);
-        h->check = ip_fast_csum((unsigned char *) h, h->ihl);
-
-        /* Construct UDP header */
-        b->udph.source = htons(toi_cluster_port_send);
-        b->udph.dest = htons(toi_cluster_port_recv);
-        b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
-        /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
-
-        /* Construct message */
-        b->message = message;
-        b->sid = my_id;
-        b->htype = net_dev->type; /* can cause undefined behavior */
-        b->hlen = net_dev->addr_len;
-        memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
-        b->secs = htons(3); /* 3 seconds */
-
-        /* Chain packet down the line... */
-        skb->dev = net_dev;
-        skb->protocol = htons(ETH_P_IP);
-        if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
-                     net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
-                        dev_queue_xmit(skb) < 0)
-                printk(KERN_INFO "E");
-}
-
-/*        =========================================                */
-
-/*                        kTOICluster                        */
-
-static atomic_t num_cluster_threads;
-static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
-
-static int kTOICluster(void *data)
-{
-        unsigned long my_id;
-
-        my_id = atomic_add_return(1, &num_cluster_threads) - 1;
-        node_array[my_id].current_message = (unsigned long) data;
-
-        PRINTK("kTOICluster daemon %lu starting.\n", my_id);
-
-        current->flags |= PF_NOFREEZE;
-
-        while (node_array[my_id].current_message) {
-                toi_send_if(node_array[my_id].current_message, my_id);
-                sleep_on_timeout(&clusterd_events,
-                                cluster_message_timeout);
-                PRINTK("Link state %lu is %d.\n", my_id,
-                                node_array[my_id].current_message);
-        }
-
-        toi_send_if(MSG_BYE, my_id);
-        atomic_dec(&num_cluster_threads);
-        wake_up(&clusterd_events);
-
-        PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
-        __set_current_state(TASK_RUNNING);
-        return 0;
-}
-
-static void kill_clusterd(void)
-{
-        int i;
-
-        for (i = 0; i < num_local_nodes; i++) {
-                if (node_array[i].current_message) {
-                        PRINTK("Seeking to kill clusterd %d.\n", i);
-                        node_array[i].current_message = 0;
-                }
-        }
-        wait_event(clusterd_events,
-                        !atomic_read(&num_cluster_threads));
-        PRINTK("All cluster daemons have exited.\n");
-}
-
-static int peers_not_in_message(int index, int message, int precise)
-{
-        struct cluster_member *this;
-        unsigned long flags;
-        int result = 0;
-
-        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-        list_for_each_entry(this, &node_array[index].member_list, list) {
-                if (this->ignore)
-                        continue;
-
-                PRINTK("Peer %d.%d.%d.%d sending %s. "
-                        "Seeking %s.\n",
-                        NIPQUAD(this->addr),
-                        str_message(this->message), str_message(message));
-                if ((precise ? this->message :
-                                        this->message & MSG_STATE_MASK) !=
-                                        message)
-                        result++;
-        }
-        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-        PRINTK("%d peers in sought message.\n", result);
-        return result;
-}
-
-static void reset_ignored(int index)
-{
-        struct cluster_member *this;
-        unsigned long flags;
-
-        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-        list_for_each_entry(this, &node_array[index].member_list, list)
-                this->ignore = 0;
-        node_array[index].ignored_peer_count = 0;
-        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-static int peers_in_message(int index, int message, int precise)
-{
-        return node_array[index].peer_count -
-                node_array[index].ignored_peer_count -
-                peers_not_in_message(index, message, precise);
-}
-
-static int time_to_continue(int index, unsigned long start, int message)
-{
-        int first = peers_not_in_message(index, message, 0);
-        int second = peers_in_message(index, message, 1);
-
-        PRINTK("First part returns %d, second returns %d.\n", first, second);
-
-        if (!first && !second) {
-                PRINTK("All peers answered message %d.\n",
-                        message);
-                return 1;
-        }
-
-        if (time_after(jiffies, start + continue_delay)) {
-                PRINTK("Timeout reached.\n");
-                return 1;
-        }
-
-        PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
-                        start + continue_delay);
-        return 0;
-}
-
-void toi_initiate_cluster_hibernate(void)
-{
-        int result;
-        unsigned long start;
-
-        result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
-        if (result)
-                return;
-
-        toi_send_if(MSG_HIBERNATE, 0);
-
-        start = jiffies;
-        wait_event(node_array[0].member_events,
-                        time_to_continue(0, start, MSG_HIBERNATE));
-
-        if (test_action_state(TOI_FREEZER_TEST)) {
-                toi_send_if(MSG_ABORT, 0);
-
-                start = jiffies;
-                wait_event(node_array[0].member_events,
-                        time_to_continue(0, start, MSG_RUNNING));
-
-                do_toi_step(STEP_QUIET_CLEANUP);
-                return;
-        }
-
-        toi_send_if(MSG_IO, 0);
-
-        result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
-        if (result)
-                return;
-
-        /* This code runs at resume time too! */
-        if (toi_in_hibernate)
-                result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-}
-
-/* toi_cluster_print_debug_stats
- *
- * Description:        Print information to be recorded for debugging purposes into a
- *                 buffer.
- * Arguments:        buffer: Pointer to a buffer into which the debug info will be
- *                         printed.
- *                 size:        Size of the buffer.
- * Returns:        Number of characters written to the buffer.
- */
-static int toi_cluster_print_debug_stats(char *buffer, int size)
-{
-        int len;
-
-        if (strlen(toi_cluster_iface))
-                len = scnprintf(buffer, size,
-                                "- Cluster interface is '%s'.\n",
-                                toi_cluster_iface);
-        else
-                len = scnprintf(buffer, size,
-                                "- Cluster support is disabled.\n");
-        return len;
-}
-
-/* cluster_memory_needed
- *
- * Description:        Tell the caller how much memory we need to operate during
- *                 hibernate/resume.
- * Returns:        Unsigned long. Maximum number of bytes of memory required for
- *                 operation.
- */
-static int toi_cluster_memory_needed(void)
-{
-        return 0;
-}
-
-static int toi_cluster_storage_needed(void)
-{
-        return 1 + strlen(toi_cluster_iface);
-}
-
-/* toi_cluster_save_config_info
- *
- * Description:        Save informaton needed when reloading the image at resume time.
- * Arguments:        Buffer:                Pointer to a buffer of size PAGE_SIZE.
- * Returns:        Number of bytes used for saving our data.
- */
-static int toi_cluster_save_config_info(char *buffer)
-{
-        strcpy(buffer, toi_cluster_iface);
-        return strlen(toi_cluster_iface + 1);
-}
-
-/* toi_cluster_load_config_info
- *
- * Description:        Reload information needed for declustering the image at
- *                 resume time.
- * Arguments:        Buffer:                Pointer to the start of the data.
- *                Size:                Number of bytes that were saved.
- */
-static void toi_cluster_load_config_info(char *buffer, int size)
-{
-        strncpy(toi_cluster_iface, buffer, size);
-        return;
-}
-
-static void cluster_startup(void)
-{
-        int have_image = do_check_can_resume(), i;
-        unsigned long start = jiffies, initial_message;
-        struct task_struct *p;
-
-        initial_message = MSG_IMAGE;
-
-        have_image = 1;
-
-        for (i = 0; i < num_local_nodes; i++) {
-                PRINTK("Starting ktoiclusterd %d.\n", i);
-                p = kthread_create(kTOICluster, (void *) initial_message,
-                                "ktoiclusterd/%d", i);
-                if (IS_ERR(p)) {
-                        printk(KERN_ERR "Failed to start ktoiclusterd.\n");
-                        return;
-                }
-
-                wake_up_process(p);
-        }
-
-        /* Wait for delay or someone else sending first message */
-        wait_event(node_array[0].member_events, time_to_continue(0, start,
-                                MSG_IMAGE));
-
-        others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
-
-        printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
-                " %d.\n", have_image ? "" : "don't ", others_have_image);
-
-        if (have_image) {
-                int result;
-
-                /* Start to resume */
-                printk(KERN_INFO "  === Starting to resume ===  \n");
-                node_array[0].current_message = MSG_IO;
-                toi_send_if(MSG_IO, 0);
-
-                /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
-                result = 0;
-
-                if (!result) {
-                        /*
-                         * Atomic restore - we'll come back in the hibernation
-                         * path.
-                         */
-
-                        /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
-                        result = 0;
-
-                        /* do_toi_step(STEP_QUIET_CLEANUP); */
-                }
-
-                node_array[0].current_message |= MSG_NACK;
-
-                /* For debugging - disable for real life? */
-                wait_event(node_array[0].member_events,
-                                time_to_continue(0, start, MSG_IO));
-        }
-
-        if (others_have_image) {
-                /* Wait for them to resume */
-                printk(KERN_INFO "Waiting for other nodes to resume.\n");
-                start = jiffies;
-                wait_event(node_array[0].member_events,
-                                time_to_continue(0, start, MSG_RUNNING));
-                if (peers_not_in_message(0, MSG_RUNNING, 0))
-                        printk(KERN_INFO "Timed out while waiting for other "
-                                        "nodes to resume.\n");
-        }
-
-        /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
-         * as appropriate.
-         *
-         * If we don't have an image:
-         * - Wait until someone else says they have one, or conditions are met
-         *   for continuing to boot (n machines or t seconds).
-         * - If anyone has an image, wait for them to resume before continuing
-         *   to boot.
-         *
-         * If we have an image:
-         * - Wait until conditions are met before continuing to resume (n
-         *   machines or t seconds). Send RESUME_PREP and freeze processes.
-         *   NACK_PREP if freezing fails (shouldn't) and follow logic for
-         *   us having no image above. On success, wait for [N]ACK_PREP from
-         *   other machines. Read image (including atomic restore) until done.
-         *   Wait for ACK_READ from others (should never fail). Thaw processes
-         *   and do post-resume. (The section after the atomic restore is done
-         *   via the code for hibernating).
-         */
-
-        node_array[0].current_message = MSG_RUNNING;
-}
-
-/* toi_cluster_open_iface
- *
- * Description:        Prepare to use an interface.
- */
-
-static int toi_cluster_open_iface(void)
-{
-        struct net_device *dev;
-
-        rtnl_lock();
-
-        for_each_netdev(&init_net, dev) {
-                if (/* dev == &init_net.loopback_dev || */
-                    strcmp(dev->name, toi_cluster_iface))
-                        continue;
-
-                net_dev = dev;
-                break;
-        }
-
-        rtnl_unlock();
-
-        if (!net_dev) {
-                printk(KERN_ERR MYNAME ": Device %s not found.\n",
-                                toi_cluster_iface);
-                return -ENODEV;
-        }
-
-        dev_add_pack(&toi_cluster_packet_type);
-        added_pack = 1;
-
-        loopback_mode = (net_dev == init_net.loopback_dev);
-        num_local_nodes = loopback_mode ? 8 : 1;
-
-        PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
-                        loopback_mode ? "on" : "off", num_local_nodes);
-
-        cluster_startup();
-        return 0;
-}
-
-/* toi_cluster_close_iface
- *
- * Description: Stop using an interface.
- */
-
-static int toi_cluster_close_iface(void)
-{
-        kill_clusterd();
-        if (added_pack) {
-                dev_remove_pack(&toi_cluster_packet_type);
-                added_pack = 0;
-        }
-        return 0;
-}
-
-static void write_side_effect(void)
-{
-        if (toi_cluster_ops.enabled) {
-                toi_cluster_open_iface();
-                set_toi_state(TOI_CLUSTER_MODE);
-        } else {
-                toi_cluster_close_iface();
-                clear_toi_state(TOI_CLUSTER_MODE);
-        }
-}
-
-static void node_write_side_effect(void)
-{
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
-                        NULL),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
-                        write_side_effect),
-        SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
-        SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
-                        256, 0, NULL),
-        SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
-                        256, 0, STRING),
-        SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
-                        0)
-};
-
-/*
- * Ops structure.
- */
-
-static struct toi_module_ops toi_cluster_ops = {
-        .type                        = FILTER_MODULE,
-        .name                        = "Cluster",
-        .directory                = "cluster",
-        .module                        = THIS_MODULE,
-        .memory_needed                 = toi_cluster_memory_needed,
-        .print_debug_info        = toi_cluster_print_debug_stats,
-        .save_config_info        = toi_cluster_save_config_info,
-        .load_config_info        = toi_cluster_load_config_info,
-        .storage_needed                = toi_cluster_storage_needed,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-#ifdef MODULE
-#define INIT static __init
-#define EXIT static __exit
-#else
-#define INIT
-#define EXIT
-#endif
-
-INIT int toi_cluster_init(void)
-{
-        int temp = toi_register_module(&toi_cluster_ops), i;
-        struct kobject *kobj = toi_cluster_ops.dir_kobj;
-
-        for (i = 0; i < MAX_LOCAL_NODES; i++) {
-                node_array[i].current_message = 0;
-                INIT_LIST_HEAD(&node_array[i].member_list);
-                init_waitqueue_head(&node_array[i].member_events);
-                spin_lock_init(&node_array[i].member_list_lock);
-                spin_lock_init(&node_array[i].receive_lock);
-
-                /* Set up sysfs entry */
-                node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
-                                sizeof(node_array[i].sysfs_data.attr.name),
-                                GFP_KERNEL);
-                sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
-                                i);
-                node_array[i].sysfs_data.attr.mode = SYSFS_RW;
-                node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
-                node_array[i].sysfs_data.flags = 0;
-                node_array[i].sysfs_data.data.integer.variable =
-                        (int *) &node_array[i].current_message;
-                node_array[i].sysfs_data.data.integer.minimum = 0;
-                node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
-                node_array[i].sysfs_data.write_side_effect =
-                        node_write_side_effect;
-                toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
-        }
-
-        toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
-
-        if (toi_cluster_ops.enabled)
-                toi_cluster_open_iface();
-
-        return temp;
-}
-
-EXIT void toi_cluster_exit(void)
-{
-        int i;
-        toi_cluster_close_iface();
-
-        for (i = 0; i < MAX_LOCAL_NODES; i++)
-                toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
-                                &node_array[i].sysfs_data);
-        toi_unregister_module(&toi_cluster_ops);
-}
-
-static int __init toi_cluster_iface_setup(char *iface)
-{
-        toi_cluster_ops.enabled = (*iface &&
-                        strcmp(iface, "off"));
-
-        if (toi_cluster_ops.enabled)
-                strncpy(toi_cluster_iface, iface, strlen(iface));
-}
-
-__setup("toi_cluster=", toi_cluster_iface_setup);
diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
deleted file mode 100644
index 84356b304..000000000
--- a/kernel/power/tuxonice_cluster.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_TOI_CLUSTER
-extern int toi_cluster_init(void);
-extern void toi_cluster_exit(void);
-extern void toi_initiate_cluster_hibernate(void);
-#else
-static inline int toi_cluster_init(void) { return 0; }
-static inline void toi_cluster_exit(void) { }
-static inline void toi_initiate_cluster_hibernate(void) { }
-#endif
-
diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
deleted file mode 100644
index 84b85226d..000000000
--- a/kernel/power/tuxonice_compress.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * kernel/power/compression.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data compression routines for TuxOnIce,
- * using cryptoapi.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-static int toi_expected_compression;
-
-static struct toi_module_ops toi_compression_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_compressor_name[32] = "lzo";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
-        u8 *page_buffer;
-        struct crypto_comp *transform;
-        unsigned int len;
-        u8 *buffer_start;
-        u8 *output_buffer;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_compress_crypto_prepare(void)
-{
-        int cpu;
-
-        if (!*toi_compressor_name) {
-                printk(KERN_INFO "TuxOnIce: Compression enabled but no "
-                                "compressor name set.\n");
-                return 1;
-        }
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
-                if (IS_ERR(this->transform)) {
-                        printk(KERN_INFO "TuxOnIce: Failed to initialise the "
-                                        "%s compression transform.\n",
-                                        toi_compressor_name);
-                        this->transform = NULL;
-                        return 1;
-                }
-
-                this->page_buffer =
-                        (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
-
-                if (!this->page_buffer) {
-                        printk(KERN_ERR
-                          "Failed to allocate a page buffer for TuxOnIce "
-                          "compression driver.\n");
-                        return -ENOMEM;
-                }
-
-                this->output_buffer =
-                        (char *) vmalloc_32(OUT_BUF_SIZE);
-
-                if (!this->output_buffer) {
-                        printk(KERN_ERR
-                          "Failed to allocate a output buffer for TuxOnIce "
-                          "compression driver.\n");
-                        return -ENOMEM;
-                }
-        }
-
-        return 0;
-}
-
-static int toi_compress_rw_cleanup(int writing)
-{
-        int cpu;
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                if (this->transform) {
-                        crypto_free_comp(this->transform);
-                        this->transform = NULL;
-                }
-
-                if (this->page_buffer)
-                        toi_free_page(16, (unsigned long) this->page_buffer);
-
-                this->page_buffer = NULL;
-
-                if (this->output_buffer)
-                        vfree(this->output_buffer);
-
-                this->output_buffer = NULL;
-        }
-
-        return 0;
-}
-
-/*
- * toi_compress_init
- */
-
-static int toi_compress_init(int toi_or_resume)
-{
-        if (!toi_or_resume)
-                return 0;
-
-        toi_compress_bytes_in = 0;
-        toi_compress_bytes_out = 0;
-
-        next_driver = toi_get_next_filter(&toi_compression_ops);
-
-        return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_compress_rw_init()
- */
-
-static int toi_compress_rw_init(int rw, int stream_number)
-{
-        if (toi_compress_crypto_prepare()) {
-                printk(KERN_ERR "Failed to initialise compression "
-                                "algorithm.\n");
-                if (rw == READ) {
-                        printk(KERN_INFO "Unable to read the image.\n");
-                        return -ENODEV;
-                } else {
-                        printk(KERN_INFO "Continuing without "
-                                "compressing the image.\n");
-                        toi_compression_ops.enabled = 0;
-                }
-        }
-
-        return 0;
-}
-
-/*
- * toi_compress_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page:        Pointer to a buffer of size PAGE_SIZE, containing
- * data to be compressed.
- *
- * Returns:        0 on success. Otherwise the error is that returned by later
- *                 modules, -ECHILD if we have a broken pipeline or -EIO if
- *                 zlib errs.
- */
-static int toi_compress_write_page(unsigned long index, int buf_type,
-                void *buffer_page, unsigned int buf_size)
-{
-        int ret = 0, cpu = smp_processor_id();
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-        u8* output_buffer = buffer_page;
-        int output_len = buf_size;
-        int out_buf_type = buf_type;
-
-        if (ctx->transform) {
-
-                ctx->buffer_start = TOI_MAP(buf_type, buffer_page);
-                ctx->len = OUT_BUF_SIZE;
-
-                ret = crypto_comp_compress(ctx->transform,
-                        ctx->buffer_start, buf_size,
-                        ctx->output_buffer, &ctx->len);
-
-                TOI_UNMAP(buf_type, buffer_page);
-
-                toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
-                                "CPU %d, index %lu: %d bytes",
-                                cpu, index, ctx->len);
-
-                if (!ret && ctx->len < buf_size) { /* some compression */
-                        output_buffer = ctx->output_buffer;
-                        output_len = ctx->len;
-                        out_buf_type = TOI_VIRT;
-                }
-
-        }
-
-        mutex_lock(&stats_lock);
-
-        toi_compress_bytes_in += buf_size;
-        toi_compress_bytes_out += output_len;
-
-        mutex_unlock(&stats_lock);
-
-        if (!ret)
-                ret = next_driver->write_page(index, out_buf_type,
-                                output_buffer, output_len);
-
-        return ret;
-}
-
-/*
- * toi_compress_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules and decompress it until the input buffer
- * is filled.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_compress_read_page(unsigned long *index, int buf_type,
-                void *buffer_page, unsigned int *buf_size)
-{
-        int ret, cpu = smp_processor_id();
-        unsigned int len;
-        unsigned int outlen = PAGE_SIZE;
-        char *buffer_start;
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
-        if (!ctx->transform)
-                return next_driver->read_page(index, TOI_PAGE, buffer_page,
-                                buf_size);
-
-        /*
-         * All our reads must be synchronous - we can't decompress
-         * data that hasn't been read yet.
-         */
-
-        ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len);
-
-        buffer_start = kmap(buffer_page);
-
-        /* Error or uncompressed data */
-        if (ret || len == PAGE_SIZE) {
-                memcpy(buffer_start, ctx->page_buffer, len);
-                goto out;
-        }
-
-        ret = crypto_comp_decompress(
-                        ctx->transform,
-                        ctx->page_buffer,
-                        len, buffer_start, &outlen);
-
-        toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
-                        "CPU %d, index %lu: %d=>%d (%d).",
-                        cpu, *index, len, outlen, ret);
-
-        if (ret)
-                abort_hibernate(TOI_FAILED_IO,
-                        "Compress_read returned %d.\n", ret);
-        else if (outlen != PAGE_SIZE) {
-                abort_hibernate(TOI_FAILED_IO,
-                        "Decompression yielded %d bytes instead of %ld.\n",
-                        outlen, PAGE_SIZE);
-                printk(KERN_ERR "Decompression yielded %d bytes instead of "
-                                "%ld.\n", outlen, PAGE_SIZE);
-                ret = -EIO;
-                *buf_size = outlen;
-        }
-out:
-        TOI_UNMAP(buf_type, buffer_page);
-        return ret;
-}
-
-/*
- * toi_compress_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_compress_print_debug_stats(char *buffer, int size)
-{
-        unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
-                      pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
-        int len;
-
-        /* Output the compression ratio achieved. */
-        if (*toi_compressor_name)
-                len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
-                                toi_compressor_name);
-        else
-                len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
-        if (pages_in)
-                len += scnprintf(buffer+len, size - len, "  Compressed "
-                        "%lu bytes into %lu (%ld percent compression).\n",
-                  toi_compress_bytes_in,
-                  toi_compress_bytes_out,
-                  (pages_in - pages_out) * 100 / pages_in);
-        return len;
-}
-
-/*
- * toi_compress_compression_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_compress_memory_needed(void)
-{
-        return 2 * PAGE_SIZE;
-}
-
-static int toi_compress_storage_needed(void)
-{
-        return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
-                strlen(toi_compressor_name) + 1;
-}
-
-/*
- * toi_compress_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_compress_save_config_info(char *buffer)
-{
-        int len = strlen(toi_compressor_name) + 1, offset = 0;
-
-        *((unsigned long *) buffer) = toi_compress_bytes_in;
-        offset += sizeof(unsigned long);
-        *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out;
-        offset += sizeof(unsigned long);
-        *((int *) (buffer + offset)) = toi_expected_compression;
-        offset += sizeof(int);
-        *((int *) (buffer + offset)) = len;
-        offset += sizeof(int);
-        strncpy(buffer + offset, toi_compressor_name, len);
-        return offset + len;
-}
-
-/* toi_compress_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description:        Reload information needed for decompressing the image at
- * resume time.
- */
-static void toi_compress_load_config_info(char *buffer, int size)
-{
-        int len, offset = 0;
-
-        toi_compress_bytes_in = *((unsigned long *) buffer);
-        offset += sizeof(unsigned long);
-        toi_compress_bytes_out = *((unsigned long *) (buffer + offset));
-        offset += sizeof(unsigned long);
-        toi_expected_compression = *((int *) (buffer + offset));
-        offset += sizeof(int);
-        len = *((int *) (buffer + offset));
-        offset += sizeof(int);
-        strncpy(toi_compressor_name, buffer + offset, len);
-}
-
-static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        bkd->compress_bytes_in = toi_compress_bytes_in;
-        bkd->compress_bytes_out = toi_compress_bytes_out;
-}
-
-static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        toi_compress_bytes_in = bkd->compress_bytes_in;
-        toi_compress_bytes_out = bkd->compress_bytes_out;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Description:        Returns the expected ratio between data passed into this module
- *                 and the amount of data output when writing.
- * Returns:        100 if the module is disabled. Otherwise the value set by the
- *                 user via our sysfs entry.
- */
-
-static int toi_compress_expected_ratio(void)
-{
-        if (!toi_compression_ops.enabled)
-                return 100;
-        else
-                return 100 - toi_expected_compression;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression,
-                        0, 99, 0, NULL),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0,
-                        NULL),
-        SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_compression_ops = {
-        .type                        = FILTER_MODULE,
-        .name                        = "compression",
-        .directory                = "compression",
-        .module                        = THIS_MODULE,
-        .initialise                = toi_compress_init,
-        .memory_needed                 = toi_compress_memory_needed,
-        .print_debug_info        = toi_compress_print_debug_stats,
-        .save_config_info        = toi_compress_save_config_info,
-        .load_config_info        = toi_compress_load_config_info,
-        .storage_needed                = toi_compress_storage_needed,
-        .expected_compression        = toi_compress_expected_ratio,
-
-        .pre_atomic_restore        = toi_compress_pre_atomic_restore,
-        .post_atomic_restore        = toi_compress_post_atomic_restore,
-
-        .rw_init                = toi_compress_rw_init,
-        .rw_cleanup                = toi_compress_rw_cleanup,
-
-        .write_page                = toi_compress_write_page,
-        .read_page                = toi_compress_read_page,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_compress_load(void)
-{
-        return toi_register_module(&toi_compression_ops);
-}
-
-late_initcall(toi_compress_load);
diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c
deleted file mode 100644
index eb627915e..000000000
--- a/kernel/power/tuxonice_copy_before_write.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * kernel/power/tuxonice_copy_before_write.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines (apart from the fault handling code) to deal with allocating memory
- * for copying pages before they are modified, restoring the contents and getting
- * the contents written to disk.
- */
-
-#include <linux/percpu-defs.h>
-#include <linux/sched.h>
-#include <linux/tuxonice.h>
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states);
-#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw))
-#define toi_cbw_pool_size 100
-
-static void _toi_free_cbw_data(struct toi_cbw_state *state)
-{
-    struct toi_cbw *page_ptr, *ptr, *next;
-
-    page_ptr = ptr = state->first;
-
-    while(ptr) {
-        next = ptr->next;
-
-        if (ptr->virt) {
-            toi__free_page(40, virt_to_page(ptr->virt));
-        }
-        if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) {
-            /* Must be on a new page - free the previous one. */
-            toi__free_page(40, virt_to_page(page_ptr));
-            page_ptr = ptr;
-        }
-        ptr = next;
-    }
-
-    if (page_ptr) {
-        toi__free_page(40, virt_to_page(page_ptr));
-    }
-
-    state->first = state->next = state->last = NULL;
-    state->size = 0;
-}
-
-void toi_free_cbw_data(void)
-{
-    int i;
-
-    for_each_online_cpu(i) {
-        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
-        if (!state->first)
-            continue;
-
-        state->enabled = 0;
-
-        while (state->active) {
-            schedule();
-        }
-
-        _toi_free_cbw_data(state);
-    }
-}
-
-static int _toi_allocate_cbw_data(struct toi_cbw_state *state)
-{
-    while(state->size < toi_cbw_pool_size) {
-        int i;
-        struct toi_cbw *ptr;
-
-        ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL);
-
-        if (!ptr) {
-            return -ENOMEM;
-        }
-
-        if (!state->first) {
-            state->first = state->next = state->last = ptr;
-        }
-
-        for (i = 0; i < CBWS_PER_PAGE; i++) {
-            struct toi_cbw *cbw = &ptr[i];
-
-            cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL);
-            if (!cbw->virt) {
-                state->size += i;
-                printk("Out of memory allocating CBW pages.\n");
-                return -ENOMEM;
-            }
-
-            if (cbw == state->first)
-                continue;
-
-            state->last->next = cbw;
-            state->last = cbw;
-        }
-
-        state->size += CBWS_PER_PAGE;
-    }
-
-    state->enabled = 1;
-
-    return 0;
-}
-
-
-int toi_allocate_cbw_data(void)
-{
-    int i, result;
-
-    for_each_online_cpu(i) {
-        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
-        result = _toi_allocate_cbw_data(state);
-
-        if (result)
-            return result;
-    }
-
-    return 0;
-}
-
-void toi_cbw_restore(void)
-{
-    if (!toi_keeping_image)
-        return;
-
-}
-
-void toi_cbw_write(void)
-{
-    if (!toi_keeping_image)
-        return;
-
-}
-
-/**
- * toi_cbw_test_read - Test copy before write on one page
- *
- * Allocate copy before write buffers, then make one page only copy-before-write
- * and attempt to write to it. We should then be able to retrieve the original
- * version from the cbw buffer and the modified version from the page itself.
- */
-static int toi_cbw_test_read(const char *buffer, int count)
-{
-    unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL);
-    char *original = "Original contents";
-    char *modified = "Modified material";
-    struct page *page = virt_to_page(virt);
-    int i, len = 0, found = 0, pfn = page_to_pfn(page);
-
-    if (!page) {
-        printk("toi_cbw_test_read: Unable to allocate a page for testing.\n");
-        return -ENOMEM;
-    }
-
-    memcpy((char *) virt, original, strlen(original));
-
-    if (toi_allocate_cbw_data()) {
-        printk("toi_cbw_test_read: Unable to allocate cbw data.\n");
-        return -ENOMEM;
-    }
-
-    toi_reset_dirtiness_one(pfn, 0);
-
-    SetPageTOI_CBW(page);
-
-    memcpy((char *) virt, modified, strlen(modified));
-
-    if (strncmp((char *) virt, modified, strlen(modified))) {
-        len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n");
-    }
-
-    for_each_online_cpu(i) {
-        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-        struct toi_cbw *ptr = state->first, *last_ptr = ptr;
-
-        if (!found) {
-            while (ptr) {
-                if (ptr->pfn == pfn) {
-                    found = 1;
-                    if (strncmp(ptr->virt, original, strlen(original))) {
-                        len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n");
-                    } else {
-                        len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n");
-                    }
-                    break;
-                }
-
-                last_ptr = ptr;
-                ptr = ptr->next;
-            }
-        }
-
-        if (!last_ptr)
-            len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i);
-    }
-
-    if (!found)
-        len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n");
-
-    toi_free_cbw_data();
-
-    return len;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read,
-                        NULL, SYSFS_NEEDS_SM_FOR_READ, NULL),
-};
-
-static struct toi_module_ops toi_cbw_ops = {
-        .type                                        = MISC_HIDDEN_MODULE,
-        .name                                        = "copy_before_write debugging",
-        .directory                                = "cbw",
-        .module                                        = THIS_MODULE,
-        .early                                        = 1,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-int toi_cbw_init(void)
-{
-        int result = toi_register_module(&toi_cbw_ops);
-        return result;
-}
diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
deleted file mode 100644
index 522c836ad..000000000
--- a/kernel/power/tuxonice_extent.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * These functions encapsulate the manipulation of storage metadata.
- */
-
-#include <linux/suspend.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-
-/**
- * toi_get_extent - return a free extent
- *
- * May fail, returning NULL instead.
- **/
-static struct hibernate_extent *toi_get_extent(void)
-{
-        return (struct hibernate_extent *) toi_kzalloc(2,
-                        sizeof(struct hibernate_extent), TOI_ATOMIC_GFP);
-}
-
-/**
- * toi_put_extent_chain - free a chain of extents starting from value 'from'
- * @chain:        Chain to free.
- *
- * Note that 'from' is an extent value, and may be part way through an extent.
- * In this case, the extent should be truncated (if necessary) and following
- * extents freed.
- **/
-void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from)
-{
-        struct hibernate_extent *this;
-
-        this = chain->first;
-
-        while (this) {
-                struct hibernate_extent *next = this->next;
-
-                // Delete the whole extent?
-                if (this->start >= from) {
-                    chain->size -= (this->end - this->start + 1);
-                    if (chain->first == this)
-                        chain->first = next;
-                    if (chain->last_touched == this)
-                        chain->last_touched = NULL;
-                    if (chain->current_extent == this)
-                        chain->current_extent = NULL;
-                    toi_kfree(2, this, sizeof(*this));
-                    chain->num_extents--;
-                } else if (this->end >= from) {
-                    // Delete part of the extent
-                    chain->size -= (this->end - from + 1);
-                    this->start = from;
-                }
-                this = next;
-        }
-}
-
-/**
- * toi_put_extent_chain - free a whole chain of extents
- * @chain:        Chain to free.
- **/
-void toi_put_extent_chain(struct hibernate_extent_chain *chain)
-{
-    toi_put_extent_chain_from(chain, 0);
-}
-
-/**
- * toi_add_to_extent_chain - add an extent to an existing chain
- * @chain:        Chain to which the extend should be added
- * @start:        Start of the extent (first physical block)
- * @end:        End of the extent (last physical block)
- *
- * The chain information is updated if the insertion is successful.
- **/
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
-                unsigned long start, unsigned long end)
-{
-        struct hibernate_extent *new_ext = NULL, *cur_ext = NULL;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0,
-                "Adding extent %lu-%lu to chain %p.\n", start, end, chain);
-
-        /* Find the right place in the chain */
-        if (chain->last_touched && chain->last_touched->start < start)
-                cur_ext = chain->last_touched;
-        else if (chain->first && chain->first->start < start)
-                cur_ext = chain->first;
-
-        if (cur_ext) {
-                while (cur_ext->next && cur_ext->next->start < start)
-                        cur_ext = cur_ext->next;
-
-                if (cur_ext->end == (start - 1)) {
-                        struct hibernate_extent *next_ext = cur_ext->next;
-                        cur_ext->end = end;
-
-                        /* Merge with the following one? */
-                        if (next_ext && cur_ext->end + 1 == next_ext->start) {
-                                cur_ext->end = next_ext->end;
-                                cur_ext->next = next_ext->next;
-                                toi_kfree(2, next_ext, sizeof(*next_ext));
-                                chain->num_extents--;
-                        }
-
-                        chain->last_touched = cur_ext;
-                        chain->size += (end - start + 1);
-
-                        return 0;
-                }
-        }
-
-        new_ext = toi_get_extent();
-        if (!new_ext) {
-                printk(KERN_INFO "Error unable to append a new extent to the "
-                                "chain.\n");
-                return -ENOMEM;
-        }
-
-        chain->num_extents++;
-        chain->size += (end - start + 1);
-        new_ext->start = start;
-        new_ext->end = end;
-
-        chain->last_touched = new_ext;
-
-        if (cur_ext) {
-                new_ext->next = cur_ext->next;
-                cur_ext->next = new_ext;
-        } else {
-                if (chain->first)
-                        new_ext->next = chain->first;
-                chain->first = new_ext;
-        }
-
-        return 0;
-}
diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
deleted file mode 100644
index aeccf1f5e..000000000
--- a/kernel/power/tuxonice_extent.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations related to extents. Extents are
- * TuxOnIce's method of storing some of the metadata for the image.
- * See tuxonice_extent.c for more info.
- *
- */
-
-#include "tuxonice_modules.h"
-
-#ifndef EXTENT_H
-#define EXTENT_H
-
-struct hibernate_extent {
-        unsigned long start, end;
-        struct hibernate_extent *next;
-};
-
-struct hibernate_extent_chain {
-        unsigned long size; /* size of the chain ie sum (max-min+1) */
-        int num_extents;
-        struct hibernate_extent *first, *last_touched;
-        struct hibernate_extent *current_extent;
-        unsigned long current_offset;
-};
-
-/* Simplify iterating through all the values in an extent chain */
-#define toi_extent_for_each(extent_chain, extentpointer, value) \
-if ((extent_chain)->first) \
-        for ((extentpointer) = (extent_chain)->first, (value) = \
-                        (extentpointer)->start; \
-             ((extentpointer) && ((extentpointer)->next || (value) <= \
-                                 (extentpointer)->end)); \
-             (((value) == (extentpointer)->end) ? \
-                ((extentpointer) = (extentpointer)->next, (value) = \
-                 ((extentpointer) ? (extentpointer)->start : 0)) : \
-                        (value)++))
-
-extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from);
-#endif
diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
deleted file mode 100644
index baf191211..000000000
--- a/kernel/power/tuxonice_file.c
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- * kernel/power/tuxonice_file.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of a simple file as a
- * backing store. It is based upon the swapallocator, and shares the
- * same basic working. Here, though, we have nothing to do with
- * swapspace, and only one device to worry about.
- *
- * The user can just
- *
- * echo TuxOnIce > /path/to/my_file
- *
- * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
- *
- * and
- *
- * echo /path/to/my_file > /sys/power/tuxonice/file/target
- *
- * then put what they find in /sys/power/tuxonice/resume
- * as their resume= parameter in lilo.conf (and rerun lilo if using it).
- *
- * Having done this, they're ready to hibernate and resume.
- *
- * TODO:
- * - File resizing.
- */
-
-#include <linux/blkdev.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-
-#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
-
-static struct toi_module_ops toi_fileops;
-
-static struct file *target_file;
-static struct block_device *toi_file_target_bdev;
-static unsigned long pages_available, pages_allocated;
-static char toi_file_target[256];
-static struct inode *target_inode;
-static int file_target_priority;
-static int used_devt;
-static int target_claim;
-static dev_t toi_file_dev_t;
-static int sig_page_index;
-
-/* For test_toi_file_target */
-static struct toi_bdev_info *file_chain;
-
-static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num)
-{
-        int j;
-        sector_t last = 0;
-
-        for (j = 0; j < dev_info->blocks_per_page; j++) {
-                sector_t this = bmap(target_inode,
-                                page_num * dev_info->blocks_per_page + j);
-
-                if (!this || (last && (last + 1) != this))
-                        break;
-
-                last = this;
-        }
-
-        return j == dev_info->blocks_per_page;
-}
-
-static unsigned long get_usable_pages(struct toi_bdev_info *dev_info)
-{
-        unsigned long result = 0;
-        struct block_device *bdev = dev_info->bdev;
-        int i;
-
-        switch (target_inode->i_mode & S_IFMT) {
-        case S_IFSOCK:
-        case S_IFCHR:
-        case S_IFIFO: /* Socket, Char, Fifo */
-                return -1;
-        case S_IFREG: /* Regular file: current size - holes + free
-                         space on part */
-                for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) {
-                        if (has_contiguous_blocks(dev_info, i))
-                                result++;
-                }
-                break;
-        case S_IFBLK: /* Block device */
-                if (!bdev->bd_disk) {
-                        toi_message(TOI_IO, TOI_VERBOSE, 0,
-                                        "bdev->bd_disk null.");
-                        return 0;
-                }
-
-                result = (bdev->bd_part ?
-                        bdev->bd_part->nr_sects :
-                        get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9);
-        }
-
-
-        return result;
-}
-
-static int toi_file_register_storage(void)
-{
-        struct toi_bdev_info *devinfo;
-        int result = 0;
-        struct fs_info *fs_info;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage.");
-        if (!strlen(toi_file_target)) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: "
-                                "No target filename set.");
-                return 0;
-        }
-
-        target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0);
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.",
-                        toi_file_target, target_file);
-
-        if (IS_ERR(target_file) || !target_file) {
-                target_file = NULL;
-                toi_file_dev_t = name_to_dev_t(toi_file_target);
-                if (!toi_file_dev_t) {
-                        struct kstat stat;
-                        int error = vfs_stat(toi_file_target, &stat);
-                        printk(KERN_INFO "Open file %s returned %p and "
-                                        "name_to_devt failed.\n",
-                                        toi_file_target, target_file);
-                        if (error) {
-                                printk(KERN_INFO "Stating the file also failed."
-                                        " Nothing more we can do.\n");
-                                return 0;
-                        } else
-                                toi_file_dev_t = stat.rdev;
-                }
-
-                toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t);
-                if (IS_ERR(toi_file_target_bdev)) {
-                        printk(KERN_INFO "Got a dev_num (%lx) but failed to "
-                                        "open it.\n",
-                                        (unsigned long) toi_file_dev_t);
-                        toi_file_target_bdev = NULL;
-                        return 0;
-                }
-                used_devt = 1;
-                target_inode = toi_file_target_bdev->bd_inode;
-        } else
-                target_inode = target_file->f_mapping->host;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target.");
-        if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
-            S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
-                printk(KERN_INFO "File support works with regular files,"
-                                " character files and block devices.\n");
-                /* Cleanup routine will undo the above */
-                return 0;
-        }
-
-        if (!used_devt) {
-                if (S_ISBLK(target_inode->i_mode)) {
-                        toi_file_target_bdev = I_BDEV(target_inode);
-                        if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE |
-                                                FMODE_READ, NULL))
-                                target_claim = 1;
-                } else
-                        toi_file_target_bdev = target_inode->i_sb->s_bdev;
-                if (!toi_file_target_bdev) {
-                        printk(KERN_INFO "%s is not a valid file allocator "
-                                        "target.\n", toi_file_target);
-                        return 0;
-                }
-                toi_file_dev_t = toi_file_target_bdev->bd_dev;
-        }
-
-        devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC);
-        if (!devinfo) {
-                printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n");
-                return -ENOMEM;
-        }
-
-        devinfo->bdev = toi_file_target_bdev;
-        devinfo->allocator = &toi_fileops;
-        devinfo->allocator_index = 0;
-
-        fs_info = fs_info_from_block_dev(toi_file_target_bdev);
-        if (fs_info && !IS_ERR(fs_info)) {
-                memcpy(devinfo->uuid, &fs_info->uuid, 16);
-                free_fs_info(fs_info);
-        } else
-                result = (int) PTR_ERR(fs_info);
-
-        /* Unlike swap code, only complain if fs_info_from_block_dev returned
-         * -ENOMEM. The 'file' might be a full partition, so might validly not
-         * have an identifiable type, UUID etc.
-         */
-        if (result)
-                printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n",
-                                result);
-        devinfo->dev_t = toi_file_dev_t;
-        devinfo->prio = file_target_priority;
-        devinfo->bmap_shift = target_inode->i_blkbits - 9;
-        devinfo->blocks_per_page =
-                (1 << (PAGE_SHIFT - target_inode->i_blkbits));
-        sprintf(devinfo->name, "file %s", toi_file_target);
-        file_chain = devinfo;
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap "
-                        "shift is %d. Blocks per page %d.",
-                        devinfo->dev_t, devinfo->prio, devinfo->bmap_shift,
-                        devinfo->blocks_per_page);
-
-        /* Keep one aside for the signature */
-        pages_available = get_usable_pages(devinfo) - 1;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu "
-                        "pages.", pages_available);
-
-        toi_bio_ops.register_storage(devinfo);
-        return 0;
-}
-
-static unsigned long toi_file_storage_available(void)
-{
-        return pages_available;
-}
-
-static int toi_file_allocate_storage(struct toi_bdev_info *chain,
-                unsigned long request)
-{
-        unsigned long available = pages_available - pages_allocated;
-        unsigned long to_add = min(available, request);
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated "
-                "is %lu. Allocating %lu pages from file.",
-                pages_available, pages_allocated, to_add);
-        pages_allocated += to_add;
-
-        return to_add;
-}
-
-/**
- * __populate_block_list - add an extent to the chain
- * @min:        Start of the extent (first physical block = sector)
- * @max:        End of the extent (last physical block = sector)
- *
- * If TOI_TEST_BIO is set, print a debug message, outputting the min and max
- * fs block numbers.
- **/
-static int __populate_block_list(struct toi_bdev_info *chain, int min, int max)
-{
-        if (test_action_state(TOI_TEST_BIO))
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.",
-                        min << chain->bmap_shift,
-                        ((max + 1) << chain->bmap_shift) - 1);
-
-        return toi_add_to_extent_chain(&chain->blocks, min, max);
-}
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
-        int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0;
-        unsigned long pages_mapped = 0;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks.");
-
-        if (chain->blocks.first)
-                toi_put_extent_chain(&chain->blocks);
-
-        if (!target_is_normal_file()) {
-                result = (pages_available > 0) ?
-                        __populate_block_list(chain, chain->blocks_per_page,
-                                (pages_allocated + 1) *
-                                chain->blocks_per_page - 1) : 0;
-                return result;
-        }
-
-        /*
-         * FIXME: We are assuming the first page is contiguous. Is that
-         * assumption always right?
-         */
-
-        for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
-                sector_t new_sector;
-
-                if (!has_contiguous_blocks(chain, i))
-                        continue;
-
-                if (!have_sig_page) {
-                        have_sig_page = 1;
-                        sig_page_index = i;
-                        continue;
-                }
-
-                pages_mapped++;
-
-                /* Ignore first page - it has the header */
-                if (pages_mapped == 1)
-                        continue;
-
-                new_sector = bmap(target_inode, (i * chain->blocks_per_page));
-
-                /*
-                 * I'd love to be able to fill in holes and resize
-                 * files, but not yet...
-                 */
-
-                if (new_sector == extent_max + 1)
-                        extent_max += chain->blocks_per_page;
-                else {
-                        if (extent_min > -1) {
-                                result = __populate_block_list(chain,
-                                                extent_min, extent_max);
-                                if (result)
-                                        return result;
-                        }
-
-                        extent_min = new_sector;
-                        extent_max = extent_min +
-                                chain->blocks_per_page - 1;
-                }
-
-                if (pages_mapped == pages_allocated)
-                        break;
-        }
-
-        if (extent_min > -1) {
-                result = __populate_block_list(chain, extent_min, extent_max);
-                if (result)
-                        return result;
-        }
-
-        return 0;
-}
-
-static void toi_file_free_storage(struct toi_bdev_info *chain)
-{
-        pages_allocated = 0;
-        file_chain = NULL;
-}
-
-/**
- * toi_file_print_debug_stats - print debug info
- * @buffer:        Buffer to data to populate
- * @size:        Size of the buffer
- **/
-static int toi_file_print_debug_stats(char *buffer, int size)
-{
-        int len = scnprintf(buffer, size, "- File Allocator active.\n");
-
-        len += scnprintf(buffer+len, size-len, "  Storage available for "
-                        "image: %lu pages.\n", pages_available);
-
-        return len;
-}
-
-static void toi_file_cleanup(int finishing_cycle)
-{
-        if (toi_file_target_bdev) {
-                if (target_claim) {
-                        blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ);
-                        target_claim = 0;
-                }
-
-                if (used_devt) {
-                        blkdev_put(toi_file_target_bdev,
-                                        FMODE_READ | FMODE_NDELAY);
-                        used_devt = 0;
-                }
-                toi_file_target_bdev = NULL;
-                target_inode = NULL;
-        }
-
-        if (target_file) {
-                filp_close(target_file, NULL);
-                target_file = NULL;
-        }
-
-        pages_available = 0;
-}
-
-/**
- * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target
- *
- * Test wheter the target file is valid for hibernating.
- **/
-static void test_toi_file_target(void)
-{
-        int result = toi_file_register_storage();
-        sector_t sector;
-        char buf[50];
-        struct fs_info *fs_info;
-
-        if (result || !file_chain)
-                return;
-
-        /* This doesn't mean we're in business. Is any storage available? */
-        if (!pages_available)
-                goto out;
-
-        toi_file_allocate_storage(file_chain, 1);
-        result = get_main_pool_phys_params(file_chain);
-        if (result)
-                goto out;
-
-
-        sector = bmap(target_inode, sig_page_index *
-                        file_chain->blocks_per_page) << file_chain->bmap_shift;
-
-        /* Use the uuid, or the dev_t if that fails */
-        fs_info = fs_info_from_block_dev(toi_file_target_bdev);
-        if (!fs_info || IS_ERR(fs_info)) {
-                bdevname(toi_file_target_bdev, buf);
-                sprintf(resume_file, "/dev/%s:%llu", buf,
-                                (unsigned long long) sector);
-        } else {
-                int i;
-                hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0);
-
-                /* Remove the spaces */
-                for (i = 1; i < 16; i++) {
-                        buf[2 * i] = buf[3 * i];
-                        buf[2 * i + 1] = buf[3 * i + 1];
-                }
-                buf[32] = 0;
-                sprintf(resume_file, "UUID=%s:0x%llx", buf,
-                                (unsigned long long) sector);
-                free_fs_info(fs_info);
-        }
-
-        toi_attempt_to_parse_resume_device(0);
-out:
-        toi_file_free_storage(file_chain);
-        toi_bio_ops.free_storage();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256,
-                SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL),
-        SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095,
-                        4096, 0, NULL),
-};
-
-static struct toi_bio_allocator_ops toi_bio_fileops = {
-        .register_storage                        = toi_file_register_storage,
-        .storage_available                        = toi_file_storage_available,
-        .allocate_storage                        = toi_file_allocate_storage,
-        .bmap                                        = get_main_pool_phys_params,
-        .free_storage                                = toi_file_free_storage,
-};
-
-static struct toi_module_ops toi_fileops = {
-        .type                                        = BIO_ALLOCATOR_MODULE,
-        .name                                        = "file storage",
-        .directory                                = "file",
-        .module                                        = THIS_MODULE,
-        .print_debug_info                        = toi_file_print_debug_stats,
-        .cleanup                                = toi_file_cleanup,
-        .bio_allocator_ops                        = &toi_bio_fileops,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_file_load(void)
-{
-        return toi_register_module(&toi_fileops);
-}
-
-late_initcall(toi_file_load);
diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
deleted file mode 100644
index 16cf14cbc..000000000
--- a/kernel/power/tuxonice_highlevel.c
+++ /dev/null
@@ -1,1413 +0,0 @@
-/*
- * kernel/power/tuxonice_highlevel.c
- */
-/** \mainpage TuxOnIce.
- *
- * TuxOnIce provides support for saving and restoring an image of
- * system memory to an arbitrary storage device, either on the local computer,
- * or across some network. The support is entirely OS based, so TuxOnIce
- * works without requiring BIOS, APM or ACPI support. The vast majority of the
- * code is also architecture independant, so it should be very easy to port
- * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
- * and preemption. Initramfses and initrds are also supported.
- *
- * TuxOnIce uses a modular design, in which the method of storing the image is
- * completely abstracted from the core code, as are transformations on the data
- * such as compression and/or encryption (multiple 'modules' can be used to
- * provide arbitrary combinations of functionality). The user interface is also
- * modular, so that arbitrarily simple or complex interfaces can be used to
- * provide anything from debugging information through to eye candy.
- *
- * \section Copyright
- *
- * TuxOnIce is released under the GPLv2.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR>
- *
- * \section Credits
- *
- * Nigel would like to thank the following people for their work:
- *
- * Bernard Blackham <bernard@blackham.com.au><BR>
- * Web page & Wiki administration, some coding. A person without whom
- * TuxOnIce would not be where it is.
- *
- * Michael Frank <mhf@linuxmail.org><BR>
- * Extensive testing and help with improving stability. I was constantly
- * amazed by the quality and quantity of Michael's help.
- *
- * Pavel Machek <pavel@ucw.cz><BR>
- * Modifications, defectiveness pointing, being with Gabor at the very
- * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
- * 2.5.17. Even though Pavel and I disagree on the direction suspend to
- * disk should take, I appreciate the valuable work he did in helping Gabor
- * get the concept working.
- *
- * ..and of course the myriads of TuxOnIce users who have helped diagnose
- * and fix bugs, made suggestions on how to improve the code, proofread
- * documentation, and donated time and money.
- *
- * Thanks also to corporate sponsors:
- *
- * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
- *
- * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
- * allowed him to work on TuxOnIce and PM related issues on company time.
- *
- * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
- * 2003 to Jan 2004.
- *
- * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
- * maintenance of SMP and Highmem support.
- *
- * <B>OSDL.</B> Provided access to various hardware configurations, make
- * occasional small donations to the project.
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/freezer.h>
-#include <generated/utsrelease.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/writeback.h>
-#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */
-#include <linux/bio.h>
-#include <linux/kgdb.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_cluster.h"
-
-/*! Pageset metadata. */
-struct pagedir pagedir2 = {2};
-
-static mm_segment_t oldfs;
-static DEFINE_MUTEX(tuxonice_in_use);
-static int block_dump_save;
-
-int toi_trace_index;
-
-/* Binary signature if an image is present */
-char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
-
-unsigned long boot_kernel_data_buffer;
-
-static char *result_strings[] = {
-        "Hibernation was aborted",
-        "The user requested that we cancel the hibernation",
-        "No storage was available",
-        "Insufficient storage was available",
-        "Freezing filesystems and/or tasks failed",
-        "A pre-existing image was used",
-        "We would free memory, but image size limit doesn't allow this",
-        "Unable to free enough memory to hibernate",
-        "Unable to obtain the Power Management Semaphore",
-        "A device suspend/resume returned an error",
-        "A system device suspend/resume returned an error",
-        "The extra pages allowance is too small",
-        "We were unable to successfully prepare an image",
-        "TuxOnIce module initialisation failed",
-        "TuxOnIce module cleanup failed",
-        "I/O errors were encountered",
-        "Ran out of memory",
-        "An error was encountered while reading the image",
-        "Platform preparation failed",
-        "CPU Hotplugging failed",
-        "Architecture specific preparation failed",
-        "Pages needed resaving, but we were told to abort if this happens",
-        "We can't hibernate at the moment (invalid resume= or filewriter "
-                "target?)",
-        "A hibernation preparation notifier chain member cancelled the "
-                "hibernation",
-        "Pre-snapshot preparation failed",
-        "Pre-restore preparation failed",
-        "Failed to disable usermode helpers",
-        "Can't resume from alternate image",
-        "Header reservation too small",
-        "Device Power Management Preparation failed",
-};
-
-/**
- * toi_finish_anything - cleanup after doing anything
- * @hibernate_or_resume:        Whether finishing a cycle or attempt at
- *                                resuming.
- *
- * This is our basic clean-up routine, matching start_anything below. We
- * call cleanup routines, drop module references and restore process fs and
- * cpus allowed masks, together with the global block_dump variable's value.
- **/
-void toi_finish_anything(int hibernate_or_resume)
-{
-        toi_running = 0;
-        toi_cleanup_modules(hibernate_or_resume);
-        toi_put_modules();
-        if (hibernate_or_resume) {
-                block_dump = block_dump_save;
-                set_cpus_allowed_ptr(current, cpu_all_mask);
-                toi_alloc_print_debug_stats();
-                atomic_inc(&snapshot_device_available);
-    unlock_system_sleep();
-        }
-
-        set_fs(oldfs);
-        mutex_unlock(&tuxonice_in_use);
-}
-
-/**
- * toi_start_anything - basic initialisation for TuxOnIce
- * @toi_or_resume:        Whether starting a cycle or attempt at resuming.
- *
- * Our basic initialisation routine. Take references on modules, use the
- * kernel segment, recheck resume= if no active allocator is set, initialise
- * modules, save and reset block_dump and ensure we're running on CPU0.
- **/
-int toi_start_anything(int hibernate_or_resume)
-{
-        mutex_lock(&tuxonice_in_use);
-
-        oldfs = get_fs();
-        set_fs(KERNEL_DS);
-
-        toi_trace_index = 0;
-
-        if (hibernate_or_resume) {
-    lock_system_sleep();
-
-                if (!atomic_add_unless(&snapshot_device_available, -1, 0))
-                        goto snapshotdevice_unavailable;
-        }
-
-        if (hibernate_or_resume == SYSFS_HIBERNATE)
-                toi_print_modules();
-
-        if (toi_get_modules()) {
-                printk(KERN_INFO "TuxOnIce: Get modules failed!\n");
-                goto prehibernate_err;
-        }
-
-        if (hibernate_or_resume) {
-                block_dump_save = block_dump;
-                block_dump = 0;
-                set_cpus_allowed_ptr(current,
-                                cpumask_of(cpumask_first(cpu_online_mask)));
-        }
-
-        if (toi_initialise_modules_early(hibernate_or_resume))
-                goto early_init_err;
-
-        if (!toiActiveAllocator)
-                toi_attempt_to_parse_resume_device(!hibernate_or_resume);
-
-        if (!toi_initialise_modules_late(hibernate_or_resume)) {
-            toi_running = 1; /* For the swsusp code we use :< */
-            return 0;
-        }
-
-        toi_cleanup_modules(hibernate_or_resume);
-early_init_err:
-        if (hibernate_or_resume) {
-                block_dump_save = block_dump;
-                set_cpus_allowed_ptr(current, cpu_all_mask);
-        }
-        toi_put_modules();
-prehibernate_err:
-        if (hibernate_or_resume)
-                atomic_inc(&snapshot_device_available);
-snapshotdevice_unavailable:
-        if (hibernate_or_resume)
-                mutex_unlock(&pm_mutex);
-        set_fs(oldfs);
-        mutex_unlock(&tuxonice_in_use);
-        return -EBUSY;
-}
-
-/*
- * Nosave page tracking.
- *
- * Here rather than in prepare_image because we want to do it once only at the
- * start of a cycle.
- */
-
-/**
- * mark_nosave_pages - set up our Nosave bitmap
- *
- * Build a bitmap of Nosave pages from the list. The bitmap allows faster
- * use when preparing the image.
- **/
-static void mark_nosave_pages(void)
-{
-        struct nosave_region *region;
-
-        list_for_each_entry(region, &nosave_regions, list) {
-                unsigned long pfn;
-
-                for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
-                        if (pfn_valid(pfn)) {
-                                SetPageNosave(pfn_to_page(pfn));
-                        }
-        }
-}
-
-/**
- * allocate_bitmaps - allocate bitmaps used to record page states
- *
- * Allocate the bitmaps we use to record the various TuxOnIce related
- * page states.
- **/
-static int allocate_bitmaps(void)
-{
-        if (toi_alloc_bitmap(&pageset1_map) ||
-            toi_alloc_bitmap(&pageset1_copy_map) ||
-            toi_alloc_bitmap(&pageset2_map) ||
-            toi_alloc_bitmap(&io_map) ||
-            toi_alloc_bitmap(&nosave_map) ||
-            toi_alloc_bitmap(&free_map) ||
-            toi_alloc_bitmap(&compare_map) ||
-            toi_alloc_bitmap(&page_resave_map))
-                return 1;
-
-        return 0;
-}
-
-/**
- * free_bitmaps - free the bitmaps used to record page states
- *
- * Free the bitmaps allocated above. It is not an error to call
- * memory_bm_free on a bitmap that isn't currently allocated.
- **/
-static void free_bitmaps(void)
-{
-        toi_free_bitmap(&pageset1_map);
-        toi_free_bitmap(&pageset1_copy_map);
-        toi_free_bitmap(&pageset2_map);
-        toi_free_bitmap(&io_map);
-        toi_free_bitmap(&nosave_map);
-        toi_free_bitmap(&free_map);
-        toi_free_bitmap(&compare_map);
-        toi_free_bitmap(&page_resave_map);
-}
-
-/**
- * io_MB_per_second - return the number of MB/s read or written
- * @write:        Whether to return the speed at which we wrote.
- *
- * Calculate the number of megabytes per second that were read or written.
- **/
-static int io_MB_per_second(int write)
-{
-        return (toi_bkd.toi_io_time[write][1]) ?
-                MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
-                toi_bkd.toi_io_time[write][1] : 0;
-}
-
-#define SNPRINTF(a...)         do { len += scnprintf(((char *) buffer) + len, \
-                count - len - 1, ## a); } while (0)
-
-/**
- * get_debug_info - fill a buffer with debugging information
- * @buffer:        The buffer to be filled.
- * @count:        The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_debug_info(const char *buffer, int count)
-{
-        int len = 0, i, first_result = 1;
-
-        SNPRINTF("TuxOnIce debugging info:\n");
-        SNPRINTF("- TuxOnIce core  : " TOI_CORE_VERSION "\n");
-        SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
-        SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
-        SNPRINTF("- Attempt number : %d\n", nr_hibernates);
-        SNPRINTF("- Parameters     : %ld %ld %ld %d %ld %ld\n",
-                        toi_result,
-                        toi_bkd.toi_action,
-                        toi_bkd.toi_debug_state,
-                        toi_bkd.toi_default_console_level,
-                        image_size_limit,
-                        toi_poweroff_method);
-        SNPRINTF("- Overall expected compression percentage: %d.\n",
-                        100 - toi_expected_compression_ratio());
-        len += toi_print_module_debug_info(((char *) buffer) + len,
-                        count - len - 1);
-        if (toi_bkd.toi_io_time[0][1]) {
-                if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
-                        SNPRINTF("- I/O speed: Write %ld KB/s",
-                          (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
-                          toi_bkd.toi_io_time[0][1]));
-                        if (toi_bkd.toi_io_time[1][1])
-                                SNPRINTF(", Read %ld KB/s",
-                                  (KB((unsigned long)
-                                      toi_bkd.toi_io_time[1][0]) * HZ /
-                                  toi_bkd.toi_io_time[1][1]));
-                } else {
-                        SNPRINTF("- I/O speed: Write %ld MB/s",
-                         (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
-                          toi_bkd.toi_io_time[0][1]));
-                        if (toi_bkd.toi_io_time[1][1])
-                                SNPRINTF(", Read %ld MB/s",
-                                 (MB((unsigned long)
-                                     toi_bkd.toi_io_time[1][0]) * HZ /
-                                  toi_bkd.toi_io_time[1][1]));
-                }
-                SNPRINTF(".\n");
-        } else
-                SNPRINTF("- No I/O speed stats available.\n");
-        SNPRINTF("- Extra pages    : %lu used/%lu.\n",
-                        extra_pd1_pages_used, extra_pd1_pages_allowance);
-
-        for (i = 0; i < TOI_NUM_RESULT_STATES; i++)
-                if (test_result_state(i)) {
-                        SNPRINTF("%s: %s.\n", first_result ?
-                                        "- Result         " :
-                                        "                 ",
-                                        result_strings[i]);
-                        first_result = 0;
-                }
-        if (first_result)
-                SNPRINTF("- Result         : %s.\n", nr_hibernates ?
-                        "Succeeded" :
-                        "No hibernation attempts so far");
-        return len;
-}
-
-#ifdef CONFIG_TOI_INCREMENTAL
-/**
- * get_toi_page_state - fill a buffer with page state information
- * @buffer:        The buffer to be filled.
- * @count:        The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_page_state(const char *buffer, int count)
-{
-    int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0;
-    int len = 0;
-    struct zone *zone;
-    int allocated_bitmaps = 0;
-
-    set_cpus_allowed_ptr(current,
-            cpumask_of(cpumask_first(cpu_online_mask)));
-
-    if (!free_map) {
-        BUG_ON(toi_alloc_bitmap(&free_map));
-        allocated_bitmaps = 1;
-    }
-
-    toi_generate_free_page_map();
-
-    for_each_populated_zone(zone) {
-        unsigned long loop;
-
-        total += zone->spanned_pages;
-
-        for (loop = 0; loop < zone->spanned_pages; loop++) {
-            unsigned long pfn = zone->zone_start_pfn + loop;
-            struct page *page;
-            int chunk_size;
-
-            if (!pfn_valid(pfn)) {
-                continue;
-            }
-
-            chunk_size = toi_size_of_free_region(zone, pfn);
-            if (chunk_size) {
-                /*
-                 * If the page gets allocated, it will be need
-                 * saving in an image.
-                 * Don't bother with explicitly removing any
-                 * RO protection applied below.
-                 * We'll SetPageTOI_Dirty(page) if/when it
-                 * gets allocated.
-                 */
-                free += chunk_size;
-                loop += chunk_size - 1;
-                continue;
-            }
-
-            page = pfn_to_page(pfn);
-
-            if (PageTOI_Untracked(page)) {
-                untracked++;
-            } else if (PageTOI_RO(page)) {
-                ro++;
-            } else if (PageTOI_Dirty(page)) {
-                dirty++;
-            } else {
-                printk("Page %ld state 'other'.\n", pfn);
-                other++;
-            }
-        }
-    }
-
-    if (allocated_bitmaps) {
-        toi_free_bitmap(&free_map);
-    }
-
-    set_cpus_allowed_ptr(current, cpu_all_mask);
-
-    SNPRINTF("TuxOnIce page breakdown:\n");
-    SNPRINTF("- Free           : %d\n", free);
-    SNPRINTF("- Untracked      : %d\n", untracked);
-    SNPRINTF("- Read only      : %d\n", ro);
-    SNPRINTF("- Dirty          : %d\n", dirty);
-    SNPRINTF("- Other          : %d\n", other);
-    SNPRINTF("- Invalid        : %d\n", invalid);
-    SNPRINTF("- Total          : %d\n", total);
-    return len;
-}
-#endif
-
-/**
- * do_cleanup - cleanup after attempting to hibernate or resume
- * @get_debug_info:        Whether to allocate and return debugging info.
- *
- * Cleanup after attempting to hibernate or resume, possibly getting
- * debugging info as we do so.
- **/
-static void do_cleanup(int get_debug_info, int restarting)
-{
-        int i = 0;
-        char *buffer = NULL;
-
-        trap_non_toi_io = 0;
-
-        if (get_debug_info)
-                toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
-
-        free_checksum_pages();
-
-        toi_cbw_restore();
-        toi_free_cbw_data();
-
-        if (get_debug_info)
-                buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
-
-        if (buffer)
-                i = get_toi_debug_info(buffer, PAGE_SIZE);
-
-        toi_free_extra_pagedir_memory();
-
-        pagedir1.size = 0;
-        pagedir2.size = 0;
-        set_highmem_size(pagedir1, 0);
-        set_highmem_size(pagedir2, 0);
-
-        if (boot_kernel_data_buffer) {
-                if (!test_toi_state(TOI_BOOT_KERNEL))
-                        toi_free_page(37, boot_kernel_data_buffer);
-                boot_kernel_data_buffer = 0;
-        }
-
-        if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) {
-                unlock_device_hotplug();
-                clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
-        }
-
-        clear_toi_state(TOI_BOOT_KERNEL);
-        if (current->flags & PF_SUSPEND_TASK)
-                thaw_processes();
-
-        if (!restarting)
-                toi_stop_other_threads();
-
-        if (toi_keeping_image &&
-            !test_result_state(TOI_ABORTED)) {
-                toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
-                        "TuxOnIce: Not invalidating the image due "
-                        "to Keep Image or Incremental Image being enabled.");
-                set_result_state(TOI_KEPT_IMAGE);
-
-                /*
-                 * For an incremental image, free unused storage so
-                 * swap (if any) can be used for normal system operation,
-                 * if so desired.
-                 */
-
-                toiActiveAllocator->free_unused_storage();
-        } else
-                if (toiActiveAllocator)
-                        toiActiveAllocator->remove_image();
-
-        free_bitmaps();
-        usermodehelper_enable();
-
-        if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
-                pm_notifier_call_chain(PM_POST_HIBERNATION);
-                clear_toi_state(TOI_NOTIFIERS_PREPARE);
-        }
-
-        if (buffer && i) {
-                /* Printk can only handle 1023 bytes, including
-                 * its level mangling. */
-                for (i = 0; i < 3; i++)
-                        printk(KERN_ERR "%s", buffer + (1023 * i));
-                toi_free_page(20, (unsigned long) buffer);
-        }
-
-        if (!restarting)
-                toi_cleanup_console();
-
-        free_attention_list();
-
-        if (!restarting)
-                toi_deactivate_storage(0);
-
-        clear_toi_state(TOI_IGNORE_LOGLEVEL);
-        clear_toi_state(TOI_TRYING_TO_RESUME);
-        clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * check_still_keeping_image - we kept an image; check whether to reuse it.
- *
- * We enter this routine when we have kept an image. If the user has said they
- * want to still keep it, all we need to do is powerdown. If powering down
- * means hibernating to ram and the power doesn't run out, we'll return 1.
- * If we do power off properly or the battery runs out, we'll resume via the
- * normal paths.
- *
- * If the user has said they want to remove the previously kept image, we
- * remove it, and return 0. We'll then store a new image.
- **/
-static int check_still_keeping_image(void)
-{
-    if (toi_keeping_image) {
-        if (!test_action_state(TOI_INCREMENTAL_IMAGE)) {
-            printk(KERN_INFO "Image already stored: powering down "
-                    "immediately.");
-            do_toi_step(STEP_HIBERNATE_POWERDOWN);
-            return 1;
-        }
-        /**
-         * Incremental image - need to write new part.
-         * We detect that we're writing an incremental image by looking
-         * at test_result_state(TOI_KEPT_IMAGE)
-         **/
-        return 0;
-    }
-
-    printk(KERN_INFO "Invalidating previous image.\n");
-    toiActiveAllocator->remove_image();
-
-    return 0;
-}
-
-/**
- * toi_init - prepare to hibernate to disk
- *
- * Initialise variables & data structures, in preparation for
- * hibernating to disk.
- **/
-static int toi_init(int restarting)
-{
-        int result, i, j;
-
-        toi_result = 0;
-
-        printk(KERN_INFO "Initiating a hibernation cycle.\n");
-
-        nr_hibernates++;
-
-        for (i = 0; i < 2; i++)
-                for (j = 0; j < 2; j++)
-                        toi_bkd.toi_io_time[i][j] = 0;
-
-        if (!test_toi_state(TOI_CAN_HIBERNATE) ||
-            allocate_bitmaps())
-                return 1;
-
-        mark_nosave_pages();
-
-        if (!restarting)
-                toi_prepare_console();
-
-        result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
-        if (result) {
-                set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
-                return 1;
-        }
-        set_toi_state(TOI_NOTIFIERS_PREPARE);
-
-        if (!restarting) {
-                printk(KERN_ERR "Starting other threads.");
-                toi_start_other_threads();
-        }
-
-        result = usermodehelper_disable();
-        if (result) {
-                printk(KERN_ERR "TuxOnIce: Failed to disable usermode "
-                                "helpers\n");
-                set_result_state(TOI_USERMODE_HELPERS_ERR);
-                return 1;
-        }
-
-        boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
-        if (!boot_kernel_data_buffer) {
-                printk(KERN_ERR "TuxOnIce: Failed to allocate "
-                                "boot_kernel_data_buffer.\n");
-                set_result_state(TOI_OUT_OF_MEMORY);
-                return 1;
-        }
-
-        toi_allocate_cbw_data();
-
-        return 0;
-}
-
-/**
- * can_hibernate - perform basic 'Can we hibernate?' tests
- *
- * Perform basic tests that must pass if we're going to be able to hibernate:
- * Can we get the pm_mutex? Is resume= valid (we need to know where to write
- * the image header).
- **/
-static int can_hibernate(void)
-{
-        if (!test_toi_state(TOI_CAN_HIBERNATE))
-                toi_attempt_to_parse_resume_device(0);
-
-        if (!test_toi_state(TOI_CAN_HIBERNATE)) {
-                printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
-                        "This may be because you haven't put something along "
-                        "the lines of\n\nresume=swap:/dev/hda1\n\n"
-                        "in lilo.conf or equivalent. (Where /dev/hda1 is your "
-                        "swap partition).\n");
-                set_abort_result(TOI_CANT_SUSPEND);
-                return 0;
-        }
-
-        if (strlen(alt_resume_param)) {
-                attempt_to_parse_alt_resume_param();
-
-                if (!strlen(alt_resume_param)) {
-                        printk(KERN_INFO "Alternate resume parameter now "
-                                        "invalid. Aborting.\n");
-                        set_abort_result(TOI_CANT_USE_ALT_RESUME);
-                        return 0;
-                }
-        }
-
-        return 1;
-}
-
-/**
- * do_post_image_write - having written an image, figure out what to do next
- *
- * After writing an image, we might load an alternate image or power down.
- * Powering down might involve hibernating to ram, in which case we also
- * need to handle reloading pageset2.
- **/
-static int do_post_image_write(void)
-{
-        /* If switching images fails, do normal powerdown */
-        if (alt_resume_param[0])
-                do_toi_step(STEP_RESUME_ALT_IMAGE);
-
-        toi_power_down();
-
-        barrier();
-        mb();
-        return 0;
-}
-
-/**
- * __save_image - do the hard work of saving the image
- *
- * High level routine for getting the image saved. The key assumptions made
- * are that processes have been frozen and sufficient memory is available.
- *
- * We also exit through here at resume time, coming back from toi_hibernate
- * after the atomic restore. This is the reason for the toi_in_hibernate
- * test.
- **/
-static int __save_image(void)
-{
-        int temp_result, did_copy = 0;
-
-        toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
-
-        toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
-                " - Final values: %d and %d.",
-                pagedir1.size, pagedir2.size);
-
-        toi_cond_pause(1, "About to write pagedir2.");
-
-        temp_result = write_pageset(&pagedir2);
-
-        if (temp_result == -1 || test_result_state(TOI_ABORTED))
-                return 1;
-
-        toi_cond_pause(1, "About to copy pageset 1.");
-
-        if (test_result_state(TOI_ABORTED))
-                return 1;
-
-        toi_deactivate_storage(1);
-
-        toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
-        toi_in_hibernate = 1;
-
-        if (toi_go_atomic(PMSG_FREEZE, 1))
-                goto Failed;
-
-        temp_result = toi_hibernate();
-
-#ifdef CONFIG_KGDB
-        if (test_action_state(TOI_POST_RESUME_BREAKPOINT))
-                kgdb_breakpoint();
-#endif
-
-        if (!temp_result)
-                did_copy = 1;
-
-        /* We return here at resume time too! */
-        toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result);
-
-Failed:
-        if (toi_activate_storage(1))
-                panic("Failed to reactivate our storage.");
-
-        /* Resume time? */
-        if (!toi_in_hibernate) {
-                copyback_post();
-                return 0;
-        }
-
-        /* Nope. Hibernating. So, see if we can save the image... */
-
-        if (temp_result || test_result_state(TOI_ABORTED)) {
-                if (did_copy)
-                        goto abort_reloading_pagedir_two;
-                else
-                        return 1;
-        }
-
-        toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size,
-                        NULL);
-
-        if (test_result_state(TOI_ABORTED))
-                goto abort_reloading_pagedir_two;
-
-        toi_cond_pause(1, "About to write pageset1.");
-
-        toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1");
-
-        temp_result = write_pageset(&pagedir1);
-
-        /* We didn't overwrite any memory, so no reread needs to be done. */
-        if (test_action_state(TOI_TEST_FILTER_SPEED) ||
-            test_action_state(TOI_TEST_BIO))
-                return 1;
-
-        if (temp_result == 1 || test_result_state(TOI_ABORTED))
-                goto abort_reloading_pagedir_two;
-
-        toi_cond_pause(1, "About to write header.");
-
-        if (test_result_state(TOI_ABORTED))
-                goto abort_reloading_pagedir_two;
-
-        temp_result = write_image_header();
-
-        if (!temp_result && !test_result_state(TOI_ABORTED))
-                return 0;
-
-abort_reloading_pagedir_two:
-        temp_result = read_pageset2(1);
-
-        /* If that failed, we're sunk. Panic! */
-        if (temp_result)
-                panic("Attempt to reload pagedir 2 while aborting "
-                                "a hibernate failed.");
-
-        return 1;
-}
-
-static void map_ps2_pages(int enable)
-{
-        unsigned long pfn = 0;
-
-        memory_bm_position_reset(pageset2_map);
-        pfn = memory_bm_next_pfn(pageset2_map, 0);
-
-        while (pfn != BM_END_OF_MAP) {
-                struct page *page = pfn_to_page(pfn);
-                kernel_map_pages(page, 1, enable);
-                pfn = memory_bm_next_pfn(pageset2_map, 0);
-        }
-}
-
-/**
- * do_save_image - save the image and handle the result
- *
- * Save the prepared image. If we fail or we're in the path returning
- * from the atomic restore, cleanup.
- **/
-static int do_save_image(void)
-{
-        int result;
-        map_ps2_pages(0);
-        result = __save_image();
-        map_ps2_pages(1);
-        return result;
-}
-
-/**
- * do_prepare_image - try to prepare an image
- *
- * Seek to initialise and prepare an image to be saved. On failure,
- * cleanup.
- **/
-static int do_prepare_image(void)
-{
-        int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-
-        if (!restarting && toi_activate_storage(0))
-                return 1;
-
-        /*
-         * If kept image and still keeping image and hibernating to RAM, (non
-         * incremental image case) we will return 1 after hibernating and
-         * resuming (provided the power doesn't run out. In that case, we skip
-         * directly to cleaning up and exiting.
-         */
-
-        if (!can_hibernate() ||
-            (test_result_state(TOI_KEPT_IMAGE) &&
-             check_still_keeping_image()))
-                return 1;
-
-        if (toi_init(restarting) || toi_prepare_image() ||
-                        test_result_state(TOI_ABORTED))
-                return 1;
-
-        trap_non_toi_io = 1;
-
-        return 0;
-}
-
-/**
- * do_check_can_resume - find out whether an image has been stored
- *
- * Read whether an image exists. We use the same routine as the
- * image_exists sysfs entry, and just look to see whether the
- * first character in the resulting buffer is a '1'.
- **/
-int do_check_can_resume(void)
-{
-        int result = -1;
-
-        if (toi_activate_storage(0))
-                return -1;
-
-        if (!test_toi_state(TOI_RESUME_DEVICE_OK))
-                toi_attempt_to_parse_resume_device(1);
-
-        if (toiActiveAllocator)
-                result = toiActiveAllocator->image_exists(1);
-
-        toi_deactivate_storage(0);
-        return result;
-}
-
-/**
- * do_load_atomic_copy - load the first part of an image, if it exists
- *
- * Check whether we have an image. If one exists, do sanity checking
- * (possibly invalidating the image or even rebooting if the user
- * requests that) before loading it into memory in preparation for the
- * atomic restore.
- *
- * If and only if we have an image loaded and ready to restore, we return 1.
- **/
-static int do_load_atomic_copy(void)
-{
-        int read_image_result = 0;
-
-        if (sizeof(swp_entry_t) != sizeof(long)) {
-                printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
-                        " of long. Please report this!\n");
-                return 1;
-        }
-
-        if (!resume_file[0])
-                printk(KERN_WARNING "TuxOnIce: "
-                        "You need to use a resume= command line parameter to "
-                        "tell TuxOnIce where to look for an image.\n");
-
-        toi_activate_storage(0);
-
-        if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
-                !toi_attempt_to_parse_resume_device(0)) {
-                /*
-                 * Without a usable storage device we can do nothing -
-                 * even if noresume is given
-                 */
-
-                if (!toiNumAllocators)
-                        printk(KERN_ALERT "TuxOnIce: "
-                          "No storage allocators have been registered.\n");
-                else
-                        printk(KERN_ALERT "TuxOnIce: "
-                                "Missing or invalid storage location "
-                                "(resume= parameter). Please correct and "
-                                "rerun lilo (or equivalent) before "
-                                "hibernating.\n");
-                toi_deactivate_storage(0);
-                return 1;
-        }
-
-        if (allocate_bitmaps())
-                return 1;
-
-        read_image_result = read_pageset1(); /* non fatal error ignored */
-
-        if (test_toi_state(TOI_NORESUME_SPECIFIED))
-                clear_toi_state(TOI_NORESUME_SPECIFIED);
-
-        toi_deactivate_storage(0);
-
-        if (read_image_result)
-                return 1;
-
-        return 0;
-}
-
-/**
- * prepare_restore_load_alt_image - save & restore alt image variables
- *
- * Save and restore the pageset1 maps, when loading an alternate image.
- **/
-static void prepare_restore_load_alt_image(int prepare)
-{
-        static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save;
-
-        if (prepare) {
-                pageset1_map_save = pageset1_map;
-                pageset1_map = NULL;
-                pageset1_copy_map_save = pageset1_copy_map;
-                pageset1_copy_map = NULL;
-                set_toi_state(TOI_LOADING_ALT_IMAGE);
-                toi_reset_alt_image_pageset2_pfn();
-        } else {
-                toi_free_bitmap(&pageset1_map);
-                pageset1_map = pageset1_map_save;
-                toi_free_bitmap(&pageset1_copy_map);
-                pageset1_copy_map = pageset1_copy_map_save;
-                clear_toi_state(TOI_NOW_RESUMING);
-                clear_toi_state(TOI_LOADING_ALT_IMAGE);
-        }
-}
-
-/**
- * do_toi_step - perform a step in hibernating or resuming
- *
- * Perform a step in hibernating or resuming an image. This abstraction
- * is in preparation for implementing cluster support, and perhaps replacing
- * uswsusp too (haven't looked whether that's possible yet).
- **/
-int do_toi_step(int step)
-{
-        switch (step) {
-        case STEP_HIBERNATE_PREPARE_IMAGE:
-                return do_prepare_image();
-        case STEP_HIBERNATE_SAVE_IMAGE:
-                return do_save_image();
-        case STEP_HIBERNATE_POWERDOWN:
-                return do_post_image_write();
-        case STEP_RESUME_CAN_RESUME:
-                return do_check_can_resume();
-        case STEP_RESUME_LOAD_PS1:
-                return do_load_atomic_copy();
-        case STEP_RESUME_DO_RESTORE:
-                /*
-                 * If we succeed, this doesn't return.
-                 * Instead, we return from do_save_image() in the
-                 * hibernated kernel.
-                 */
-                return toi_atomic_restore();
-        case STEP_RESUME_ALT_IMAGE:
-                printk(KERN_INFO "Trying to resume alternate image.\n");
-                toi_in_hibernate = 0;
-                save_restore_alt_param(SAVE, NOQUIET);
-                prepare_restore_load_alt_image(1);
-                if (!do_check_can_resume()) {
-                        printk(KERN_INFO "Nothing to resume from.\n");
-                        goto out;
-                }
-                if (!do_load_atomic_copy())
-                        toi_atomic_restore();
-
-                printk(KERN_INFO "Failed to load image.\n");
-out:
-                prepare_restore_load_alt_image(0);
-                save_restore_alt_param(RESTORE, NOQUIET);
-                break;
-        case STEP_CLEANUP:
-                do_cleanup(1, 0);
-                break;
-        case STEP_QUIET_CLEANUP:
-                do_cleanup(0, 0);
-                break;
-        }
-
-        return 0;
-}
-
-/* -- Functions for kickstarting a hibernate or resume --- */
-
-/**
- * toi_try_resume - try to do the steps in resuming
- *
- * Check if we have an image and if so try to resume. Clear the status
- * flags too.
- **/
-void toi_try_resume(void)
-{
-        set_toi_state(TOI_TRYING_TO_RESUME);
-        resume_attempted = 1;
-
-        current->flags |= PF_MEMALLOC;
-        toi_start_other_threads();
-
-        if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
-                        !do_toi_step(STEP_RESUME_LOAD_PS1))
-                do_toi_step(STEP_RESUME_DO_RESTORE);
-
-        toi_stop_other_threads();
-        do_cleanup(0, 0);
-
-        current->flags &= ~PF_MEMALLOC;
-
-        clear_toi_state(TOI_IGNORE_LOGLEVEL);
-        clear_toi_state(TOI_TRYING_TO_RESUME);
-        clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume
- *
- * Wrapper for when __toi_try_resume is called from swsusp resume path,
- * rather than from echo > /sys/power/tuxonice/do_resume.
- **/
-static void toi_sys_power_disk_try_resume(void)
-{
-        resume_attempted = 1;
-
-        /*
-         * There's a comment in kernel/power/disk.c that indicates
-         * we should be able to use mutex_lock_nested below. That
-         * doesn't seem to cut it, though, so let's just turn lockdep
-         * off for now.
-         */
-        lockdep_off();
-
-        if (toi_start_anything(SYSFS_RESUMING))
-                goto out;
-
-        toi_try_resume();
-
-        /*
-         * For initramfs, we have to clear the boot time
-         * flag after trying to resume
-         */
-        clear_toi_state(TOI_BOOT_TIME);
-
-        toi_finish_anything(SYSFS_RESUMING);
-out:
-        lockdep_on();
-}
-
-/**
- * toi_try_hibernate - try to start a hibernation cycle
- *
- * Start a hibernation cycle, coming in from either
- * echo > /sys/power/tuxonice/do_suspend
- *
- * or
- *
- * echo disk > /sys/power/state
- *
- * In the later case, we come in without pm_sem taken; in the
- * former, it has been taken.
- **/
-int toi_try_hibernate(void)
-{
-        int result = 0, sys_power_disk = 0, retries = 0;
-
-        if (!mutex_is_locked(&tuxonice_in_use)) {
-                /* Came in via /sys/power/disk */
-                if (toi_start_anything(SYSFS_HIBERNATING))
-                        return -EBUSY;
-                sys_power_disk = 1;
-        }
-
-        current->flags |= PF_MEMALLOC;
-
-        if (test_toi_state(TOI_CLUSTER_MODE)) {
-                toi_initiate_cluster_hibernate();
-                goto out;
-        }
-
-prepare:
-        result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
-
-        if (result)
-                goto out;
-
-        if (test_action_state(TOI_FREEZER_TEST))
-                goto out_restore_gfp_mask;
-
-        result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
-
-        if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) {
-                if (retries < 2) {
-                        do_cleanup(0, 1);
-                        retries++;
-                        clear_result_state(TOI_ABORTED);
-                        extra_pd1_pages_allowance = extra_pd1_pages_used + 500;
-                        printk(KERN_INFO "Automatically adjusting the extra"
-                                " pages allowance to %ld and restarting.\n",
-                                extra_pd1_pages_allowance);
-                        pm_restore_gfp_mask();
-                        goto prepare;
-                }
-
-                printk(KERN_INFO "Adjusted extra pages allowance twice and "
-                        "still couldn't hibernate successfully. Giving up.");
-        }
-
-        /* This code runs at resume time too! */
-        if (!result && toi_in_hibernate)
-                result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-
-out_restore_gfp_mask:
-        pm_restore_gfp_mask();
-out:
-        do_cleanup(1, 0);
-        current->flags &= ~PF_MEMALLOC;
-
-        if (sys_power_disk)
-                toi_finish_anything(SYSFS_HIBERNATING);
-
-        return result;
-}
-
-/*
- * channel_no: If !0, -c <channel_no> is added to args (userui).
- */
-int toi_launch_userspace_program(char *command, int channel_no,
-                int wait, int debug)
-{
-        int retval;
-        static char *envp[] = {
-                        "HOME=/",
-                        "TERM=linux",
-                        "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-                        NULL };
-        static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
-                };
-        char *channel = NULL;
-        int arg = 0, size;
-        char test_read[255];
-        char *orig_posn = command;
-
-        if (!strlen(orig_posn))
-                return 1;
-
-        if (channel_no) {
-                channel = toi_kzalloc(4, 6, GFP_KERNEL);
-                if (!channel) {
-                        printk(KERN_INFO "Failed to allocate memory in "
-                                "preparing to launch userspace program.\n");
-                        return 1;
-                }
-        }
-
-        /* Up to 6 args supported */
-        while (arg < 6) {
-                sscanf(orig_posn, "%s", test_read);
-                size = strlen(test_read);
-                if (!(size))
-                        break;
-                argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
-                strcpy(argv[arg], test_read);
-                orig_posn += size + 1;
-                *test_read = 0;
-                arg++;
-        }
-
-        if (channel_no) {
-                sprintf(channel, "-c%d", channel_no);
-                argv[arg] = channel;
-        } else
-                arg--;
-
-        if (debug) {
-                argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP);
-                strcpy(argv[arg], "--debug");
-        }
-
-        retval = call_usermodehelper(argv[0], argv, envp, wait);
-
-        /*
-         * If the program reports an error, retval = 256. Don't complain
-         * about that here.
-         */
-        if (retval && retval != 256)
-                printk(KERN_ERR "Failed to launch userspace program '%s': "
-                                "Error %d\n", command, retval);
-
-        {
-                int i;
-                for (i = 0; i < arg; i++)
-                        if (argv[i] && argv[i] != channel)
-                                toi_kfree(5, argv[i], sizeof(*argv[i]));
-        }
-
-        toi_kfree(4, channel, sizeof(*channel));
-
-        return retval;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_LONG("extra_pages_allowance", SYSFS_RW,
-                        &extra_pd1_pages_allowance, 0, LONG_MAX, 0),
-        SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read,
-                        image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL),
-        SYSFS_STRING("resume", SYSFS_RW, resume_file, 255,
-                        SYSFS_NEEDS_SM_FOR_WRITE,
-                        attempt_to_parse_resume_device2),
-        SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255,
-                        SYSFS_NEEDS_SM_FOR_WRITE,
-                        attempt_to_parse_alt_resume_param),
-        SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0,
-                        NULL),
-        SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_IGNORE_ROOTFS, 0),
-        SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2,
-                        INT_MAX, 0),
-        SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0),
-        SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_MULTITHREADED_IO, 0),
-        SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_FLUSHER_THREAD, 0),
-        SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_PAGESET2_FULL, 0),
-        SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0),
-        SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_REPLACE_SWSUSP, 0),
-        SYSFS_STRING("resume_commandline", SYSFS_RW,
-                        toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0,
-                        NULL),
-        SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL),
-        SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_FREEZER_TEST, 0),
-        SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0),
-        SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_TEST_FILTER_SPEED, 0),
-        SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_PAGESET2, 0),
-        SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_PS2_IF_UNNEEDED, 0),
-        SYSFS_STRING("binary_signature", SYSFS_READONLY,
-                        tuxonice_signature, 9, 0, NULL),
-        SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0,
-                        NULL),
-#ifdef CONFIG_KGDB
-        SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_POST_RESUME_BREAKPOINT, 0),
-#endif
-        SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_READAHEAD, 0),
-        SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_TRACE_DEBUG_ON, 0),
-#ifdef CONFIG_TOI_KEEP_IMAGE
-        SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE,
-                        0),
-#endif
-#ifdef CONFIG_TOI_INCREMENTAL
-        SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0,
-                        NULL),
-        SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_INCREMENTAL_IMAGE, 1),
-#endif
-};
-
-static struct toi_core_fns my_fns = {
-        .get_nonconflicting_page = __toi_get_nonconflicting_page,
-        .post_context_save = __toi_post_context_save,
-        .try_hibernate = toi_try_hibernate,
-        .try_resume = toi_sys_power_disk_try_resume,
-};
-
-/**
- * core_load - initialisation of TuxOnIce core
- *
- * Initialise the core, beginning with sysfs. Checksum and so on are part of
- * the core, but have their own initialisation routines because they either
- * aren't compiled in all the time or have their own subdirectories.
- **/
-static __init int core_load(void)
-{
-        int i,
-            numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
-        printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION
-                        " (http://tuxonice.net)\n");
-
-        if (!hibernation_available()) {
-          printk(KERN_INFO "TuxOnIce disabled due to request for hibernation"
-              " to be disabled in this kernel.\n");
-          return 1;
-        }
-
-        if (toi_sysfs_init())
-                return 1;
-
-        for (i = 0; i < numfiles; i++)
-                toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
-        toi_core_fns = &my_fns;
-
-        if (toi_alloc_init())
-                return 1;
-        if (toi_checksum_init())
-                return 1;
-        if (toi_usm_init())
-                return 1;
-        if (toi_ui_init())
-                return 1;
-        if (toi_poweroff_init())
-                return 1;
-        if (toi_cluster_init())
-                return 1;
-        if (toi_cbw_init())
-                return 1;
-
-        return 0;
-}
-
-late_initcall(core_load);
diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c
deleted file mode 100644
index a8c5f3660..000000000
--- a/kernel/power/tuxonice_incremental.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * kernel/power/tuxonice_incremental.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines related to storing incremental images - that
- * is, retaining an image after an initial cycle and then storing incremental
- * changes on subsequent hibernations.
- *
- * Based in part on on...
- *
- * Debug helper to dump the current kernel pagetables of the system
- * so that we can see what the various memory ranges are set to.
- *
- * (C) Copyright 2008 Intel Corporation
- *
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/mm.h>
-#include <linux/tuxonice.h>
-#include <linux/sched.h>
-#include <asm/pgtable.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/page.h>
-#include "tuxonice_pageflags.h"
-#include "tuxonice_builtin.h"
-#include "power.h"
-
-int toi_do_incremental_initcall;
-
-extern void kdb_init(int level);
-extern noinline void kgdb_breakpoint(void);
-
-#undef pr_debug
-#if 0
-#define pr_debug(a, b...) do { printk(a, ##b); } while(0)
-#else
-#define pr_debug(a, b...) do { } while(0)
-#endif
-
-/* Multipliers for offsets within the PTEs */
-#define PTE_LEVEL_MULT (PAGE_SIZE)
-#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
-#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
-
-/*
- * This function gets called on a break in a continuous series
- * of PTE entries; the next one is different so we need to
- * print what we collected so far.
- */
-static void note_page(void *addr)
-{
-    static struct page *lastpage;
-    struct page *page;
-
-    page = virt_to_page(addr);
-
-    if (page != lastpage) {
-        unsigned int level;
-        pte_t *pte = lookup_address((unsigned long) addr, &level);
-        struct page *pt_page2 = pte_page(*pte);
-        //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2));
-        SetPageTOI_Untracked(pt_page2);
-        lastpage = page;
-    }
-}
-
-static void walk_pte_level(pmd_t addr)
-{
-        int i;
-        pte_t *start;
-
-        start = (pte_t *) pmd_page_vaddr(addr);
-        for (i = 0; i < PTRS_PER_PTE; i++) {
-                note_page(start);
-                start++;
-        }
-}
-
-#if PTRS_PER_PMD > 1
-
-static void walk_pmd_level(pud_t addr)
-{
-        int i;
-        pmd_t *start;
-
-        start = (pmd_t *) pud_page_vaddr(addr);
-        for (i = 0; i < PTRS_PER_PMD; i++) {
-                if (!pmd_none(*start)) {
-                        if (pmd_large(*start) || !pmd_present(*start))
-                                note_page(start);
-                        else
-                                walk_pte_level(*start);
-                } else
-                        note_page(start);
-                start++;
-        }
-}
-
-#else
-#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a)))
-#define pud_large(a) pmd_large(__pmd(pud_val(a)))
-#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
-#endif
-
-#if PTRS_PER_PUD > 1
-
-static void walk_pud_level(pgd_t addr)
-{
-        int i;
-        pud_t *start;
-
-        start = (pud_t *) pgd_page_vaddr(addr);
-
-        for (i = 0; i < PTRS_PER_PUD; i++) {
-                if (!pud_none(*start)) {
-                        if (pud_large(*start) || !pud_present(*start))
-                                note_page(start);
-                        else
-                                walk_pmd_level(*start);
-                } else
-                        note_page(start);
-
-                start++;
-        }
-}
-
-#else
-#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a)))
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
-#endif
-
-/*
- * Not static in the original at the time of writing, so needs renaming here.
- */
-static void toi_ptdump_walk_pgd_level(pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
-        pgd_t *start = (pgd_t *) &init_level4_pgt;
-#else
-        pgd_t *start = swapper_pg_dir;
-#endif
-        int i;
-        if (pgd) {
-                start = pgd;
-        }
-
-        for (i = 0; i < PTRS_PER_PGD; i++) {
-                if (!pgd_none(*start)) {
-                        if (pgd_large(*start) || !pgd_present(*start))
-                                note_page(start);
-                        else
-                                walk_pud_level(*start);
-                } else
-                        note_page(start);
-
-                start++;
-        }
-
-        /* Flush out the last page */
-        note_page(start);
-}
-
-#ifdef CONFIG_PARAVIRT
-extern struct pv_info pv_info;
-
-static void toi_set_paravirt_ops_untracked(void) {
-    int i;
-
-    unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)),
-                  pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end));
-    //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end);
-    for (i = pvpfn; i <= pvpfn_end; i++) {
-        SetPageTOI_Untracked(pfn_to_page(i));
-    }
-}
-#else
-#define toi_set_paravirt_ops_untracked() { do { } while(0) }
-#endif
-
-extern void toi_mark_per_cpus_pages_untracked(void);
-
-void toi_untrack_stack(unsigned long *stack)
-{
-    int i;
-    struct page *stack_page = virt_to_page(stack);
-
-    for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) {
-        pr_debug("Untrack stack page %p.\n", page_address(stack_page + i));
-        SetPageTOI_Untracked(stack_page + i);
-    }
-}
-void toi_untrack_process(struct task_struct *p)
-{
-    SetPageTOI_Untracked(virt_to_page(p));
-    pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p)));
-
-    toi_untrack_stack(p->stack);
-}
-
-void toi_generate_untracked_map(void)
-{
-    struct task_struct *p, *t;
-    struct page *page;
-    pte_t *pte;
-    int i;
-    unsigned int level;
-    static int been_here = 0;
-
-    if (been_here)
-        return;
-
-    been_here = 1;
-
-    /* Pagetable pages */
-    toi_ptdump_walk_pgd_level(NULL);
-
-    /* Printk buffer - not normally needed but can be helpful for debugging. */
-    //toi_set_logbuf_untracked();
-
-    /* Paravirt ops */
-    toi_set_paravirt_ops_untracked();
-
-    /* Task structs and stacks */
-    for_each_process_thread(p, t) {
-        toi_untrack_process(p);
-        //toi_untrack_stack((unsigned long *) t->thread.sp);
-    }
-
-    for (i = 0; i < NR_CPUS; i++) {
-        struct task_struct *idle = idle_task(i);
-
-        if (idle) {
-            pr_debug("Untrack idle process for CPU %d.\n", i);
-            toi_untrack_process(idle);
-        }
-
-        /* IRQ stack */
-        pr_debug("Untrack IRQ stack for CPU %d.\n", i);
-        toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i));
-    }
-
-    /* Per CPU data */
-    //pr_debug("Untracking per CPU variable pages.\n");
-    toi_mark_per_cpus_pages_untracked();
-
-    /* Init stack - for bringing up secondary CPUs */
-    page = virt_to_page(init_stack);
-    for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) {
-        SetPageTOI_Untracked(page + i);
-    }
-
-    pte = lookup_address((unsigned long) &mmu_cr4_features, &level);
-    SetPageTOI_Untracked(pte_page(*pte));
-    SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features));
-}
-
-/**
- * toi_reset_dirtiness_one
- */
-
-void toi_reset_dirtiness_one(unsigned long pfn, int verbose)
-{
-    struct page *page = pfn_to_page(pfn);
-
-    /**
-     * Don't worry about whether the Dirty flag is
-     * already set. If this is our first call, it
-     * won't be.
-     */
-
-    preempt_disable();
-
-    ClearPageTOI_Dirty(page);
-    SetPageTOI_RO(page);
-    if (verbose)
-        printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page));
-
-    set_memory_ro((unsigned long) page_address(page), 1);
-
-    preempt_enable();
-}
-
-/**
- * TuxOnIce's incremental image support works by marking all memory apart from
- * the page tables read-only, then in the page-faults that result enabling
- * writing if appropriate and flagging the page as dirty. Free pages are also
- * marked as dirty and not protected so that if allocated, they will be included
- * in the image without further processing.
- *
- * toi_reset_dirtiness is called when and image exists and incremental images are
- * enabled, and each time we resume thereafter. It is not invoked on a fresh boot.
- *
- * This routine should be called from a single-cpu-running context to avoid races in setting
- * page dirty/read only flags.
- *
- * TODO: Make "it is not invoked on a fresh boot" true  when I've finished developing it!
- *
- * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c.
- **/
-
-int toi_reset_dirtiness(int verbose)
-{
-        struct zone *zone;
-        unsigned long loop;
-        int allocated_map = 0;
-
-        toi_generate_untracked_map();
-
-        if (!free_map) {
-            if (!toi_alloc_bitmap(&free_map))
-                return -ENOMEM;
-            allocated_map = 1;
-        }
-
-        toi_generate_free_page_map();
-
-        pr_debug(KERN_EMERG "Reset dirtiness.\n");
-        for_each_populated_zone(zone) {
-            // 64 bit only. No need to worry about highmem.
-            for (loop = 0; loop < zone->spanned_pages; loop++) {
-                unsigned long pfn = zone->zone_start_pfn + loop;
-                struct page *page;
-                int chunk_size;
-
-                if (!pfn_valid(pfn)) {
-                    continue;
-                }
-
-                chunk_size = toi_size_of_free_region(zone, pfn);
-                if (chunk_size) {
-                    loop += chunk_size - 1;
-                    continue;
-                }
-
-                page = pfn_to_page(pfn);
-
-                if (PageNosave(page) || !saveable_page(zone, pfn)) {
-                    continue;
-                }
-
-                if (PageTOI_Untracked(page)) {
-                    continue;
-                }
-
-                /**
-                 * Do we need to (re)protect the page?
-                 * If it is already protected (PageTOI_RO), there is
-                 * nothing to do - skip the following.
-                 * If it is marked as dirty (PageTOI_Dirty), it was
-                 * either free and has been allocated or has been
-                 * written to and marked dirty. Reset the dirty flag
-                 * and (re)apply the protection.
-                 */
-                if (!PageTOI_RO(page)) {
-                    toi_reset_dirtiness_one(pfn, verbose);
-                }
-            }
-        }
-
-        pr_debug(KERN_EMERG "Done resetting dirtiness.\n");
-
-        if (allocated_map) {
-            toi_free_bitmap(&free_map);
-        }
-        return 0;
-}
-
-static int toi_reset_dirtiness_initcall(void)
-{
-    if (toi_do_incremental_initcall) {
-        pr_info("TuxOnIce: Enabling dirty page tracking.\n");
-        toi_reset_dirtiness(0);
-    }
-    return 1;
-}
-extern void toi_generate_untracked_map(void);
-
-// Leave early_initcall for pages to register untracked sections.
-early_initcall(toi_reset_dirtiness_initcall);
-
-static int __init toi_incremental_initcall_setup(char *str)
-{
-        int value;
-
-        if (sscanf(str, "=%d", &value) && value)
-                toi_do_incremental_initcall = value;
-
-        return 1;
-}
-__setup("toi_incremental_initcall", toi_incremental_initcall_setup);
diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
deleted file mode 100644
index 3c62c2682..000000000
--- a/kernel/power/tuxonice_io.c
+++ /dev/null
@@ -1,1932 +0,0 @@
-/*
- * kernel/power/tuxonice_io.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/version.h>
-#include <linux/utsname.h>
-#include <linux/mount.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-#include <linux/cpu.h>
-#include <linux/fs_struct.h>
-#include <linux/bio.h>
-#include <linux/fs_uuid.h>
-#include <linux/kmod.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_alloc.h"
-char alt_resume_param[256];
-
-/* Version read from image header at resume */
-static int toi_image_header_version;
-
-#define read_if_version(VERS, VAR, DESC, ERR_ACT) do {                                        \
-        if (likely(toi_image_header_version >= VERS))                                \
-                if (toiActiveAllocator->rw_header_chunk(READ, NULL,                \
-                                        (char *) &VAR, sizeof(VAR))) {                \
-                        abort_hibernate(TOI_FAILED_IO, "Failed to read DESC.");        \
-                        ERR_ACT;                                        \
-                }                                                                \
-} while(0)                                                                        \
-
-/* Variables shared between threads and updated under the mutex */
-static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
-static int io_index, io_nextupdate, io_pc, io_pc_step;
-static DEFINE_MUTEX(io_mutex);
-static DEFINE_PER_CPU(struct page *, last_sought);
-static DEFINE_PER_CPU(struct page *, last_high_page);
-static DEFINE_PER_CPU(char *, checksum_locn);
-static DEFINE_PER_CPU(struct pbe *, last_low_page);
-static atomic_t io_count;
-atomic_t toi_io_workers;
-
-static int using_flusher;
-
-DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
-
-int toi_bio_queue_flusher_should_finish;
-
-int toi_max_workers;
-
-static char *image_version_error = "The image header version is newer than " \
-        "this kernel supports.";
-
-struct toi_module_ops *first_filter;
-
-static atomic_t toi_num_other_threads;
-static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue);
-enum toi_worker_commands {
-        TOI_IO_WORKER_STOP,
-        TOI_IO_WORKER_RUN,
-        TOI_IO_WORKER_EXIT
-};
-static enum toi_worker_commands toi_worker_command;
-
-/**
- * toi_attempt_to_parse_resume_device - determine if we can hibernate
- *
- * Can we hibernate, using the current resume= parameter?
- **/
-int toi_attempt_to_parse_resume_device(int quiet)
-{
-        struct list_head *Allocator;
-        struct toi_module_ops *thisAllocator;
-        int result, returning = 0;
-
-        if (toi_activate_storage(0))
-                return 0;
-
-        toiActiveAllocator = NULL;
-        clear_toi_state(TOI_RESUME_DEVICE_OK);
-        clear_toi_state(TOI_CAN_RESUME);
-        clear_result_state(TOI_ABORTED);
-
-        if (!toiNumAllocators) {
-                if (!quiet)
-                        printk(KERN_INFO "TuxOnIce: No storage allocators have "
-                                "been registered. Hibernating will be "
-                                "disabled.\n");
-                goto cleanup;
-        }
-
-        list_for_each(Allocator, &toiAllocators) {
-                thisAllocator = list_entry(Allocator, struct toi_module_ops,
-                                                                type_list);
-
-                /*
-                 * Not sure why you'd want to disable an allocator, but
-                 * we should honour the flag if we're providing it
-                 */
-                if (!thisAllocator->enabled)
-                        continue;
-
-                result = thisAllocator->parse_sig_location(
-                                resume_file, (toiNumAllocators == 1),
-                                quiet);
-
-                switch (result) {
-                case -EINVAL:
-                        /* For this allocator, but not a valid
-                         * configuration. Error already printed. */
-                        goto cleanup;
-
-                case 0:
-                        /* For this allocator and valid. */
-                        toiActiveAllocator = thisAllocator;
-
-                        set_toi_state(TOI_RESUME_DEVICE_OK);
-                        set_toi_state(TOI_CAN_RESUME);
-                        returning = 1;
-                        goto cleanup;
-                }
-        }
-        if (!quiet)
-                printk(KERN_INFO "TuxOnIce: No matching enabled allocator "
-                                "found. Resuming disabled.\n");
-cleanup:
-        toi_deactivate_storage(0);
-        return returning;
-}
-
-void attempt_to_parse_resume_device2(void)
-{
-        toi_prepare_usm();
-        toi_attempt_to_parse_resume_device(0);
-        toi_cleanup_usm();
-}
-
-void save_restore_alt_param(int replace, int quiet)
-{
-        static char resume_param_save[255];
-        static unsigned long toi_state_save;
-
-        if (replace) {
-                toi_state_save = toi_state;
-                strcpy(resume_param_save, resume_file);
-                strcpy(resume_file, alt_resume_param);
-        } else {
-                strcpy(resume_file, resume_param_save);
-                toi_state = toi_state_save;
-        }
-        toi_attempt_to_parse_resume_device(quiet);
-}
-
-void attempt_to_parse_alt_resume_param(void)
-{
-        int ok = 0;
-
-        /* Temporarily set resume_param to the poweroff value */
-        if (!strlen(alt_resume_param))
-                return;
-
-        printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n");
-        save_restore_alt_param(SAVE, NOQUIET);
-        if (test_toi_state(TOI_CAN_RESUME))
-                ok = 1;
-
-        printk(KERN_INFO "=== Done ===\n");
-        save_restore_alt_param(RESTORE, QUIET);
-
-        /* If not ok, clear the string */
-        if (ok)
-                return;
-
-        printk(KERN_INFO "Can't resume from that location; clearing "
-                        "alt_resume_param.\n");
-        alt_resume_param[0] = '\0';
-}
-
-/**
- * noresume_reset_modules - reset data structures in case of non resuming
- *
- * When we read the start of an image, modules (and especially the
- * active allocator) might need to reset data structures if we
- * decide to remove the image rather than resuming from it.
- **/
-static void noresume_reset_modules(void)
-{
-        struct toi_module_ops *this_filter;
-
-        list_for_each_entry(this_filter, &toi_filters, type_list)
-                if (this_filter->noresume_reset)
-                        this_filter->noresume_reset();
-
-        if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
-                toiActiveAllocator->noresume_reset();
-}
-
-/**
- * fill_toi_header - fill the hibernate header structure
- * @struct toi_header: Header data structure to be filled.
- **/
-static int fill_toi_header(struct toi_header *sh)
-{
-        int i, error;
-
-        error = init_header((struct swsusp_info *) sh);
-        if (error)
-                return error;
-
-        sh->pagedir = pagedir1;
-        sh->pageset_2_size = pagedir2.size;
-        sh->param0 = toi_result;
-        sh->param1 = toi_bkd.toi_action;
-        sh->param2 = toi_bkd.toi_debug_state;
-        sh->param3 = toi_bkd.toi_default_console_level;
-        sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
-        for (i = 0; i < 4; i++)
-                sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
-        sh->bkd = boot_kernel_data_buffer;
-        return 0;
-}
-
-/**
- * rw_init_modules - initialize modules
- * @rw:                Whether we are reading of writing an image.
- * @which:        Section of the image being processed.
- *
- * Iterate over modules, preparing the ones that will be used to read or write
- * data.
- **/
-static int rw_init_modules(int rw, int which)
-{
-        struct toi_module_ops *this_module;
-        /* Initialise page transformers */
-        list_for_each_entry(this_module, &toi_filters, type_list) {
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->rw_init && this_module->rw_init(rw, which)) {
-                        abort_hibernate(TOI_FAILED_MODULE_INIT,
-                                "Failed to initialize the %s filter.",
-                                this_module->name);
-                        return 1;
-                }
-        }
-
-        /* Initialise allocator */
-        if (toiActiveAllocator->rw_init(rw, which)) {
-                abort_hibernate(TOI_FAILED_MODULE_INIT,
-                                "Failed to initialise the allocator.");
-                return 1;
-        }
-
-        /* Initialise other modules */
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    this_module->type == FILTER_MODULE ||
-                    this_module->type == WRITER_MODULE)
-                        continue;
-                if (this_module->rw_init && this_module->rw_init(rw, which)) {
-                        set_abort_result(TOI_FAILED_MODULE_INIT);
-                        printk(KERN_INFO "Setting aborted flag due to module "
-                                        "init failure.\n");
-                        return 1;
-                }
-        }
-
-        return 0;
-}
-
-/**
- * rw_cleanup_modules - cleanup modules
- * @rw:        Whether we are reading of writing an image.
- *
- * Cleanup components after reading or writing a set of pages.
- * Only the allocator may fail.
- **/
-static int rw_cleanup_modules(int rw)
-{
-        struct toi_module_ops *this_module;
-        int result = 0;
-
-        /* Cleanup other modules */
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    this_module->type == FILTER_MODULE ||
-                    this_module->type == WRITER_MODULE)
-                        continue;
-                if (this_module->rw_cleanup)
-                        result |= this_module->rw_cleanup(rw);
-        }
-
-        /* Flush data and cleanup */
-        list_for_each_entry(this_module, &toi_filters, type_list) {
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->rw_cleanup)
-                        result |= this_module->rw_cleanup(rw);
-        }
-
-        result |= toiActiveAllocator->rw_cleanup(rw);
-
-        return result;
-}
-
-static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high)
-{
-        int index, min, max;
-        struct page *high_page = NULL,
-                    **my_last_high_page = raw_cpu_ptr(&last_high_page),
-                    **my_last_sought = raw_cpu_ptr(&last_sought);
-        struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page);
-        void *compare;
-
-        if (is_high) {
-                if (*my_last_sought && *my_last_high_page &&
-                                *my_last_sought < orig_page)
-                        high_page = *my_last_high_page;
-                else
-                        high_page = (struct page *) restore_highmem_pblist;
-                this = (struct pbe *) kmap(high_page);
-                compare = orig_page;
-        } else {
-                if (*my_last_sought && *my_last_low_page &&
-                                *my_last_sought < orig_page)
-                        this = *my_last_low_page;
-                else
-                        this = restore_pblist;
-                compare = page_address(orig_page);
-        }
-
-        *my_last_sought = orig_page;
-
-        /* Locate page containing pbe */
-        while (this[PBES_PER_PAGE - 1].next &&
-                        this[PBES_PER_PAGE - 1].orig_address < compare) {
-                if (is_high) {
-                        struct page *next_high_page = (struct page *)
-                                this[PBES_PER_PAGE - 1].next;
-                        kunmap(high_page);
-                        this = kmap(next_high_page);
-                        high_page = next_high_page;
-                } else
-                        this = this[PBES_PER_PAGE - 1].next;
-        }
-
-        /* Do a binary search within the page */
-        min = 0;
-        max = PBES_PER_PAGE;
-        index = PBES_PER_PAGE / 2;
-        while (max - min) {
-                if (!this[index].orig_address ||
-                    this[index].orig_address > compare)
-                        max = index;
-                else if (this[index].orig_address == compare) {
-                        if (is_high) {
-                                struct page *page = this[index].address;
-                                *my_last_high_page = high_page;
-                                kunmap(high_page);
-                                return page;
-                        }
-                        *my_last_low_page = this;
-                        return virt_to_page(this[index].address);
-                } else
-                        min = index;
-                index = ((max + min) / 2);
-        };
-
-        if (is_high)
-                kunmap(high_page);
-
-        abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
-                " orig page %p. This[min].orig_address=%p.\n", orig_page,
-                this[index].orig_address);
-        return NULL;
-}
-
-/**
- * write_next_page - write the next page in a pageset
- * @data_pfn: The pfn where the next data to write is located.
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn number to write in the image (where the data belongs).
- *
- * Get the pfn of the next page to write, map the page if necessary and do the
- * write.
- **/
-static int write_next_page(unsigned long *data_pfn, int *my_io_index,
-                unsigned long *write_pfn)
-{
-        struct page *page;
-        char **my_checksum_locn = raw_cpu_ptr(&checksum_locn);
-        int result = 0, was_present;
-
-        *data_pfn = memory_bm_next_pfn(io_map, 0);
-
-        /* Another thread could have beaten us to it. */
-        if (*data_pfn == BM_END_OF_MAP) {
-                if (atomic_read(&io_count)) {
-                        printk(KERN_INFO "Ran out of pfns but io_count is "
-                                        "still %d.\n", atomic_read(&io_count));
-                        BUG();
-                }
-                mutex_unlock(&io_mutex);
-                return -ENODATA;
-        }
-
-        *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
-        memory_bm_clear_bit(io_map, 0, *data_pfn);
-        page = pfn_to_page(*data_pfn);
-
-        was_present = kernel_page_present(page);
-        if (!was_present)
-                kernel_map_pages(page, 1, 1);
-
-        if (io_pageset == 1)
-                *write_pfn = memory_bm_next_pfn(pageset1_map, 0);
-        else {
-                *write_pfn = *data_pfn;
-                *my_checksum_locn = tuxonice_get_next_checksum();
-        }
-
-        TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index);
-
-        mutex_unlock(&io_mutex);
-
-        if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn))
-                return 1;
-
-        result = first_filter->write_page(*write_pfn, TOI_PAGE, page,
-                        PAGE_SIZE);
-
-        if (!was_present)
-                kernel_map_pages(page, 1, 0);
-
-        return result;
-}
-
-/**
- * read_next_page - read the next page in a pageset
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn in which the data belongs.
- *
- * Read a page of the image into our buffer. It can happen (here and in the
- * write routine) that threads don't get run until after other CPUs have done
- * all the work. This was the cause of the long standing issue with
- * occasionally getting -ENODATA errors at the end of reading the image. We
- * therefore need to check there's actually a page to read before trying to
- * retrieve one.
- **/
-
-static int read_next_page(int *my_io_index, unsigned long *write_pfn,
-                struct page *buffer)
-{
-        unsigned int buf_size = PAGE_SIZE;
-        unsigned long left = atomic_read(&io_count);
-
-        if (!left)
-                return -ENODATA;
-
-        /* Start off assuming the page we read isn't resaved */
-        *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
-        mutex_unlock(&io_mutex);
-
-        /*
-         * Are we aborting? If so, don't submit any more I/O as
-         * resetting the resume_attempted flag (from ui.c) will
-         * clear the bdev flags, making this thread oops.
-         */
-        if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
-                atomic_dec(&toi_io_workers);
-                if (!atomic_read(&toi_io_workers)) {
-                        /*
-                         * So we can be sure we'll have memory for
-                         * marking that we haven't resumed.
-                         */
-                        rw_cleanup_modules(READ);
-                        set_toi_state(TOI_IO_STOPPED);
-                }
-                while (1)
-                        schedule();
-        }
-
-        /*
-         * See toi_bio_read_page in tuxonice_bio.c:
-         * read the next page in the image.
-         */
-        return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size);
-}
-
-static void use_read_page(unsigned long write_pfn, struct page *buffer)
-{
-        struct page *final_page = pfn_to_page(write_pfn),
-                    *copy_page = final_page;
-        char *virt, *buffer_virt;
-        int was_present, cpu = smp_processor_id();
-        unsigned long idx = 0;
-
-        if (io_pageset == 1 && (!pageset1_copy_map ||
-                        !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) {
-                int is_high = PageHighMem(final_page);
-                copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high);
-        }
-
-        if (!memory_bm_test_bit(io_map, cpu, write_pfn)) {
-                int test = !memory_bm_test_bit(io_map, cpu, write_pfn);
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test);
-                mutex_lock(&io_mutex);
-                idx = atomic_add_return(1, &io_count);
-                mutex_unlock(&io_mutex);
-                return;
-        }
-
-        virt = kmap(copy_page);
-        buffer_virt = kmap(buffer);
-        was_present = kernel_page_present(copy_page);
-        if (!was_present)
-                kernel_map_pages(copy_page, 1, 1);
-        memcpy(virt, buffer_virt, PAGE_SIZE);
-        if (!was_present)
-                kernel_map_pages(copy_page, 1, 0);
-        kunmap(copy_page);
-        kunmap(buffer);
-        memory_bm_clear_bit(io_map, cpu, write_pfn);
-        TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset);
-}
-
-static unsigned long status_update(int writing, unsigned long done,
-                unsigned long ticks)
-{
-        int cs_index = writing ? 0 : 1;
-        unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks;
-        unsigned long msec = jiffies_to_msecs(abs(ticks_so_far));
-        unsigned long pgs_per_s, estimate = 0, pages_left;
-
-        if (msec) {
-                pages_left = io_barmax - done;
-                pgs_per_s = 1000 * done / msec;
-                if (pgs_per_s)
-                        estimate = DIV_ROUND_UP(pages_left, pgs_per_s);
-        }
-
-        if (estimate && ticks > HZ / 2)
-                return toi_update_status(done, io_barmax,
-                        " %d/%d MB (%lu sec left)",
-                        MB(done+1), MB(io_barmax), estimate);
-
-        return toi_update_status(done, io_barmax, " %d/%d MB",
-                MB(done+1), MB(io_barmax));
-}
-
-/**
- * worker_rw_loop - main loop to read/write pages
- *
- * The main I/O loop for reading or writing pages. The io_map bitmap is used to
- * track the pages to read/write.
- * If we are reading, the pages are loaded to their final (mapped) pfn.
- * Data is non zero iff this is a thread started via start_other_threads.
- * In that case, we stay in here until told to quit.
- **/
-static int worker_rw_loop(void *data)
-{
-        unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4,
-                      jif_index = 1, start_time = jiffies, thread_num;
-        int result = 0, my_io_index = 0, last_worker;
-        struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
-        cpumask_var_t orig_mask;
-
-        if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) {
-                printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data);
-                result = -ENOMEM;
-                goto out;
-        }
-
-        cpumask_copy(orig_mask, tsk_cpus_allowed(current));
-
-        current->flags |= PF_NOFREEZE;
-
-top:
-        mutex_lock(&io_mutex);
-        thread_num = atomic_read(&toi_io_workers);
-
-        cpumask_copy(tsk_cpus_allowed(current), orig_mask);
-        schedule();
-
-        atomic_inc(&toi_io_workers);
-
-        while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
-                !(io_write && test_result_state(TOI_ABORTED)) &&
-                toi_worker_command == TOI_IO_WORKER_RUN) {
-                if (!thread_num && jiffies > next_jiffies) {
-                        next_jiffies += HZ / 4;
-                        if (toiActiveAllocator->update_throughput_throttle)
-                                toiActiveAllocator->update_throughput_throttle(
-                                                jif_index);
-                        jif_index++;
-                }
-
-                /*
-                 * What page to use? If reading, don't know yet which page's
-                 * data will be read, so always use the buffer. If writing,
-                 * use the copy (Pageset1) or original page (Pageset2), but
-                 * always write the pfn of the original page.
-                 */
-                if (io_write)
-                        result = write_next_page(&data_pfn, &my_io_index,
-                                        &write_pfn);
-                else /* Reading */
-                        result = read_next_page(&my_io_index, &write_pfn,
-                                        buffer);
-
-                if (result) {
-                        mutex_lock(&io_mutex);
-                        /* Nothing to do? */
-                        if (result == -ENODATA) {
-                                toi_message(TOI_IO, TOI_VERBOSE, 0,
-                                        "Thread %d has no more work.",
-                                        smp_processor_id());
-                                break;
-                        }
-
-                        io_result = result;
-
-                        if (io_write) {
-                                printk(KERN_INFO "Write chunk returned %d.\n",
-                                                result);
-                                abort_hibernate(TOI_FAILED_IO,
-                                        "Failed to write a chunk of the "
-                                        "image.");
-                                break;
-                        }
-
-                        if (io_pageset == 1) {
-                                printk(KERN_ERR "\nBreaking out of I/O loop "
-                                        "because of result code %d.\n", result);
-                                break;
-                        }
-                        panic("Read chunk returned (%d)", result);
-                }
-
-                /*
-                 * Discard reads of resaved pages while reading ps2
-                 * and unwanted pages while rereading ps2 when aborting.
-                 */
-                if (!io_write) {
-                        if (!PageResave(pfn_to_page(write_pfn)))
-                                use_read_page(write_pfn, buffer);
-                        else {
-                                mutex_lock(&io_mutex);
-                                toi_message(TOI_IO, TOI_VERBOSE, 0,
-                                                "Resaved %ld.", write_pfn);
-                                atomic_inc(&io_count);
-                                mutex_unlock(&io_mutex);
-                        }
-                }
-
-                if (!thread_num) {
-                        if(my_io_index + io_base > io_nextupdate)
-                                io_nextupdate = status_update(io_write,
-                                                my_io_index + io_base,
-                                                jiffies - start_time);
-
-                        if (my_io_index > io_pc) {
-                                printk(KERN_CONT "...%d%%", 20 * io_pc_step);
-                                io_pc_step++;
-                                io_pc = io_finish_at * io_pc_step / 5;
-                        }
-                }
-
-                toi_cond_pause(0, NULL);
-
-                /*
-                 * Subtle: If there's less I/O still to be done than threads
-                 * running, quit. This stops us doing I/O beyond the end of
-                 * the image when reading.
-                 *
-                 * Possible race condition. Two threads could do the test at
-                 * the same time; one should exit and one should continue.
-                 * Therefore we take the mutex before comparing and exiting.
-                 */
-
-                mutex_lock(&io_mutex);
-        }
-
-        last_worker = atomic_dec_and_test(&toi_io_workers);
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers));
-        mutex_unlock(&io_mutex);
-
-        if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) {
-                /* Were we the last thread and we're using a flusher thread? */
-                if (last_worker && using_flusher) {
-                        toiActiveAllocator->finish_all_io();
-                }
-                /* First, if we're doing I/O, wait for it to finish */
-                wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN);
-                /* Then wait to be told what to do next */
-                wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP);
-                if (toi_worker_command == TOI_IO_WORKER_RUN)
-                        goto top;
-        }
-
-        if (thread_num)
-                atomic_dec(&toi_num_other_threads);
-
-out:
-        toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num);
-        toi__free_page(28, buffer);
-        free_cpumask_var(orig_mask);
-
-        return result;
-}
-
-int toi_start_other_threads(void)
-{
-        int cpu;
-        struct task_struct *p;
-        int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1;
-  unsigned long num_started = 0;
-
-        if (test_action_state(TOI_NO_MULTITHREADED_IO))
-                return 0;
-
-        toi_worker_command = TOI_IO_WORKER_STOP;
-
-        for_each_online_cpu(cpu) {
-                if (num_started == to_start)
-                        break;
-
-                if (cpu == smp_processor_id())
-                        continue;
-
-                p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1,
-                                cpu_to_node(cpu), "ktoi_io/%d", cpu);
-                if (IS_ERR(p)) {
-                        printk(KERN_ERR "ktoi_io for %i failed\n", cpu);
-                        continue;
-                }
-                kthread_bind(p, cpu);
-                p->flags |= PF_MEMALLOC;
-                wake_up_process(p);
-                num_started++;
-                atomic_inc(&toi_num_other_threads);
-        }
-
-        toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started);
-        return num_started;
-}
-
-void toi_stop_other_threads(void)
-{
-        toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads.");
-        toi_worker_command = TOI_IO_WORKER_EXIT;
-        wake_up(&toi_worker_wait_queue);
-}
-
-/**
- * do_rw_loop - main highlevel function for reading or writing pages
- *
- * Create the io_map bitmap and call worker_rw_loop to perform I/O operations.
- **/
-static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags,
-                int base, int barmax, int pageset)
-{
-        int index = 0, cpu, result = 0, workers_started;
-        unsigned long pfn, next;
-
-        first_filter = toi_get_next_filter(NULL);
-
-        if (!finish_at)
-                return 0;
-
-        io_write = write;
-        io_finish_at = finish_at;
-        io_base = base;
-        io_barmax = barmax;
-        io_pageset = pageset;
-        io_index = 0;
-        io_pc = io_finish_at / 5;
-        io_pc_step = 1;
-        io_result = 0;
-        io_nextupdate = base + 1;
-        toi_bio_queue_flusher_should_finish = 0;
-
-        for_each_online_cpu(cpu) {
-                per_cpu(last_sought, cpu) = NULL;
-                per_cpu(last_low_page, cpu) = NULL;
-                per_cpu(last_high_page, cpu) = NULL;
-        }
-
-        /* Ensure all bits clear */
-        memory_bm_clear(io_map);
-
-        memory_bm_position_reset(io_map);
-        next = memory_bm_next_pfn(io_map, 0);
-
-        BUG_ON(next != BM_END_OF_MAP);
-
-        /* Set the bits for the pages to write */
-        memory_bm_position_reset(pageflags);
-
-        pfn = memory_bm_next_pfn(pageflags, 0);
-        toi_trace_index++;
-
-        while (pfn != BM_END_OF_MAP && index < finish_at) {
-                TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at);
-                memory_bm_set_bit(io_map, 0, pfn);
-                pfn = memory_bm_next_pfn(pageflags, 0);
-                index++;
-        }
-
-        BUG_ON(next != BM_END_OF_MAP || index < finish_at);
-
-        memory_bm_position_reset(io_map);
-        toi_trace_index++;
-
-        atomic_set(&io_count, finish_at);
-
-        memory_bm_position_reset(pageset1_map);
-
-        mutex_lock(&io_mutex);
-
-        clear_toi_state(TOI_IO_STOPPED);
-
-        using_flusher = (atomic_read(&toi_num_other_threads) &&
-                         toiActiveAllocator->io_flusher &&
-                         !test_action_state(TOI_NO_FLUSHER_THREAD));
-
-        workers_started = atomic_read(&toi_num_other_threads);
-
-        memory_bm_position_reset(io_map);
-        memory_bm_position_reset(pageset1_copy_map);
-
-        toi_worker_command = TOI_IO_WORKER_RUN;
-        wake_up(&toi_worker_wait_queue);
-
-        mutex_unlock(&io_mutex);
-
-        if (using_flusher)
-                result = toiActiveAllocator->io_flusher(write);
-        else
-                worker_rw_loop(NULL);
-
-        while (atomic_read(&toi_io_workers))
-                schedule();
-
-        printk(KERN_CONT "\n");
-
-        toi_worker_command = TOI_IO_WORKER_STOP;
-        wake_up(&toi_worker_wait_queue);
-
-        if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
-                if (!atomic_read(&toi_io_workers)) {
-                        rw_cleanup_modules(READ);
-                        set_toi_state(TOI_IO_STOPPED);
-                }
-                while (1)
-                        schedule();
-        }
-        set_toi_state(TOI_IO_STOPPED);
-
-        if (!io_result && !result && !test_result_state(TOI_ABORTED)) {
-                unsigned long next;
-
-                toi_update_status(io_base + io_finish_at, io_barmax,
-                                " %d/%d MB ",
-                                MB(io_base + io_finish_at), MB(io_barmax));
-
-                memory_bm_position_reset(io_map);
-                next = memory_bm_next_pfn(io_map, 0);
-                if  (next != BM_END_OF_MAP) {
-                        printk(KERN_INFO "Finished I/O loop but still work to "
-                                        "do?\nFinish at = %d. io_count = %d.\n",
-                                        finish_at, atomic_read(&io_count));
-                        printk(KERN_INFO "I/O bitmap still records work to do."
-                                        "%ld.\n", next);
-                        BUG();
-                        do {
-                                cpu_relax();
-                        } while (0);
-                }
-        }
-
-        return io_result ? io_result : result;
-}
-
-/**
- * write_pageset - write a pageset to disk.
- * @pagedir:        Which pagedir to write.
- *
- * Returns:
- *        Zero on success or -1 on failure.
- **/
-int write_pageset(struct pagedir *pagedir)
-{
-        int finish_at, base = 0;
-        int barmax = pagedir1.size + pagedir2.size;
-        long error = 0;
-        struct memory_bitmap *pageflags;
-        unsigned long start_time, end_time;
-
-        /*
-         * Even if there is nothing to read or write, the allocator
-         * may need the init/cleanup for it's housekeeping.  (eg:
-         * Pageset1 may start where pageset2 ends when writing).
-         */
-        finish_at = pagedir->size;
-
-        if (pagedir->id == 1) {
-                toi_prepare_status(DONT_CLEAR_BAR,
-                                "Writing kernel & process data...");
-                base = pagedir2.size;
-                if (test_action_state(TOI_TEST_FILTER_SPEED) ||
-                    test_action_state(TOI_TEST_BIO))
-                        pageflags = pageset1_map;
-                else
-                        pageflags = pageset1_copy_map;
-        } else {
-                toi_prepare_status(DONT_CLEAR_BAR, "Writing caches...");
-                pageflags = pageset2_map;
-        }
-
-        start_time = jiffies;
-
-        if (rw_init_modules(WRITE, pagedir->id)) {
-                abort_hibernate(TOI_FAILED_MODULE_INIT,
-                                "Failed to initialise modules for writing.");
-                error = 1;
-        }
-
-        if (!error)
-                error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax,
-                                pagedir->id);
-
-        if (rw_cleanup_modules(WRITE) && !error) {
-                abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
-                                "Failed to cleanup after writing.");
-                error = 1;
-        }
-
-        end_time = jiffies;
-
-        if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
-                toi_bkd.toi_io_time[0][0] += finish_at,
-                toi_bkd.toi_io_time[0][1] += (end_time - start_time);
-        }
-
-        return error;
-}
-
-/**
- * read_pageset - highlevel function to read a pageset from disk
- * @pagedir:                        pageset to read
- * @overwrittenpagesonly:        Whether to read the whole pageset or
- *                                only part of it.
- *
- * Returns:
- *        Zero on success or -1 on failure.
- **/
-static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
-{
-        int result = 0, base = 0;
-        int finish_at = pagedir->size;
-        int barmax = pagedir1.size + pagedir2.size;
-        struct memory_bitmap *pageflags;
-        unsigned long start_time, end_time;
-
-        if (pagedir->id == 1) {
-                toi_prepare_status(DONT_CLEAR_BAR,
-                                "Reading kernel & process data...");
-                pageflags = pageset1_map;
-        } else {
-                toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
-                if (overwrittenpagesonly) {
-                        barmax = min(pagedir1.size, pagedir2.size);
-                        finish_at = min(pagedir1.size, pagedir2.size);
-                } else
-                        base = pagedir1.size;
-                pageflags = pageset2_map;
-        }
-
-        start_time = jiffies;
-
-        if (rw_init_modules(READ, pagedir->id)) {
-                toiActiveAllocator->remove_image();
-                result = 1;
-        } else
-                result = do_rw_loop(READ, finish_at, pageflags, base, barmax,
-                                pagedir->id);
-
-        if (rw_cleanup_modules(READ) && !result) {
-                abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
-                                "Failed to cleanup after reading.");
-                result = 1;
-        }
-
-        /* Statistics */
-        end_time = jiffies;
-
-        if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
-                toi_bkd.toi_io_time[1][0] += finish_at,
-                toi_bkd.toi_io_time[1][1] += (end_time - start_time);
-        }
-
-        return result;
-}
-
-/**
- * write_module_configs - store the modules configuration
- *
- * The configuration for each module is stored in the image header.
- * Returns: Int
- *        Zero on success, Error value otherwise.
- **/
-static int write_module_configs(void)
-{
-        struct toi_module_ops *this_module;
-        char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
-        int len, index = 1;
-        struct toi_module_header toi_module_header;
-
-        if (!buffer) {
-                printk(KERN_INFO "Failed to allocate a buffer for saving "
-                                "module configuration info.\n");
-                return -ENOMEM;
-        }
-
-        /*
-         * We have to know which data goes with which module, so we at
-         * least write a length of zero for a module. Note that we are
-         * also assuming every module's config data takes <= PAGE_SIZE.
-         */
-
-        /* For each module (in registration order) */
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled || !this_module->storage_needed ||
-                    (this_module->type == WRITER_MODULE &&
-                     toiActiveAllocator != this_module))
-                        continue;
-
-                /* Get the data from the module */
-                len = 0;
-                if (this_module->save_config_info)
-                        len = this_module->save_config_info(buffer);
-
-                /* Save the details of the module */
-                toi_module_header.enabled = this_module->enabled;
-                toi_module_header.type = this_module->type;
-                toi_module_header.index = index++;
-                strncpy(toi_module_header.name, this_module->name,
-                                        sizeof(toi_module_header.name));
-                toiActiveAllocator->rw_header_chunk(WRITE,
-                                this_module,
-                                (char *) &toi_module_header,
-                                sizeof(toi_module_header));
-
-                /* Save the size of the data and any data returned */
-                toiActiveAllocator->rw_header_chunk(WRITE,
-                                this_module,
-                                (char *) &len, sizeof(int));
-                if (len)
-                        toiActiveAllocator->rw_header_chunk(
-                                WRITE, this_module, buffer, len);
-        }
-
-        /* Write a blank header to terminate the list */
-        toi_module_header.name[0] = '\0';
-        toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                        (char *) &toi_module_header, sizeof(toi_module_header));
-
-        toi_free_page(22, (unsigned long) buffer);
-        return 0;
-}
-
-/**
- * read_one_module_config - read and configure one module
- *
- * Read the configuration for one module, and configure the module
- * to match if it is loaded.
- *
- * Returns: Int
- *        Zero on success, Error value otherwise.
- **/
-static int read_one_module_config(struct toi_module_header *header)
-{
-        struct toi_module_ops *this_module;
-        int result, len;
-        char *buffer;
-
-        /* Find the module */
-        this_module = toi_find_module_given_name(header->name);
-
-        if (!this_module) {
-                if (header->enabled) {
-                        toi_early_boot_message(1, TOI_CONTINUE_REQ,
-                                "It looks like we need module %s for reading "
-                                "the image but it hasn't been registered.\n",
-                                header->name);
-                        if (!(test_toi_state(TOI_CONTINUE_REQ)))
-                                return -EINVAL;
-                } else
-                        printk(KERN_INFO "Module %s configuration data found, "
-                                "but the module hasn't registered. Looks like "
-                                "it was disabled, so we're ignoring its data.",
-                                header->name);
-        }
-
-        /* Get the length of the data (if any) */
-        result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len,
-                        sizeof(int));
-        if (result) {
-                printk(KERN_ERR "Failed to read the length of the module %s's"
-                                " configuration data.\n",
-                                header->name);
-                return -EINVAL;
-        }
-
-        /* Read any data and pass to the module (if we found one) */
-        if (!len)
-                return 0;
-
-        buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
-
-        if (!buffer) {
-                printk(KERN_ERR "Failed to allocate a buffer for reloading "
-                                "module configuration info.\n");
-                return -ENOMEM;
-        }
-
-        toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len);
-
-        if (!this_module)
-                goto out;
-
-        if (!this_module->save_config_info)
-                printk(KERN_ERR "Huh? Module %s appears to have a "
-                                "save_config_info, but not a load_config_info "
-                                "function!\n", this_module->name);
-        else
-                this_module->load_config_info(buffer, len);
-
-        /*
-         * Now move this module to the tail of its lists. This will put it in
-         * order. Any new modules will end up at the top of the lists. They
-         * should have been set to disabled when loaded (people will
-         * normally not edit an initrd to load a new module and then hibernate
-         * without using it!).
-         */
-
-        toi_move_module_tail(this_module);
-
-        this_module->enabled = header->enabled;
-
-out:
-        toi_free_page(23, (unsigned long) buffer);
-        return 0;
-}
-
-/**
- * read_module_configs - reload module configurations from the image header.
- *
- * Returns: Int
- *        Zero on success or an error code.
- **/
-static int read_module_configs(void)
-{
-        int result = 0;
-        struct toi_module_header toi_module_header;
-        struct toi_module_ops *this_module;
-
-        /* All modules are initially disabled. That way, if we have a module
-         * loaded now that wasn't loaded when we hibernated, it won't be used
-         * in trying to read the data.
-         */
-        list_for_each_entry(this_module, &toi_modules, module_list)
-                this_module->enabled = 0;
-
-        /* Get the first module header */
-        result = toiActiveAllocator->rw_header_chunk(READ, NULL,
-                        (char *) &toi_module_header,
-                        sizeof(toi_module_header));
-        if (result) {
-                printk(KERN_ERR "Failed to read the next module header.\n");
-                return -EINVAL;
-        }
-
-        /* For each module (in registration order) */
-        while (toi_module_header.name[0]) {
-                result = read_one_module_config(&toi_module_header);
-
-                if (result)
-                        return -EINVAL;
-
-                /* Get the next module header */
-                result = toiActiveAllocator->rw_header_chunk(READ, NULL,
-                                (char *) &toi_module_header,
-                                sizeof(toi_module_header));
-
-                if (result) {
-                        printk(KERN_ERR "Failed to read the next module "
-                                        "header.\n");
-                        return -EINVAL;
-                }
-        }
-
-        return 0;
-}
-
-static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev)
-{
-        return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1;
-}
-
-int fs_info_space_needed(void)
-{
-        const struct super_block *sb;
-        int result = sizeof(int);
-
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                struct fs_info *fs;
-
-                if (!sb->s_bdev)
-                        continue;
-
-                fs = fs_info_from_block_dev(sb->s_bdev);
-                if (save_fs_info(fs, sb->s_bdev))
-                        result += 16 + sizeof(dev_t) + sizeof(int) +
-                                fs->last_mount_size;
-                free_fs_info(fs);
-        }
-        return result;
-}
-
-static int fs_info_num_to_save(void)
-{
-        const struct super_block *sb;
-        int to_save = 0;
-
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                struct fs_info *fs;
-
-                if (!sb->s_bdev)
-                        continue;
-
-                fs = fs_info_from_block_dev(sb->s_bdev);
-                if (save_fs_info(fs, sb->s_bdev))
-                        to_save++;
-                free_fs_info(fs);
-        }
-
-        return to_save;
-}
-
-static int fs_info_save(void)
-{
-        const struct super_block *sb;
-        int to_save = fs_info_num_to_save();
-
-        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save,
-                                sizeof(int))) {
-                abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info"
-                                " to save.");
-                return -EIO;
-        }
-
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                struct fs_info *fs;
-
-                if (!sb->s_bdev)
-                        continue;
-
-                fs = fs_info_from_block_dev(sb->s_bdev);
-                if (save_fs_info(fs, sb->s_bdev)) {
-                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                                        &fs->uuid[0], 16)) {
-                                abort_hibernate(TOI_FAILED_IO, "Failed to "
-                                                "write uuid.");
-                                return -EIO;
-                        }
-                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                                        (char *) &fs->dev_t, sizeof(dev_t))) {
-                                abort_hibernate(TOI_FAILED_IO, "Failed to "
-                                                "write dev_t.");
-                                return -EIO;
-                        }
-                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                                        (char *) &fs->last_mount_size, sizeof(int))) {
-                                abort_hibernate(TOI_FAILED_IO, "Failed to "
-                                                "write last mount length.");
-                                return -EIO;
-                        }
-                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                                        fs->last_mount, fs->last_mount_size)) {
-                                abort_hibernate(TOI_FAILED_IO, "Failed to "
-                                                "write uuid.");
-                                return -EIO;
-                        }
-                }
-                free_fs_info(fs);
-        }
-        return 0;
-}
-
-static int fs_info_load_and_check_one(void)
-{
-        char uuid[16], *last_mount;
-        int result = 0, ln;
-        dev_t dev_t;
-        struct block_device *dev;
-        struct fs_info *fs_info, seek;
-
-        if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) {
-                abort_hibernate(TOI_FAILED_IO, "Failed to read uuid.");
-                return -EIO;
-        }
-
-        read_if_version(3, dev_t, "uuid dev_t field", return -EIO);
-
-        if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln,
-                                sizeof(int))) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to read last mount size.");
-                return -EIO;
-        }
-
-        last_mount = kzalloc(ln, GFP_KERNEL);
-
-        if (!last_mount)
-                return -ENOMEM;
-
-        if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount,        ln)) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to read last mount timestamp.");
-                result = -EIO;
-                goto out_lmt;
-        }
-
-        strncpy((char *) &seek.uuid, uuid, 16);
-        seek.dev_t = dev_t;
-        seek.last_mount_size = ln;
-        seek.last_mount = last_mount;
-        dev_t = blk_lookup_fs_info(&seek);
-        if (!dev_t)
-                goto out_lmt;
-
-        dev = toi_open_by_devnum(dev_t);
-
-        fs_info = fs_info_from_block_dev(dev);
-        if (fs_info && !IS_ERR(fs_info)) {
-                if (ln != fs_info->last_mount_size) {
-                        printk(KERN_EMERG "Found matching uuid but last mount "
-                                        "time lengths differ?! "
-                                        "(%d vs %d).\n", ln,
-                                        fs_info->last_mount_size);
-                        result = -EINVAL;
-                } else {
-                        char buf[BDEVNAME_SIZE];
-                        result = !!memcmp(fs_info->last_mount, last_mount, ln);
-                        if (result)
-                                printk(KERN_EMERG "Last mount time for %s has "
-                                        "changed!\n", bdevname(dev, buf));
-                }
-        }
-        toi_close_bdev(dev);
-        free_fs_info(fs_info);
-out_lmt:
-        kfree(last_mount);
-        return result;
-}
-
-static int fs_info_load_and_check(void)
-{
-        int to_do, result = 0;
-
-        if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do,
-                                sizeof(int))) {
-                abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info "
-                                "to load.");
-                return -EIO;
-        }
-
-        while(to_do--)
-                result |= fs_info_load_and_check_one();
-
-        return result;
-}
-
-/**
- * write_image_header - write the image header after write the image proper
- *
- * Returns: Int
- *        Zero on success, error value otherwise.
- **/
-int write_image_header(void)
-{
-        int ret;
-        int total = pagedir1.size + pagedir2.size+2;
-        char *header_buffer = NULL;
-
-        /* Now prepare to write the header */
-        ret = toiActiveAllocator->write_header_init();
-        if (ret) {
-                abort_hibernate(TOI_FAILED_MODULE_INIT,
-                                "Active allocator's write_header_init"
-                                " function failed.");
-                goto write_image_header_abort;
-        }
-
-        /* Get a buffer */
-        header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
-        if (!header_buffer) {
-                abort_hibernate(TOI_OUT_OF_MEMORY,
-                        "Out of memory when trying to get page for header!");
-                goto write_image_header_abort;
-        }
-
-        /* Write hibernate header */
-        if (fill_toi_header((struct toi_header *) header_buffer)) {
-                abort_hibernate(TOI_OUT_OF_MEMORY,
-                        "Failure to fill header information!");
-                goto write_image_header_abort;
-        }
-
-        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                        header_buffer, sizeof(struct toi_header))) {
-                abort_hibernate(TOI_OUT_OF_MEMORY,
-                        "Failure to write header info.");
-                goto write_image_header_abort;
-        }
-
-        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                        (char *) &toi_max_workers, sizeof(toi_max_workers))) {
-                abort_hibernate(TOI_OUT_OF_MEMORY,
-                        "Failure to number of workers to use.");
-                goto write_image_header_abort;
-        }
-
-        /* Write filesystem info */
-        if (fs_info_save())
-                goto write_image_header_abort;
-
-        /* Write module configurations */
-        ret = write_module_configs();
-        if (ret) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to write module configs.");
-                goto write_image_header_abort;
-        }
-
-        if (memory_bm_write(pageset1_map,
-                                toiActiveAllocator->rw_header_chunk)) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to write bitmaps.");
-                goto write_image_header_abort;
-        }
-
-        /* Flush data and let allocator cleanup */
-        if (toiActiveAllocator->write_header_cleanup()) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to cleanup writing header.");
-                goto write_image_header_abort_no_cleanup;
-        }
-
-        if (test_result_state(TOI_ABORTED))
-                goto write_image_header_abort_no_cleanup;
-
-        toi_update_status(total, total, NULL);
-
-out:
-        if (header_buffer)
-                toi_free_page(24, (unsigned long) header_buffer);
-        return ret;
-
-write_image_header_abort:
-        toiActiveAllocator->write_header_cleanup();
-write_image_header_abort_no_cleanup:
-        ret = -1;
-        goto out;
-}
-
-/**
- * sanity_check - check the header
- * @sh:        the header which was saved at hibernate time.
- *
- * Perform a few checks, seeking to ensure that the kernel being
- * booted matches the one hibernated. They need to match so we can
- * be _sure_ things will work. It is not absolutely impossible for
- * resuming from a different kernel to work, just not assured.
- **/
-static char *sanity_check(struct toi_header *sh)
-{
-        char *reason = check_image_kernel((struct swsusp_info *) sh);
-
-        if (reason)
-                return reason;
-
-        if (!test_action_state(TOI_IGNORE_ROOTFS)) {
-                const struct super_block *sb;
-                list_for_each_entry(sb, &super_blocks, s_list) {
-                        if ((!(sb->s_flags & MS_RDONLY)) &&
-                            (sb->s_type->fs_flags & FS_REQUIRES_DEV))
-                                return "Device backed fs has been mounted "
-                                        "rw prior to resume or initrd/ramfs "
-                                        "is mounted rw.";
-                }
-        }
-
-        return NULL;
-}
-
-static DECLARE_WAIT_QUEUE_HEAD(freeze_wait);
-
-#define FREEZE_IN_PROGRESS (~0)
-
-static int freeze_result;
-
-static void do_freeze(struct work_struct *dummy)
-{
-        freeze_result = freeze_processes();
-        wake_up(&freeze_wait);
-        trap_non_toi_io = 1;
-}
-
-static DECLARE_WORK(freeze_work, do_freeze);
-
-/**
- * __read_pageset1 - test for the existence of an image and attempt to load it
- *
- * Returns:        Int
- *        Zero if image found and pageset1 successfully loaded.
- *        Error if no image found or loaded.
- **/
-static int __read_pageset1(void)
-{
-        int i, result = 0;
-        char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
-             *sanity_error = NULL;
-        struct toi_header *toi_header;
-
-        if (!header_buffer) {
-                printk(KERN_INFO "Unable to allocate a page for reading the "
-                                "signature.\n");
-                return -ENOMEM;
-        }
-
-        /* Check for an image */
-        result = toiActiveAllocator->image_exists(1);
-        if (result == 3) {
-                result = -ENODATA;
-                toi_early_boot_message(1, 0, "The signature from an older "
-                                "version of TuxOnIce has been detected.");
-                goto out_remove_image;
-        }
-
-        if (result != 1) {
-                result = -ENODATA;
-                noresume_reset_modules();
-                printk(KERN_INFO "TuxOnIce: No image found.\n");
-                goto out;
-        }
-
-        /*
-         * Prepare the active allocator for reading the image header. The
-         * activate allocator might read its own configuration.
-         *
-         * NB: This call may never return because there might be a signature
-         * for a different image such that we warn the user and they choose
-         * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
-         * location of the image might be unavailable if it was stored on a
-         * network connection).
-         */
-
-        result = toiActiveAllocator->read_header_init();
-        if (result) {
-                printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the "
-                                "image header.\n");
-                goto out_remove_image;
-        }
-
-        /* Check for noresume command line option */
-        if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
-                printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
-                                "image.\n");
-                goto out_remove_image;
-        }
-
-        /* Check whether we've resumed before */
-        if (test_toi_state(TOI_RESUMED_BEFORE)) {
-                toi_early_boot_message(1, 0, NULL);
-                if (!(test_toi_state(TOI_CONTINUE_REQ))) {
-                        printk(KERN_INFO "TuxOnIce: Tried to resume before: "
-                                        "Invalidated image.\n");
-                        goto out_remove_image;
-                }
-        }
-
-        clear_toi_state(TOI_CONTINUE_REQ);
-
-        toi_image_header_version = toiActiveAllocator->get_header_version();
-
-        if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) {
-                toi_early_boot_message(1, 0, image_version_error);
-                if (!(test_toi_state(TOI_CONTINUE_REQ))) {
-                        printk(KERN_INFO "TuxOnIce: Header version too new: "
-                                        "Invalidated image.\n");
-                        goto out_remove_image;
-                }
-        }
-
-        /* Read hibernate header */
-        result = toiActiveAllocator->rw_header_chunk(READ, NULL,
-                        header_buffer, sizeof(struct toi_header));
-        if (result < 0) {
-                printk(KERN_ERR "TuxOnIce: Failed to read the image "
-                                "signature.\n");
-                goto out_remove_image;
-        }
-
-        toi_header = (struct toi_header *) header_buffer;
-
-        /*
-         * NB: This call may also result in a reboot rather than returning.
-         */
-
-        sanity_error = sanity_check(toi_header);
-        if (sanity_error) {
-                toi_early_boot_message(1, TOI_CONTINUE_REQ,
-                                sanity_error);
-                printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
-                goto out_remove_image;
-        }
-
-        /*
-         * We have an image and it looks like it will load okay.
-         *
-         * Get metadata from header. Don't override commandline parameters.
-         *
-         * We don't need to save the image size limit because it's not used
-         * during resume and will be restored with the image anyway.
-         */
-
-        memcpy((char *) &pagedir1,
-                (char *) &toi_header->pagedir, sizeof(pagedir1));
-        toi_result = toi_header->param0;
-        if (!toi_bkd.toi_debug_state) {
-                toi_bkd.toi_action =
-                        (toi_header->param1 & ~toi_bootflags_mask) |
-                        (toi_bkd.toi_action & toi_bootflags_mask);
-                toi_bkd.toi_debug_state = toi_header->param2;
-                toi_bkd.toi_default_console_level = toi_header->param3;
-        }
-        clear_toi_state(TOI_IGNORE_LOGLEVEL);
-        pagedir2.size = toi_header->pageset_2_size;
-        for (i = 0; i < 4; i++)
-                toi_bkd.toi_io_time[i/2][i%2] =
-                        toi_header->io_time[i/2][i%2];
-
-        set_toi_state(TOI_BOOT_KERNEL);
-        boot_kernel_data_buffer = toi_header->bkd;
-
-        read_if_version(1, toi_max_workers, "TuxOnIce max workers",
-                        goto out_remove_image);
-
-        /* Read filesystem info */
-        if (fs_info_load_and_check()) {
-                printk(KERN_EMERG "TuxOnIce: File system mount time checks "
-                        "failed. Refusing to corrupt your filesystems!\n");
-                goto out_remove_image;
-        }
-
-        /* Read module configurations */
-        result = read_module_configs();
-        if (result) {
-                pagedir1.size = 0;
-                pagedir2.size = 0;
-                printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
-                                "configurations.\n");
-                clear_action_state(TOI_KEEP_IMAGE);
-                goto out_remove_image;
-        }
-
-        toi_prepare_console();
-
-        set_toi_state(TOI_NOW_RESUMING);
-
-        result = pm_notifier_call_chain(PM_RESTORE_PREPARE);
-        if (result)
-                goto out_notifier_call_chain;;
-
-        if (usermodehelper_disable())
-                goto out_enable_usermodehelper;
-
-        current->flags |= PF_NOFREEZE;
-        freeze_result = FREEZE_IN_PROGRESS;
-
-        schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work);
-
-        toi_cond_pause(1, "About to read original pageset1 locations.");
-
-        /*
-         * See _toi_rw_header_chunk in tuxonice_bio.c:
-         * Initialize pageset1_map by reading the map from the image.
-         */
-        if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk))
-                goto out_thaw;
-
-        /*
-         * See toi_rw_cleanup in tuxonice_bio.c:
-         * Clean up after reading the header.
-         */
-        result = toiActiveAllocator->read_header_cleanup();
-        if (result) {
-                printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the "
-                                "image header.\n");
-                goto out_thaw;
-        }
-
-        toi_cond_pause(1, "About to read pagedir.");
-
-        /*
-         * Get the addresses of pages into which we will load the kernel to
-         * be copied back and check if they conflict with the ones we are using.
-         */
-        if (toi_get_pageset1_load_addresses()) {
-                printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
-                                "pageset1.\n");
-                goto out_thaw;
-        }
-
-        /* Read the original kernel back */
-        toi_cond_pause(1, "About to read pageset 1.");
-
-        /* Given the pagemap, read back the data from disk */
-        if (read_pageset(&pagedir1, 0)) {
-                toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1.");
-                result = -EIO;
-                goto out_thaw;
-        }
-
-        toi_cond_pause(1, "About to restore original kernel.");
-        result = 0;
-
-        if (!toi_keeping_image &&
-            toiActiveAllocator->mark_resume_attempted)
-                toiActiveAllocator->mark_resume_attempted(1);
-
-        wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
-out:
-        current->flags &= ~PF_NOFREEZE;
-        toi_free_page(25, (unsigned long) header_buffer);
-        return result;
-
-out_thaw:
-        wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
-        trap_non_toi_io = 0;
-        thaw_processes();
-out_enable_usermodehelper:
-        usermodehelper_enable();
-out_notifier_call_chain:
-        pm_notifier_call_chain(PM_POST_RESTORE);
-        toi_cleanup_console();
-out_remove_image:
-        result = -EINVAL;
-        if (!toi_keeping_image)
-                toiActiveAllocator->remove_image();
-        toiActiveAllocator->read_header_cleanup();
-        noresume_reset_modules();
-        goto out;
-}
-
-/**
- * read_pageset1 - highlevel function to read the saved pages
- *
- * Attempt to read the header and pageset1 of a hibernate image.
- * Handle the outcome, complaining where appropriate.
- **/
-int read_pageset1(void)
-{
-        int error;
-
-        error = __read_pageset1();
-
-        if (error && error != -ENODATA && error != -EINVAL &&
-                                        !test_result_state(TOI_ABORTED))
-                abort_hibernate(TOI_IMAGE_ERROR,
-                        "TuxOnIce: Error %d resuming\n", error);
-
-        return error;
-}
-
-/**
- * get_have_image_data - check the image header
- **/
-static char *get_have_image_data(void)
-{
-        char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
-        struct toi_header *toi_header;
-
-        if (!output_buffer) {
-                printk(KERN_INFO "Output buffer null.\n");
-                return NULL;
-        }
-
-        /* Check for an image */
-        if (!toiActiveAllocator->image_exists(1) ||
-            toiActiveAllocator->read_header_init() ||
-            toiActiveAllocator->rw_header_chunk(READ, NULL,
-                        output_buffer, sizeof(struct toi_header))) {
-                sprintf(output_buffer, "0\n");
-                /*
-                 * From an initrd/ramfs, catting have_image and
-                 * getting a result of 0 is sufficient.
-                 */
-                clear_toi_state(TOI_BOOT_TIME);
-                goto out;
-        }
-
-        toi_header = (struct toi_header *) output_buffer;
-
-        sprintf(output_buffer, "1\n%s\n%s\n",
-                        toi_header->uts.machine,
-                        toi_header->uts.version);
-
-        /* Check whether we've resumed before */
-        if (test_toi_state(TOI_RESUMED_BEFORE))
-                strcat(output_buffer, "Resumed before.\n");
-
-out:
-        noresume_reset_modules();
-        return output_buffer;
-}
-
-/**
- * read_pageset2 - read second part of the image
- * @overwrittenpagesonly:        Read only pages which would have been
- *                                verwritten by pageset1?
- *
- * Read in part or all of pageset2 of an image, depending upon
- * whether we are hibernating and have only overwritten a portion
- * with pageset1 pages, or are resuming and need to read them
- * all.
- *
- * Returns: Int
- *        Zero if no error, otherwise the error value.
- **/
-int read_pageset2(int overwrittenpagesonly)
-{
-        int result = 0;
-
-        if (!pagedir2.size)
-                return 0;
-
-        result = read_pageset(&pagedir2, overwrittenpagesonly);
-
-        toi_cond_pause(1, "Pagedir 2 read.");
-
-        return result;
-}
-
-/**
- * image_exists_read - has an image been found?
- * @page:        Output buffer
- *
- * Store 0 or 1 in page, depending on whether an image is found.
- * Incoming buffer is PAGE_SIZE and result is guaranteed
- * to be far less than that, so we don't worry about
- * overflow.
- **/
-int image_exists_read(const char *page, int count)
-{
-        int len = 0;
-        char *result;
-
-        if (toi_activate_storage(0))
-                return count;
-
-        if (!test_toi_state(TOI_RESUME_DEVICE_OK))
-                toi_attempt_to_parse_resume_device(0);
-
-        if (!toiActiveAllocator) {
-                len = sprintf((char *) page, "-1\n");
-        } else {
-                result = get_have_image_data();
-                if (result) {
-                        len = sprintf((char *) page, "%s",  result);
-                        toi_free_page(26, (unsigned long) result);
-                }
-        }
-
-        toi_deactivate_storage(0);
-
-        return len;
-}
-
-/**
- * image_exists_write - invalidate an image if one exists
- **/
-int image_exists_write(const char *buffer, int count)
-{
-        if (toi_activate_storage(0))
-                return count;
-
-        if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
-                toiActiveAllocator->remove_image();
-
-        toi_deactivate_storage(0);
-
-        clear_result_state(TOI_KEPT_IMAGE);
-
-        return count;
-}
diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
deleted file mode 100644
index 683eab7a0..000000000
--- a/kernel/power/tuxonice_io.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * kernel/power/tuxonice_io.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/utsname.h>
-#include "tuxonice_pagedir.h"
-
-/* Non-module data saved in our image header */
-struct toi_header {
-        /*
-         * Mirror struct swsusp_info, but without
-         * the page aligned attribute
-         */
-        struct new_utsname uts;
-        u32 version_code;
-        unsigned long num_physpages;
-        int cpus;
-        unsigned long image_pages;
-        unsigned long pages;
-        unsigned long size;
-
-        /* Our own data */
-        unsigned long orig_mem_free;
-        int page_size;
-        int pageset_2_size;
-        int param0;
-        int param1;
-        int param2;
-        int param3;
-        int progress0;
-        int progress1;
-        int progress2;
-        int progress3;
-        int io_time[2][2];
-        struct pagedir pagedir;
-        dev_t root_fs;
-        unsigned long bkd; /* Boot kernel data locn */
-};
-
-extern int write_pageset(struct pagedir *pagedir);
-extern int write_image_header(void);
-extern int read_pageset1(void);
-extern int read_pageset2(int overwrittenpagesonly);
-
-extern int toi_attempt_to_parse_resume_device(int quiet);
-extern void attempt_to_parse_resume_device2(void);
-extern void attempt_to_parse_alt_resume_param(void);
-int image_exists_read(const char *page, int count);
-int image_exists_write(const char *buffer, int count);
-extern void save_restore_alt_param(int replace, int quiet);
-extern atomic_t toi_io_workers;
-
-/* Args to save_restore_alt_param */
-#define RESTORE 0
-#define SAVE 1
-
-#define NOQUIET 0
-#define QUIET 1
-
-extern wait_queue_head_t toi_io_queue_flusher;
-extern int toi_bio_queue_flusher_should_finish;
-
-int fs_info_space_needed(void);
-
-extern int toi_max_workers;
diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
deleted file mode 100644
index a203c8fb9..000000000
--- a/kernel/power/tuxonice_modules.c
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-
-LIST_HEAD(toi_filters);
-LIST_HEAD(toiAllocators);
-
-LIST_HEAD(toi_modules);
-
-struct toi_module_ops *toiActiveAllocator;
-
-static int toi_num_filters;
-int toiNumAllocators, toi_num_modules;
-
-/*
- * toi_header_storage_for_modules
- *
- * Returns the amount of space needed to store configuration
- * data needed by the modules prior to copying back the original
- * kernel. We can exclude data for pageset2 because it will be
- * available anyway once the kernel is copied back.
- */
-long toi_header_storage_for_modules(void)
-{
-        struct toi_module_ops *this_module;
-        int bytes = 0;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    (this_module->type == WRITER_MODULE &&
-                     toiActiveAllocator != this_module))
-                        continue;
-                if (this_module->storage_needed) {
-                        int this = this_module->storage_needed() +
-                                sizeof(struct toi_module_header) +
-                                sizeof(int);
-                        this_module->header_requested = this;
-                        bytes += this;
-                }
-        }
-
-        /* One more for the empty terminator */
-        return bytes + sizeof(struct toi_module_header);
-}
-
-void print_toi_header_storage_for_modules(void)
-{
-        struct toi_module_ops *this_module;
-        int bytes = 0;
-
-        printk(KERN_DEBUG "Header storage:\n");
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    (this_module->type == WRITER_MODULE &&
-                     toiActiveAllocator != this_module))
-                        continue;
-                if (this_module->storage_needed) {
-                        int this = this_module->storage_needed() +
-                                sizeof(struct toi_module_header) +
-                                sizeof(int);
-                        this_module->header_requested = this;
-                        bytes += this;
-                        printk(KERN_DEBUG "+ %16s : %-4d/%d.\n",
-                                        this_module->name,
-                                        this_module->header_used, this);
-                }
-        }
-
-        printk(KERN_DEBUG "+ empty terminator : %zu.\n",
-                        sizeof(struct toi_module_header));
-        printk(KERN_DEBUG "                     ====\n");
-        printk(KERN_DEBUG "                     %zu\n",
-                        bytes + sizeof(struct toi_module_header));
-}
-
-/*
- * toi_memory_for_modules
- *
- * Returns the amount of memory requested by modules for
- * doing their work during the cycle.
- */
-
-long toi_memory_for_modules(int print_parts)
-{
-        long bytes = 0, result;
-        struct toi_module_ops *this_module;
-
-        if (print_parts)
-                printk(KERN_INFO "Memory for modules:\n===================\n");
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                int this;
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->memory_needed) {
-                        this = this_module->memory_needed();
-                        if (print_parts)
-                                printk(KERN_INFO "%10d bytes (%5ld pages) for "
-                                                "module '%s'.\n", this,
-                                                DIV_ROUND_UP(this, PAGE_SIZE),
-                                                this_module->name);
-                        bytes += this;
-                }
-        }
-
-        result = DIV_ROUND_UP(bytes, PAGE_SIZE);
-        if (print_parts)
-                printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
-
-        return result;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Returns the compression ratio expected when saving the image.
- */
-
-int toi_expected_compression_ratio(void)
-{
-        int ratio = 100;
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->expected_compression)
-                        ratio = ratio * this_module->expected_compression()
-                                / 100;
-        }
-
-        return ratio;
-}
-
-/* toi_find_module_given_dir
- * Functionality :        Return a module (if found), given a pointer
- *                         to its directory name
- */
-
-static struct toi_module_ops *toi_find_module_given_dir(char *name)
-{
-        struct toi_module_ops *this_module, *found_module = NULL;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!strcmp(name, this_module->directory)) {
-                        found_module = this_module;
-                        break;
-                }
-        }
-
-        return found_module;
-}
-
-/* toi_find_module_given_name
- * Functionality :        Return a module (if found), given a pointer
- *                         to its name
- */
-
-struct toi_module_ops *toi_find_module_given_name(char *name)
-{
-        struct toi_module_ops *this_module, *found_module = NULL;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!strcmp(name, this_module->name)) {
-                        found_module = this_module;
-                        break;
-                }
-        }
-
-        return found_module;
-}
-
-/*
- * toi_print_module_debug_info
- * Functionality   : Get debugging info from modules into a buffer.
- */
-int toi_print_module_debug_info(char *buffer, int buffer_size)
-{
-        struct toi_module_ops *this_module;
-        int len = 0;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->print_debug_info) {
-                        int result;
-                        result = this_module->print_debug_info(buffer + len,
-                                        buffer_size - len);
-                        len += result;
-                }
-        }
-
-        /* Ensure null terminated */
-        buffer[buffer_size] = 0;
-
-        return len;
-}
-
-/*
- * toi_register_module
- *
- * Register a module.
- */
-int toi_register_module(struct toi_module_ops *module)
-{
-        int i;
-        struct kobject *kobj;
-
-        if (!hibernation_available())
-          return -ENODEV;
-
-        module->enabled = 1;
-
-        if (toi_find_module_given_name(module->name)) {
-                printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
-                                " which is already registered.\n",
-                                module->name);
-                return -EBUSY;
-        }
-
-        switch (module->type) {
-        case FILTER_MODULE:
-                list_add_tail(&module->type_list, &toi_filters);
-                toi_num_filters++;
-                break;
-        case WRITER_MODULE:
-                list_add_tail(&module->type_list, &toiAllocators);
-                toiNumAllocators++;
-                break;
-        case MISC_MODULE:
-        case MISC_HIDDEN_MODULE:
-        case BIO_ALLOCATOR_MODULE:
-                break;
-        default:
-                printk(KERN_ERR "Hmmm. Module '%s' has an invalid type."
-                        " It has been ignored.\n", module->name);
-                return -EINVAL;
-        }
-        list_add_tail(&module->module_list, &toi_modules);
-        toi_num_modules++;
-
-        if ((!module->directory && !module->shared_directory) ||
-                        !module->sysfs_data || !module->num_sysfs_entries)
-                return 0;
-
-        /*
-         * Modules may share a directory, but those with shared_dir
-         * set must be loaded (via symbol dependencies) after parents
-         * and unloaded beforehand.
-         */
-        if (module->shared_directory) {
-                struct toi_module_ops *shared =
-                        toi_find_module_given_dir(module->shared_directory);
-                if (!shared) {
-                        printk(KERN_ERR "TuxOnIce: Module %s wants to share "
-                                        "%s's directory but %s isn't loaded.\n",
-                                        module->name, module->shared_directory,
-                                        module->shared_directory);
-                        toi_unregister_module(module);
-                        return -ENODEV;
-                }
-                kobj = shared->dir_kobj;
-        } else {
-                if (!strncmp(module->directory, "[ROOT]", 6))
-                        kobj = tuxonice_kobj;
-                else
-                        kobj = make_toi_sysdir(module->directory);
-        }
-        module->dir_kobj = kobj;
-        for (i = 0; i < module->num_sysfs_entries; i++) {
-                int result = toi_register_sysfs_file(kobj,
-                                &module->sysfs_data[i]);
-                if (result)
-                        return result;
-        }
-        return 0;
-}
-
-/*
- * toi_unregister_module
- *
- * Remove a module.
- */
-void toi_unregister_module(struct toi_module_ops *module)
-{
-        int i;
-
-        if (module->dir_kobj)
-                for (i = 0; i < module->num_sysfs_entries; i++)
-                        toi_unregister_sysfs_file(module->dir_kobj,
-                                        &module->sysfs_data[i]);
-
-        if (!module->shared_directory && module->directory &&
-                        strncmp(module->directory, "[ROOT]", 6))
-                remove_toi_sysdir(module->dir_kobj);
-
-        switch (module->type) {
-        case FILTER_MODULE:
-                list_del(&module->type_list);
-                toi_num_filters--;
-                break;
-        case WRITER_MODULE:
-                list_del(&module->type_list);
-                toiNumAllocators--;
-                if (toiActiveAllocator == module) {
-                        toiActiveAllocator = NULL;
-                        clear_toi_state(TOI_CAN_RESUME);
-                        clear_toi_state(TOI_CAN_HIBERNATE);
-                }
-                break;
-        case MISC_MODULE:
-        case MISC_HIDDEN_MODULE:
-        case BIO_ALLOCATOR_MODULE:
-                break;
-        default:
-                printk(KERN_ERR "Module '%s' has an invalid type."
-                        " It has been ignored.\n", module->name);
-                return;
-        }
-        list_del(&module->module_list);
-        toi_num_modules--;
-}
-
-/*
- * toi_move_module_tail
- *
- * Rearrange modules when reloading the config.
- */
-void toi_move_module_tail(struct toi_module_ops *module)
-{
-        switch (module->type) {
-        case FILTER_MODULE:
-                if (toi_num_filters > 1)
-                        list_move_tail(&module->type_list, &toi_filters);
-                break;
-        case WRITER_MODULE:
-                if (toiNumAllocators > 1)
-                        list_move_tail(&module->type_list, &toiAllocators);
-                break;
-        case MISC_MODULE:
-        case MISC_HIDDEN_MODULE:
-        case BIO_ALLOCATOR_MODULE:
-                break;
-        default:
-                printk(KERN_ERR "Module '%s' has an invalid type."
-                        " It has been ignored.\n", module->name);
-                return;
-        }
-        if ((toi_num_filters + toiNumAllocators) > 1)
-                list_move_tail(&module->module_list, &toi_modules);
-}
-
-/*
- * toi_initialise_modules
- *
- * Get ready to do some work!
- */
-int toi_initialise_modules(int starting_cycle, int early)
-{
-        struct toi_module_ops *this_module;
-        int result;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                this_module->header_requested = 0;
-                this_module->header_used = 0;
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->early != early)
-                        continue;
-                if (this_module->initialise) {
-                        result = this_module->initialise(starting_cycle);
-                        if (result) {
-                                toi_cleanup_modules(starting_cycle);
-                                return result;
-                        }
-                        this_module->initialised = 1;
-                }
-        }
-
-        return 0;
-}
-
-/*
- * toi_cleanup_modules
- *
- * Tell modules the work is done.
- */
-void toi_cleanup_modules(int finishing_cycle)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled || !this_module->initialised)
-                        continue;
-                if (this_module->cleanup)
-                        this_module->cleanup(finishing_cycle);
-                this_module->initialised = 0;
-        }
-}
-
-/*
- * toi_pre_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (this_module->enabled && this_module->pre_atomic_restore)
-                        this_module->pre_atomic_restore(bkd);
-        }
-}
-
-/*
- * toi_post_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (this_module->enabled && this_module->post_atomic_restore)
-                        this_module->post_atomic_restore(bkd);
-        }
-}
-
-/*
- * toi_get_next_filter
- *
- * Get the next filter in the pipeline.
- */
-struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
-{
-        struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
-
-        list_for_each_entry(this_filter, &toi_filters, type_list) {
-                if (!this_filter->enabled)
-                        continue;
-                if ((last_filter == filter_sought) || (!filter_sought))
-                        return this_filter;
-                last_filter = this_filter;
-        }
-
-        return toiActiveAllocator;
-}
-
-/**
- * toi_show_modules: Printk what support is loaded.
- */
-void toi_print_modules(void)
-{
-        struct toi_module_ops *this_module;
-        int prev = 0;
-
-        printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for");
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (this_module->type == MISC_HIDDEN_MODULE)
-                        continue;
-                printk("%s %s%s%s", prev ? "," : "",
-                                this_module->enabled ? "" : "[",
-                                this_module->name,
-                                this_module->enabled ? "" : "]");
-                prev = 1;
-        }
-
-        printk(".\n");
-}
-
-/* toi_get_modules
- *
- * Take a reference to modules so they can't go away under us.
- */
-
-int toi_get_modules(void)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                struct toi_module_ops *this_module2;
-
-                if (try_module_get(this_module->module))
-                        continue;
-
-                /* Failed! Reverse gets and return error */
-                list_for_each_entry(this_module2, &toi_modules,
-                                module_list) {
-                        if (this_module == this_module2)
-                                return -EINVAL;
-                        module_put(this_module2->module);
-                }
-        }
-        return 0;
-}
-
-/* toi_put_modules
- *
- * Release our references to modules we used.
- */
-
-void toi_put_modules(void)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list)
-                module_put(this_module->module);
-}
diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
deleted file mode 100644
index 44f10abb9..000000000
--- a/kernel/power/tuxonice_modules.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations for modules. Modules are additions to
- * TuxOnIce that provide facilities such as image compression or
- * encryption, backends for storage of the image and user interfaces.
- *
- */
-
-#ifndef TOI_MODULES_H
-#define TOI_MODULES_H
-
-/* This is the maximum size we store in the image header for a module name */
-#define TOI_MAX_MODULE_NAME_LENGTH 30
-
-struct toi_boot_kernel_data;
-
-/* Per-module metadata */
-struct toi_module_header {
-        char name[TOI_MAX_MODULE_NAME_LENGTH];
-        int enabled;
-        int type;
-        int index;
-        int data_length;
-        unsigned long signature;
-};
-
-enum {
-        FILTER_MODULE,
-        WRITER_MODULE,
-        BIO_ALLOCATOR_MODULE,
-        MISC_MODULE,
-        MISC_HIDDEN_MODULE,
-};
-
-enum {
-        TOI_ASYNC,
-        TOI_SYNC
-};
-
-enum {
-        TOI_VIRT,
-        TOI_PAGE,
-};
-
-#define TOI_MAP(type, addr) \
- (type == TOI_PAGE ? kmap(addr) : addr)
-
-#define TOI_UNMAP(type, addr) \
- do { \
-   if (type == TOI_PAGE) \
-     kunmap(addr); \
- } while(0)
-
-struct toi_module_ops {
-        /* Functions common to all modules */
-        int type;
-        char *name;
-        char *directory;
-        char *shared_directory;
-        struct kobject *dir_kobj;
-        struct module *module;
-        int enabled, early, initialised;
-        struct list_head module_list;
-
-        /* List of filters or allocators */
-        struct list_head list, type_list;
-
-        /*
-         * Requirements for memory and storage in
-         * the image header..
-         */
-        int (*memory_needed) (void);
-        int (*storage_needed) (void);
-
-        int header_requested, header_used;
-
-        int (*expected_compression) (void);
-
-        /*
-         * Debug info
-         */
-        int (*print_debug_info) (char *buffer, int size);
-        int (*save_config_info) (char *buffer);
-        void (*load_config_info) (char *buffer, int len);
-
-        /*
-         * Initialise & cleanup - general routines called
-         * at the start and end of a cycle.
-         */
-        int (*initialise) (int starting_cycle);
-        void (*cleanup) (int finishing_cycle);
-
-        void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd);
-        void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd);
-
-        /*
-         * Calls for allocating storage (allocators only).
-         *
-         * Header space is requested separately and cannot fail, but the
-         * reservation is only applied when main storage is allocated.
-         * The header space reservation is thus always set prior to
-         * requesting the allocation of storage - and prior to querying
-         * how much storage is available.
-         */
-
-        unsigned long (*storage_available) (void);
-        void (*reserve_header_space) (unsigned long space_requested);
-        int (*register_storage) (void);
-        int (*allocate_storage) (unsigned long space_requested);
-        unsigned long (*storage_allocated) (void);
-        void (*free_unused_storage) (void);
-
-        /*
-         * Routines used in image I/O.
-         */
-        int (*rw_init) (int rw, int stream_number);
-        int (*rw_cleanup) (int rw);
-        int (*write_page) (unsigned long index, int buf_type, void *buf,
-                        unsigned int buf_size);
-        int (*read_page) (unsigned long *index, int buf_type, void *buf,
-                        unsigned int *buf_size);
-        int (*io_flusher) (int rw);
-
-        /* Reset module if image exists but reading aborted */
-        void (*noresume_reset) (void);
-
-        /* Read and write the metadata */
-        int (*write_header_init) (void);
-        int (*write_header_cleanup) (void);
-
-        int (*read_header_init) (void);
-        int (*read_header_cleanup) (void);
-
-        /* To be called after read_header_init */
-        int (*get_header_version) (void);
-
-        int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
-                        char *buffer_start, int buffer_size);
-
-        int (*rw_header_chunk_noreadahead) (int rw,
-                        struct toi_module_ops *owner, char *buffer_start,
-                        int buffer_size);
-
-        /* Attempt to parse an image location */
-        int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
-
-        /* Throttle I/O according to throughput */
-        void (*update_throughput_throttle) (int jif_index);
-
-        /* Flush outstanding I/O */
-        int (*finish_all_io) (void);
-
-        /* Determine whether image exists that we can restore */
-        int (*image_exists) (int quiet);
-
-        /* Mark the image as having tried to resume */
-        int (*mark_resume_attempted) (int);
-
-        /* Destroy image if one exists */
-        int (*remove_image) (void);
-
-        /* Sysfs Data */
-        struct toi_sysfs_data *sysfs_data;
-        int num_sysfs_entries;
-
-        /* Block I/O allocator */
-        struct toi_bio_allocator_ops *bio_allocator_ops;
-};
-
-extern int toi_num_modules, toiNumAllocators;
-
-extern struct toi_module_ops *toiActiveAllocator;
-extern struct list_head toi_filters, toiAllocators, toi_modules;
-
-extern void toi_prepare_console_modules(void);
-extern void toi_cleanup_console_modules(void);
-
-extern struct toi_module_ops *toi_find_module_given_name(char *name);
-extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
-
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_move_module_tail(struct toi_module_ops *module);
-
-extern long toi_header_storage_for_modules(void);
-extern long toi_memory_for_modules(int print_parts);
-extern void print_toi_header_storage_for_modules(void);
-extern int toi_expected_compression_ratio(void);
-
-extern int toi_print_module_debug_info(char *buffer, int buffer_size);
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_unregister_module(struct toi_module_ops *module);
-
-extern int toi_initialise_modules(int starting_cycle, int early);
-#define toi_initialise_modules_early(starting) \
-        toi_initialise_modules(starting, 1)
-#define toi_initialise_modules_late(starting) \
-        toi_initialise_modules(starting, 0)
-extern void toi_cleanup_modules(int finishing_cycle);
-
-extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-
-extern void toi_print_modules(void);
-
-int toi_get_modules(void);
-void toi_put_modules(void);
-#endif
diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
deleted file mode 100644
index 78bd31b05..000000000
--- a/kernel/power/tuxonice_netlink.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Functions for communicating with a userspace helper via netlink.
- */
-
-#include <linux/suspend.h>
-#include <linux/sched.h>
-#include <linux/kmod.h>
-#include "tuxonice_netlink.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct user_helper_data *uhd_list;
-
-/*
- * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
- * none can be allocated).
- */
-static void toi_fill_skb_pool(struct user_helper_data *uhd)
-{
-        while (uhd->pool_level < uhd->pool_limit) {
-                struct sk_buff *new_skb =
-                        alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
-                if (!new_skb)
-                        break;
-
-                new_skb->next = uhd->emerg_skbs;
-                uhd->emerg_skbs = new_skb;
-                uhd->pool_level++;
-        }
-}
-
-/*
- * Try to allocate a single skb. If we can't get one, try to use one from
- * our pool.
- */
-static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
-{
-        struct sk_buff *skb =
-                alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
-        if (skb)
-                return skb;
-
-        skb = uhd->emerg_skbs;
-        if (skb) {
-                uhd->pool_level--;
-                uhd->emerg_skbs = skb->next;
-                skb->next = NULL;
-        }
-
-        return skb;
-}
-
-void toi_send_netlink_message(struct user_helper_data *uhd,
-                int type, void *params, size_t len)
-{
-        struct sk_buff *skb;
-        struct nlmsghdr *nlh;
-        void *dest;
-        struct task_struct *t;
-
-        if (uhd->pid == -1)
-                return;
-
-        if (uhd->debug)
-                printk(KERN_ERR "toi_send_netlink_message: Send "
-                                "message type %d.\n", type);
-
-        skb = toi_get_skb(uhd);
-        if (!skb) {
-                printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
-                return;
-        }
-
-        nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0);
-        uhd->sock_seq++;
-
-        dest = NLMSG_DATA(nlh);
-        if (params && len > 0)
-                memcpy(dest, params, len);
-
-        netlink_unicast(uhd->nl, skb, uhd->pid, 0);
-
-        toi_read_lock_tasklist();
-        t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
-        if (!t) {
-                toi_read_unlock_tasklist();
-                if (uhd->pid > -1)
-                        printk(KERN_INFO "Hmm. Can't find the userspace task"
-                                " %d.\n", uhd->pid);
-                return;
-        }
-        wake_up_process(t);
-        toi_read_unlock_tasklist();
-
-        yield();
-}
-
-static void send_whether_debugging(struct user_helper_data *uhd)
-{
-        static u8 is_debugging = 1;
-
-        toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
-                        &is_debugging, sizeof(u8));
-}
-
-/*
- * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
- * are hibernating.
- */
-static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid)
-{
-        struct task_struct *t;
-
-        if (uhd->debug)
-                printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid);
-
-        toi_read_lock_tasklist();
-        t = find_task_by_pid_ns(pid, &init_pid_ns);
-        if (!t) {
-                toi_read_unlock_tasklist();
-                printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
-                                pid);
-                return -EINVAL;
-        }
-
-        t->flags |= PF_NOFREEZE;
-
-        toi_read_unlock_tasklist();
-        uhd->pid = pid;
-
-        toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
-
-        return 0;
-}
-
-/*
- * Called when the userspace process has informed us that it's ready to roll.
- */
-static int nl_ready(struct user_helper_data *uhd, u32 version)
-{
-        if (version != uhd->interface_version) {
-                printk(KERN_INFO "%s userspace process using invalid interface"
-                                " version (%d - kernel wants %d). Trying to "
-                                "continue without it.\n",
-                                uhd->name, version, uhd->interface_version);
-                if (uhd->not_ready)
-                        uhd->not_ready();
-                return -EINVAL;
-        }
-
-        complete(&uhd->wait_for_process);
-
-        return 0;
-}
-
-void toi_netlink_close_complete(struct user_helper_data *uhd)
-{
-        if (uhd->nl) {
-                netlink_kernel_release(uhd->nl);
-                uhd->nl = NULL;
-        }
-
-        while (uhd->emerg_skbs) {
-                struct sk_buff *next = uhd->emerg_skbs->next;
-                kfree_skb(uhd->emerg_skbs);
-                uhd->emerg_skbs = next;
-        }
-
-        uhd->pid = -1;
-}
-
-static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
-                struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-        int type = nlh->nlmsg_type;
-        int *data;
-        int err;
-
-        if (uhd->debug)
-                printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n",
-                                type);
-
-        /* Let the more specific handler go first. It returns
-         * 1 for valid messages that it doesn't know. */
-        err = uhd->rcv_msg(skb, nlh);
-        if (err != 1)
-                return err;
-
-        /* Only allow one task to receive NOFREEZE privileges */
-        if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
-                printk(KERN_INFO "Received extra nofreeze me requests.\n");
-                return -EBUSY;
-        }
-
-        data = NLMSG_DATA(nlh);
-
-        switch (type) {
-        case NETLINK_MSG_NOFREEZE_ME:
-                return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
-        case NETLINK_MSG_GET_DEBUGGING:
-                send_whether_debugging(uhd);
-                return 0;
-        case NETLINK_MSG_READY:
-                if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) {
-                        printk(KERN_INFO "Invalid ready mesage.\n");
-                        if (uhd->not_ready)
-                                uhd->not_ready();
-                        return -EINVAL;
-                }
-                return nl_ready(uhd, (u32) *data);
-        case NETLINK_MSG_CLEANUP:
-                toi_netlink_close_complete(uhd);
-                return 0;
-        }
-
-        return -EINVAL;
-}
-
-static void toi_user_rcv_skb(struct sk_buff *skb)
-{
-        int err;
-        struct nlmsghdr *nlh;
-        struct user_helper_data *uhd = uhd_list;
-
-        while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
-                uhd = uhd->next;
-
-        if (!uhd)
-                return;
-
-        while (skb->len >= NLMSG_SPACE(0)) {
-                u32 rlen;
-
-                nlh = (struct nlmsghdr *) skb->data;
-                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-                        return;
-
-                rlen = NLMSG_ALIGN(nlh->nlmsg_len);
-                if (rlen > skb->len)
-                        rlen = skb->len;
-
-                err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
-                if (err)
-                        netlink_ack(skb, nlh, err);
-                else if (nlh->nlmsg_flags & NLM_F_ACK)
-                        netlink_ack(skb, nlh, 0);
-                skb_pull(skb, rlen);
-        }
-}
-
-static int netlink_prepare(struct user_helper_data *uhd)
-{
-        struct netlink_kernel_cfg cfg = {
-                .groups = 0,
-                .input = toi_user_rcv_skb,
-        };
-
-        uhd->next = uhd_list;
-        uhd_list = uhd;
-
-        uhd->sock_seq = 0x42c0ffee;
-        uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg);
-        if (!uhd->nl) {
-                printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
-                                uhd->name);
-                return -ENOMEM;
-        }
-
-        toi_fill_skb_pool(uhd);
-
-        return 0;
-}
-
-void toi_netlink_close(struct user_helper_data *uhd)
-{
-        struct task_struct *t;
-
-        toi_read_lock_tasklist();
-        t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
-        if (t)
-                t->flags &= ~PF_NOFREEZE;
-        toi_read_unlock_tasklist();
-
-        toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
-}
-int toi_netlink_setup(struct user_helper_data *uhd)
-{
-        /* In case userui didn't cleanup properly on us */
-        toi_netlink_close_complete(uhd);
-
-        if (netlink_prepare(uhd) < 0) {
-                printk(KERN_INFO "Netlink prepare failed.\n");
-                return 1;
-        }
-
-        if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
-                                UMH_WAIT_EXEC, uhd->debug) < 0) {
-                printk(KERN_INFO "Launch userspace program failed.\n");
-                toi_netlink_close_complete(uhd);
-                return 1;
-        }
-
-        /* Wait 2 seconds for the userspace process to make contact */
-        wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
-
-        if (uhd->pid == -1) {
-                printk(KERN_INFO "%s: Failed to contact userspace process.\n",
-                                uhd->name);
-                toi_netlink_close_complete(uhd);
-                return 1;
-        }
-
-        return 0;
-}
diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
deleted file mode 100644
index 6613c8eaa..000000000
--- a/kernel/power/tuxonice_netlink.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for functions for communicating with a userspace helper
- * via netlink.
- */
-
-#include <linux/netlink.h>
-#include <net/sock.h>
-
-#define NETLINK_MSG_BASE 0x10
-
-#define NETLINK_MSG_READY 0x10
-#define        NETLINK_MSG_NOFREEZE_ME 0x16
-#define NETLINK_MSG_GET_DEBUGGING 0x19
-#define NETLINK_MSG_CLEANUP 0x24
-#define NETLINK_MSG_NOFREEZE_ACK 0x27
-#define NETLINK_MSG_IS_DEBUGGING 0x28
-
-struct user_helper_data {
-        int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
-        void (*not_ready) (void);
-        struct sock *nl;
-        u32 sock_seq;
-        pid_t pid;
-        char *comm;
-        char program[256];
-        int pool_level;
-        int pool_limit;
-        struct sk_buff *emerg_skbs;
-        int skb_size;
-        int netlink_id;
-        char *name;
-        struct user_helper_data *next;
-        struct completion wait_for_process;
-        u32 interface_version;
-        int must_init;
-        int debug;
-};
-
-#ifdef CONFIG_NET
-int toi_netlink_setup(struct user_helper_data *uhd);
-void toi_netlink_close(struct user_helper_data *uhd);
-void toi_send_netlink_message(struct user_helper_data *uhd,
-                int type, void *params, size_t len);
-void toi_netlink_close_complete(struct user_helper_data *uhd);
-#else
-static inline int toi_netlink_setup(struct user_helper_data *uhd)
-{
-        return 0;
-}
-
-static inline void toi_netlink_close(struct user_helper_data *uhd) { };
-static inline void toi_send_netlink_message(struct user_helper_data *uhd,
-                int type, void *params, size_t len) { };
-static inline void toi_netlink_close_complete(struct user_helper_data *uhd)
-        { };
-#endif
diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
deleted file mode 100644
index d469f3d2d..000000000
--- a/kernel/power/tuxonice_pagedir.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for handling pagesets.
- * Note that pbes aren't actually stored as such. They're stored as
- * bitmaps and extents.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/hardirq.h>
-#include <linux/sched.h>
-#include <linux/cpu.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
-
-static int ptoi_pfn;
-static struct pbe *this_low_pbe;
-static struct pbe **last_low_pbe_ptr;
-
-void toi_reset_alt_image_pageset2_pfn(void)
-{
-  memory_bm_position_reset(pageset2_map);
-}
-
-static struct page *first_conflicting_page;
-
-/*
- * free_conflicting_pages
- */
-
-static void free_conflicting_pages(void)
-{
-        while (first_conflicting_page) {
-                struct page *next =
-                        *((struct page **) kmap(first_conflicting_page));
-                kunmap(first_conflicting_page);
-                toi__free_page(29, first_conflicting_page);
-                first_conflicting_page = next;
-        }
-}
-
-/* __toi_get_nonconflicting_page
- *
- * Description: Gets order zero pages that won't be overwritten
- *                while copying the original pages.
- */
-
-struct page *___toi_get_nonconflicting_page(int can_be_highmem)
-{
-        struct page *page;
-        gfp_t flags = TOI_ATOMIC_GFP;
-        if (can_be_highmem)
-                flags |= __GFP_HIGHMEM;
-
-
-        if (test_toi_state(TOI_LOADING_ALT_IMAGE) &&
-                        pageset2_map && ptoi_pfn) {
-                do {
-                        ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0);
-                        if (ptoi_pfn != BM_END_OF_MAP) {
-                                page = pfn_to_page(ptoi_pfn);
-                                if (!PagePageset1(page) &&
-                                    (can_be_highmem || !PageHighMem(page)))
-                                        return page;
-                        }
-                } while (ptoi_pfn);
-        }
-
-        do {
-                page = toi_alloc_page(29, flags | __GFP_ZERO);
-                if (!page) {
-                        printk(KERN_INFO "Failed to get nonconflicting "
-                                        "page.\n");
-                        return NULL;
-                }
-                if (PagePageset1(page)) {
-                        struct page **next = (struct page **) kmap(page);
-                        *next = first_conflicting_page;
-                        first_conflicting_page = page;
-                        kunmap(page);
-                }
-        } while (PagePageset1(page));
-
-        return page;
-}
-
-unsigned long __toi_get_nonconflicting_page(void)
-{
-        struct page *page = ___toi_get_nonconflicting_page(0);
-        return page ? (unsigned long) page_address(page) : 0;
-}
-
-static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
-                int highmem)
-{
-        if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
-                     + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
-                struct page *new_page =
-                        ___toi_get_nonconflicting_page(highmem);
-                if (!new_page)
-                        return ERR_PTR(-ENOMEM);
-                this_pbe = (struct pbe *) kmap(new_page);
-                memset(this_pbe, 0, PAGE_SIZE);
-                *page_ptr = new_page;
-        } else
-                this_pbe++;
-
-        return this_pbe;
-}
-
-/**
- * get_pageset1_load_addresses - generate pbes for conflicting pages
- *
- * We check here that pagedir & pages it points to won't collide
- * with pages where we're going to restore from the loaded pages
- * later.
- *
- * Returns:
- *        Zero on success, one if couldn't find enough pages (shouldn't
- *        happen).
- **/
-int toi_get_pageset1_load_addresses(void)
-{
-        int pfn, highallocd = 0, lowallocd = 0;
-        int low_needed = pagedir1.size - get_highmem_size(pagedir1);
-        int high_needed = get_highmem_size(pagedir1);
-        int low_pages_for_highmem = 0;
-        gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
-        struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
-                    *low_pbe_page, *last_low_pbe_page = NULL;
-        struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
-                   *this_high_pbe = NULL;
-        unsigned long orig_low_pfn, orig_high_pfn;
-        int high_pbes_done = 0, low_pbes_done = 0;
-        int low_direct = 0, high_direct = 0, result = 0, i;
-        int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0;
-
-        toi_trace_index++;
-
-        memory_bm_position_reset(pageset1_map);
-        memory_bm_position_reset(pageset1_copy_map);
-
-        last_low_pbe_ptr = &restore_pblist;
-
-        /* First, allocate pages for the start of our pbe lists. */
-        if (high_needed) {
-                high_pbe_page = ___toi_get_nonconflicting_page(1);
-                if (!high_pbe_page) {
-                        result = -ENOMEM;
-                        goto out;
-                }
-                this_high_pbe = (struct pbe *) kmap(high_pbe_page);
-                memset(this_high_pbe, 0, PAGE_SIZE);
-        }
-
-        low_pbe_page = ___toi_get_nonconflicting_page(0);
-        if (!low_pbe_page) {
-                result = -ENOMEM;
-                goto out;
-        }
-        this_low_pbe = (struct pbe *) page_address(low_pbe_page);
-
-        /*
-         * Next, allocate the number of pages we need.
-         */
-
-        i = low_needed + high_needed;
-
-        do {
-                int is_high;
-
-                if (i == low_needed)
-                        flags &= ~__GFP_HIGHMEM;
-
-                page = toi_alloc_page(30, flags);
-                BUG_ON(!page);
-
-                SetPagePageset1Copy(page);
-                is_high = PageHighMem(page);
-
-                if (PagePageset1(page)) {
-                        if (is_high)
-                                high_direct++;
-                        else
-                                low_direct++;
-                } else {
-                        if (is_high)
-                                highallocd++;
-                        else
-                                lowallocd++;
-                }
-        } while (--i);
-
-        high_needed -= high_direct;
-        low_needed -= low_direct;
-
-        /*
-         * Do we need to use some lowmem pages for the copies of highmem
-         * pages?
-         */
-        if (high_needed > highallocd) {
-                low_pages_for_highmem = high_needed - highallocd;
-                high_needed -= low_pages_for_highmem;
-                low_needed += low_pages_for_highmem;
-        }
-
-        /*
-         * Now generate our pbes (which will be used for the atomic restore),
-         * and free unneeded pages.
-         */
-        memory_bm_position_reset(pageset1_copy_map);
-        for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP;
-                        pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) {
-                int is_high;
-                page = pfn_to_page(pfn);
-                is_high = PageHighMem(page);
-
-                if (PagePageset1(page))
-                        continue;
-
-                /* Nope. We're going to use this page. Add a pbe. */
-                if (is_high || low_pages_for_highmem) {
-                        struct page *orig_page;
-                        high_pbes_done++;
-                        if (!is_high)
-                                low_pages_for_highmem--;
-                        do {
-                                orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0);
-                                BUG_ON(orig_high_pfn == BM_END_OF_MAP);
-                                orig_page = pfn_to_page(orig_high_pfn);
-                        } while (!PageHighMem(orig_page) ||
-                                        PagePageset1Copy(orig_page));
-
-                        this_high_pbe->orig_address = (void *) orig_high_pfn;
-                        this_high_pbe->address = page;
-                        this_high_pbe->next = NULL;
-                        toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p",
-                                        high_page, high_offset, page, orig_high_pfn, orig_page);
-                        if (last_high_pbe_page != high_pbe_page) {
-                                *last_high_pbe_ptr =
-                                        (struct pbe *) high_pbe_page;
-                                if (last_high_pbe_page) {
-                                        kunmap(last_high_pbe_page);
-                                        high_page++;
-                                        high_offset = 0;
-                                } else
-                                        high_offset++;
-                                last_high_pbe_page = high_pbe_page;
-                        } else {
-                                *last_high_pbe_ptr = this_high_pbe;
-                                high_offset++;
-                        }
-                        last_high_pbe_ptr = &this_high_pbe->next;
-                        this_high_pbe = get_next_pbe(&high_pbe_page,
-                                        this_high_pbe, 1);
-                        if (IS_ERR(this_high_pbe)) {
-                                printk(KERN_INFO
-                                                "This high pbe is an error.\n");
-                                return -ENOMEM;
-                        }
-                } else {
-                        struct page *orig_page;
-                        low_pbes_done++;
-                        do {
-                                orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0);
-                                BUG_ON(orig_low_pfn == BM_END_OF_MAP);
-                                orig_page = pfn_to_page(orig_low_pfn);
-                        } while (PageHighMem(orig_page) ||
-                                        PagePageset1Copy(orig_page));
-
-                        this_low_pbe->orig_address = page_address(orig_page);
-                        this_low_pbe->address = page_address(page);
-                        this_low_pbe->next = NULL;
-                        toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p",
-                                        low_page, low_offset, this_low_pbe->orig_address,
-                                        orig_low_pfn, this_low_pbe->address);
-                        TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address);
-                        *last_low_pbe_ptr = this_low_pbe;
-                        last_low_pbe_ptr = &this_low_pbe->next;
-                        this_low_pbe = get_next_pbe(&low_pbe_page,
-                                        this_low_pbe, 0);
-                        if (low_pbe_page != last_low_pbe_page) {
-                                if (last_low_pbe_page) {
-                                        low_page++;
-                                        low_offset = 0;
-                                } else {
-                                    low_offset++;
-                                }
-                                last_low_pbe_page = low_pbe_page;
-                        } else
-                                low_offset++;
-                        if (IS_ERR(this_low_pbe)) {
-                                printk(KERN_INFO "this_low_pbe is an error.\n");
-                                return -ENOMEM;
-                        }
-                }
-        }
-
-        if (high_pbe_page)
-                kunmap(high_pbe_page);
-
-        if (last_high_pbe_page != high_pbe_page) {
-                if (last_high_pbe_page)
-                        kunmap(last_high_pbe_page);
-                toi__free_page(29, high_pbe_page);
-        }
-
-        free_conflicting_pages();
-
-out:
-        return result;
-}
-
-int add_boot_kernel_data_pbe(void)
-{
-        this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
-        if (!this_low_pbe->address) {
-                printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
-                return -ENOMEM;
-        }
-
-        toi_bkd.size = sizeof(toi_bkd);
-        memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
-
-        *last_low_pbe_ptr = this_low_pbe;
-        this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
-        this_low_pbe->next = NULL;
-        return 0;
-}
diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
deleted file mode 100644
index 046535918..000000000
--- a/kernel/power/tuxonice_pagedir.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for routines for handling pagesets.
- */
-
-#ifndef KERNEL_POWER_PAGEDIR_H
-#define KERNEL_POWER_PAGEDIR_H
-
-/* Pagedir
- *
- * Contains the metadata for a set of pages saved in the image.
- */
-
-struct pagedir {
-        int id;
-        unsigned long size;
-#ifdef CONFIG_HIGHMEM
-        unsigned long size_high;
-#endif
-};
-
-#ifdef CONFIG_HIGHMEM
-#define get_highmem_size(pagedir) (pagedir.size_high)
-#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
-#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
-#else
-#define get_highmem_size(pagedir) (0)
-#define set_highmem_size(pagedir, sz) do { } while (0)
-#define inc_highmem_size(pagedir) do { } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size)
-#endif
-
-extern struct pagedir pagedir1, pagedir2;
-
-extern void toi_copy_pageset1(void);
-
-extern int toi_get_pageset1_load_addresses(void);
-
-extern unsigned long __toi_get_nonconflicting_page(void);
-struct page *___toi_get_nonconflicting_page(int can_be_highmem);
-
-extern void toi_reset_alt_image_pageset2_pfn(void);
-extern int add_boot_kernel_data_pbe(void);
-#endif
diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
deleted file mode 100644
index 0fe92edd7..000000000
--- a/kernel/power/tuxonice_pageflags.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for serialising and relocating pageflags in which we
- * store our image metadata.
- */
-
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-int toi_pageflags_space_needed(void)
-{
-        return memory_bm_space_needed(pageset1_map);
-}
diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
deleted file mode 100644
index ddeeaf1e7..000000000
--- a/kernel/power/tuxonice_pageflags.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-
-struct  memory_bitmap;
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-void memory_bm_clear(struct memory_bitmap *bm);
-
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index);
-void memory_bm_position_reset(struct memory_bitmap *bm);
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-int toi_alloc_bitmap(struct memory_bitmap **bm);
-void toi_free_bitmap(struct memory_bitmap **bm);
-void memory_bm_clear(struct memory_bitmap *bm);
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-
-struct toi_module_ops;
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
-        (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
-        (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_space_needed(struct memory_bitmap *bm);
-
-extern struct memory_bitmap *pageset1_map;
-extern struct memory_bitmap *pageset1_copy_map;
-extern struct memory_bitmap *pageset2_map;
-extern struct memory_bitmap *page_resave_map;
-extern struct memory_bitmap *io_map;
-extern struct memory_bitmap *nosave_map;
-extern struct memory_bitmap *free_map;
-extern struct memory_bitmap *compare_map;
-
-#define PagePageset1(page) \
-        (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1(page) \
-        (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1(page) \
-        (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset1Copy(page) \
-        (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1Copy(page) \
-        (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1Copy(page) \
-        (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset2(page) \
-        (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset2(page) \
-        (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset2(page) \
-        (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageWasRW(page) \
-        (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPageWasRW(page) \
-        (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageWasRW(page) \
-        (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageResave(page) (page_resave_map ? \
-        memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageResave(page) \
-        (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageResave(page) \
-        (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosave(page) (nosave_map ? \
-        memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosave(page) \
-        (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosave(page) \
-        (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosaveFree(page) (free_map ? \
-                memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosaveFree(page) \
-        (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosaveFree(page) \
-        (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageCompareChanged(page) (compare_map ? \
-                memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageCompareChanged(page) \
-        (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageCompareChanged(page) \
-        (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-
-extern void save_pageflags(struct memory_bitmap *pagemap);
-extern int load_pageflags(struct memory_bitmap *pagemap);
-extern int toi_pageflags_space_needed(void);
-#endif
diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
deleted file mode 100644
index 7c78773cf..000000000
--- a/kernel/power/tuxonice_power_off.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for powering down.
- */
-
-#include <linux/device.h>
-#include <linux/suspend.h>
-#include <linux/mm.h>
-#include <linux/pm.h>
-#include <linux/reboot.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/fs.h>
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-
-unsigned long toi_poweroff_method; /* 0 - Kernel power off */
-
-static int wake_delay;
-static char lid_state_file[256], wake_alarm_dir[256];
-static struct file *lid_file, *alarm_file, *epoch_file;
-static int post_wake_state = -1;
-
-static int did_suspend_to_both;
-
-/*
- * __toi_power_down
- * Functionality   : Powers down or reboots the computer once the image
- *                   has been written to disk.
- * Key Assumptions : Able to reboot/power down via code called or that
- *                   the warning emitted if the calls fail will be visible
- *                   to the user (ie printk resumes devices).
- */
-
-static void __toi_power_down(int method)
-{
-        int error;
-
-        toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." :
-                        "Powering down.");
-
-        if (test_result_state(TOI_ABORTED))
-                goto out;
-
-        if (test_action_state(TOI_REBOOT))
-                kernel_restart(NULL);
-
-        switch (method) {
-        case 0:
-                break;
-        case 3:
-                /*
-                 * Re-read the overwritten part of pageset2 to make post-resume
-                 * faster.
-                 */
-                if (read_pageset2(1))
-                        panic("Attempt to reload pagedir 2 failed. "
-                                        "Try rebooting.");
-
-                pm_prepare_console();
-
-                error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
-                if (!error) {
-                        pm_restore_gfp_mask();
-                        error = suspend_devices_and_enter(PM_SUSPEND_MEM);
-                        pm_restrict_gfp_mask();
-                        if (!error)
-                                did_suspend_to_both = 1;
-                }
-                pm_notifier_call_chain(PM_POST_SUSPEND);
-                pm_restore_console();
-
-                /* Success - we're now post-resume-from-ram */
-                if (did_suspend_to_both)
-                        return;
-
-                /* Failed to suspend to ram - do normal power off */
-                break;
-        case 4:
-                /*
-                 * If succeeds, doesn't return. If fails, do a simple
-                 * powerdown.
-                 */
-                hibernation_platform_enter();
-                break;
-        case 5:
-                /* Historic entry only now */
-                break;
-        }
-
-        if (method && method != 5)
-                toi_cond_pause(1,
-                        "Falling back to alternate power off method.");
-
-        if (test_result_state(TOI_ABORTED))
-                goto out;
-
-        if (pm_power_off)
-            kernel_power_off();
-        kernel_halt();
-        toi_cond_pause(1, "Powerdown failed.");
-        while (1)
-                cpu_relax();
-
-out:
-        if (read_pageset2(1))
-                panic("Attempt to reload pagedir 2 failed. Try rebooting.");
-        return;
-}
-
-#define CLOSE_FILE(file) \
-        if (file) { \
-                filp_close(file, NULL); file = NULL; \
-        }
-
-static void powerdown_cleanup(int toi_or_resume)
-{
-        if (!toi_or_resume)
-                return;
-
-        CLOSE_FILE(lid_file);
-        CLOSE_FILE(alarm_file);
-        CLOSE_FILE(epoch_file);
-}
-
-static void open_file(char *format, char *arg, struct file **var, int mode,
-                char *desc)
-{
-        char buf[256];
-
-        if (strlen(arg)) {
-                sprintf(buf, format, arg);
-                *var = filp_open(buf, mode, 0);
-                if (IS_ERR(*var) || !*var) {
-                        printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
-                                desc, buf, *var);
-                        *var = NULL;
-                }
-        }
-}
-
-static int powerdown_init(int toi_or_resume)
-{
-        if (!toi_or_resume)
-                return 0;
-
-        did_suspend_to_both = 0;
-
-        open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
-                        O_RDONLY, "lid");
-
-        if (strlen(wake_alarm_dir)) {
-                open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
-                                &alarm_file, O_WRONLY, "alarm");
-
-                open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
-                                &epoch_file, O_RDONLY, "epoch");
-        }
-
-        return 0;
-}
-
-static int lid_closed(void)
-{
-        char array[25];
-        ssize_t size;
-        loff_t pos = 0;
-
-        if (!lid_file)
-                return 0;
-
-        size = vfs_read(lid_file, (char __user *) array, 25, &pos);
-        if ((int) size < 1) {
-                printk(KERN_INFO "Failed to read lid state file (%d).\n",
-                        (int) size);
-                return 0;
-        }
-
-        if (!strcmp(array, "state:      closed\n"))
-                return 1;
-
-        return 0;
-}
-
-static void write_alarm_file(int value)
-{
-        ssize_t size;
-        char buf[40];
-        loff_t pos = 0;
-
-        if (!alarm_file)
-                return;
-
-        sprintf(buf, "%d\n", value);
-
-        size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
-
-        if (size < 0)
-                printk(KERN_INFO "Error %d writing alarm value %s.\n",
-                                (int) size, buf);
-}
-
-/**
- * toi_check_resleep: See whether to powerdown again after waking.
- *
- * After waking, check whether we should powerdown again in a (usually
- * different) way. We only do this if the lid switch is still closed.
- */
-void toi_check_resleep(void)
-{
-        /* We only return if we suspended to ram and woke. */
-        if (lid_closed() && post_wake_state >= 0)
-                __toi_power_down(post_wake_state);
-}
-
-void toi_power_down(void)
-{
-        if (alarm_file && wake_delay) {
-                char array[25];
-                loff_t pos = 0;
-                size_t size = vfs_read(epoch_file, (char __user *) array, 25,
-                                &pos);
-
-                if (((int) size) < 1)
-                        printk(KERN_INFO "Failed to read epoch file (%d).\n",
-                                        (int) size);
-                else {
-                        unsigned long since_epoch;
-                        if (!kstrtoul(array, 0, &since_epoch)) {
-                                /* Clear any wakeup time. */
-                                write_alarm_file(0);
-
-                                /* Set new wakeup time. */
-                                write_alarm_file(since_epoch + wake_delay);
-                        }
-                }
-        }
-
-        __toi_power_down(toi_poweroff_method);
-
-        toi_check_resleep();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_ACPI)
-        SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL),
-        SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL),
-        SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL),
-        SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0,
-                        NULL),
-        SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0),
-        SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both,
-                0, 0, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops powerdown_ops = {
-        .type                                = MISC_HIDDEN_MODULE,
-        .name                                = "poweroff",
-        .initialise                        = powerdown_init,
-        .cleanup                        = powerdown_cleanup,
-        .directory                        = "[ROOT]",
-        .module                                = THIS_MODULE,
-        .sysfs_data                        = sysfs_params,
-        .num_sysfs_entries                = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-int toi_poweroff_init(void)
-{
-        return toi_register_module(&powerdown_ops);
-}
-
-void toi_poweroff_exit(void)
-{
-        toi_unregister_module(&powerdown_ops);
-}
diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
deleted file mode 100644
index 6e1d8bb39..000000000
--- a/kernel/power/tuxonice_power_off.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for the powering down.
- */
-
-int toi_pm_state_finish(void);
-void toi_power_down(void);
-extern unsigned long toi_poweroff_method;
-int toi_poweroff_init(void);
-void toi_poweroff_exit(void);
-void toi_check_resleep(void);
-
-extern int platform_begin(int platform_mode);
-extern int platform_pre_snapshot(int platform_mode);
-extern void platform_leave(int platform_mode);
-extern void platform_end(int platform_mode);
-extern void platform_finish(int platform_mode);
-extern int platform_pre_restore(int platform_mode);
-extern void platform_restore_cleanup(int platform_mode);
diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
deleted file mode 100644
index a10d62080..000000000
--- a/kernel/power/tuxonice_prepare_image.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * We need to eat memory until we can:
- * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
- * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
- *    main_storage_needed())
- * 3. Reload the pagedir and pageset1 to places that don't collide with their
- *    final destinations, not knowing to what extent the resumed kernel will
- *    overlap with the one loaded at boot time. I think the resumed kernel
- *    should overlap completely, but I don't want to rely on this as it is
- *    an unproven assumption. We therefore assume there will be no overlap at
- *    all (worse case).
- * 4. Meet the user's requested limit (if any) on the size of the image.
- *    The limit is in MB, so pages/256 (assuming 4K pages).
- *
- */
-
-#include <linux/highmem.h>
-#include <linux/freezer.h>
-#include <linux/hardirq.h>
-#include <linux/mmzone.h>
-#include <linux/console.h>
-#include <linux/tuxonice.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_builtin.h"
-
-static unsigned long num_nosave, main_storage_allocated, storage_limit,
-            header_storage_needed;
-unsigned long extra_pd1_pages_allowance =
-        CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE;
-long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT;
-static int no_ps2_needed;
-
-struct attention_list {
-        struct task_struct *task;
-        struct attention_list *next;
-};
-
-static struct attention_list *attention_list;
-
-#define PAGESET1 0
-#define PAGESET2 1
-
-void free_attention_list(void)
-{
-        struct attention_list *last = NULL;
-
-        while (attention_list) {
-                last = attention_list;
-                attention_list = attention_list->next;
-                toi_kfree(6, last, sizeof(*last));
-        }
-}
-
-static int build_attention_list(void)
-{
-        int i, task_count = 0;
-        struct task_struct *p;
-        struct attention_list *next;
-
-        /*
-         * Count all userspace process (with task->mm) marked PF_NOFREEZE.
-         */
-        toi_read_lock_tasklist();
-        for_each_process(p)
-                if ((p->flags & PF_NOFREEZE) || p == current)
-                        task_count++;
-        toi_read_unlock_tasklist();
-
-        /*
-         * Allocate attention list structs.
-         */
-        for (i = 0; i < task_count; i++) {
-                struct attention_list *this =
-                        toi_kzalloc(6, sizeof(struct attention_list),
-                                        TOI_WAIT_GFP);
-                if (!this) {
-                        printk(KERN_INFO "Failed to allocate slab for "
-                                        "attention list.\n");
-                        free_attention_list();
-                        return 1;
-                }
-                this->next = NULL;
-                if (attention_list)
-                        this->next = attention_list;
-                attention_list = this;
-        }
-
-        next = attention_list;
-        toi_read_lock_tasklist();
-        for_each_process(p)
-                if ((p->flags & PF_NOFREEZE) || p == current) {
-                        next->task = p;
-                        next = next->next;
-                }
-        toi_read_unlock_tasklist();
-        return 0;
-}
-
-static void pageset2_full(void)
-{
-        struct zone *zone;
-        struct page *page;
-        unsigned long flags;
-        int i;
-
-        toi_trace_index++;
-
-        for_each_populated_zone(zone) {
-                spin_lock_irqsave(&zone->lru_lock, flags);
-                for_each_lru(i) {
-                        if (!zone_page_state(zone, NR_LRU_BASE + i))
-                                continue;
-
-                        list_for_each_entry(page, &zone->lruvec.lists[i], lru) {
-                                struct address_space *mapping;
-
-                                mapping = page_mapping(page);
-                                if (!mapping || !mapping->host ||
-                                    !(mapping->host->i_flags & S_ATOMIC_COPY)) {
-                                    if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
-                                        TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified.");
-                                    } else {
-                                        TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full.");
-                                        SetPagePageset2(page);
-                                    }
-                                }
-                        }
-                }
-                spin_unlock_irqrestore(&zone->lru_lock, flags);
-        }
-}
-
-/*
- * toi_mark_task_as_pageset
- * Functionality   : Marks all the saveable pages belonging to a given process
- *                      as belonging to a particular pageset.
- */
-
-static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
-{
-        struct vm_area_struct *vma;
-        struct mm_struct *mm;
-
-        mm = t->active_mm;
-
-        if (!mm || !mm->mmap)
-                return;
-
-        toi_trace_index++;
-
-        if (!irqs_disabled())
-                down_read(&mm->mmap_sem);
-
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                unsigned long posn;
-
-                if (!vma->vm_start ||
-                    vma->vm_flags & VM_PFNMAP)
-                        continue;
-
-                for (posn = vma->vm_start; posn < vma->vm_end;
-                                posn += PAGE_SIZE) {
-                        struct page *page = follow_page(vma, posn, 0);
-                        struct address_space *mapping;
-
-                        if (!page || !pfn_valid(page_to_pfn(page)))
-                                continue;
-
-                        mapping = page_mapping(page);
-                        if (mapping && mapping->host &&
-                            mapping->host->i_flags & S_ATOMIC_COPY && pageset2)
-                                continue;
-
-                        if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
-                                TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2);
-                                continue;
-                        }
-
-                        if (pageset2) {
-                                TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1");
-                                SetPagePageset2(page);
-                        } else {
-                                TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2");
-                                ClearPagePageset2(page);
-                                SetPagePageset1(page);
-                        }
-                }
-        }
-
-        if (!irqs_disabled())
-                up_read(&mm->mmap_sem);
-}
-
-static void mark_tasks(int pageset)
-{
-        struct task_struct *p;
-
-        toi_read_lock_tasklist();
-        for_each_process(p) {
-                if (!p->mm)
-                        continue;
-
-                if (p->flags & PF_KTHREAD)
-                        continue;
-
-                toi_mark_task_as_pageset(p, pageset);
-        }
-        toi_read_unlock_tasklist();
-
-}
-
-/* mark_pages_for_pageset2
- *
- * Description:        Mark unshared pages in processes not needed for hibernate as
- *                 being able to be written out in a separate pagedir.
- *                 HighMem pages are simply marked as pageset2. They won't be
- *                 needed during hibernate.
- */
-
-static void toi_mark_pages_for_pageset2(void)
-{
-        struct attention_list *this = attention_list;
-
-        memory_bm_clear(pageset2_map);
-
-        if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed)
-                return;
-
-        if (test_action_state(TOI_PAGESET2_FULL))
-                pageset2_full();
-        else
-                mark_tasks(PAGESET2);
-
-        /*
-         * Because the tasks in attention_list are ones related to hibernating,
-         * we know that they won't go away under us.
-         */
-
-        while (this) {
-                if (!test_result_state(TOI_ABORTED))
-                        toi_mark_task_as_pageset(this->task, PAGESET1);
-                this = this->next;
-        }
-}
-
-/*
- * The atomic copy of pageset1 is stored in pageset2 pages.
- * But if pageset1 is larger (normally only just after boot),
- * we need to allocate extra pages to store the atomic copy.
- * The following data struct and functions are used to handle
- * the allocation and freeing of that memory.
- */
-
-static unsigned long extra_pages_allocated;
-
-struct extras {
-        struct page *page;
-        int order;
-        struct extras *next;
-};
-
-static struct extras *extras_list;
-
-/* toi_free_extra_pagedir_memory
- *
- * Description:        Free previously allocated extra pagedir memory.
- */
-void toi_free_extra_pagedir_memory(void)
-{
-        /* Free allocated pages */
-        while (extras_list) {
-                struct extras *this = extras_list;
-                int i;
-
-                extras_list = this->next;
-
-                for (i = 0; i < (1 << this->order); i++)
-                        ClearPageNosave(this->page + i);
-
-                toi_free_pages(9, this->page, this->order);
-                toi_kfree(7, this, sizeof(*this));
-        }
-
-        extra_pages_allocated = 0;
-}
-
-/* toi_allocate_extra_pagedir_memory
- *
- * Description:        Allocate memory for making the atomic copy of pagedir1 in the
- *                 case where it is bigger than pagedir2.
- * Arguments:        int        num_to_alloc: Number of extra pages needed.
- * Result:        int.         Number of extra pages we now have allocated.
- */
-static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
-{
-        int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
-        gfp_t flags = TOI_ATOMIC_GFP;
-
-        if (num_to_alloc < 1)
-                return 0;
-
-        order = fls(num_to_alloc);
-        if (order >= MAX_ORDER)
-                order = MAX_ORDER - 1;
-
-        while (num_to_alloc) {
-                struct page *newpage;
-                unsigned long virt;
-                struct extras *extras_entry;
-
-                while ((1 << order) > num_to_alloc)
-                        order--;
-
-                extras_entry = (struct extras *) toi_kzalloc(7,
-                        sizeof(struct extras), TOI_ATOMIC_GFP);
-
-                if (!extras_entry)
-                        return extra_pages_allocated;
-
-                virt = toi_get_free_pages(9, flags, order);
-                while (!virt && order) {
-                        order--;
-                        virt = toi_get_free_pages(9, flags, order);
-                }
-
-                if (!virt) {
-                        toi_kfree(7, extras_entry, sizeof(*extras_entry));
-                        return extra_pages_allocated;
-                }
-
-                newpage = virt_to_page(virt);
-
-                extras_entry->page = newpage;
-                extras_entry->order = order;
-                extras_entry->next = extras_list;
-
-                extras_list = extras_entry;
-
-                for (j = 0; j < (1 << order); j++) {
-                        SetPageNosave(newpage + j);
-                        SetPagePageset1Copy(newpage + j);
-                }
-
-                extra_pages_allocated += (1 << order);
-                num_to_alloc -= (1 << order);
-        }
-
-        return extra_pages_allocated;
-}
-
-/*
- * real_nr_free_pages: Count pcp pages for a zone type or all zones
- * (-1 for all, otherwise zone_idx() result desired).
- */
-unsigned long real_nr_free_pages(unsigned long zone_idx_mask)
-{
-        struct zone *zone;
-        int result = 0, cpu;
-
-        /* PCP lists */
-        for_each_populated_zone(zone) {
-                if (!(zone_idx_mask & (1 << zone_idx(zone))))
-                        continue;
-
-                for_each_online_cpu(cpu) {
-                        struct per_cpu_pageset *pset =
-                                per_cpu_ptr(zone->pageset, cpu);
-                        struct per_cpu_pages *pcp = &pset->pcp;
-                        result += pcp->count;
-                }
-
-                result += zone_page_state(zone, NR_FREE_PAGES);
-        }
-        return result;
-}
-
-/*
- * Discover how much extra memory will be required by the drivers
- * when they're asked to hibernate. We can then ensure that amount
- * of memory is available when we really want it.
- */
-static void get_extra_pd1_allowance(void)
-{
-        unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final;
-
-        toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
-
-        if (toi_go_atomic(PMSG_FREEZE, 1))
-                return;
-
-        final = real_nr_free_pages(all_zones_mask);
-        toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0);
-
-        extra_pd1_pages_allowance = (orig_num_free > final) ?
-                orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE :
-                MIN_EXTRA_PAGES_ALLOWANCE;
-}
-
-/*
- * Amount of storage needed, possibly taking into account the
- * expected compression ratio and possibly also ignoring our
- * allowance for extra pages.
- */
-static unsigned long main_storage_needed(int use_ecr,
-                int ignore_extra_pd1_allow)
-{
-        return (pagedir1.size + pagedir2.size +
-          (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
-         (use_ecr ? toi_expected_compression_ratio() : 100) / 100;
-}
-
-/*
- * Storage needed for the image header, in bytes until the return.
- */
-unsigned long get_header_storage_needed(void)
-{
-        unsigned long bytes = sizeof(struct toi_header) +
-                        toi_header_storage_for_modules() +
-                        toi_pageflags_space_needed() +
-                        fs_info_space_needed();
-
-        return DIV_ROUND_UP(bytes, PAGE_SIZE);
-}
-
-/*
- * When freeing memory, pages from either pageset might be freed.
- *
- * When seeking to free memory to be able to hibernate, for every ps1 page
- * freed, we need 2 less pages for the atomic copy because there is one less
- * page to copy and one more page into which data can be copied.
- *
- * Freeing ps2 pages saves us nothing directly. No more memory is available
- * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
- * that's too much work to figure out.
- *
- * => ps1_to_free functions
- *
- * Of course if we just want to reduce the image size, because of storage
- * limitations or an image size limit either ps will do.
- *
- * => any_to_free function
- */
-
-static unsigned long lowpages_usable_for_highmem_copy(void)
-{
-        unsigned long needed = get_lowmem_size(pagedir1) +
-                        extra_pd1_pages_allowance + MIN_FREE_RAM +
-                        toi_memory_for_modules(0),
-                available = get_lowmem_size(pagedir2) +
-                         real_nr_free_low_pages() + extra_pages_allocated;
-
-        return available > needed ? available - needed : 0;
-}
-
-static unsigned long highpages_ps1_to_free(void)
-{
-        unsigned long need = get_highmem_size(pagedir1),
-                      available = get_highmem_size(pagedir2) +
-                              real_nr_free_high_pages() +
-                              lowpages_usable_for_highmem_copy();
-
-        return need > available ? DIV_ROUND_UP(need - available, 2) : 0;
-}
-
-static unsigned long lowpages_ps1_to_free(void)
-{
-        unsigned long needed = get_lowmem_size(pagedir1) +
-                        extra_pd1_pages_allowance + MIN_FREE_RAM +
-                        toi_memory_for_modules(0),
-                available = get_lowmem_size(pagedir2) +
-                         real_nr_free_low_pages() + extra_pages_allocated;
-
-        return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0;
-}
-
-static unsigned long current_image_size(void)
-{
-        return pagedir1.size + pagedir2.size + header_storage_needed;
-}
-
-static unsigned long storage_still_required(void)
-{
-        unsigned long needed = main_storage_needed(1, 1);
-        return needed > storage_limit ? needed - storage_limit : 0;
-}
-
-static unsigned long ram_still_required(void)
-{
-        unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) +
-                2 * extra_pd1_pages_allowance,
-                  available = real_nr_free_low_pages() + extra_pages_allocated;
-        return needed > available ? needed - available : 0;
-}
-
-unsigned long any_to_free(int use_image_size_limit)
-{
-        int use_soft_limit = use_image_size_limit && image_size_limit > 0;
-        unsigned long current_size = current_image_size(),
-                      soft_limit = use_soft_limit ? (image_size_limit << 8) : 0,
-                      to_free = use_soft_limit ? (current_size > soft_limit ?
-                                      current_size - soft_limit : 0) : 0,
-                      storage_limit = storage_still_required(),
-                      ram_limit = ram_still_required(),
-                      first_max = max(to_free, storage_limit);
-
-        return max(first_max, ram_limit);
-}
-
-static int need_pageset2(void)
-{
-        return (real_nr_free_low_pages() + extra_pages_allocated -
-                2 * extra_pd1_pages_allowance - MIN_FREE_RAM -
-                 toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size;
-}
-
-/* amount_needed
- *
- * Calculates the amount by which the image size needs to be reduced to meet
- * our constraints.
- */
-static unsigned long amount_needed(int use_image_size_limit)
-{
-        return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
-                        any_to_free(use_image_size_limit));
-}
-
-static int image_not_ready(int use_image_size_limit)
-{
-        toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
-                "Amount still needed (%lu) > 0:%u,"
-                " Storage allocd: %lu < %lu: %u.\n",
-                        amount_needed(use_image_size_limit),
-                        (amount_needed(use_image_size_limit) > 0),
-                        main_storage_allocated,
-                        main_storage_needed(1, 1),
-                        main_storage_allocated < main_storage_needed(1, 1));
-
-        toi_cond_pause(0, NULL);
-
-        return (amount_needed(use_image_size_limit) > 0) ||
-                 main_storage_allocated < main_storage_needed(1, 1);
-}
-
-static void display_failure_reason(int tries_exceeded)
-{
-        unsigned long storage_required = storage_still_required(),
-            ram_required = ram_still_required(),
-            high_ps1 = highpages_ps1_to_free(),
-            low_ps1 = lowpages_ps1_to_free();
-
-        printk(KERN_INFO "Failed to prepare the image because...\n");
-
-        if (!storage_limit) {
-                printk(KERN_INFO "- You need some storage available to be "
-                                "able to hibernate.\n");
-                return;
-        }
-
-        if (tries_exceeded)
-                printk(KERN_INFO "- The maximum number of iterations was "
-                                "reached without successfully preparing the "
-                                "image.\n");
-
-        if (storage_required) {
-                printk(KERN_INFO " - We need at least %lu pages of storage "
-                                "(ignoring the header), but only have %lu.\n",
-                                main_storage_needed(1, 1),
-                                main_storage_allocated);
-                set_abort_result(TOI_INSUFFICIENT_STORAGE);
-        }
-
-        if (ram_required) {
-                printk(KERN_INFO " - We need %lu more free pages of low "
-                                "memory.\n", ram_required);
-                printk(KERN_INFO "     Minimum free     : %8d\n", MIN_FREE_RAM);
-                printk(KERN_INFO "   + Reqd. by modules : %8lu\n",
-                                toi_memory_for_modules(0));
-                printk(KERN_INFO "   + 2 * extra allow  : %8lu\n",
-                                2 * extra_pd1_pages_allowance);
-                printk(KERN_INFO "   - Currently free   : %8lu\n",
-                                real_nr_free_low_pages());
-                printk(KERN_INFO "   - Pages allocd     : %8lu\n",
-                                extra_pages_allocated);
-                printk(KERN_INFO "                      : ========\n");
-                printk(KERN_INFO "     Still needed     : %8lu\n",
-                                ram_required);
-
-                /* Print breakdown of memory needed for modules */
-                toi_memory_for_modules(1);
-                set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
-        }
-
-        if (high_ps1) {
-                printk(KERN_INFO "- We need to free %lu highmem pageset 1 "
-                                "pages.\n", high_ps1);
-                set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
-        }
-
-        if (low_ps1) {
-                printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
-                                "pages.\n", low_ps1);
-                set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
-        }
-}
-
-static void display_stats(int always, int sub_extra_pd1_allow)
-{
-        char buffer[255];
-        snprintf(buffer, 254,
-                "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). "
-                "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). "
-                "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n",
-
-                /* Free */
-                real_nr_free_pages(all_zones_mask),
-                real_nr_free_low_pages(),
-
-                /* Sets */
-                pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
-                pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
-
-                /* Nosave */
-                num_nosave, extra_pages_allocated,
-                num_nosave - extra_pages_allocated,
-
-                /* Storage */
-                main_storage_allocated,
-                storage_limit,
-                main_storage_needed(1, sub_extra_pd1_allow),
-                main_storage_needed(1, 1),
-
-                /* Needed */
-                lowpages_ps1_to_free(), highpages_ps1_to_free(),
-                any_to_free(1),
-                MIN_FREE_RAM, toi_memory_for_modules(0),
-                extra_pd1_pages_allowance,
-                image_size_limit,
-
-                need_pageset2() ? "yes" : "no");
-
-        if (always)
-                printk("%s", buffer);
-        else
-                toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
-}
-
-/* flag_image_pages
- *
- * This routine generates our lists of pages to be stored in each
- * pageset. Since we store the data using extents, and adding new
- * extents might allocate a new extent page, this routine may well
- * be called more than once.
- */
-static void flag_image_pages(int atomic_copy)
-{
-        int num_free = 0, num_unmodified = 0;
-        unsigned long loop;
-        struct zone *zone;
-
-        pagedir1.size = 0;
-        pagedir2.size = 0;
-
-        set_highmem_size(pagedir1, 0);
-        set_highmem_size(pagedir2, 0);
-
-        num_nosave = 0;
-        toi_trace_index++;
-
-        memory_bm_clear(pageset1_map);
-
-        toi_generate_free_page_map();
-
-        /*
-         * Pages not to be saved are marked Nosave irrespective of being
-         * reserved.
-         */
-        for_each_populated_zone(zone) {
-                int highmem = is_highmem(zone);
-
-                for (loop = 0; loop < zone->spanned_pages; loop++) {
-                        unsigned long pfn = zone->zone_start_pfn + loop;
-                        struct page *page;
-                        int chunk_size;
-
-                        if (!pfn_valid(pfn)) {
-                            TOI_TRACE_DEBUG(pfn, "_Flag Invalid");
-                            continue;
-                        }
-
-                        chunk_size = toi_size_of_free_region(zone, pfn);
-                        if (chunk_size) {
-                            unsigned long y;
-                            for (y = pfn; y < pfn + chunk_size; y++) {
-                                page = pfn_to_page(y);
-                                TOI_TRACE_DEBUG(y, "_Flag Free");
-                                ClearPagePageset1(page);
-                                ClearPagePageset2(page);
-                            }
-                                num_free += chunk_size;
-                                loop += chunk_size - 1;
-                                continue;
-                        }
-
-                        page = pfn_to_page(pfn);
-
-                        if (PageNosave(page)) {
-                            char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave";
-                            TOI_TRACE_DEBUG(pfn, "_Flag %s", desc);
-                            num_nosave++;
-                            continue;
-                        }
-
-                        page = highmem ? saveable_highmem_page(zone, pfn) :
-                                saveable_page(zone, pfn);
-
-                        if (!page) {
-                                TOI_TRACE_DEBUG(pfn, "_Flag Nosave2");
-                                num_nosave++;
-                                continue;
-                        }
-
-                        if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
-                            TOI_TRACE_DEBUG(pfn, "_Unmodified");
-                            num_unmodified++;
-                            continue;
-                        }
-
-                        if (PagePageset2(page)) {
-                                pagedir2.size++;
-                                TOI_TRACE_DEBUG(pfn, "_Flag PS2");
-                                if (PageHighMem(page))
-                                        inc_highmem_size(pagedir2);
-                                else
-                                        SetPagePageset1Copy(page);
-                                if (PageResave(page)) {
-                                        SetPagePageset1(page);
-                                        ClearPagePageset1Copy(page);
-                                        pagedir1.size++;
-                                        if (PageHighMem(page))
-                                                inc_highmem_size(pagedir1);
-                                }
-                        } else {
-                                pagedir1.size++;
-                                TOI_TRACE_DEBUG(pfn, "_Flag PS1");
-                                SetPagePageset1(page);
-                                if (PageHighMem(page))
-                                        inc_highmem_size(pagedir1);
-                        }
-                }
-        }
-
-        if (!atomic_copy)
-                toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
-                        "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)"
-                                    " + Unmodified (%d) + NumFree (%d) = %d.\n",
-                        pagedir1.size, pagedir2.size, num_nosave, num_unmodified,
-                        num_free, pagedir1.size + pagedir2.size + num_nosave + num_free);
-}
-
-void toi_recalculate_image_contents(int atomic_copy)
-{
-        memory_bm_clear(pageset1_map);
-        if (!atomic_copy) {
-                unsigned long pfn;
-                memory_bm_position_reset(pageset2_map);
-                for (pfn = memory_bm_next_pfn(pageset2_map, 0);
-                                pfn != BM_END_OF_MAP;
-                                pfn = memory_bm_next_pfn(pageset2_map, 0))
-                        ClearPagePageset1Copy(pfn_to_page(pfn));
-                /* Need to call this before getting pageset1_size! */
-                toi_mark_pages_for_pageset2();
-        }
-        memory_bm_position_reset(pageset2_map);
-        flag_image_pages(atomic_copy);
-
-        if (!atomic_copy) {
-                storage_limit = toiActiveAllocator->storage_available();
-                display_stats(0, 0);
-        }
-}
-
-int try_allocate_extra_memory(void)
-{
-        unsigned long wanted = pagedir1.size +  extra_pd1_pages_allowance -
-                get_lowmem_size(pagedir2);
-        if (wanted > extra_pages_allocated) {
-                unsigned long got = toi_allocate_extra_pagedir_memory(wanted);
-                if (wanted < got) {
-                        toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
-                                "Want %d extra pages for pageset1, got %d.\n",
-                                wanted, got);
-                        return 1;
-                }
-        }
-        return 0;
-}
-
-/* update_image
- *
- * Allocate [more] memory and storage for the image.
- */
-static void update_image(int ps2_recalc)
-{
-        int old_header_req;
-        unsigned long seek;
-
-        if (try_allocate_extra_memory())
-                return;
-
-        if (ps2_recalc)
-                goto recalc;
-
-        thaw_kernel_threads();
-
-        /*
-         * Allocate remaining storage space, if possible, up to the
-         * maximum we know we'll need. It's okay to allocate the
-         * maximum if the writer is the swapwriter, but
-         * we don't want to grab all available space on an NFS share.
-         * We therefore ignore the expected compression ratio here,
-         * thereby trying to allocate the maximum image size we could
-         * need (assuming compression doesn't expand the image), but
-         * don't complain if we can't get the full amount we're after.
-         */
-
-        do {
-                int result;
-
-                old_header_req = header_storage_needed;
-                toiActiveAllocator->reserve_header_space(header_storage_needed);
-
-                /* How much storage is free with the reservation applied? */
-                storage_limit = toiActiveAllocator->storage_available();
-                seek = min(storage_limit, main_storage_needed(0, 0));
-
-                result = toiActiveAllocator->allocate_storage(seek);
-                if (result)
-                        printk("Failed to allocate storage (%d).\n", result);
-
-                main_storage_allocated =
-                        toiActiveAllocator->storage_allocated();
-
-                /* Need more header because more storage allocated? */
-                header_storage_needed = get_header_storage_needed();
-
-        } while (header_storage_needed > old_header_req);
-
-        if (freeze_kernel_threads())
-                set_abort_result(TOI_FREEZING_FAILED);
-
-recalc:
-        toi_recalculate_image_contents(0);
-}
-
-/* attempt_to_freeze
- *
- * Try to freeze processes.
- */
-
-static int attempt_to_freeze(void)
-{
-        int result;
-
-        /* Stop processes before checking again */
-        toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
-                        "filesystems.");
-        result = freeze_processes();
-
-        if (result)
-                set_abort_result(TOI_FREEZING_FAILED);
-
-        result = freeze_kernel_threads();
-
-        if (result)
-                set_abort_result(TOI_FREEZING_FAILED);
-
-        return result;
-}
-
-/* eat_memory
- *
- * Try to free some memory, either to meet hard or soft constraints on the image
- * characteristics.
- *
- * Hard constraints:
- * - Pageset1 must be < half of memory;
- * - We must have enough memory free at resume time to have pageset1
- *   be able to be loaded in pages that don't conflict with where it has to
- *   be restored.
- * Soft constraints
- * - User specificied image size limit.
- */
-static void eat_memory(void)
-{
-        unsigned long amount_wanted = 0;
-        int did_eat_memory = 0;
-
-        /*
-         * Note that if we have enough storage space and enough free memory, we
-         * may exit without eating anything. We give up when the last 10
-         * iterations ate no extra pages because we're not going to get much
-         * more anyway, but the few pages we get will take a lot of time.
-         *
-         * We freeze processes before beginning, and then unfreeze them if we
-         * need to eat memory until we think we have enough. If our attempts
-         * to freeze fail, we give up and abort.
-         */
-
-        amount_wanted = amount_needed(1);
-
-        switch (image_size_limit) {
-        case -1: /* Don't eat any memory */
-                if (amount_wanted > 0) {
-                        set_abort_result(TOI_WOULD_EAT_MEMORY);
-                        return;
-                }
-                break;
-        case -2:  /* Free caches only */
-                drop_pagecache();
-                toi_recalculate_image_contents(0);
-                amount_wanted = amount_needed(1);
-                break;
-        default:
-                break;
-        }
-
-        if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
-                        image_size_limit != -1) {
-                unsigned long request = amount_wanted;
-                unsigned long high_req = max(highpages_ps1_to_free(),
-                                any_to_free(1));
-                unsigned long low_req = lowpages_ps1_to_free();
-                unsigned long got = 0;
-
-                toi_prepare_status(CLEAR_BAR,
-                                "Seeking to free %ldMB of memory.",
-                                MB(amount_wanted));
-
-                thaw_kernel_threads();
-
-                /*
-                 * Ask for too many because shrink_memory_mask doesn't
-                 * currently return enough most of the time.
-                 */
-                
-                if (low_req)
-                        got = shrink_memory_mask(low_req, GFP_KERNEL);
-                if (high_req)
-                        shrink_memory_mask(high_req - got, GFP_HIGHUSER);
-
-                did_eat_memory = 1;
-
-                toi_recalculate_image_contents(0);
-
-                amount_wanted = amount_needed(1);
-
-                printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &"
-                                " %ld pages from anywhere, got %ld.\n",
-                                high_req, low_req,
-                                request - amount_wanted);
-
-                toi_cond_pause(0, NULL);
-
-                if (freeze_kernel_threads())
-                        set_abort_result(TOI_FREEZING_FAILED);
-        }
-
-        if (did_eat_memory)
-                toi_recalculate_image_contents(0);
-}
-
-/* toi_prepare_image
- *
- * Entry point to the whole image preparation section.
- *
- * We do four things:
- * - Freeze processes;
- * - Ensure image size constraints are met;
- * - Complete all the preparation for saving the image,
- *   including allocation of storage. The only memory
- *   that should be needed when we're finished is that
- *   for actually storing the image (and we know how
- *   much is needed for that because the modules tell
- *   us).
- * - Make sure that all dirty buffers are written out.
- */
-#define MAX_TRIES 2
-int toi_prepare_image(void)
-{
-        int result = 1, tries = 1;
-
-        main_storage_allocated = 0;
-        no_ps2_needed = 0;
-
-        if (attempt_to_freeze())
-                return 1;
-
-        lock_device_hotplug();
-        set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
-
-        if (!extra_pd1_pages_allowance)
-                get_extra_pd1_allowance();
-
-        storage_limit = toiActiveAllocator->storage_available();
-
-        if (!storage_limit) {
-                printk(KERN_INFO "No storage available. Didn't try to prepare "
-                                "an image.\n");
-                display_failure_reason(0);
-                set_abort_result(TOI_NOSTORAGE_AVAILABLE);
-                return 1;
-        }
-
-        if (build_attention_list()) {
-                abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
-                                "Unable to successfully prepare the image.\n");
-                return 1;
-        }
-
-        toi_recalculate_image_contents(0);
-
-        do {
-                toi_prepare_status(CLEAR_BAR,
-                                "Preparing Image. Try %d.", tries);
-
-                eat_memory();
-
-                if (test_result_state(TOI_ABORTED))
-                        break;
-
-                update_image(0);
-
-                tries++;
-
-        } while (image_not_ready(1) && tries <= MAX_TRIES &&
-                        !test_result_state(TOI_ABORTED));
-
-        result = image_not_ready(0);
-
-        /* TODO: Handle case where need to remove existing image and resave
-         * instead of adding to incremental image. */
-
-        if (!test_result_state(TOI_ABORTED)) {
-                if (result) {
-                        display_stats(1, 0);
-                        display_failure_reason(tries > MAX_TRIES);
-                        abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
-                                "Unable to successfully prepare the image.\n");
-                } else {
-                        /* Pageset 2 needed? */
-                        if (!need_pageset2() &&
-                                  test_action_state(TOI_NO_PS2_IF_UNNEEDED)) {
-                                no_ps2_needed = 1;
-                                toi_recalculate_image_contents(0);
-                                update_image(1);
-                        }
-
-                        toi_cond_pause(1, "Image preparation complete.");
-                }
-        }
-
-        return result ? result : allocate_checksum_pages();
-}
diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
deleted file mode 100644
index c1508975c..000000000
--- a/kernel/power/tuxonice_prepare_image.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <asm/sections.h>
-
-extern int toi_prepare_image(void);
-extern void toi_recalculate_image_contents(int storage_available);
-extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask);
-extern long image_size_limit;
-extern void toi_free_extra_pagedir_memory(void);
-extern unsigned long extra_pd1_pages_allowance;
-extern void free_attention_list(void);
-
-#define MIN_FREE_RAM 100
-#define MIN_EXTRA_PAGES_ALLOWANCE 500
-
-#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
-#ifdef CONFIG_HIGHMEM
-#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
-                                                (1 << ZONE_HIGHMEM)))
-#else
-#define real_nr_free_high_pages() (0)
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
-
-/* For eat_memory function */
-#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
-#endif
-
-unsigned long get_header_storage_needed(void);
-unsigned long any_to_free(int use_image_size_limit);
-int try_allocate_extra_memory(void);
diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c
deleted file mode 100644
index 5bc56d3a1..000000000
--- a/kernel/power/tuxonice_prune.c
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * kernel/power/tuxonice_prune.c
- *
- * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file implements a TuxOnIce module that seeks to prune the
- * amount of data written to disk. It builds a table of hashes
- * of the uncompressed data, and writes the pfn of the previous page
- * with the same contents instead of repeating the data when a match
- * is found.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-#include <crypto/hash.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-/*
- * We never write a page bigger than PAGE_SIZE, so use a large number
- * to indicate that data is a PFN.
- */
-#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100)
-
-static unsigned long toi_pruned_pages;
-
-static struct toi_module_ops toi_prune_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_prune_hash_algo_name[32] = "sha1";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
-        struct shash_desc desc;
-        char *digest;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_prune_crypto_prepare(void)
-{
-        int cpu, ret, digestsize;
-
-        if (!*toi_prune_hash_algo_name) {
-                printk(KERN_INFO "TuxOnIce: Pruning enabled but no "
-                                "hash algorithm set.\n");
-                return 1;
-        }
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0);
-                if (IS_ERR(this->desc.tfm)) {
-                        printk(KERN_INFO "TuxOnIce: Failed to allocate the "
-                                        "%s prune hash algorithm.\n",
-                                        toi_prune_hash_algo_name);
-                        this->desc.tfm = NULL;
-                        return 1;
-                }
-
-                if (!digestsize)
-                        digestsize = crypto_shash_digestsize(this->desc.tfm);
-
-                this->digest = kmalloc(digestsize, GFP_KERNEL);
-                if (!this->digest) {
-                        printk(KERN_INFO "TuxOnIce: Failed to allocate space "
-                                        "for digest output.\n");
-                        crypto_free_shash(this->desc.tfm);
-                        this->desc.tfm = NULL;
-                }
-
-                this->desc.flags = 0;
-
-                ret = crypto_shash_init(&this->desc);
-                if (ret < 0) {
-                        printk(KERN_INFO "TuxOnIce: Failed to initialise the "
-                                        "%s prune hash algorithm.\n",
-                                        toi_prune_hash_algo_name);
-                        kfree(this->digest);
-                        this->digest = NULL;
-                        crypto_free_shash(this->desc.tfm);
-                        this->desc.tfm = NULL;
-                        return 1;
-                }
-        }
-
-        return 0;
-}
-
-static int toi_prune_rw_cleanup(int writing)
-{
-        int cpu;
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                if (this->desc.tfm) {
-                        crypto_free_shash(this->desc.tfm);
-                        this->desc.tfm = NULL;
-                }
-
-                if (this->digest) {
-                        kfree(this->digest);
-                        this->digest = NULL;
-                }
-        }
-
-        return 0;
-}
-
-/*
- * toi_prune_init
- */
-
-static int toi_prune_init(int toi_or_resume)
-{
-        if (!toi_or_resume)
-                return 0;
-
-        toi_pruned_pages = 0;
-
-        next_driver = toi_get_next_filter(&toi_prune_ops);
-
-        return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_prune_rw_init()
- */
-
-static int toi_prune_rw_init(int rw, int stream_number)
-{
-        if (toi_prune_crypto_prepare()) {
-                printk(KERN_ERR "Failed to initialise prune "
-                                "algorithm.\n");
-                if (rw == READ) {
-                        printk(KERN_INFO "Unable to read the image.\n");
-                        return -ENODEV;
-                } else {
-                        printk(KERN_INFO "Continuing without "
-                                "pruning the image.\n");
-                        toi_prune_ops.enabled = 0;
-                }
-        }
-
-        return 0;
-}
-
-/*
- * toi_prune_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page:        Pointer to a buffer of size PAGE_SIZE, containing
- * data to be checked.
- *
- * Returns:        0 on success. Otherwise the error is that returned by later
- *                 modules, -ECHILD if we have a broken pipeline or -EIO if
- *                 zlib errs.
- */
-static int toi_prune_write_page(unsigned long index, int buf_type,
-                void *buffer_page, unsigned int buf_size)
-{
-        int ret = 0, cpu = smp_processor_id(), write_data = 1;
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-        u8* output_buffer = buffer_page;
-        int output_len = buf_size;
-        int out_buf_type = buf_type;
-        void *buffer_start;
-        u32 buf[4];
-
-        if (ctx->desc.tfm) {
-
-                buffer_start = TOI_MAP(buf_type, buffer_page);
-                ctx->len = OUT_BUF_SIZE;
-
-                ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest);
-                if (ret) {
-                        printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret);
-                } else {
-                        mutex_lock(&stats_lock);
-
-                        toi_pruned_pages++;
-
-                        mutex_unlock(&stats_lock);
-
-                }
-
-                TOI_UNMAP(buf_type, buffer_page);
-        }
-
-        if (write_data)
-                ret = next_driver->write_page(index, out_buf_type,
-                                output_buffer, output_len);
-        else
-                ret = next_driver->write_page(index, out_buf_type,
-                                output_buffer, output_len);
-
-        return ret;
-}
-
-/*
- * toi_prune_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules or from a previously loaded page and
- * fill the input buffer.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_prune_read_page(unsigned long *index, int buf_type,
-                void *buffer_page, unsigned int *buf_size)
-{
-        int ret, cpu = smp_processor_id();
-        unsigned int len;
-        char *buffer_start;
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
-        if (!ctx->desc.tfm)
-                return next_driver->read_page(index, TOI_PAGE, buffer_page,
-                                buf_size);
-
-        /*
-         * All our reads must be synchronous - we can't handle
-         * data that hasn't been read yet.
-         */
-
-        ret = next_driver->read_page(index, buf_type, buffer_page, &len);
-
-        if (len == PRUNE_DATA_IS_PFN) {
-                buffer_start = kmap(buffer_page);
-        }
-
-        return ret;
-}
-
-/*
- * toi_prune_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_prune_print_debug_stats(char *buffer, int size)
-{
-        int len;
-
-        /* Output the number of pages pruned. */
-        if (*toi_prune_hash_algo_name)
-                len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
-                                toi_prune_hash_algo_name);
-        else
-                len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
-        if (toi_pruned_pages)
-                len += scnprintf(buffer+len, size - len, "  Pruned "
-                        "%lu pages).\n",
-                  toi_pruned_pages);
-        return len;
-}
-
-/*
- * toi_prune_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_prune_memory_needed(void)
-{
-        return 2 * PAGE_SIZE;
-}
-
-static int toi_prune_storage_needed(void)
-{
-        return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
-                strlen(toi_prune_hash_algo_name) + 1;
-}
-
-/*
- * toi_prune_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_prune_save_config_info(char *buffer)
-{
-        int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0;
-
-        *((unsigned long *) buffer) = toi_pruned_pages;
-        offset += sizeof(unsigned long);
-        *((int *) (buffer + offset)) = len;
-        offset += sizeof(int);
-        strncpy(buffer + offset, toi_prune_hash_algo_name, len);
-        return offset + len;
-}
-
-/* toi_prune_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description:        Reload information needed for passing back to the
- * resumed kernel.
- */
-static void toi_prune_load_config_info(char *buffer, int size)
-{
-        int len, offset = 0;
-
-        toi_pruned_pages = *((unsigned long *) buffer);
-        offset += sizeof(unsigned long);
-        len = *((int *) (buffer + offset));
-        offset += sizeof(int);
-        strncpy(toi_prune_hash_algo_name, buffer + offset, len);
-}
-
-static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        bkd->pruned_pages = toi_pruned_pages;
-}
-
-static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        toi_pruned_pages = bkd->pruned_pages;
-}
-
-/*
- * toi_expected_ratio
- *
- * Description:        Returns the expected ratio between data passed into this module
- *                 and the amount of data output when writing.
- * Returns:        100 - we have no idea how many pages will be pruned.
- */
-
-static int toi_prune_expected_ratio(void)
-{
-        return 100;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0,
-                        NULL),
-        SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_prune_ops = {
-        .type                        = FILTER_MODULE,
-        .name                        = "prune",
-        .directory                = "prune",
-        .module                        = THIS_MODULE,
-        .initialise                = toi_prune_init,
-        .memory_needed                 = toi_prune_memory_needed,
-        .print_debug_info        = toi_prune_print_debug_stats,
-        .save_config_info        = toi_prune_save_config_info,
-        .load_config_info        = toi_prune_load_config_info,
-        .storage_needed                = toi_prune_storage_needed,
-        .expected_compression        = toi_prune_expected_ratio,
-
-        .pre_atomic_restore        = toi_prune_pre_atomic_restore,
-        .post_atomic_restore        = toi_prune_post_atomic_restore,
-
-        .rw_init                = toi_prune_rw_init,
-        .rw_cleanup                = toi_prune_rw_cleanup,
-
-        .write_page                = toi_prune_write_page,
-        .read_page                = toi_prune_read_page,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_prune_load(void)
-{
-        return toi_register_module(&toi_prune_ops);
-}
-
-late_initcall(toi_prune_load);
diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
deleted file mode 100644
index d8539c275..000000000
--- a/kernel/power/tuxonice_storage.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for talking to a userspace program that manages storage.
- *
- * The kernel side:
- * - starts the userspace program;
- * - sends messages telling it when to open and close the connection;
- * - tells it when to quit;
- *
- * The user space side:
- * - passes messages regarding status;
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_ui.h"
-
-static struct user_helper_data usm_helper_data;
-static struct toi_module_ops usm_ops;
-static int message_received, usm_prepare_count;
-static int storage_manager_last_action, storage_manager_action;
-
-static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-        int type;
-        int *data;
-
-        type = nlh->nlmsg_type;
-
-        /* A control message: ignore them */
-        if (type < NETLINK_MSG_BASE)
-                return 0;
-
-        /* Unknown message: reply with EINVAL */
-        if (type >= USM_MSG_MAX)
-                return -EINVAL;
-
-        /* All operations require privileges, even GET */
-        if (!capable(CAP_NET_ADMIN))
-                return -EPERM;
-
-        /* Only allow one task to receive NOFREEZE privileges */
-        if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
-                return -EBUSY;
-
-        data = (int *) NLMSG_DATA(nlh);
-
-        switch (type) {
-        case USM_MSG_SUCCESS:
-        case USM_MSG_FAILED:
-                message_received = type;
-                complete(&usm_helper_data.wait_for_process);
-                break;
-        default:
-                printk(KERN_INFO "Storage manager doesn't recognise "
-                                "message %d.\n", type);
-        }
-
-        return 1;
-}
-
-#ifdef CONFIG_NET
-static int activations;
-
-int toi_activate_storage(int force)
-{
-        int tries = 1;
-
-        if (usm_helper_data.pid == -1 || !usm_ops.enabled)
-                return 0;
-
-        message_received = 0;
-        activations++;
-
-        if (activations > 1 && !force)
-                return 0;
-
-        while ((!message_received || message_received == USM_MSG_FAILED) &&
-                        tries < 2) {
-                toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
-                                "%d.\n", tries);
-
-                init_completion(&usm_helper_data.wait_for_process);
-
-                toi_send_netlink_message(&usm_helper_data,
-                        USM_MSG_CONNECT,
-                        NULL, 0);
-
-                /* Wait 2 seconds for the userspace process to make contact */
-                wait_for_completion_timeout(&usm_helper_data.wait_for_process,
-                                2*HZ);
-
-                tries++;
-        }
-
-        return 0;
-}
-
-int toi_deactivate_storage(int force)
-{
-        if (usm_helper_data.pid == -1 || !usm_ops.enabled)
-                return 0;
-
-        message_received = 0;
-        activations--;
-
-        if (activations && !force)
-                return 0;
-
-        init_completion(&usm_helper_data.wait_for_process);
-
-        toi_send_netlink_message(&usm_helper_data,
-                        USM_MSG_DISCONNECT,
-                        NULL, 0);
-
-        wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
-
-        if (!message_received || message_received == USM_MSG_FAILED) {
-                printk(KERN_INFO "Returning failure disconnecting storage.\n");
-                return 1;
-        }
-
-        return 0;
-}
-#endif
-
-static void storage_manager_simulate(void)
-{
-        printk(KERN_INFO "--- Storage manager simulate ---\n");
-        toi_prepare_usm();
-        schedule();
-        printk(KERN_INFO "--- Activate storage 1 ---\n");
-        toi_activate_storage(1);
-        schedule();
-        printk(KERN_INFO "--- Deactivate storage 1 ---\n");
-        toi_deactivate_storage(1);
-        schedule();
-        printk(KERN_INFO "--- Cleanup usm ---\n");
-        toi_cleanup_usm();
-        schedule();
-        printk(KERN_INFO "--- Storage manager simulate ends ---\n");
-}
-
-static int usm_storage_needed(void)
-{
-        return sizeof(int) + strlen(usm_helper_data.program) + 1;
-}
-
-static int usm_save_config_info(char *buf)
-{
-        int len = strlen(usm_helper_data.program);
-        memcpy(buf, usm_helper_data.program, len + 1);
-        return sizeof(int) + len + 1;
-}
-
-static void usm_load_config_info(char *buf, int size)
-{
-        /* Don't load the saved path if one has already been set */
-        if (usm_helper_data.program[0])
-                return;
-
-        memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf));
-}
-
-static int usm_memory_needed(void)
-{
-        /* ball park figure of 32 pages */
-        return 32 * PAGE_SIZE;
-}
-
-/* toi_prepare_usm
- */
-int toi_prepare_usm(void)
-{
-        usm_prepare_count++;
-
-        if (usm_prepare_count > 1 || !usm_ops.enabled)
-                return 0;
-
-        usm_helper_data.pid = -1;
-
-        if (!*usm_helper_data.program)
-                return 0;
-
-        toi_netlink_setup(&usm_helper_data);
-
-        if (usm_helper_data.pid == -1)
-                printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
-                                " start it.\n");
-
-        toi_activate_storage(0);
-
-        return usm_helper_data.pid != -1;
-}
-
-void toi_cleanup_usm(void)
-{
-        usm_prepare_count--;
-
-        if (usm_helper_data.pid > -1 && !usm_prepare_count) {
-                toi_deactivate_storage(0);
-                toi_netlink_close(&usm_helper_data);
-        }
-}
-
-static void storage_manager_activate(void)
-{
-        if (storage_manager_action == storage_manager_last_action)
-                return;
-
-        if (storage_manager_action)
-                toi_prepare_usm();
-        else
-                toi_cleanup_usm();
-
-        storage_manager_last_action = storage_manager_action;
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate),
-        SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL),
-        SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0,
-                NULL),
-        SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1,
-                        0, storage_manager_activate)
-};
-
-static struct toi_module_ops usm_ops = {
-        .type                                = MISC_MODULE,
-        .name                                = "usm",
-        .directory                        = "storage_manager",
-        .module                                = THIS_MODULE,
-        .storage_needed                        = usm_storage_needed,
-        .save_config_info                = usm_save_config_info,
-        .load_config_info                = usm_load_config_info,
-        .memory_needed                        = usm_memory_needed,
-
-        .sysfs_data                        = sysfs_params,
-        .num_sysfs_entries                = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* toi_usm_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-int toi_usm_init(void)
-{
-        usm_helper_data.nl = NULL;
-        usm_helper_data.program[0] = '\0';
-        usm_helper_data.pid = -1;
-        usm_helper_data.skb_size = 0;
-        usm_helper_data.pool_limit = 6;
-        usm_helper_data.netlink_id = NETLINK_TOI_USM;
-        usm_helper_data.name = "userspace storage manager";
-        usm_helper_data.rcv_msg = usm_user_rcv_msg;
-        usm_helper_data.interface_version = 2;
-        usm_helper_data.must_init = 0;
-        init_completion(&usm_helper_data.wait_for_process);
-
-        return toi_register_module(&usm_ops);
-}
-
-void toi_usm_exit(void)
-{
-        toi_netlink_close_complete(&usm_helper_data);
-        toi_unregister_module(&usm_ops);
-}
diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
deleted file mode 100644
index 0189c8888..000000000
--- a/kernel/power/tuxonice_storage.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_NET
-int toi_prepare_usm(void);
-void toi_cleanup_usm(void);
-
-int toi_activate_storage(int force);
-int toi_deactivate_storage(int force);
-extern int toi_usm_init(void);
-extern void toi_usm_exit(void);
-#else
-static inline int toi_usm_init(void) { return 0; }
-static inline void toi_usm_exit(void) { }
-
-static inline int toi_activate_storage(int force)
-{
-        return 0;
-}
-
-static inline int toi_deactivate_storage(int force)
-{
-        return 0;
-}
-
-static inline int toi_prepare_usm(void) { return 0; }
-static inline void toi_cleanup_usm(void) { }
-#endif
-
-enum {
-        USM_MSG_BASE = 0x10,
-
-        /* Kernel -> Userspace */
-        USM_MSG_CONNECT = 0x30,
-        USM_MSG_DISCONNECT = 0x31,
-        USM_MSG_SUCCESS = 0x40,
-        USM_MSG_FAILED = 0x41,
-
-        USM_MSG_MAX,
-};
diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
deleted file mode 100644
index 9f555c932..000000000
--- a/kernel/power/tuxonice_swap.c
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * kernel/power/tuxonice_swap.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of swap space as a
- * backing store.
- */
-
-#include <linux/suspend.h>
-#include <linux/blkdev.h>
-#include <linux/swapops.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct toi_module_ops toi_swapops;
-
-/* For swapfile automatically swapon/off'd. */
-static char swapfilename[255] = "";
-static int toi_swapon_status;
-
-/* Swap Pages */
-static unsigned long swap_allocated;
-
-static struct sysinfo swapinfo;
-
-static int is_ram_backed(struct swap_info_struct *si)
-{
-        if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) ||
-            !strncmp(si->bdev->bd_disk->disk_name, "zram", 4))
-                return 1;
-
-        return 0;
-}
-
-/**
- * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
- *
- * Activate the given swapfile if it wasn't already enabled. Remember whether
- * we really did swapon it for swapoffing later.
- */
-static void enable_swapfile(void)
-{
-        int activateswapresult = -EINVAL;
-
-        if (swapfilename[0]) {
-                /* Attempt to swap on with maximum priority */
-                activateswapresult = sys_swapon(swapfilename, 0xFFFF);
-                if (activateswapresult && activateswapresult != -EBUSY)
-                        printk(KERN_ERR "TuxOnIce: The swapfile/partition "
-                                "specified by /sys/power/tuxonice/swap/swapfile"
-                                " (%s) could not be turned on (error %d). "
-                                "Attempting to continue.\n",
-                                swapfilename, activateswapresult);
-                if (!activateswapresult)
-                        toi_swapon_status = 1;
-        }
-}
-
-/**
- * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
- *
- * If we did successfully swapon a file at the start of the cycle, swapoff
- * it now (finishing up).
- */
-static void disable_swapfile(void)
-{
-        if (!toi_swapon_status)
-                return;
-
-        sys_swapoff(swapfilename);
-        toi_swapon_status = 0;
-}
-
-static int add_blocks_to_extent_chain(struct toi_bdev_info *chain,
-                unsigned long start, unsigned long end)
-{
-        if (test_action_state(TOI_TEST_BIO))
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to "
-                                "chain %p.", start << chain->bmap_shift,
-                                end << chain->bmap_shift, chain);
-
-        return toi_add_to_extent_chain(&chain->blocks, start, end);
-}
-
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
-        struct hibernate_extent *extentpointer = NULL;
-        unsigned long address, extent_min = 0, extent_max = 0;
-        int empty = 1;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for "
-                        "chain %d.", chain->allocator_index);
-
-        if (!chain->allocations.first)
-                return 0;
-
-        if (chain->blocks.first)
-                toi_put_extent_chain(&chain->blocks);
-
-        toi_extent_for_each(&chain->allocations, extentpointer, address) {
-                swp_entry_t swap_address = (swp_entry_t) { address };
-                struct block_device *bdev;
-                sector_t new_sector = map_swap_entry(swap_address, &bdev);
-
-                if (empty) {
-                        empty = 0;
-                        extent_min = extent_max = new_sector;
-                        continue;
-                }
-
-                if (new_sector == extent_max + 1) {
-                        extent_max++;
-                        continue;
-                }
-
-                if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
-                        printk(KERN_ERR "Out of memory while making block "
-                                        "chains.\n");
-                        return -ENOMEM;
-                }
-
-                extent_min = new_sector;
-                extent_max = new_sector;
-        }
-
-        if (!empty &&
-            add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
-                printk(KERN_ERR "Out of memory while making block chains.\n");
-                return -ENOMEM;
-        }
-
-        return 0;
-}
-
-/*
- * Like si_swapinfo, except that we don't include ram backed swap (compcache!)
- * and don't need to use the spinlocks (userspace is stopped when this
- * function is called).
- */
-void si_swapinfo_no_compcache(void)
-{
-        unsigned int i;
-
-        si_swapinfo(&swapinfo);
-        swapinfo.freeswap = 0;
-        swapinfo.totalswap = 0;
-
-        for (i = 0; i < MAX_SWAPFILES; i++) {
-                struct swap_info_struct *si = get_swap_info_struct(i);
-                if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) {
-                        swapinfo.totalswap += si->inuse_pages;
-                        swapinfo.freeswap += si->pages - si->inuse_pages;
-                }
-        }
-}
-/*
- * We can't just remember the value from allocation time, because other
- * processes might have allocated swap in the mean time.
- */
-static unsigned long toi_swap_storage_available(void)
-{
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available.");
-        si_swapinfo_no_compcache();
-        return swapinfo.freeswap + swap_allocated;
-}
-
-static int toi_swap_initialise(int starting_cycle)
-{
-        if (!starting_cycle)
-                return 0;
-
-        enable_swapfile();
-        return 0;
-}
-
-static void toi_swap_cleanup(int ending_cycle)
-{
-        if (!ending_cycle)
-                return;
-
-        disable_swapfile();
-}
-
-static void toi_swap_free_storage(struct toi_bdev_info *chain)
-{
-        /* Free swap entries */
-        struct hibernate_extent *extentpointer;
-        unsigned long extentvalue;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.",
-                        chain);
-
-        swap_allocated -= chain->allocations.size;
-        toi_extent_for_each(&chain->allocations, extentpointer, extentvalue)
-                swap_free((swp_entry_t) { extentvalue });
-
-        toi_put_extent_chain(&chain->allocations);
-}
-
-static void free_swap_range(unsigned long min, unsigned long max)
-{
-        int j;
-
-        for (j = min; j <= max; j++)
-                swap_free((swp_entry_t) { j });
-        swap_allocated -= (max - min + 1);
-}
-
-/*
- * Allocation of a single swap type. Swap priorities are handled at the higher
- * level.
- */
-static int toi_swap_allocate_storage(struct toi_bdev_info *chain,
-                unsigned long request)
-{
-        unsigned long gotten = 0;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "  Swap allocate storage: Asked to"
-                        " allocate %lu pages from device %d.", request,
-                        chain->allocator_index);
-
-        while (gotten < request) {
-                swp_entry_t start, end;
-                if (0) {
-                    /* Broken at the moment for SSDs */
-                    get_swap_range_of_type(chain->allocator_index, &start, &end,
-                            request - gotten + 1);
-                } else {
-                    start = end = get_swap_page_of_type(chain->allocator_index);
-                }
-                if (start.val) {
-                        int added = end.val - start.val + 1;
-                        if (toi_add_to_extent_chain(&chain->allocations,
-                                                start.val, end.val)) {
-                                printk(KERN_INFO "Failed to allocate extent for "
-                                        "%lu-%lu.\n", start.val, end.val);
-                                free_swap_range(start.val, end.val);
-                                break;
-                        }
-                        gotten += added;
-                        swap_allocated += added;
-                } else
-                        break;
-        }
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "  Allocated %lu pages.", gotten);
-        return gotten;
-}
-
-static int toi_swap_register_storage(void)
-{
-        int i, result = 0;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage.");
-        for (i = 0; i < MAX_SWAPFILES; i++) {
-                struct swap_info_struct *si = get_swap_info_struct(i);
-                struct toi_bdev_info *devinfo;
-                unsigned char *p;
-                unsigned char buf[256];
-                struct fs_info *fs_info;
-
-                if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si))
-                        continue;
-
-                devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info),
-                                GFP_ATOMIC);
-                if (!devinfo) {
-                        printk("Failed to allocate devinfo struct for swap "
-                                        "device %d.\n", i);
-                        return -ENOMEM;
-                }
-
-                devinfo->bdev = si->bdev;
-                devinfo->allocator = &toi_swapops;
-                devinfo->allocator_index = i;
-
-                fs_info = fs_info_from_block_dev(si->bdev);
-                if (fs_info && !IS_ERR(fs_info)) {
-                        memcpy(devinfo->uuid, &fs_info->uuid, 16);
-                        free_fs_info(fs_info);
-                } else
-                        result = (int) PTR_ERR(fs_info);
-
-                if (!fs_info)
-                        printk("fs_info from block dev returned %d.\n", result);
-                devinfo->dev_t = si->bdev->bd_dev;
-                devinfo->prio = si->prio;
-                devinfo->bmap_shift = 3;
-                devinfo->blocks_per_page = 1;
-
-                p = d_path(&si->swap_file->f_path, buf, sizeof(buf));
-                sprintf(devinfo->name, "swap on %s", p);
-
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:"
-                                " Device %d (%lx), prio %d.", i,
-                                (unsigned long) devinfo->dev_t, devinfo->prio);
-                toi_bio_ops.register_storage(devinfo);
-        }
-
-        return 0;
-}
-
-static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used)
-{
-    struct hibernate_extent *extentpointer = NULL;
-    unsigned long extentvalue;
-    unsigned long i = 0, first_freed = 0;
-
-    toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) {
-        i++;
-        if (i > used) {
-            swap_free((swp_entry_t) { extentvalue });
-            if (!first_freed)
-                first_freed = extentvalue;
-        }
-    }
-
-    return first_freed;
-}
-
-/*
- * workspace_size
- *
- * Description:
- * Returns the number of bytes of RAM needed for this
- * code to do its work. (Used when calculating whether
- * we have enough memory to be able to hibernate & resume).
- *
- */
-static int toi_swap_memory_needed(void)
-{
-        return 1;
-}
-
-/*
- * Print debug info
- *
- * Description:
- */
-static int toi_swap_print_debug_stats(char *buffer, int size)
-{
-        int len = 0;
-
-        len = scnprintf(buffer, size, "- Swap Allocator enabled.\n");
-        if (swapfilename[0])
-                len += scnprintf(buffer+len, size-len,
-                        "  Attempting to automatically swapon: %s.\n",
-                        swapfilename);
-
-        si_swapinfo_no_compcache();
-
-        len += scnprintf(buffer+len, size-len,
-                        "  Swap available for image: %lu pages.\n",
-                        swapinfo.freeswap + swap_allocated);
-
-        return len;
-}
-
-static int header_locations_read_sysfs(const char *page, int count)
-{
-        int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
-        struct inode *swapf = NULL;
-        int zone;
-        char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
-        char *path, *output = (char *) page;
-        int path_len;
-
-        if (!page)
-                return 0;
-
-        for (i = 0; i < MAX_SWAPFILES; i++) {
-                struct swap_info_struct *si =  get_swap_info_struct(i);
-
-                if (!si || !(si->flags & SWP_WRITEOK))
-                        continue;
-
-                if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
-                        haveswap = 1;
-                        if (!printedpartitionsmessage) {
-                                len += sprintf(output + len,
-                                        "For swap partitions, simply use the "
-                                        "format: resume=swap:/dev/hda1.\n");
-                                printedpartitionsmessage = 1;
-                        }
-                } else {
-                        path_len = 0;
-
-                        path = d_path(&si->swap_file->f_path, path_page,
-                                        PAGE_SIZE);
-                        path_len = snprintf(path_page, PAGE_SIZE, "%s", path);
-
-                        haveswap = 1;
-                        swapf = si->swap_file->f_mapping->host;
-                        zone = bmap(swapf, 0);
-                        if (!zone) {
-                                len += sprintf(output + len,
-                                        "Swapfile %s has been corrupted. Reuse"
-                                        " mkswap on it and try again.\n",
-                                        path_page);
-                        } else {
-                                char name_buffer[BDEVNAME_SIZE];
-                                len += sprintf(output + len,
-                                        "For swapfile `%s`,"
-                                        " use resume=swap:/dev/%s:0x%x.\n",
-                                        path_page,
-                                        bdevname(si->bdev, name_buffer),
-                                        zone << (swapf->i_blkbits - 9));
-                        }
-                }
-        }
-
-        if (!haveswap)
-                len = sprintf(output, "You need to turn on swap partitions "
-                                "before examining this file.\n");
-
-        toi_free_page(10, (unsigned long) path_page);
-        return len;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL),
-        SYSFS_CUSTOM("headerlocations", SYSFS_READONLY,
-                        header_locations_read_sysfs, NULL, 0, NULL),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0,
-                        attempt_to_parse_resume_device2),
-};
-
-static struct toi_bio_allocator_ops toi_bio_swapops = {
-        .register_storage                        = toi_swap_register_storage,
-        .storage_available                        = toi_swap_storage_available,
-        .allocate_storage                        = toi_swap_allocate_storage,
-        .bmap                                        = get_main_pool_phys_params,
-        .free_storage                                = toi_swap_free_storage,
-        .free_unused_storage                    = toi_swap_free_unused_storage,
-};
-
-static struct toi_module_ops toi_swapops = {
-        .type                                        = BIO_ALLOCATOR_MODULE,
-        .name                                        = "swap storage",
-        .directory                                = "swap",
-        .module                                        = THIS_MODULE,
-        .memory_needed                                = toi_swap_memory_needed,
-        .print_debug_info                        = toi_swap_print_debug_stats,
-        .initialise                                = toi_swap_initialise,
-        .cleanup                                = toi_swap_cleanup,
-        .bio_allocator_ops                        = &toi_bio_swapops,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_swap_load(void)
-{
-        return toi_register_module(&toi_swapops);
-}
-
-late_initcall(toi_swap_load);
diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
deleted file mode 100644
index 77f36dbeb..000000000
--- a/kernel/power/tuxonice_sysfs.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.c
- *
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains support for sysfs entries for tuning TuxOnIce.
- *
- * We have a generic handler that deals with the most common cases, and
- * hooks for special handlers to use.
- */
-
-#include <linux/suspend.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_alloc.h"
-
-static int toi_sysfs_initialised;
-
-static void toi_initialise_sysfs(void);
-
-static struct toi_sysfs_data sysfs_params[];
-
-#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
-
-static void toi_main_wrapper(void)
-{
-        toi_try_hibernate();
-}
-
-static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
-                              char *page)
-{
-        struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
-        int len = 0;
-        int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ;
-
-        if (full_prep && toi_start_anything(0))
-                return -EBUSY;
-
-        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
-                toi_prepare_usm();
-
-        switch (sysfs_data->type) {
-        case TOI_SYSFS_DATA_CUSTOM:
-                len = (sysfs_data->data.special.read_sysfs) ?
-                        (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
-                        : 0;
-                break;
-        case TOI_SYSFS_DATA_BIT:
-                len = sprintf(page, "%d\n",
-                        -test_bit(sysfs_data->data.bit.bit,
-                                sysfs_data->data.bit.bit_vector));
-                break;
-        case TOI_SYSFS_DATA_INTEGER:
-                len = sprintf(page, "%d\n",
-                        *(sysfs_data->data.integer.variable));
-                break;
-        case TOI_SYSFS_DATA_LONG:
-                len = sprintf(page, "%ld\n",
-                        *(sysfs_data->data.a_long.variable));
-                break;
-        case TOI_SYSFS_DATA_UL:
-                len = sprintf(page, "%lu\n",
-                        *(sysfs_data->data.ul.variable));
-                break;
-        case TOI_SYSFS_DATA_STRING:
-                len = sprintf(page, "%s\n",
-                        sysfs_data->data.string.variable);
-                break;
-        }
-
-        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
-                toi_cleanup_usm();
-
-        if (full_prep)
-                toi_finish_anything(0);
-
-        return len;
-}
-
-#define BOUND(_variable, _type) do { \
-        if (*_variable < sysfs_data->data._type.minimum) \
-                *_variable = sysfs_data->data._type.minimum; \
-        else if (*_variable > sysfs_data->data._type.maximum) \
-                *_variable = sysfs_data->data._type.maximum; \
-} while (0)
-
-static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
-                const char *my_buf, size_t count)
-{
-        int assigned_temp_buffer = 0, result = count;
-        struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
-
-        if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
-                return -EBUSY;
-
-        ((char *) my_buf)[count] = 0;
-
-        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
-                toi_prepare_usm();
-
-        switch (sysfs_data->type) {
-        case TOI_SYSFS_DATA_CUSTOM:
-                if (sysfs_data->data.special.write_sysfs)
-                        result = (sysfs_data->data.special.write_sysfs)(my_buf,
-                                        count);
-                break;
-        case TOI_SYSFS_DATA_BIT:
-                {
-                unsigned long value;
-                result = kstrtoul(my_buf, 0, &value);
-                if (result)
-                        break;
-                if (value)
-                        set_bit(sysfs_data->data.bit.bit,
-                                (sysfs_data->data.bit.bit_vector));
-                else
-                        clear_bit(sysfs_data->data.bit.bit,
-                                (sysfs_data->data.bit.bit_vector));
-                }
-                break;
-        case TOI_SYSFS_DATA_INTEGER:
-                {
-                        long temp;
-                        result = kstrtol(my_buf, 0, &temp);
-                        if (result)
-                                break;
-                        *(sysfs_data->data.integer.variable) = (int) temp;
-                        BOUND(sysfs_data->data.integer.variable, integer);
-                        break;
-                }
-        case TOI_SYSFS_DATA_LONG:
-                {
-                        long *variable =
-                                sysfs_data->data.a_long.variable;
-                        result = kstrtol(my_buf, 0, variable);
-                        if (result)
-                                break;
-                        BOUND(variable, a_long);
-                        break;
-                }
-        case TOI_SYSFS_DATA_UL:
-                {
-                        unsigned long *variable =
-                                sysfs_data->data.ul.variable;
-                        result = kstrtoul(my_buf, 0, variable);
-                        if (result)
-                                break;
-                        BOUND(variable, ul);
-                        break;
-                }
-                break;
-        case TOI_SYSFS_DATA_STRING:
-                {
-                        int copy_len = count;
-                        char *variable =
-                                sysfs_data->data.string.variable;
-
-                        if (sysfs_data->data.string.max_length &&
-                            (copy_len > sysfs_data->data.string.max_length))
-                                copy_len = sysfs_data->data.string.max_length;
-
-                        if (!variable) {
-                                variable = (char *) toi_get_zeroed_page(31,
-                                                TOI_ATOMIC_GFP);
-                                sysfs_data->data.string.variable = variable;
-                                assigned_temp_buffer = 1;
-                        }
-                        strncpy(variable, my_buf, copy_len);
-                        if (copy_len && my_buf[copy_len - 1] == '\n')
-                                variable[count - 1] = 0;
-                        variable[count] = 0;
-                }
-                break;
-        }
-
-        if (!result)
-                result = count;
-
-        /* Side effect routine? */
-        if (result == count && sysfs_data->write_side_effect)
-                sysfs_data->write_side_effect();
-
-        /* Free temporary buffers */
-        if (assigned_temp_buffer) {
-                toi_free_page(31,
-                        (unsigned long) sysfs_data->data.string.variable);
-                sysfs_data->data.string.variable = NULL;
-        }
-
-        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
-                toi_cleanup_usm();
-
-        toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
-
-        return result;
-}
-
-static struct sysfs_ops toi_sysfs_ops = {
-        .show        = &toi_attr_show,
-        .store        = &toi_attr_store,
-};
-
-static struct kobj_type toi_ktype = {
-        .sysfs_ops        = &toi_sysfs_ops,
-};
-
-struct kobject *tuxonice_kobj;
-
-/* Non-module sysfs entries.
- *
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL,
-                SYSFS_HIBERNATING, toi_main_wrapper),
-        SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL,
-                SYSFS_RESUMING, toi_try_resume)
-};
-
-void remove_toi_sysdir(struct kobject *kobj)
-{
-        if (!kobj)
-                return;
-
-        kobject_put(kobj);
-}
-
-struct kobject *make_toi_sysdir(char *name)
-{
-        struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
-
-        if (!kobj) {
-                printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
-                                "dir!\n");
-                return NULL;
-        }
-
-        kobj->ktype = &toi_ktype;
-
-        return kobj;
-}
-
-/* toi_register_sysfs_file
- *
- * Helper for registering a new /sysfs/tuxonice entry.
- */
-
-int toi_register_sysfs_file(
-                struct kobject *kobj,
-                struct toi_sysfs_data *toi_sysfs_data)
-{
-        int result;
-
-        if (!toi_sysfs_initialised)
-                toi_initialise_sysfs();
-
-        result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
-        if (result)
-                printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
-                        "returned %d.\n",
-                        toi_sysfs_data->attr.name, result);
-        kobj->ktype = &toi_ktype;
-
-        return result;
-}
-
-/* toi_unregister_sysfs_file
- *
- * Helper for removing unwanted /sys/power/tuxonice entries.
- *
- */
-void toi_unregister_sysfs_file(struct kobject *kobj,
-                struct toi_sysfs_data *toi_sysfs_data)
-{
-        sysfs_remove_file(kobj, &toi_sysfs_data->attr);
-}
-
-void toi_cleanup_sysfs(void)
-{
-        int i,
-            numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
-        if (!toi_sysfs_initialised)
-                return;
-
-        for (i = 0; i < numfiles; i++)
-                toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
-        kobject_put(tuxonice_kobj);
-        toi_sysfs_initialised = 0;
-}
-
-/* toi_initialise_sysfs
- *
- * Initialise the /sysfs/tuxonice directory.
- */
-
-static void toi_initialise_sysfs(void)
-{
-        int i;
-        int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
-        if (toi_sysfs_initialised)
-                return;
-
-        /* Make our TuxOnIce directory a child of /sys/power */
-        tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
-        if (!tuxonice_kobj)
-                return;
-
-        toi_sysfs_initialised = 1;
-
-        for (i = 0; i < numfiles; i++)
-                toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-}
-
-int toi_sysfs_init(void)
-{
-        toi_initialise_sysfs();
-        return 0;
-}
-
-void toi_sysfs_exit(void)
-{
-        toi_cleanup_sysfs();
-}
diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
deleted file mode 100644
index 1de954ce1..000000000
--- a/kernel/power/tuxonice_sysfs.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/sysfs.h>
-
-struct toi_sysfs_data {
-        struct attribute attr;
-        int type;
-        int flags;
-        union {
-                struct {
-                        unsigned long *bit_vector;
-                        int bit;
-                } bit;
-                struct {
-                        int *variable;
-                        int minimum;
-                        int maximum;
-                } integer;
-                struct {
-                        long *variable;
-                        long minimum;
-                        long maximum;
-                } a_long;
-                struct {
-                        unsigned long *variable;
-                        unsigned long minimum;
-                        unsigned long maximum;
-                } ul;
-                struct {
-                        char *variable;
-                        int max_length;
-                } string;
-                struct {
-                        int (*read_sysfs) (const char *buffer, int count);
-                        int (*write_sysfs) (const char *buffer, int count);
-                        void *data;
-                } special;
-        } data;
-
-        /* Side effects routine. Used, eg, for reparsing the
-         * resume= entry when it changes */
-        void (*write_side_effect) (void);
-        struct list_head sysfs_data_list;
-};
-
-enum {
-        TOI_SYSFS_DATA_NONE = 1,
-        TOI_SYSFS_DATA_CUSTOM,
-        TOI_SYSFS_DATA_BIT,
-        TOI_SYSFS_DATA_INTEGER,
-        TOI_SYSFS_DATA_UL,
-        TOI_SYSFS_DATA_LONG,
-        TOI_SYSFS_DATA_STRING
-};
-
-#define SYSFS_WRITEONLY 0200
-#define SYSFS_READONLY 0444
-#define SYSFS_RW 0644
-
-#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_BIT, \
-        .flags = _flags, \
-        .data = { .bit = { .bit_vector = _ul, .bit = _bit } } }
-
-#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_INTEGER, \
-        .flags = _flags, \
-        .data = { .integer = { .variable = _int, .minimum = _min, \
-                        .maximum = _max } }, \
-        .write_side_effect = _wse }
-
-#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_UL, \
-        .flags = _flags, \
-        .data = { .ul = { .variable = _ul, .minimum = _min, \
-                        .maximum = _max } } }
-
-#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_LONG, \
-        .flags = _flags, \
-        .data = { .a_long = { .variable = _long, .minimum = _min, \
-                        .maximum = _max } } }
-
-#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_STRING, \
-        .flags = _flags, \
-        .data = { .string = { .variable = _string, .max_length = _max_len } }, \
-        .write_side_effect = _wse }
-
-#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_CUSTOM, \
-        .flags = _flags, \
-        .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \
-        .write_side_effect = _wse }
-
-#define SYSFS_NONE(_name, _wse) { \
-        .attr = {.name  = _name , .mode   = SYSFS_WRITEONLY }, \
-        .type = TOI_SYSFS_DATA_NONE, \
-        .write_side_effect = _wse, \
-}
-
-/* Flags */
-#define SYSFS_NEEDS_SM_FOR_READ 1
-#define SYSFS_NEEDS_SM_FOR_WRITE 2
-#define SYSFS_HIBERNATE 4
-#define SYSFS_RESUME 8
-#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
-#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_NEEDS_SM_FOR_BOTH \
- (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
-
-int toi_register_sysfs_file(struct kobject *kobj,
-                struct toi_sysfs_data *toi_sysfs_data);
-void toi_unregister_sysfs_file(struct kobject *kobj,
-                struct toi_sysfs_data *toi_sysfs_data);
-
-extern struct kobject *tuxonice_kobj;
-
-struct kobject *make_toi_sysdir(char *name);
-void remove_toi_sysdir(struct kobject *obj);
-extern void toi_cleanup_sysfs(void);
-
-extern int toi_sysfs_init(void);
-extern void toi_sysfs_exit(void);
diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
deleted file mode 100644
index 76152f3ff..000000000
--- a/kernel/power/tuxonice_ui.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/reboot.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_builtin.h"
-
-static char local_printf_buf[1024];        /* Same as printk - should be safe */
-struct ui_ops *toi_current_ui;
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress, either from userui or /dev/console if userui isn't
- * available. The non-userui path is particularly for at boot-time, prior
- * to userui being started, when we have an important warning to give to
- * the user.
- */
-static char toi_wait_for_keypress(int timeout)
-{
-        if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
-                return ' ';
-
-        return toi_wait_for_keypress_dev_console(timeout);
-}
-
-/* toi_early_boot_message()
- * Description:        Handle errors early in the process of booting.
- *                 The user may press C to continue booting, perhaps
- *                 invalidating the image,  or space to reboot.
- *                 This works from either the serial console or normally
- *                 attached keyboard.
- *
- *                 Note that we come in here from init, while the kernel is
- *                 locked. If we want to get events from the serial console,
- *                 we need to temporarily unlock the kernel.
- *
- *                 toi_early_boot_message may also be called post-boot.
- *                 In this case, it simply printks the message and returns.
- *
- * Arguments:        int        Whether we are able to erase the image.
- *                 int        default_answer. What to do when we timeout. This
- *                         will normally be continue, but the user might
- *                         provide command line options (__setup) to override
- *                         particular cases.
- *                 Char *. Pointer to a string explaining why we're moaning.
- */
-
-#define say(message, a...) printk(KERN_EMERG message, ##a)
-
-void toi_early_boot_message(int message_detail, int default_answer,
-        char *warning_reason, ...)
-{
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
-        unsigned long orig_state = get_toi_state(), continue_req = 0;
-        unsigned long orig_loglevel = console_loglevel;
-        int can_ask = 1;
-#else
-        int can_ask = 0;
-#endif
-
-        va_list args;
-        int printed_len;
-
-        if (!toi_wait) {
-                set_toi_state(TOI_CONTINUE_REQ);
-                can_ask = 0;
-        }
-
-        if (warning_reason) {
-                va_start(args, warning_reason);
-                printed_len = vsnprintf(local_printf_buf,
-                                sizeof(local_printf_buf),
-                                warning_reason,
-                                args);
-                va_end(args);
-        }
-
-        if (!test_toi_state(TOI_BOOT_TIME)) {
-                printk("TuxOnIce: %s\n", local_printf_buf);
-                return;
-        }
-
-        if (!can_ask) {
-                continue_req = !!default_answer;
-                goto post_ask;
-        }
-
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
-        console_loglevel = 7;
-
-        say("=== TuxOnIce ===\n\n");
-        if (warning_reason) {
-                say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
-                switch (message_detail) {
-                case 0:
-                        say("If you continue booting, note that any image WILL"
-                                "NOT BE REMOVED.\nTuxOnIce is unable to do so "
-                                "because the appropriate modules aren't\n"
-                                "loaded. You should manually remove the image "
-                                "to avoid any\npossibility of corrupting your "
-                                "filesystem(s) later.\n");
-                        break;
-                case 1:
-                        say("If you want to use the current TuxOnIce image, "
-                                "reboot and try\nagain with the same kernel "
-                                "that you hibernated from. If you want\n"
-                                "to forget that image, continue and the image "
-                                "will be erased.\n");
-                        break;
-                }
-                say("Press SPACE to reboot or C to continue booting with "
-                        "this kernel\n\n");
-                if (toi_wait > 0)
-                        say("Default action if you don't select one in %d "
-                                "seconds is: %s.\n",
-                                toi_wait,
-                                default_answer == TOI_CONTINUE_REQ ?
-                                "continue booting" : "reboot");
-        } else {
-                say("BIG FAT WARNING!!\n\n"
-                        "You have tried to resume from this image before.\n"
-                        "If it failed once, it may well fail again.\n"
-                        "Would you like to remove the image and boot "
-                        "normally?\nThis will be equivalent to entering "
-                        "noresume on the\nkernel command line.\n\n"
-                        "Press SPACE to remove the image or C to continue "
-                        "resuming.\n\n");
-                if (toi_wait > 0)
-                        say("Default action if you don't select one in %d "
-                                "seconds is: %s.\n", toi_wait,
-                                !!default_answer ?
-                                "continue resuming" : "remove the image");
-        }
-        console_loglevel = orig_loglevel;
-
-        set_toi_state(TOI_SANITY_CHECK_PROMPT);
-        clear_toi_state(TOI_CONTINUE_REQ);
-
-        if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
-                continue_req = !!default_answer;
-        else
-                continue_req = test_toi_state(TOI_CONTINUE_REQ);
-
-#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
-
-post_ask:
-        if ((warning_reason) && (!continue_req))
-                kernel_restart(NULL);
-
-        restore_toi_state(orig_state);
-        if (continue_req)
-                set_toi_state(TOI_CONTINUE_REQ);
-}
-
-#undef say
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
-        SYSFS_INT("default_console_level", SYSFS_RW,
-                        &toi_bkd.toi_default_console_level, 0, 7, 0, NULL),
-        SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0,
-                        1 << 30, 0),
-        SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL,
-                        0)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
-        .type                                = MISC_HIDDEN_MODULE,
-        .name                                = "printk ui",
-        .directory                        = "user_interface",
-        .module                                = THIS_MODULE,
-        .sysfs_data                        = sysfs_params,
-        .num_sysfs_entries                = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-int toi_register_ui_ops(struct ui_ops *this_ui)
-{
-        if (toi_current_ui) {
-                printk(KERN_INFO "Only one TuxOnIce user interface module can "
-                                "be loaded at a time.");
-                return -EBUSY;
-        }
-
-        toi_current_ui = this_ui;
-
-        return 0;
-}
-
-void toi_remove_ui_ops(struct ui_ops *this_ui)
-{
-        if (toi_current_ui != this_ui)
-                return;
-
-        toi_current_ui = NULL;
-}
-
-/* toi_console_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-
-int toi_ui_init(void)
-{
-        return toi_register_module(&userui_ops);
-}
-
-void toi_ui_exit(void)
-{
-        toi_unregister_module(&userui_ops);
-}
diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
deleted file mode 100644
index 4934e3a91..000000000
--- a/kernel/power/tuxonice_ui.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- */
-
-enum {
-        DONT_CLEAR_BAR,
-        CLEAR_BAR
-};
-
-enum {
-        /* Userspace -> Kernel */
-        USERUI_MSG_ABORT = 0x11,
-        USERUI_MSG_SET_STATE = 0x12,
-        USERUI_MSG_GET_STATE = 0x13,
-        USERUI_MSG_GET_DEBUG_STATE = 0x14,
-        USERUI_MSG_SET_DEBUG_STATE = 0x15,
-        USERUI_MSG_SPACE = 0x18,
-        USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
-        USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
-        USERUI_MSG_GET_LOGLEVEL = 0x1C,
-        USERUI_MSG_SET_LOGLEVEL = 0x1D,
-        USERUI_MSG_PRINTK = 0x1E,
-
-        /* Kernel -> Userspace */
-        USERUI_MSG_MESSAGE = 0x21,
-        USERUI_MSG_PROGRESS = 0x22,
-        USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
-
-        USERUI_MSG_MAX,
-};
-
-struct userui_msg_params {
-        u32 a, b, c, d;
-        char text[255];
-};
-
-struct ui_ops {
-        char (*wait_for_key) (int timeout);
-        u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...);
-        void (*prepare_status) (int clearbar, const char *fmt, ...);
-        void (*cond_pause) (int pause, char *message);
-        void (*abort)(int result_code, const char *fmt, ...);
-        void (*prepare)(void);
-        void (*cleanup)(void);
-        void (*message)(u32 section, u32 level, u32 normally_logged,
-                        const char *fmt, ...);
-};
-
-extern struct ui_ops *toi_current_ui;
-
-#define toi_update_status(val, max, fmt, args...) \
- (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
-        max)
-
-#define toi_prepare_console(void) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->prepare)(); \
-        } while (0)
-
-#define toi_cleanup_console(void) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->cleanup)(); \
-        } while (0)
-
-#define abort_hibernate(result, fmt, args...) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->abort)(result, fmt, ##args); \
-             else { \
-                set_abort_result(result); \
-             } \
-        } while (0)
-
-#define toi_cond_pause(pause, message) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->cond_pause)(pause, message); \
-        } while (0)
-
-#define toi_prepare_status(clear, fmt, args...) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->prepare_status)(clear, fmt, ##args); \
-             else \
-                printk(KERN_INFO fmt "%s", ##args, "\n"); \
-        } while (0)
-
-#define toi_message(sn, lev, log, fmt, a...) \
-do { \
-        if (toi_current_ui && (!sn || test_debug_state(sn))) \
-                toi_current_ui->message(sn, lev, log, fmt, ##a); \
-} while (0)
-
-__exit void toi_ui_cleanup(void);
-extern int toi_ui_init(void);
-extern void toi_ui_exit(void);
-extern int toi_register_ui_ops(struct ui_ops *this_ui);
-extern void toi_remove_ui_ops(struct ui_ops *this_ui);
diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
deleted file mode 100644
index 6aa5ac3eb..000000000
--- a/kernel/power/tuxonice_userui.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * kernel/power/user_ui.c
- *
- * Copyright (C) 2005-2007 Bernard Blackham
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/ctype.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>
-#include <linux/reboot.h>
-#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vt.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-
-static char local_printf_buf[1024];        /* Same as printk - should be safe */
-
-static struct user_helper_data ui_helper_data;
-static struct toi_module_ops userui_ops;
-static int orig_kmsg;
-
-static char lastheader[512];
-static int lastheader_message_len;
-static int ui_helper_changed; /* Used at resume-time so don't overwrite value
-                                set from initrd/ramfs. */
-
-/* Number of distinct progress amounts that userspace can display */
-static int progress_granularity = 30;
-
-static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
-static int userui_wait_should_wake;
-
-#define toi_stop_waiting_for_userui_key() \
-{ \
-        userui_wait_should_wake = true; \
-        wake_up_interruptible(&userui_wait_for_key); \
-}
-
-/**
- * ui_nl_set_state - Update toi_action based on a message from userui.
- *
- * @n: The bit (1 << bit) to set.
- */
-static void ui_nl_set_state(int n)
-{
-        /* Only let them change certain settings */
-        static const u32 toi_action_mask =
-                (1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
-                (1 << TOI_LOGALL) |
-                (1 << TOI_SINGLESTEP) |
-                (1 << TOI_PAUSE_NEAR_PAGESET_END);
-        static unsigned long new_action;
-
-        new_action = (toi_bkd.toi_action & (~toi_action_mask)) |
-                (n & toi_action_mask);
-
-        printk(KERN_DEBUG "n is %x. Action flags being changed from %lx "
-                        "to %lx.", n, toi_bkd.toi_action, new_action);
-        toi_bkd.toi_action = new_action;
-
-        if (!test_action_state(TOI_PAUSE) &&
-                        !test_action_state(TOI_SINGLESTEP))
-                toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_post_atomic_restore - Tell userui that atomic restore just happened.
- *
- * Tell userui that atomic restore just occured, so that it can do things like
- * redrawing the screen, re-getting settings and so on.
- */
-static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        toi_send_netlink_message(&ui_helper_data,
-                        USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
-}
-
-/**
- * userui_storage_needed - Report how much memory in image header is needed.
- */
-static int userui_storage_needed(void)
-{
-        return sizeof(ui_helper_data.program) + 1 + sizeof(int);
-}
-
-/**
- * userui_save_config_info - Fill buffer with config info for image header.
- *
- * @buf: Buffer into which to put the config info we want to save.
- */
-static int userui_save_config_info(char *buf)
-{
-        *((int *) buf) = progress_granularity;
-        memcpy(buf + sizeof(int), ui_helper_data.program,
-                        sizeof(ui_helper_data.program));
-        return sizeof(ui_helper_data.program) + sizeof(int) + 1;
-}
-
-/**
- * userui_load_config_info - Restore config info from buffer.
- *
- * @buf: Buffer containing header info loaded.
- * @size: Size of data loaded for this module.
- */
-static void userui_load_config_info(char *buf, int size)
-{
-        progress_granularity = *((int *) buf);
-        size -= sizeof(int);
-
-        /* Don't load the saved path if one has already been set */
-        if (ui_helper_changed)
-                return;
-
-        if (size > sizeof(ui_helper_data.program))
-                size = sizeof(ui_helper_data.program);
-
-        memcpy(ui_helper_data.program, buf + sizeof(int), size);
-        ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
-}
-
-/**
- * set_ui_program_set: Record that userui program was changed.
- *
- * Side effect routine for when the userui program is set. In an initrd or
- * ramfs, the user may set a location for the userui program. If this happens,
- * we don't want to reload the value that was saved in the image header. This
- * routine allows us to flag that we shouldn't restore the program name from
- * the image header.
- */
-static void set_ui_program_set(void)
-{
-        ui_helper_changed = 1;
-}
-
-/**
- * userui_memory_needed - Tell core how much memory to reserve for us.
- */
-static int userui_memory_needed(void)
-{
-        /* ball park figure of 128 pages */
-        return 128 * PAGE_SIZE;
-}
-
-/**
- * userui_update_status - Update the progress bar and (if on) in-bar message.
- *
- * @value: Current progress percentage numerator.
- * @maximum: Current progress percentage denominator.
- * @fmt: Message to be displayed in the middle of the progress bar.
- *
- * Note that a NULL message does not mean that any previous message is erased!
- * For that, you need toi_prepare_status with clearbar on.
- *
- * Returns an unsigned long, being the next numerator (as determined by the
- * maximum and progress granularity) where status needs to be updated.
- * This is to reduce unnecessary calls to update_status.
- */
-static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...)
-{
-        static u32 last_step = 9999;
-        struct userui_msg_params msg;
-        u32 this_step, next_update;
-        int bitshift;
-
-        if (ui_helper_data.pid == -1)
-                return 0;
-
-        if ((!maximum) || (!progress_granularity))
-                return maximum;
-
-        if (value < 0)
-                value = 0;
-
-        if (value > maximum)
-                value = maximum;
-
-        /* Try to avoid math problems - we can't do 64 bit math here
-         * (and shouldn't need it - anyone got screen resolution
-         * of 65536 pixels or more?) */
-        bitshift = fls(maximum) - 16;
-        if (bitshift > 0) {
-                u32 temp_maximum = maximum >> bitshift;
-                u32 temp_value = value >> bitshift;
-                this_step = (u32)
-                        (temp_value * progress_granularity / temp_maximum);
-                next_update = (((this_step + 1) * temp_maximum /
-                                        progress_granularity) + 1) << bitshift;
-        } else {
-                this_step = (u32) (value * progress_granularity / maximum);
-                next_update = ((this_step + 1) * maximum /
-                                progress_granularity) + 1;
-        }
-
-        if (this_step == last_step)
-                return next_update;
-
-        memset(&msg, 0, sizeof(msg));
-
-        msg.a = this_step;
-        msg.b = progress_granularity;
-
-        if (fmt) {
-                va_list args;
-                va_start(args, fmt);
-                vsnprintf(msg.text, sizeof(msg.text), fmt, args);
-                va_end(args);
-                msg.text[sizeof(msg.text)-1] = '\0';
-        }
-
-        toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
-                        &msg, sizeof(msg));
-        last_step = this_step;
-
-        return next_update;
-}
-
-/**
- * userui_message - Display a message without necessarily logging it.
- *
- * @section: Type of message. Messages can be filtered by type.
- * @level: Degree of importance of the message. Lower values = higher priority.
- * @normally_logged: Whether logged even if log_everything is off.
- * @fmt: Message (and parameters).
- *
- * This function is intended to do the same job as printk, but without normally
- * logging what is printed. The point is to be able to get debugging info on
- * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
- *
- * It may be called from an interrupt context - can't sleep!
- */
-static void userui_message(u32 section, u32 level, u32 normally_logged,
-                const char *fmt, ...)
-{
-        struct userui_msg_params msg;
-
-        if ((level) && (level > console_loglevel))
-                return;
-
-        memset(&msg, 0, sizeof(msg));
-
-        msg.a = section;
-        msg.b = level;
-        msg.c = normally_logged;
-
-        if (fmt) {
-                va_list args;
-                va_start(args, fmt);
-                vsnprintf(msg.text, sizeof(msg.text), fmt, args);
-                va_end(args);
-                msg.text[sizeof(msg.text)-1] = '\0';
-        }
-
-        if (test_action_state(TOI_LOGALL))
-                printk(KERN_INFO "%s\n", msg.text);
-
-        toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
-                        &msg, sizeof(msg));
-}
-
-/**
- * wait_for_key_via_userui - Wait for userui to receive a keypress.
- */
-static void wait_for_key_via_userui(void)
-{
-        DECLARE_WAITQUEUE(wait, current);
-
-        add_wait_queue(&userui_wait_for_key, &wait);
-        set_current_state(TASK_INTERRUPTIBLE);
-
-        wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake);
-        userui_wait_should_wake = false;
-
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&userui_wait_for_key, &wait);
-}
-
-/**
- * userui_prepare_status - Display high level messages.
- *
- * @clearbar: Whether to clear the progress bar.
- * @fmt...: New message for the title.
- *
- * Prepare the 'nice display', drawing the header and version, along with the
- * current action and perhaps also resetting the progress bar.
- */
-static void userui_prepare_status(int clearbar, const char *fmt, ...)
-{
-        va_list args;
-
-        if (fmt) {
-                va_start(args, fmt);
-                lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
-                va_end(args);
-        }
-
-        if (clearbar)
-                toi_update_status(0, 1, NULL);
-
-        if (ui_helper_data.pid == -1)
-                printk(KERN_EMERG "%s\n", lastheader);
-        else
-                toi_message(0, TOI_STATUS, 1, lastheader, NULL);
-}
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress from userui.
- *
- * FIXME: Implement timeout?
- */
-static char userui_wait_for_keypress(int timeout)
-{
-        char key = '\0';
-
-        if (ui_helper_data.pid != -1) {
-                wait_for_key_via_userui();
-                key = ' ';
-        }
-
-        return key;
-}
-
-/**
- * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
- *
- * @result_code: Reason why we're aborting (1 << bit).
- * @fmt: Message to display if telling the user what's going on.
- *
- * Abort a cycle. If this wasn't at the user's request (and we're displaying
- * output), tell the user why and wait for them to acknowledge the message.
- */
-static void userui_abort_hibernate(int result_code, const char *fmt, ...)
-{
-        va_list args;
-        int printed_len = 0;
-
-        set_result_state(result_code);
-
-        if (test_result_state(TOI_ABORTED))
-                return;
-
-        set_result_state(TOI_ABORTED);
-
-        if (test_result_state(TOI_ABORT_REQUESTED))
-                return;
-
-        va_start(args, fmt);
-        printed_len = vsnprintf(local_printf_buf,  sizeof(local_printf_buf),
-                        fmt, args);
-        va_end(args);
-        if (ui_helper_data.pid != -1)
-                printed_len = sprintf(local_printf_buf + printed_len,
-                                        " (Press SPACE to continue)");
-
-        toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
-
-        if (ui_helper_data.pid != -1)
-                userui_wait_for_keypress(0);
-}
-
-/**
- * request_abort_hibernate - Abort hibernating or resuming at user request.
- *
- * Handle the user requesting the cancellation of a hibernation or resume by
- * pressing escape.
- */
-static void request_abort_hibernate(void)
-{
-        if (test_result_state(TOI_ABORT_REQUESTED) ||
-           !test_action_state(TOI_CAN_CANCEL))
-                return;
-
-        if (test_toi_state(TOI_NOW_RESUMING)) {
-                toi_prepare_status(CLEAR_BAR, "Escape pressed. "
-                                        "Powering down again.");
-                set_toi_state(TOI_STOP_RESUME);
-                while (!test_toi_state(TOI_IO_STOPPED))
-                        schedule();
-                if (toiActiveAllocator->mark_resume_attempted)
-                        toiActiveAllocator->mark_resume_attempted(0);
-                toi_power_down();
-        }
-
-        toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
-                                        " ABORTING HIBERNATION ---");
-        set_abort_result(TOI_ABORT_REQUESTED);
-        toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_user_rcv_msg - Receive a netlink message from userui.
- *
- * @skb: skb received.
- * @nlh: Netlink header received.
- */
-static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-        int type;
-        int *data;
-
-        type = nlh->nlmsg_type;
-
-        /* A control message: ignore them */
-        if (type < NETLINK_MSG_BASE)
-                return 0;
-
-        /* Unknown message: reply with EINVAL */
-        if (type >= USERUI_MSG_MAX)
-                return -EINVAL;
-
-        /* All operations require privileges, even GET */
-        if (!capable(CAP_NET_ADMIN))
-                return -EPERM;
-
-        /* Only allow one task to receive NOFREEZE privileges */
-        if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
-                printk(KERN_INFO "Got NOFREEZE_ME request when "
-                        "ui_helper_data.pid is %d.\n", ui_helper_data.pid);
-                return -EBUSY;
-        }
-
-        data = (int *) NLMSG_DATA(nlh);
-
-        switch (type) {
-        case USERUI_MSG_ABORT:
-                request_abort_hibernate();
-                return 0;
-        case USERUI_MSG_GET_STATE:
-                toi_send_netlink_message(&ui_helper_data,
-                                USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
-                                sizeof(toi_bkd.toi_action));
-                return 0;
-        case USERUI_MSG_GET_DEBUG_STATE:
-                toi_send_netlink_message(&ui_helper_data,
-                                USERUI_MSG_GET_DEBUG_STATE,
-                                &toi_bkd.toi_debug_state,
-                                sizeof(toi_bkd.toi_debug_state));
-                return 0;
-        case USERUI_MSG_SET_STATE:
-                if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
-                        return -EINVAL;
-                ui_nl_set_state(*data);
-                return 0;
-        case USERUI_MSG_SET_DEBUG_STATE:
-                if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
-                        return -EINVAL;
-                toi_bkd.toi_debug_state = (*data);
-                return 0;
-        case USERUI_MSG_SPACE:
-                toi_stop_waiting_for_userui_key();
-                return 0;
-        case USERUI_MSG_GET_POWERDOWN_METHOD:
-                toi_send_netlink_message(&ui_helper_data,
-                                USERUI_MSG_GET_POWERDOWN_METHOD,
-                                &toi_poweroff_method,
-                                sizeof(toi_poweroff_method));
-                return 0;
-        case USERUI_MSG_SET_POWERDOWN_METHOD:
-                if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char)))
-                        return -EINVAL;
-                toi_poweroff_method = (unsigned long)(*data);
-                return 0;
-        case USERUI_MSG_GET_LOGLEVEL:
-                toi_send_netlink_message(&ui_helper_data,
-                                USERUI_MSG_GET_LOGLEVEL,
-                                &toi_bkd.toi_default_console_level,
-                                sizeof(toi_bkd.toi_default_console_level));
-                return 0;
-        case USERUI_MSG_SET_LOGLEVEL:
-                if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
-                        return -EINVAL;
-                toi_bkd.toi_default_console_level = (*data);
-                return 0;
-        case USERUI_MSG_PRINTK:
-                printk(KERN_INFO "%s", (char *) data);
-                return 0;
-        }
-
-        /* Unhandled here */
-        return 1;
-}
-
-/**
- * userui_cond_pause - Possibly pause at user request.
- *
- * @pause: Whether to pause or just display the message.
- * @message: Message to display at the start of pausing.
- *
- * Potentially pause and wait for the user to tell us to continue. We normally
- * only pause when @pause is set. While paused, the user can do things like
- * changing the loglevel, toggling the display of debugging sections and such
- * like.
- */
-static void userui_cond_pause(int pause, char *message)
-{
-        int displayed_message = 0, last_key = 0;
-
-        while (last_key != 32 &&
-                ui_helper_data.pid != -1 &&
-                ((test_action_state(TOI_PAUSE) && pause) ||
-                 (test_action_state(TOI_SINGLESTEP)))) {
-                if (!displayed_message) {
-                        toi_prepare_status(DONT_CLEAR_BAR,
-                           "%s Press SPACE to continue.%s",
-                           message ? message : "",
-                           (test_action_state(TOI_SINGLESTEP)) ?
-                           " Single step on." : "");
-                        displayed_message = 1;
-                }
-                last_key = userui_wait_for_keypress(0);
-        }
-        schedule();
-}
-
-/**
- * userui_prepare_console - Prepare the console for use.
- *
- * Prepare a console for use, saving current kmsg settings and attempting to
- * start userui. Console loglevel changes are handled by userui.
- */
-static void userui_prepare_console(void)
-{
-        orig_kmsg = vt_kmsg_redirect(fg_console + 1);
-
-        ui_helper_data.pid = -1;
-
-        if (!userui_ops.enabled) {
-                printk(KERN_INFO "TuxOnIce: Userui disabled.\n");
-                return;
-        }
-
-        if (*ui_helper_data.program)
-                toi_netlink_setup(&ui_helper_data);
-        else
-                printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
-}
-
-/**
- * userui_cleanup_console - Cleanup after a cycle.
- *
- * Tell userui to cleanup, and restore kmsg_redirect to its original value.
- */
-
-static void userui_cleanup_console(void)
-{
-        if (ui_helper_data.pid > -1)
-                toi_netlink_close(&ui_helper_data);
-
-        vt_kmsg_redirect(orig_kmsg);
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
-        SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_CAN_CANCEL, 0),
-        SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_PAUSE, 0),
-        SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL),
-        SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
-                        2048, 0, NULL),
-        SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0,
-                        set_ui_program_set),
-        SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
-        .type                                = MISC_MODULE,
-        .name                                = "userui",
-        .shared_directory                = "user_interface",
-        .module                                = THIS_MODULE,
-        .storage_needed                        = userui_storage_needed,
-        .save_config_info                = userui_save_config_info,
-        .load_config_info                = userui_load_config_info,
-        .memory_needed                        = userui_memory_needed,
-        .post_atomic_restore                = userui_post_atomic_restore,
-        .sysfs_data                        = sysfs_params,
-        .num_sysfs_entries                = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-static struct ui_ops my_ui_ops = {
-        .update_status                        = userui_update_status,
-        .message                        = userui_message,
-        .prepare_status                        = userui_prepare_status,
-        .abort                                = userui_abort_hibernate,
-        .cond_pause                        = userui_cond_pause,
-        .prepare                        = userui_prepare_console,
-        .cleanup                        = userui_cleanup_console,
-        .wait_for_key                        = userui_wait_for_keypress,
-};
-
-/**
- * toi_user_ui_init - Boot time initialisation for user interface.
- *
- * Invoked from the core init routine.
- */
-static __init int toi_user_ui_init(void)
-{
-        int result;
-
-        ui_helper_data.nl = NULL;
-        strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
-        ui_helper_data.pid = -1;
-        ui_helper_data.skb_size = sizeof(struct userui_msg_params);
-        ui_helper_data.pool_limit = 6;
-        ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
-        ui_helper_data.name = "userspace ui";
-        ui_helper_data.rcv_msg = userui_user_rcv_msg;
-        ui_helper_data.interface_version = 8;
-        ui_helper_data.must_init = 0;
-        ui_helper_data.not_ready = userui_cleanup_console;
-        init_completion(&ui_helper_data.wait_for_process);
-        result = toi_register_module(&userui_ops);
-        if (!result) {
-          result = toi_register_ui_ops(&my_ui_ops);
-          if (result)
-            toi_unregister_module(&userui_ops);
-        }
-
-        return result;
-}
-
-late_initcall(toi_user_ui_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8362f1979..af4e6968c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -33,7 +33,6 @@
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/syscalls.h>
-#include <linux/suspend.h>
 #include <linux/kexec.h>
 #include <linux/kdb.h>
 #include <linux/ratelimit.h>
@@ -49,6 +48,7 @@
 #include <linux/uio.h>
 
 #include <asm/uaccess.h>
+#include <asm-generic/sections.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
@@ -233,7 +233,11 @@ struct printk_log {
 	u8 facility;		/* syslog facility */
 	u8 flags:5;		/* internal record flags */
 	u8 level:3;		/* syslog level */
-};
+}
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+__packed __aligned(4)
+#endif
+;
 
 /*
  * The logbuf_lock protects kmsg buffer, indices, counters.  This can be taken
@@ -274,30 +278,12 @@ static u32 clear_idx;
 #define LOG_FACILITY(v)		((v) >> 3 & 0xff)
 
 /* record buffer */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define LOG_ALIGN 4
-#else
 #define LOG_ALIGN __alignof__(struct printk_log)
-#endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
 static u32 log_buf_len = __LOG_BUF_LEN;
 
-#ifdef CONFIG_TOI_INCREMENTAL
-void toi_set_logbuf_untracked(void)
-{
-    int i;
-    struct page *log_buf_start_page = virt_to_page(__log_buf);
-
-    printk("Not protecting kernel printk log buffer (%p-%p).\n",
-            __log_buf, __log_buf + __LOG_BUF_LEN);
-
-    for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++)
-        SetPageTOI_Untracked(log_buf_start_page + i);
-}
-#endif
-
 /* Return log buffer address */
 char *log_buf_addr_get(void)
 {
@@ -1675,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 			    const char *dict, size_t dictlen,
 			    const char *fmt, va_list args)
 {
-	static int recursion_bug;
+	static bool recursion_bug;
 	static char textbuf[LOG_LINE_MAX];
 	char *text = textbuf;
 	size_t text_len = 0;
@@ -1711,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * it can be printed at the next appropriate moment:
 		 */
 		if (!oops_in_progress && !lockdep_recursing(current)) {
-			recursion_bug = 1;
+			recursion_bug = true;
 			local_irq_restore(flags);
 			return 0;
 		}
@@ -1726,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		static const char recursion_msg[] =
 			"BUG: recent printk recursion!";
 
-		recursion_bug = 0;
+		recursion_bug = false;
 		/* emit KERN_CRIT message */
 		printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
 					 NULL, 0, recursion_msg,
@@ -2706,13 +2692,36 @@ int unregister_console(struct console *console)
 }
 EXPORT_SYMBOL(unregister_console);
 
+/*
+ * Some boot consoles access data that is in the init section and which will
+ * be discarded after the initcalls have been run. To make sure that no code
+ * will access this data, unregister the boot consoles in a late initcall.
+ *
+ * If for some reason, such as deferred probe or the driver being a loadable
+ * module, the real console hasn't registered yet at this point, there will
+ * be a brief interval in which no messages are logged to the console, which
+ * makes it difficult to diagnose problems that occur during this time.
+ *
+ * To mitigate this problem somewhat, only unregister consoles whose memory
+ * intersects with the init section. Note that code exists elsewhere to get
+ * rid of the boot console as soon as the proper console shows up, so there
+ * won't be side-effects from postponing the removal.
+ */
 static int __init printk_late_init(void)
 {
 	struct console *con;
 
 	for_each_console(con) {
 		if (!keep_bootcon && con->flags & CON_BOOT) {
-			unregister_console(con);
+			/*
+			 * Make sure to unregister boot consoles whose data
+			 * resides in the init section before the init section
+			 * is discarded. Boot consoles whose data will stick
+			 * around will automatically be unregistered when the
+			 * proper console replaces them.
+			 */
+			if (init_section_intersects(con, sizeof(*con)))
+				unregister_console(con);
 		}
 	}
 	hotcpu_notifier(console_cpu_notify, 0);
diff --git a/kernel/profile.c b/kernel/profile.c
index 99513e116..513696974 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,6 +59,7 @@ int profile_setup(char *str)
 
 	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
 #ifdef CONFIG_SCHEDSTATS
+		force_schedstat_enabled();
 		prof_on = SLEEP_PROFILING;
 		if (str[strlen(sleepstr)] == ',')
 			str += strlen(sleepstr) + 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 3189e51db..2341efe7f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -387,8 +387,14 @@ unlock_creds:
 	mutex_unlock(&task->signal->cred_guard_mutex);
 out:
 	if (!retval) {
-		wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
-			    TASK_UNINTERRUPTIBLE);
+		/*
+		 * We do not bother to change retval or clear JOBCTL_TRAPPING
+		 * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+		 * not return to user-mode, it will exit and clear this bit in
+		 * __ptrace_unlink() if it wasn't already cleared by the tracee;
+		 * and until then nobody can ptrace this task.
+		 */
+		wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
 		proc_ptrace_connector(task, PTRACE_ATTACH);
 	}
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d89328e26..d2988d047 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -162,6 +162,27 @@ static int rcu_torture_writer_state;
 #define RTWS_SYNC		7
 #define RTWS_STUTTER		8
 #define RTWS_STOPPING		9
+static const char * const rcu_torture_writer_state_names[] = {
+	"RTWS_FIXED_DELAY",
+	"RTWS_DELAY",
+	"RTWS_REPLACE",
+	"RTWS_DEF_FREE",
+	"RTWS_EXP_SYNC",
+	"RTWS_COND_GET",
+	"RTWS_COND_SYNC",
+	"RTWS_SYNC",
+	"RTWS_STUTTER",
+	"RTWS_STOPPING",
+};
+
+static const char *rcu_torture_writer_state_getname(void)
+{
+	unsigned int i = READ_ONCE(rcu_torture_writer_state);
+
+	if (i >= ARRAY_SIZE(rcu_torture_writer_state_names))
+		return "???";
+	return rcu_torture_writer_state_names[i];
+}
 
 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
 #define RCUTORTURE_RUNNABLE_INIT 1
@@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void)
 
 		rcutorture_get_gp_data(cur_ops->ttype,
 				       &flags, &gpnum, &completed);
-		pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
+		pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
+			 rcu_torture_writer_state_getname(),
 			 rcu_torture_writer_state,
 			 gpnum, completed, flags);
 		show_rcu_gp_kthreads();
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index a63a1ea5a..9b9cdd549 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
  */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-	__synchronize_srcu(sp, rcu_gp_is_expedited()
+	__synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
 			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
 			   : SYNCHRONIZE_SRCU_TRYCOUNT);
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f07343b54..9fd5b628a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree");
 
 /* Data structures. */
 
-static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
-
 /*
  * In order to export the rcu_state name to the tracing tools, it
  * needs to be added in the __tracepoint_string section.
@@ -246,24 +242,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
-	unsigned long flags;
-
-	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
-		trace_rcu_grace_period(TPS("rcu_sched"),
-				       __this_cpu_read(rcu_sched_data.gpnum),
-				       TPS("cpuqs"));
-		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
-		if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
-			return;
-		local_irq_save(flags);
-		if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
-			__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
-			rcu_report_exp_rdp(&rcu_sched_state,
-					   this_cpu_ptr(&rcu_sched_data),
-					   true);
-		}
-		local_irq_restore(flags);
-	}
+	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
+		return;
+	trace_rcu_grace_period(TPS("rcu_sched"),
+			       __this_cpu_read(rcu_sched_data.gpnum),
+			       TPS("cpuqs"));
+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+		return;
+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+	rcu_report_exp_rdp(&rcu_sched_state,
+			   this_cpu_ptr(&rcu_sched_data), true);
 }
 
 void rcu_bh_qs(void)
@@ -300,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
  * We inform the RCU core by emulating a zero-duration dyntick-idle
  * period, which we in turn do by incrementing the ->dynticks counter
  * by two.
+ *
+ * The caller must have disabled interrupts.
  */
 static void rcu_momentary_dyntick_idle(void)
 {
-	unsigned long flags;
 	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp;
 	int resched_mask;
 	struct rcu_state *rsp;
 
-	local_irq_save(flags);
-
 	/*
 	 * Yes, we can lose flag-setting operations.  This is OK, because
 	 * the flag will be set again after some delay.
@@ -340,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void)
 		smp_mb__after_atomic(); /* Later stuff after QS. */
 		break;
 	}
-	local_irq_restore(flags);
 }
 
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
  * and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
+ * The caller must have disabled interrupts.
  */
 void rcu_note_context_switch(void)
 {
@@ -376,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  */
 void rcu_all_qs(void)
 {
+	unsigned long flags;
+
 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
-	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+	if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+		local_irq_save(flags);
 		rcu_momentary_dyntick_idle();
+		local_irq_restore(flags);
+	}
 	this_cpu_inc(rcu_qs_ctr);
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
@@ -605,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
  * The caller must have disabled interrupts to prevent races with
  * normal callback registry.
  */
-static int
+static bool
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	int i;
 
 	if (rcu_gp_in_progress(rsp))
-		return 0;  /* No, a grace period is already in progress. */
+		return false;  /* No, a grace period is already in progress. */
 	if (rcu_future_needs_gp(rsp))
-		return 1;  /* Yes, a no-CBs CPU needs one. */
+		return true;  /* Yes, a no-CBs CPU needs one. */
 	if (!rdp->nxttail[RCU_NEXT_TAIL])
-		return 0;  /* No, this is a no-CBs (or offline) CPU. */
+		return false;  /* No, this is a no-CBs (or offline) CPU. */
 	if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
-		return 1;  /* Yes, this CPU has newly registered callbacks. */
+		return true;  /* Yes, CPU has newly registered callbacks. */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
 		if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
 		    ULONG_CMP_LT(READ_ONCE(rsp->completed),
 				 rdp->nxtcompleted[i]))
-			return 1;  /* Yes, CBs for future grace period. */
-	return 0; /* No grace period needed. */
+			return true;  /* Yes, CBs for future grace period. */
+	return false; /* No grace period needed. */
 }
 
 /*
@@ -740,7 +732,7 @@ void rcu_user_enter(void)
  *
  * Exit from an interrupt handler, which might possibly result in entering
  * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur.
+ * sections can occur.  The caller must have disabled interrupts.
  *
  * This code assumes that the idle loop never does anything that might
  * result in unbalanced calls to irq_enter() and irq_exit().  If your
@@ -753,11 +745,10 @@ void rcu_user_enter(void)
  */
 void rcu_irq_exit(void)
 {
-	unsigned long flags;
 	long long oldval;
 	struct rcu_dynticks *rdtp;
 
-	local_irq_save(flags);
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting--;
@@ -768,6 +759,17 @@ void rcu_irq_exit(void)
 	else
 		rcu_eqs_enter_common(oldval, true);
 	rcu_sysidle_enter(1);
+}
+
+/*
+ * Wrapper for rcu_irq_exit() where interrupts are enabled.
+ */
+void rcu_irq_exit_irqson(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	rcu_irq_exit();
 	local_irq_restore(flags);
 }
 
@@ -865,7 +867,7 @@ void rcu_user_exit(void)
  *
  * Enter an interrupt handler, which might possibly result in exiting
  * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur.
+ * sections can occur.  The caller must have disabled interrupts.
  *
  * Note that the Linux kernel is fully capable of entering an interrupt
  * handler that it never exits, for example when doing upcalls to
@@ -881,11 +883,10 @@ void rcu_user_exit(void)
  */
 void rcu_irq_enter(void)
 {
-	unsigned long flags;
 	struct rcu_dynticks *rdtp;
 	long long oldval;
 
-	local_irq_save(flags);
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting++;
@@ -896,6 +897,17 @@ void rcu_irq_enter(void)
 	else
 		rcu_eqs_exit_common(oldval, true);
 	rcu_sysidle_exit(1);
+}
+
+/*
+ * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ */
+void rcu_irq_enter_irqson(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	rcu_irq_enter();
 	local_irq_restore(flags);
 }
 
@@ -1187,6 +1199,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
 }
 
 /*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+		return "???";
+	return gp_state_names[gs];
+}
+
+/*
  * Complain about starvation of grace-period kthread.
  */
 static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
@@ -1196,12 +1218,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 
 	j = jiffies;
 	gpa = READ_ONCE(rsp->gp_activity);
-	if (j - gpa > 2 * HZ)
-		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
+	if (j - gpa > 2 * HZ) {
+		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
 		       rsp->name, j - gpa,
 		       rsp->gpnum, rsp->completed,
-		       rsp->gp_flags, rsp->gp_state,
-		       rsp->gp_kthread ? rsp->gp_kthread->state : 0);
+		       rsp->gp_flags,
+		       gp_state_getname(rsp->gp_state), rsp->gp_state,
+		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+		if (rsp->gp_kthread)
+			sched_show_task(rsp->gp_kthread);
+	}
 }
 
 /*
@@ -1214,7 +1240,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 	struct rcu_node *rnp;
 
 	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask != 0) {
 			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
 				if (rnp->qsmask & (1UL << cpu))
@@ -1237,7 +1263,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 
 	/* Only let one CPU complain about others per time interval. */
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	delta = jiffies - READ_ONCE(rsp->jiffies_stall);
 	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1256,7 +1282,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	       rsp->name);
 	print_cpu_stall_info_begin();
 	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		ndetected += rcu_print_task_stall(rnp);
 		if (rnp->qsmask != 0) {
 			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -1327,7 +1353,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
 
 	rcu_dump_cpu_stacks(rsp);
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
 		WRITE_ONCE(rsp->jiffies_stall,
 			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
@@ -1534,10 +1560,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 	 * hold it, acquire the root rcu_node structure's lock in order to
 	 * start one (if needed).
 	 */
-	if (rnp != rnp_root) {
-		raw_spin_lock(&rnp_root->lock);
-		smp_mb__after_unlock_lock();
-	}
+	if (rnp != rnp_root)
+		raw_spin_lock_rcu_node(rnp_root);
 
 	/*
 	 * Get a new grace-period number.  If there really is no grace
@@ -1590,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 	int needmore;
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
-	rcu_nocb_gp_cleanup(rsp, rnp);
 	rnp->need_future_gp[c & 0x1] = 0;
 	needmore = rnp->need_future_gp[(c + 1) & 0x1];
 	trace_rcu_future_gp(rnp, rdp, c,
@@ -1611,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 	    !READ_ONCE(rsp->gp_flags) ||
 	    !rsp->gp_kthread)
 		return;
-	wake_up(&rsp->gp_wq);
+	swake_up(&rsp->gp_wq);
 }
 
 /*
@@ -1786,11 +1809,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 	if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
 	     rdp->completed == READ_ONCE(rnp->completed) &&
 	     !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
-	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+	    !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
 	}
-	smp_mb__after_unlock_lock();
 	needwake = __note_gp_changes(rsp, rnp, rdp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (needwake)
@@ -1805,21 +1827,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay)
 }
 
 /*
- * Initialize a new grace period.  Return 0 if no grace period required.
+ * Initialize a new grace period.  Return false if no grace period required.
  */
-static int rcu_gp_init(struct rcu_state *rsp)
+static bool rcu_gp_init(struct rcu_state *rsp)
 {
 	unsigned long oldmask;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
-	raw_spin_lock_irq(&rnp->lock);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irq_rcu_node(rnp);
 	if (!READ_ONCE(rsp->gp_flags)) {
 		/* Spurious wakeup, tell caller to go back to sleep.  */
 		raw_spin_unlock_irq(&rnp->lock);
-		return 0;
+		return false;
 	}
 	WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
 
@@ -1829,7 +1850,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		 * Not supposed to be able to happen.
 		 */
 		raw_spin_unlock_irq(&rnp->lock);
-		return 0;
+		return false;
 	}
 
 	/* Advance to a new grace period and initialize state. */
@@ -1847,8 +1868,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	 */
 	rcu_for_each_leaf_node(rsp, rnp) {
 		rcu_gp_slow(rsp, gp_preinit_delay);
-		raw_spin_lock_irq(&rnp->lock);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irq_rcu_node(rnp);
 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
 		    !rnp->wait_blkd_tasks) {
 			/* Nothing to do on this leaf rcu_node structure. */
@@ -1904,8 +1924,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		rcu_gp_slow(rsp, gp_init_delay);
-		raw_spin_lock_irq(&rnp->lock);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irq_rcu_node(rnp);
 		rdp = this_cpu_ptr(rsp->rda);
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
@@ -1923,7 +1942,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 	}
 
-	return 1;
+	return true;
 }
 
 /*
@@ -1973,8 +1992,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
 	}
 	/* Clear flag to prevent immediate re-entry. */
 	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
-		raw_spin_lock_irq(&rnp->lock);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irq_rcu_node(rnp);
 		WRITE_ONCE(rsp->gp_flags,
 			   READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
 		raw_spin_unlock_irq(&rnp->lock);
@@ -1991,10 +2009,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	int nocb = 0;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct swait_queue_head *sq;
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
-	raw_spin_lock_irq(&rnp->lock);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irq_rcu_node(rnp);
 	gp_duration = jiffies - rsp->gp_start;
 	if (gp_duration > rsp->gp_max)
 		rsp->gp_max = gp_duration;
@@ -2019,8 +2037,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	 * grace period is recorded in any of the rcu_node structures.
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
-		raw_spin_lock_irq(&rnp->lock);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irq_rcu_node(rnp);
 		WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 		WARN_ON_ONCE(rnp->qsmask);
 		WRITE_ONCE(rnp->completed, rsp->gpnum);
@@ -2029,14 +2046,15 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
 		/* smp_mb() provided by prior unlock-lock pair. */
 		nocb += rcu_future_gp_cleanup(rsp, rnp);
+		sq = rcu_nocb_gp_get(rnp);
 		raw_spin_unlock_irq(&rnp->lock);
+		rcu_nocb_gp_cleanup(sq);
 		cond_resched_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 		rcu_gp_slow(rsp, gp_cleanup_delay);
 	}
 	rnp = rcu_get_root(rsp);
-	raw_spin_lock_irq(&rnp->lock);
-	smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
+	raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
 	rcu_nocb_gp_set(rnp, nocb);
 
 	/* Declare grace period done. */
@@ -2076,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwait"));
 			rsp->gp_state = RCU_GP_WAIT_GPS;
-			wait_event_interruptible(rsp->gp_wq,
+			swait_event_interruptible(rsp->gp_wq,
 						 READ_ONCE(rsp->gp_flags) &
 						 RCU_GP_FLAG_INIT);
 			rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2106,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
 			rsp->gp_state = RCU_GP_WAIT_FQS;
-			ret = wait_event_interruptible_timeout(rsp->gp_wq,
+			ret = swait_event_interruptible_timeout(rsp->gp_wq,
 					rcu_gp_fqs_check_wake(rsp, &gf), j);
 			rsp->gp_state = RCU_GP_DOING_FQS;
 			/* Locking provides needed memory barriers. */
@@ -2230,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-	rcu_gp_kthread_wake(rsp);
+	swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
 }
 
 /*
@@ -2284,8 +2302,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		rnp_c = rnp;
 		rnp = rnp->parent;
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		oldmask = rnp_c->qsmask;
 	}
 
@@ -2332,8 +2349,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	gps = rnp->gpnum;
 	mask = rnp->grpmask;
 	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
-	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_rcu_node(rnp_p);	/* irqs already disabled. */
 	rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
 }
 
@@ -2355,8 +2371,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	struct rcu_node *rnp;
 
 	rnp = rdp->mynode;
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if ((rdp->cpu_no_qs.b.norm &&
 	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
 	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
@@ -2582,8 +2597,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 		rnp = rnp->parent;
 		if (!rnp)
 			break;
-		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-		smp_mb__after_unlock_lock(); /* GP memory ordering. */
+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		rnp->qsmaskinit &= ~mask;
 		rnp->qsmask &= ~mask;
 		if (rnp->qsmaskinit) {
@@ -2611,8 +2625,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
+	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
 	rnp->qsmaskinitnext &= ~mask;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -2809,8 +2822,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
 	rcu_for_each_leaf_node(rsp, rnp) {
 		cond_resched_rcu_qs();
 		mask = 0;
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask == 0) {
 			if (rcu_state_p == &rcu_sched_state ||
 			    rsp != rcu_state_p ||
@@ -2881,8 +2893,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
 
 	/* Reached the root of the rcu_node tree, acquire lock. */
-	raw_spin_lock_irqsave(&rnp_old->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
 	raw_spin_unlock(&rnp_old->fqslock);
 	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 		rsp->n_force_qs_lh++;
@@ -2891,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	}
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
-	rcu_gp_kthread_wake(rsp);
+	swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
 }
 
 /*
@@ -2914,7 +2925,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 	/* Does this CPU require a not-yet-started grace period? */
 	local_irq_save(flags);
 	if (cpu_needs_another_gp(rsp, rdp)) {
-		raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
+		raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
 		needwake = rcu_start_gp(rsp);
 		raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
 		if (needwake)
@@ -3005,8 +3016,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 		if (!rcu_gp_in_progress(rsp)) {
 			struct rcu_node *rnp_root = rcu_get_root(rsp);
 
-			raw_spin_lock(&rnp_root->lock);
-			smp_mb__after_unlock_lock();
+			raw_spin_lock_rcu_node(rnp_root);
 			needwake = rcu_start_gp(rsp);
 			raw_spin_unlock(&rnp_root->lock);
 			if (needwake)
@@ -3365,7 +3375,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp)
 {
 	unsigned long s;
 
-	smp_mb(); /* Caller's modifications seen first by other CPUs. */
 	s = (READ_ONCE(*sp) + 3) & ~0x1;
 	smp_mb(); /* Above access must not bleed into critical section. */
 	return s;
@@ -3392,6 +3401,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
 }
 static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
 {
+	smp_mb(); /* Caller's modifications seen first by other CPUs. */
 	return rcu_seq_snap(&rsp->expedited_sequence);
 }
 static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
@@ -3426,8 +3436,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
 	 * CPUs for the current rcu_node structure up the rcu_node tree.
 	 */
 	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->expmaskinit == rnp->expmaskinitnext) {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			continue;  /* No new CPUs, nothing to do. */
@@ -3447,8 +3456,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
 		rnp_up = rnp->parent;
 		done = false;
 		while (rnp_up) {
-			raw_spin_lock_irqsave(&rnp_up->lock, flags);
-			smp_mb__after_unlock_lock();
+			raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
 			if (rnp_up->expmaskinit)
 				done = true;
 			rnp_up->expmaskinit |= mask;
@@ -3472,8 +3480,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
 
 	sync_exp_reset_tree_hotplug(rsp);
 	rcu_for_each_node_breadth_first(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		WARN_ON_ONCE(rnp->expmask);
 		rnp->expmask = rnp->expmaskinit;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -3524,15 +3531,14 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			if (wake) {
 				smp_mb(); /* EGP done before wake_up(). */
-				wake_up(&rsp->expedited_wq);
+				swake_up(&rsp->expedited_wq);
 			}
 			break;
 		}
 		mask = rnp->grpmask;
 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
 		rnp = rnp->parent;
-		raw_spin_lock(&rnp->lock); /* irqs already disabled */
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
 		WARN_ON_ONCE(!(rnp->expmask & mask));
 		rnp->expmask &= ~mask;
 	}
@@ -3549,8 +3555,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 {
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	__rcu_report_exp_rnp(rsp, rnp, wake, flags);
 }
 
@@ -3564,8 +3569,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 {
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (!(rnp->expmask & mask)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
@@ -3609,7 +3613,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
  */
 static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 {
-	struct rcu_data *rdp;
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
 	struct rcu_node *rnp0;
 	struct rcu_node *rnp1 = NULL;
 
@@ -3623,7 +3627,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
 		if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
 			if (sync_exp_work_done(rsp, rnp0, NULL,
-					       &rsp->expedited_workdone0, s))
+					       &rdp->expedited_workdone0, s))
 				return NULL;
 			return rnp0;
 		}
@@ -3637,14 +3641,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	 * can be inexact, as it is just promoting locality and is not
 	 * strictly needed for correctness.
 	 */
-	rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
-	if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
 		return NULL;
 	mutex_lock(&rdp->exp_funnel_mutex);
 	rnp0 = rdp->mynode;
 	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
 		if (sync_exp_work_done(rsp, rnp1, rdp,
-				       &rsp->expedited_workdone2, s))
+				       &rdp->expedited_workdone2, s))
 			return NULL;
 		mutex_lock(&rnp0->exp_funnel_mutex);
 		if (rnp1)
@@ -3654,7 +3657,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 		rnp1 = rnp0;
 	}
 	if (sync_exp_work_done(rsp, rnp1, rdp,
-			       &rsp->expedited_workdone3, s))
+			       &rdp->expedited_workdone3, s))
 		return NULL;
 	return rnp1;
 }
@@ -3708,8 +3711,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 
 	sync_exp_reset_tree(rsp);
 	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 
 		/* Each pass checks a CPU for identity, offline, and idle. */
 		mask_ofl_test = 0;
@@ -3741,24 +3743,22 @@ retry_ipi:
 			ret = smp_call_function_single(cpu, func, rsp, 0);
 			if (!ret) {
 				mask_ofl_ipi &= ~mask;
-			} else {
-				/* Failed, raced with offline. */
-				raw_spin_lock_irqsave(&rnp->lock, flags);
-				if (cpu_online(cpu) &&
-				    (rnp->expmask & mask)) {
-					raw_spin_unlock_irqrestore(&rnp->lock,
-								   flags);
-					schedule_timeout_uninterruptible(1);
-					if (cpu_online(cpu) &&
-					    (rnp->expmask & mask))
-						goto retry_ipi;
-					raw_spin_lock_irqsave(&rnp->lock,
-							      flags);
-				}
-				if (!(rnp->expmask & mask))
-					mask_ofl_ipi &= ~mask;
+				continue;
+			}
+			/* Failed, raced with offline. */
+			raw_spin_lock_irqsave_rcu_node(rnp, flags);
+			if (cpu_online(cpu) &&
+			    (rnp->expmask & mask)) {
 				raw_spin_unlock_irqrestore(&rnp->lock, flags);
+				schedule_timeout_uninterruptible(1);
+				if (cpu_online(cpu) &&
+				    (rnp->expmask & mask))
+					goto retry_ipi;
+				raw_spin_lock_irqsave_rcu_node(rnp, flags);
 			}
+			if (!(rnp->expmask & mask))
+				mask_ofl_ipi &= ~mask;
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		}
 		/* Report quiescent states for those that went offline. */
 		mask_ofl_test |= mask_ofl_ipi;
@@ -3773,6 +3773,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	unsigned long jiffies_stall;
 	unsigned long jiffies_start;
 	unsigned long mask;
+	int ndetected;
 	struct rcu_node *rnp;
 	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	int ret;
@@ -3781,28 +3782,30 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	jiffies_start = jiffies;
 
 	for (;;) {
-		ret = wait_event_interruptible_timeout(
+		ret = swait_event_timeout(
 				rsp->expedited_wq,
 				sync_rcu_preempt_exp_done(rnp_root),
 				jiffies_stall);
-		if (ret > 0)
+		if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
 			return;
 		if (ret < 0) {
 			/* Hit a signal, disable CPU stall warnings. */
-			wait_event(rsp->expedited_wq,
+			swait_event(rsp->expedited_wq,
 				   sync_rcu_preempt_exp_done(rnp_root));
 			return;
 		}
 		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
 		       rsp->name);
+		ndetected = 0;
 		rcu_for_each_leaf_node(rsp, rnp) {
-			(void)rcu_print_task_exp_stall(rnp);
+			ndetected = rcu_print_task_exp_stall(rnp);
 			mask = 1;
 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 				struct rcu_data *rdp;
 
 				if (!(rnp->expmask & mask))
 					continue;
+				ndetected++;
 				rdp = per_cpu_ptr(rsp->rda, cpu);
 				pr_cont(" %d-%c%c%c", cpu,
 					"O."[cpu_online(cpu)],
@@ -3811,8 +3814,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 			}
 			mask <<= 1;
 		}
-		pr_cont(" } %lu jiffies s: %lu\n",
-			jiffies - jiffies_start, rsp->expedited_sequence);
+		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+			jiffies - jiffies_start, rsp->expedited_sequence,
+			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+		if (!ndetected) {
+			pr_err("blocking rcu_node structures:");
+			rcu_for_each_node_breadth_first(rsp, rnp) {
+				if (rnp == rnp_root)
+					continue; /* printed unconditionally */
+				if (sync_rcu_preempt_exp_done(rnp))
+					continue;
+				pr_cont(" l=%u:%d-%d:%#lx/%c",
+					rnp->level, rnp->grplo, rnp->grphi,
+					rnp->expmask,
+					".T"[!!rnp->exp_tasks]);
+			}
+			pr_cont("\n");
+		}
 		rcu_for_each_leaf_node(rsp, rnp) {
 			mask = 1;
 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
@@ -3847,6 +3865,16 @@ void synchronize_sched_expedited(void)
 	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_sched_state;
 
+	/* If only one CPU, this is automatically a grace period. */
+	if (rcu_blocking_is_gp())
+		return;
+
+	/* If expedited grace periods are prohibited, fall back to normal. */
+	if (rcu_gp_is_normal()) {
+		wait_rcu_gp(call_rcu_sched);
+		return;
+	}
+
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
 
@@ -4135,7 +4163,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 		rnp = rnp->parent;
 		if (rnp == NULL)
 			return;
-		raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+		raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
 		rnp->qsmaskinit |= mask;
 		raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
 	}
@@ -4152,7 +4180,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
@@ -4179,7 +4207,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
@@ -4198,8 +4226,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	 */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
-	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_rcu_node(rnp);		/* irqs already disabled. */
 	rnp->qsmaskinitnext |= mask;
 	rnp->expmaskinitnext |= mask;
 	if (!rdp->beenonline)
@@ -4327,14 +4354,14 @@ static int __init rcu_spawn_gp_kthread(void)
 		t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
 		BUG_ON(IS_ERR(t));
 		rnp = rcu_get_root(rsp);
-		raw_spin_lock_irqsave(&rnp->lock, flags);
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		rsp->gp_kthread = t;
 		if (kthread_prio) {
 			sp.sched_priority = kthread_prio;
 			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 		}
-		wake_up_process(t);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		wake_up_process(t);
 	}
 	rcu_spawn_nocb_kthreads();
 	rcu_spawn_boost_kthreads();
@@ -4385,12 +4412,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
 /*
  * Helper function for rcu_init() that initializes one rcu_state structure.
  */
-static void __init rcu_init_one(struct rcu_state *rsp,
-		struct rcu_data __percpu *rda)
+static void __init rcu_init_one(struct rcu_state *rsp)
 {
 	static const char * const buf[] = RCU_NODE_NAME_INIT;
 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
 	static const char * const exp[] = RCU_EXP_NAME_INIT;
+	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+	static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
 	static u8 fl_mask = 0x1;
 
 	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
@@ -4455,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 		}
 	}
 
-	init_waitqueue_head(&rsp->gp_wq);
-	init_waitqueue_head(&rsp->expedited_wq);
+	init_swait_queue_head(&rsp->gp_wq);
+	init_swait_queue_head(&rsp->expedited_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
@@ -4576,8 +4605,8 @@ void __init rcu_init(void)
 
 	rcu_bootup_announce();
 	rcu_init_geometry();
-	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
-	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+	rcu_init_one(&rcu_bh_state);
+	rcu_init_one(&rcu_sched_state);
 	if (dump_tree)
 		rcu_dump_rcu_node_tree(&rcu_sched_state);
 	__rcu_init_preempt();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9fb4e238d..bbd235d0e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
+#include <linux/swait.h>
 #include <linux/stop_machine.h>
 
 /*
@@ -178,6 +179,8 @@ struct rcu_node {
 				/*  beginning of each expedited GP. */
 	unsigned long expmaskinitnext;
 				/* Online CPUs for next expedited GP. */
+				/*  Any CPU that has ever been online will */
+				/*  have its bit set. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU or group here. */
@@ -241,7 +244,7 @@ struct rcu_node {
 				/* Refused to boost: not sure why, though. */
 				/*  This can happen due to race conditions. */
 #ifdef CONFIG_RCU_NOCB_CPU
-	wait_queue_head_t nocb_gp_wq[2];
+	struct swait_queue_head nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 	int need_future_gp[2];
@@ -384,6 +387,10 @@ struct rcu_data {
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 	struct mutex exp_funnel_mutex;
+	atomic_long_t expedited_workdone0;	/* # done by others #0. */
+	atomic_long_t expedited_workdone1;	/* # done by others #1. */
+	atomic_long_t expedited_workdone2;	/* # done by others #2. */
+	atomic_long_t expedited_workdone3;	/* # done by others #3. */
 
 	/* 7) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
@@ -393,7 +400,7 @@ struct rcu_data {
 	atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
 	struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
 	struct rcu_head **nocb_follower_tail;
-	wait_queue_head_t nocb_wq;	/* For nocb kthreads to sleep on. */
+	struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
 	struct task_struct *nocb_kthread;
 	int nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */
 
@@ -472,7 +479,7 @@ struct rcu_state {
 	unsigned long gpnum;			/* Current gp number. */
 	unsigned long completed;		/* # of last completed gp. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
-	wait_queue_head_t gp_wq;		/* Where GP task waits. */
+	struct swait_queue_head gp_wq;		/* Where GP task waits. */
 	short gp_flags;				/* Commands for GP task. */
 	short gp_state;				/* GP kthread sleep state. */
 
@@ -498,13 +505,9 @@ struct rcu_state {
 	/* End of fields guarded by barrier_mutex. */
 
 	unsigned long expedited_sequence;	/* Take a ticket. */
-	atomic_long_t expedited_workdone0;	/* # done by others #0. */
-	atomic_long_t expedited_workdone1;	/* # done by others #1. */
-	atomic_long_t expedited_workdone2;	/* # done by others #2. */
-	atomic_long_t expedited_workdone3;	/* # done by others #3. */
 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
-	wait_queue_head_t expedited_wq;		/* Wait for check-ins. */
+	struct swait_queue_head expedited_wq;	/* Wait for check-ins. */
 	int ncpus_snap;				/* # CPUs seen last time. */
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
@@ -545,6 +548,18 @@ struct rcu_state {
 #define RCU_GP_CLEANUP   5	/* Grace-period cleanup started. */
 #define RCU_GP_CLEANED   6	/* Grace-period cleanup complete. */
 
+#ifndef RCU_TREE_NONCORE
+static const char * const gp_state_names[] = {
+	"RCU_GP_IDLE",
+	"RCU_GP_WAIT_GPS",
+	"RCU_GP_DONE_GPS",
+	"RCU_GP_WAIT_FQS",
+	"RCU_GP_DOING_FQS",
+	"RCU_GP_CLEANUP",
+	"RCU_GP_CLEANED",
+};
+#endif /* #ifndef RCU_TREE_NONCORE */
+
 extern struct list_head rcu_struct_flavors;
 
 /* Sequence through rcu_state structures for each RCU flavor. */
@@ -607,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 			    bool lazy, unsigned long flags);
@@ -664,3 +680,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 #else /* #ifdef CONFIG_PPC */
 #define smp_mb__after_unlock_lock()	do { } while (0)
 #endif /* #else #ifdef CONFIG_PPC */
+
+/*
+ * Wrappers for the rcu_node::lock acquire.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ */
+static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
+{
+	raw_spin_lock(&rnp->lock);
+	smp_mb__after_unlock_lock();
+}
+
+static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
+{
+	raw_spin_lock_irq(&rnp->lock);
+	smp_mb__after_unlock_lock();
+}
+
+#define raw_spin_lock_irqsave_rcu_node(rnp, flags)	\
+do {							\
+	typecheck(unsigned long, flags);		\
+	raw_spin_lock_irqsave(&(rnp)->lock, flags);	\
+	smp_mb__after_unlock_lock();			\
+} while (0)
+
+static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
+{
+	bool locked = raw_spin_trylock(&rnp->lock);
+
+	if (locked)
+		smp_mb__after_unlock_lock();
+	return locked;
+}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 630c19772..080bd202d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
 
 /*
  * Check the RCU kernel configuration parameters and print informative
- * messages about anything out of the ordinary.  If you like #ifdef, you
- * will love this function.
+ * messages about anything out of the ordinary.
  */
 static void __init rcu_bootup_announce_oddness(void)
 {
@@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void)
  * the corresponding expedited grace period will also be the end of the
  * normal grace period.
  */
-static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
-				   unsigned long flags) __releases(rnp->lock)
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
+	__releases(rnp->lock) /* But leaves rrupts disabled. */
 {
 	int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
 			 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
@@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
 		rnp->gp_tasks = &t->rcu_node_entry;
 	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
 		rnp->exp_tasks = &t->rcu_node_entry;
-	raw_spin_unlock(&rnp->lock);
+	raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
 
 	/*
 	 * Report the quiescent state for the expedited GP.  This expedited
@@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
 	} else {
 		WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
 	}
-	local_irq_restore(flags);
 }
 
 /*
@@ -286,12 +284,11 @@ static void rcu_preempt_qs(void)
  * predating the current grace period drain, in other words, until
  * rnp->gp_tasks becomes NULL.
  *
- * Caller must disable preemption.
+ * Caller must disable interrupts.
  */
 static void rcu_preempt_note_context_switch(void)
 {
 	struct task_struct *t = current;
-	unsigned long flags;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
@@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void)
 		/* Possibly blocking in an RCU read-side critical section. */
 		rdp = this_cpu_ptr(rcu_state_p->rda);
 		rnp = rdp->mynode;
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_rcu_node(rnp);
 		t->rcu_read_unlock_special.b.blocked = true;
 		t->rcu_blocked_node = rnp;
 
@@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void)
 				       (rnp->qsmask & rdp->grpmask)
 				       ? rnp->gpnum
 				       : rnp->gpnum + 1);
-		rcu_preempt_ctxt_queue(rnp, rdp, flags);
+		rcu_preempt_ctxt_queue(rnp, rdp);
 	} else if (t->rcu_read_lock_nesting < 0 &&
 		   t->rcu_read_unlock_special.s) {
 
@@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t)
 
 		/*
 		 * Remove this task from the list it blocked on.  The task
-		 * now remains queued on the rcu_node corresponding to
-		 * the CPU it first blocked on, so the first attempt to
-		 * acquire the task's rcu_node's ->lock will succeed.
-		 * Keep the loop and add a WARN_ON() out of sheer paranoia.
+		 * now remains queued on the rcu_node corresponding to the
+		 * CPU it first blocked on, so there is no longer any need
+		 * to loop.  Retain a WARN_ON_ONCE() out of sheer paranoia.
 		 */
-		for (;;) {
-			rnp = t->rcu_blocked_node;
-			raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
-			smp_mb__after_unlock_lock();
-			if (rnp == t->rcu_blocked_node)
-				break;
-			WARN_ON_ONCE(1);
-			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-		}
+		rnp = t->rcu_blocked_node;
+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = sync_rcu_preempt_exp_done(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -527,7 +516,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 	unsigned long flags;
 	struct task_struct *t;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
@@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void)
 	struct rcu_state *rsp = rcu_state_p;
 	unsigned long s;
 
+	/* If expedited grace periods are prohibited, fall back to normal. */
+	if (rcu_gp_is_normal()) {
+		wait_rcu_gp(call_rcu);
+		return;
+	}
+
 	s = rcu_exp_gp_seq_snap(rsp);
 
 	rnp_unlock = exp_funnel_lock(rsp, s);
@@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
  */
 static void __init __rcu_init_preempt(void)
 {
-	rcu_init_one(rcu_state_p, rcu_data_p);
+	rcu_init_one(rcu_state_p);
 }
 
 /*
@@ -989,8 +984,7 @@ static int rcu_boost(struct rcu_node *rnp)
 	    READ_ONCE(rnp->boost_tasks) == NULL)
 		return 0;  /* Nothing left to boost. */
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 
 	/*
 	 * Recheck under the lock: all tasks in need of boosting
@@ -1176,8 +1170,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 			   "rcub/%d", rnp_index);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rnp->boost_kthread_task = t;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	sp.sched_priority = kthread_prio;
@@ -1524,7 +1517,8 @@ static void rcu_prepare_for_idle(void)
 	struct rcu_state *rsp;
 	int tne;
 
-	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+	    rcu_is_nocb_cpu(smp_processor_id()))
 		return;
 
 	/* Handle nohz enablement switches conservatively. */
@@ -1538,10 +1532,6 @@ static void rcu_prepare_for_idle(void)
 	if (!tne)
 		return;
 
-	/* If this is a no-CBs CPU, no callbacks, just return. */
-	if (rcu_is_nocb_cpu(smp_processor_id()))
-		return;
-
 	/*
 	 * If a non-lazy callback arrived at a CPU having only lazy
 	 * callbacks, invoke RCU core for the side-effect of recalculating
@@ -1567,8 +1557,7 @@ static void rcu_prepare_for_idle(void)
 		if (!*rdp->nxttail[RCU_DONE_TAIL])
 			continue;
 		rnp = rdp->mynode;
-		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		if (needwake)
@@ -1822,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
  * grace period.
  */
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
-	wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+	swake_up_all(sq);
 }
 
 /*
@@ -1840,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 	rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
 }
 
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+	return &rnp->nocb_gp_wq[rnp->completed & 0x1];
+}
+
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
-	init_waitqueue_head(&rnp->nocb_gp_wq[0]);
-	init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+	init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+	init_swait_queue_head(&rnp->nocb_gp_wq[1]);
 }
 
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1868,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 	if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
 		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
-		wake_up(&rdp_leader->nocb_wq);
+		swake_up(&rdp_leader->nocb_wq);
 	}
 }
 
@@ -2068,8 +2062,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	bool needwake;
 	struct rcu_node *rnp = rdp->mynode;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	needwake = rcu_start_future_gp(rnp, rdp, &c);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (needwake)
@@ -2081,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	 */
 	trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
 	for (;;) {
-		wait_event_interruptible(
+		swait_event_interruptible(
 			rnp->nocb_gp_wq[c & 0x1],
 			(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
 		if (likely(d))
@@ -2109,7 +2102,7 @@ wait_again:
 	/* Wait for callbacks to appear. */
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
-		wait_event_interruptible(my_rdp->nocb_wq,
+		swait_event_interruptible(my_rdp->nocb_wq,
 				!READ_ONCE(my_rdp->nocb_leader_sleep));
 		/* Memory barrier handled by smp_mb() calls below and repoll. */
 	} else if (firsttime) {
@@ -2184,7 +2177,7 @@ wait_again:
 			 * List was empty, wake up the follower.
 			 * Memory barriers supplied by atomic_long_add().
 			 */
-			wake_up(&rdp->nocb_wq);
+			swake_up(&rdp->nocb_wq);
 		}
 	}
 
@@ -2205,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 		if (!rcu_nocb_poll) {
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    "FollowerSleep");
-			wait_event_interruptible(rdp->nocb_wq,
+			swait_event_interruptible(rdp->nocb_wq,
 						 READ_ONCE(rdp->nocb_follower_head));
 		} else if (firsttime) {
 			/* Don't drown trace log with "Poll"! */
@@ -2364,7 +2357,7 @@ void __init rcu_init_nohz(void)
 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
 	rdp->nocb_tail = &rdp->nocb_head;
-	init_waitqueue_head(&rdp->nocb_wq);
+	init_swait_queue_head(&rdp->nocb_wq);
 	rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 }
 
@@ -2514,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 	return false;
 }
 
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
 }
 
@@ -2522,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 {
 }
 
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+	return NULL;
+}
+
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index ef7093cc9..1088e64f0 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -1,5 +1,5 @@
 /*
- * Read-Copy Update tracing for classic implementation
+ * Read-Copy Update tracing for hierarchical implementation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,6 +16,7 @@
  * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2008
+ * Author: Paul E. McKenney
  *
  * Papers:  http://www.rdrop.com/users/paulmck/RCU
  *
@@ -33,9 +34,7 @@
 #include <linux/sched.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
 #include <linux/completion.h>
-#include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
@@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = {
 
 static int show_rcuexp(struct seq_file *m, void *v)
 {
+	int cpu;
 	struct rcu_state *rsp = (struct rcu_state *)m->private;
-
+	struct rcu_data *rdp;
+	unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+
+	for_each_possible_cpu(cpu) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		s0 += atomic_long_read(&rdp->expedited_workdone0);
+		s1 += atomic_long_read(&rdp->expedited_workdone1);
+		s2 += atomic_long_read(&rdp->expedited_workdone2);
+		s3 += atomic_long_read(&rdp->expedited_workdone3);
+	}
 	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
-		   rsp->expedited_sequence,
-		   atomic_long_read(&rsp->expedited_workdone0),
-		   atomic_long_read(&rsp->expedited_workdone1),
-		   atomic_long_read(&rsp->expedited_workdone2),
-		   atomic_long_read(&rsp->expedited_workdone3),
+		   rsp->expedited_sequence, s0, s1, s2, s3,
 		   atomic_long_read(&rsp->expedited_normal),
 		   atomic_read(&rsp->expedited_need_qs),
 		   rsp->expedited_sequence / 2);
@@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
 	unsigned long gpmax;
 	struct rcu_node *rnp = &rsp->node[0];
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	completed = READ_ONCE(rsp->completed);
 	gpnum = READ_ONCE(rsp->gpnum);
 	if (completed == gpnum)
@@ -487,16 +492,4 @@ free_out:
 	debugfs_remove_recursive(rcudir);
 	return 1;
 }
-
-static void __exit rcutree_trace_cleanup(void)
-{
-	debugfs_remove_recursive(rcudir);
-}
-
-
-module_init(rcutree_trace_init);
-module_exit(rcutree_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
-MODULE_LICENSE("GPL");
+device_initcall(rcutree_trace_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5f748c5a4..76b94e194 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate");
 #endif
 #define MODULE_PARAM_PREFIX "rcupdate."
 
+#ifndef CONFIG_TINY_RCU
 module_param(rcu_expedited, int, 0);
+module_param(rcu_normal, int, 0);
+static int rcu_normal_after_boot;
+module_param(rcu_normal_after_boot, int, 0);
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
 /**
@@ -113,6 +118,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
 
 #ifndef CONFIG_TINY_RCU
 
+/*
+ * Should expedited grace-period primitives always fall back to their
+ * non-expedited counterparts?  Intended for use within RCU.  Note
+ * that if the user specifies both rcu_expedited and rcu_normal, then
+ * rcu_normal wins.
+ */
+bool rcu_gp_is_normal(void)
+{
+	return READ_ONCE(rcu_normal);
+}
+
 static atomic_t rcu_expedited_nesting =
 	ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
 
@@ -157,8 +173,6 @@ void rcu_unexpedite_gp(void)
 }
 EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
 
-#endif /* #ifndef CONFIG_TINY_RCU */
-
 /*
  * Inform RCU of the end of the in-kernel boot sequence.
  */
@@ -166,8 +180,12 @@ void rcu_end_inkernel_boot(void)
 {
 	if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
 		rcu_unexpedite_gp();
+	if (rcu_normal_after_boot)
+		WRITE_ONCE(rcu_normal, 1);
 }
 
+#endif /* #ifndef CONFIG_TINY_RCU */
+
 #ifdef CONFIG_PREEMPT_RCU
 
 /*
diff --git a/kernel/relay.c b/kernel/relay.c
index 0b4570cfa..074994bcf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
 	if (!desc->count)
 		return 0;
 
-	mutex_lock(&file_inode(filp)->i_mutex);
+	inode_lock(file_inode(filp));
 	do {
 		if (!relay_file_read_avail(buf, *ppos))
 			break;
@@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
 			*ppos = relay_file_read_end_pos(buf, read_start, ret);
 		}
 	} while (desc->count && ret);
-	mutex_unlock(&file_inode(filp)->i_mutex);
+	inode_unlock(file_inode(filp));
 
 	return desc->written;
 }
diff --git a/kernel/resource.c b/kernel/resource.c
index 249b1eb1e..3669d1bfc 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1499,8 +1499,15 @@ int iomem_is_exclusive(u64 addr)
 			break;
 		if (p->end < addr)
 			continue;
-		if (p->flags & IORESOURCE_BUSY &&
-		     p->flags & IORESOURCE_EXCLUSIVE) {
+		/*
+		 * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
+		 * or CONFIG_IO_STRICT_DEVMEM is enabled and the
+		 * resource is busy.
+		 */
+		if ((p->flags & IORESOURCE_BUSY) == 0)
+			continue;
+		if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
+				|| p->flags & IORESOURCE_EXCLUSIVE) {
 			err = 1;
 			break;
 		}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 67687973c..7d4cba227 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o swait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 750ed601d..a5d966cb8 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 	ag = autogroup_task_get(p);
 
 	down_write(&ag->lock);
-	err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
+	err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
 	if (!err)
 		ag->nice = nice;
 	up_write(&ag->lock);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index caf4041f5..bc54e8467 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 		return;
 
 	sched_clock_tick();
-	touch_softlockup_watchdog();
+	touch_softlockup_watchdog_sched();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb70592f0..05114b15b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
  *              Thomas Gleixner, Mike Kravetz
  */
 
+#include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
@@ -66,12 +67,10 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
-#include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 
@@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
 
 #undef SCHED_FEAT
 
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled)	\
-	#name ,
-
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static int sched_feat_show(struct seq_file *m, void *v)
-{
-	int i;
-
-	for (i = 0; i < __SCHED_FEAT_NR; i++) {
-		if (!(sysctl_sched_features & (1UL << i)))
-			seq_puts(m, "NO_");
-		seq_printf(m, "%s ", sched_feat_names[i]);
-	}
-	seq_puts(m, "\n");
-
-	return 0;
-}
-
-#ifdef HAVE_JUMP_LABEL
-
-#define jump_label_key__true  STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
-
-#define SCHED_FEAT(name, enabled)	\
-	jump_label_key__##enabled ,
-
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static void sched_feat_disable(int i)
-{
-	static_key_disable(&sched_feat_keys[i]);
-}
-
-static void sched_feat_enable(int i)
-{
-	static_key_enable(&sched_feat_keys[i]);
-}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-
-static int sched_feat_set(char *cmp)
-{
-	int i;
-	int neg = 0;
-
-	if (strncmp(cmp, "NO_", 3) == 0) {
-		neg = 1;
-		cmp += 3;
-	}
-
-	for (i = 0; i < __SCHED_FEAT_NR; i++) {
-		if (strcmp(cmp, sched_feat_names[i]) == 0) {
-			if (neg) {
-				sysctl_sched_features &= ~(1UL << i);
-				sched_feat_disable(i);
-			} else {
-				sysctl_sched_features |= (1UL << i);
-				sched_feat_enable(i);
-			}
-			break;
-		}
-	}
-
-	return i;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
-		size_t cnt, loff_t *ppos)
-{
-	char buf[64];
-	char *cmp;
-	int i;
-	struct inode *inode;
-
-	if (cnt > 63)
-		cnt = 63;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-	cmp = strstrip(buf);
-
-	/* Ensure the static_key remains in a consistent state */
-	inode = file_inode(filp);
-	mutex_lock(&inode->i_mutex);
-	i = sched_feat_set(cmp);
-	mutex_unlock(&inode->i_mutex);
-	if (i == __SCHED_FEAT_NR)
-		return -EINVAL;
-
-	*ppos += cnt;
-
-	return cnt;
-}
-
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, sched_feat_show, NULL);
-}
-
-static const struct file_operations sched_feat_fops = {
-	.open		= sched_feat_open,
-	.write		= sched_feat_write,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static __init int sched_init_debug(void)
-{
-	debugfs_create_file("sched_features", 0644, NULL, NULL,
-			&sched_feat_fops);
-
-	return 0;
-}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
@@ -731,7 +598,7 @@ bool sched_can_stop_tick(void)
 	if (current->policy == SCHED_RR) {
 		struct sched_rt_entity *rt_se = &current->rt;
 
-		return rt_se->run_list.prev == rt_se->run_list.next;
+		return list_is_singular(&rt_se->run_list);
 	}
 
 	/*
@@ -823,8 +690,8 @@ static void set_load_weight(struct task_struct *p)
 		return;
 	}
 
-	load->weight = scale_load(prio_to_weight[prio]);
-	load->inv_weight = prio_to_wmult[prio];
+	load->weight = scale_load(sched_prio_to_weight[prio]);
+	load->inv_weight = sched_prio_to_wmult[prio];
 }
 
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1071,8 +938,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 {
 	lockdep_assert_held(&rq->lock);
 
-	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	dequeue_task(rq, p, 0);
 	set_task_cpu(p, new_cpu);
 	raw_spin_unlock(&rq->lock);
 
@@ -1080,8 +947,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
@@ -1274,6 +1141,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!p->on_rq);
 
+	/*
+	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
+	 * time relying on p->on_rq.
+	 */
+	WARN_ON_ONCE(p->state == TASK_RUNNING &&
+		     p->sched_class == &fair_sched_class &&
+		     (p->on_rq && !task_on_rq_migrating(p)));
+
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1310,9 +1186,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		src_rq = task_rq(p);
 		dst_rq = cpu_rq(cpu);
 
+		p->on_rq = TASK_ON_RQ_MIGRATING;
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
 		check_preempt_curr(dst_rq, p, 0);
 	} else {
 		/*
@@ -1905,6 +1783,97 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 	raw_spin_unlock(&rq->lock);
 }
 
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ *  MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ *  A) UNLOCK of the rq(c0)->lock scheduling out task t
+ *  B) migration for t is required to synchronize *both* rq(c0)->lock and
+ *     rq(c1)->lock (if not at the same time, then in that order).
+ *  C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ *   CPU0            CPU1            CPU2
+ *
+ *   LOCK rq(0)->lock
+ *   sched-out X
+ *   sched-in Y
+ *   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(0)->lock // orders against CPU0
+ *                                   dequeue X
+ *                                   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(1)->lock
+ *                                   enqueue X
+ *                                   UNLOCK rq(1)->lock
+ *
+ *                   LOCK rq(1)->lock // orders against CPU2
+ *                   sched-out Z
+ *                   sched-in X
+ *                   UNLOCK rq(1)->lock
+ *
+ *
+ *  BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ *   1) smp_store_release(X->on_cpu, 0)
+ *   2) smp_cond_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ *   LOCK rq(0)->lock LOCK X->pi_lock
+ *   dequeue X
+ *   sched-out X
+ *   smp_store_release(X->on_cpu, 0);
+ *
+ *                    smp_cond_acquire(!X->on_cpu);
+ *                    X->state = WAKING
+ *                    set_task_cpu(X,2)
+ *
+ *                    LOCK rq(2)->lock
+ *                    enqueue X
+ *                    X->state = RUNNING
+ *                    UNLOCK rq(2)->lock
+ *
+ *                                          LOCK rq(2)->lock // orders against CPU1
+ *                                          sched-out Z
+ *                                          sched-in X
+ *                                          UNLOCK rq(2)->lock
+ *
+ *                    UNLOCK X->pi_lock
+ *   UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ *
+ */
+
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
@@ -1968,19 +1937,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	/*
 	 * If the owning (remote) cpu is still in the middle of schedule() with
 	 * this task as prev, wait until its done referencing the task.
-	 */
-	while (p->on_cpu)
-		cpu_relax();
-	/*
-	 * Combined with the control dependency above, we have an effective
-	 * smp_load_acquire() without the need for full barriers.
 	 *
 	 * Pairs with the smp_store_release() in finish_lock_switch().
 	 *
 	 * This ensures that tasks getting woken will be fully ordered against
 	 * their previous state and preserve Program Order.
 	 */
-	smp_rmb();
+	smp_cond_acquire(!p->on_cpu);
 
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
@@ -1997,7 +1960,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
 	ttwu_queue(p, cpu);
 stat:
-	ttwu_stat(p, cpu, wake_flags);
+	if (schedstat_enabled())
+		ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -2045,7 +2009,8 @@ static void try_to_wake_up_local(struct task_struct *p)
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
 	ttwu_do_wakeup(rq, p, 0);
-	ttwu_stat(p, smp_processor_id(), 0);
+	if (schedstat_enabled())
+		ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
 }
@@ -2087,7 +2052,6 @@ void __dl_clear_params(struct task_struct *p)
 	dl_se->dl_bw = 0;
 
 	dl_se->dl_throttled = 0;
-	dl_se->dl_new = 1;
 	dl_se->dl_yielded = 0;
 }
 
@@ -2109,7 +2073,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	p->se.cfs_rq			= NULL;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
+	/* Even if schedstat is disabled, there should not be garbage */
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
@@ -2118,6 +2087,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	__dl_clear_params(p);
 
 	INIT_LIST_HEAD(&p->rt.run_list);
+	p->rt.timeout		= 0;
+	p->rt.time_slice	= sched_rr_timeslice;
+	p->rt.on_rq		= 0;
+	p->rt.on_list		= 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2181,6 +2154,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #endif
 #endif
 
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+	if (enabled)
+		static_branch_enable(&sched_schedstats);
+	else
+		static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+	if (!schedstat_enabled()) {
+		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+		static_branch_enable(&sched_schedstats);
+	}
+}
+
+static int __init setup_schedstats(char *str)
+{
+	int ret = 0;
+	if (!str)
+		goto out;
+
+	if (!strcmp(str, "enable")) {
+		set_schedstats(true);
+		ret = 1;
+	} else if (!strcmp(str, "disable")) {
+		set_schedstats(false);
+		ret = 1;
+	}
+out:
+	if (!ret)
+		pr_warn("Unable to parse schedstats=\n");
+
+	return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_schedstats);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		set_schedstats(state);
+	return err;
+}
+#endif
+#endif
+
 /*
  * fork()/clone()-time setup:
  */
@@ -2910,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void)
 }
 #endif
 
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-	if (in_lock_functions(addr)) {
-		addr = CALLER_ADDR2;
-		if (in_lock_functions(addr))
-			addr = CALLER_ADDR3;
-	}
-	return addr;
-}
-
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
 
@@ -2941,7 +2967,7 @@ void preempt_count_add(int val)
 				PREEMPT_MASK - 10);
 #endif
 	if (preempt_count() == val) {
-		unsigned long ip = get_parent_ip(CALLER_ADDR1);
+		unsigned long ip = get_lock_parent_ip();
 #ifdef CONFIG_DEBUG_PREEMPT
 		current->preempt_disable_ip = ip;
 #endif
@@ -2968,7 +2994,7 @@ void preempt_count_sub(int val)
 #endif
 
 	if (preempt_count() == val)
-		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
 	__preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
@@ -3109,7 +3135,6 @@ static void __sched notrace __schedule(bool preempt)
 
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
-	rcu_note_context_switch();
 	prev = rq->curr;
 
 	/*
@@ -3128,13 +3153,16 @@ static void __sched notrace __schedule(bool preempt)
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 
+	local_irq_disable();
+	rcu_note_context_switch();
+
 	/*
 	 * Make sure that signal_pending_state()->signal_pending() below
 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
 	smp_mb__before_spinlock();
-	raw_spin_lock_irq(&rq->lock);
+	raw_spin_lock(&rq->lock);
 	lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -3178,7 +3206,6 @@ static void __sched notrace __schedule(bool preempt)
 
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
-		cpu = cpu_of(rq);
 	} else {
 		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock_irq(&rq->lock);
@@ -3364,7 +3391,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3392,11 +3419,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
+
+	if (oldprio == prio)
+		queue_flag &= ~DEQUEUE_MOVE;
+
 	prev_class = p->sched_class;
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flag);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3414,7 +3445,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag |= ENQUEUE_REPLENISH;
+			queue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3422,7 +3453,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag |= ENQUEUE_HEAD;
+			queue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3437,7 +3468,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, enqueue_flag);
+		enqueue_task(rq, p, queue_flag);
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -3793,6 +3824,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
+	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -3975,17 +4007,14 @@ change:
 		 * itself.
 		 */
 		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-		if (new_effective_prio == oldprio) {
-			__setscheduler_params(p, attr);
-			task_rq_unlock(rq, p, &flags);
-			return 0;
-		}
+		if (new_effective_prio == oldprio)
+			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flags);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3995,15 +4024,14 @@ change:
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
-		int enqueue_flags = ENQUEUE_RESTORE;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		if (oldprio <= p->prio)
-			enqueue_flags |= ENQUEUE_HEAD;
+		if (oldprio < p->prio)
+			queue_flags |= ENQUEUE_HEAD;
 
-		enqueue_task(rq, p, enqueue_flags);
+		enqueue_task(rq, p, queue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -4994,6 +5022,8 @@ void init_idle(struct task_struct *idle, int cpu)
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 
+	kasan_unpoison_task_stack(idle);
+
 #ifdef CONFIG_SMP
 	/*
 	 * Its possible that init_idle() gets called multiple times on a task,
@@ -5303,183 +5333,6 @@ static void migrate_tasks(struct rq *dead_rq)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
-	{
-		.procname	= "sched_domain",
-		.mode		= 0555,
-	},
-	{}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= sd_ctl_dir,
-	},
-	{}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-	struct ctl_table *entry =
-		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-	return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-	struct ctl_table *entry;
-
-	/*
-	 * In the intermediate directories, both the child directory and
-	 * procname are dynamically allocated and could fail but the mode
-	 * will always be set. In the lowest directory the names are
-	 * static strings and all have proc handlers.
-	 */
-	for (entry = *tablep; entry->mode; entry++) {
-		if (entry->child)
-			sd_free_ctl_entry(&entry->child);
-		if (entry->proc_handler == NULL)
-			kfree(entry->procname);
-	}
-
-	kfree(*tablep);
-	*tablep = NULL;
-}
-
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
-		const char *procname, void *data, int maxlen,
-		umode_t mode, proc_handler *proc_handler,
-		bool load_idx)
-{
-	entry->procname = procname;
-	entry->data = data;
-	entry->maxlen = maxlen;
-	entry->mode = mode;
-	entry->proc_handler = proc_handler;
-
-	if (load_idx) {
-		entry->extra1 = &min_load_idx;
-		entry->extra2 = &max_load_idx;
-	}
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-	struct ctl_table *table = sd_alloc_ctl_entry(14);
-
-	if (table == NULL)
-		return NULL;
-
-	set_table_entry(&table[0], "min_interval", &sd->min_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[1], "max_interval", &sd->max_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[9], "cache_nice_tries",
-		&sd->cache_nice_tries,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[10], "flags", &sd->flags,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[11], "max_newidle_lb_cost",
-		&sd->max_newidle_lb_cost,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[12], "name", sd->name,
-		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-	/* &table[13] is terminator */
-
-	return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-	struct ctl_table *entry, *table;
-	struct sched_domain *sd;
-	int domain_num = 0, i;
-	char buf[32];
-
-	for_each_domain(cpu, sd)
-		domain_num++;
-	entry = table = sd_alloc_ctl_entry(domain_num + 1);
-	if (table == NULL)
-		return NULL;
-
-	i = 0;
-	for_each_domain(cpu, sd) {
-		snprintf(buf, 32, "domain%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_domain_table(sd);
-		entry++;
-		i++;
-	}
-	return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
-	int i, cpu_num = num_possible_cpus();
-	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-	char buf[32];
-
-	WARN_ON(sd_ctl_dir[0].child);
-	sd_ctl_dir[0].child = entry;
-
-	if (entry == NULL)
-		return;
-
-	for_each_possible_cpu(i) {
-		snprintf(buf, 32, "cpu%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_cpu_table(i);
-		entry++;
-	}
-
-	WARN_ON(sd_sysctl_header);
-	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
-	unregister_sysctl_table(sd_sysctl_header);
-	sd_sysctl_header = NULL;
-	if (sd_ctl_dir[0].child)
-		sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
-
 static void set_rq_online(struct rq *rq)
 {
 	if (!rq->online) {
@@ -6071,11 +5924,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
+	int ret;
+
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	cpulist_parse(str, cpu_isolated_map);
+	ret = cpulist_parse(str, cpu_isolated_map);
+	if (ret) {
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+		return 0;
+	}
 	return 1;
 }
-
 __setup("isolcpus=", isolated_cpu_setup);
 
 struct s_data {
@@ -7355,6 +7213,9 @@ int in_sched_functions(unsigned long addr)
  */
 struct task_group root_task_group;
 LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
 #endif
 
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7412,11 +7273,12 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
+	task_group_cache = KMEM_CACHE(task_group, 0);
+
 	list_add(&root_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&root_task_group.children);
 	INIT_LIST_HEAD(&root_task_group.siblings);
 	autogroup_init(&init_task);
-
 #endif /* CONFIG_CGROUP_SCHED */
 
 	for_each_possible_cpu(i) {
@@ -7697,7 +7559,7 @@ static void free_sched_group(struct task_group *tg)
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
-	kfree(tg);
+	kmem_cache_free(task_group_cache, tg);
 }
 
 /* allocate runqueue etc for a new task group */
@@ -7705,7 +7567,7 @@ struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
 
-	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 
@@ -7754,11 +7616,9 @@ void sched_destroy_group(struct task_group *tg)
 void sched_offline_group(struct task_group *tg)
 {
 	unsigned long flags;
-	int i;
 
 	/* end participation in shares distribution */
-	for_each_possible_cpu(i)
-		unregister_fair_sched_group(tg, i);
+	unregister_fair_sched_group(tg);
 
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_del_rcu(&tg->list);
@@ -7784,7 +7644,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, DEQUEUE_SAVE);
+		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -7808,7 +7668,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
 	task_rq_unlock(rq, tsk, &flags);
 }
@@ -8236,7 +8096,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
 	sched_offline_group(tg);
 }
 
-static void cpu_cgroup_fork(struct task_struct *task, void *private)
+static void cpu_cgroup_fork(struct task_struct *task)
 {
 	sched_move_task(task);
 }
@@ -8610,3 +8470,44 @@ void dump_cpu_task(int cpu)
 	pr_info("Task dump for CPU %d:\n", cpu);
 	sched_show_task(cpu_curr(cpu));
 }
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 05de80b48..75f98c549 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -5,6 +5,9 @@
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
 #include "sched.h"
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
 
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -259,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
 #ifdef CONFIG_PARAVIRT
 	if (static_key_false(&paravirt_steal_enabled)) {
 		u64 steal;
-		cputime_t steal_ct;
+		unsigned long steal_jiffies;
 
 		steal = paravirt_steal_clock(smp_processor_id());
 		steal -= this_rq()->prev_steal_time;
 
 		/*
-		 * cputime_t may be less precise than nsecs (eg: if it's
-		 * based on jiffies). Lets cast the result to cputime
+		 * steal is in nsecs but our caller is expecting steal
+		 * time in jiffies. Lets cast the result to jiffies
 		 * granularity and account the rest on the next rounds.
 		 */
-		steal_ct = nsecs_to_cputime(steal);
-		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+		steal_jiffies = nsecs_to_jiffies(steal);
+		this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
 
-		account_steal_time(steal_ct);
-		return steal_ct;
+		account_steal_time(jiffies_to_cputime(steal_jiffies));
+		return steal_jiffies;
 	}
 #endif
 	return false;
@@ -466,7 +469,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 	struct rq *rq = this_rq();
 
-	if (vtime_accounting_enabled())
+	if (vtime_accounting_cpu_enabled())
 		return;
 
 	if (sched_clock_irqtime) {
@@ -665,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static cputime_t vtime_delta(struct task_struct *tsk)
 {
-	unsigned long long clock;
+	unsigned long now = READ_ONCE(jiffies);
 
-	clock = local_clock();
-	if (clock < tsk->vtime_snap)
+	if (time_before(now, (unsigned long)tsk->vtime_snap))
 		return 0;
 
-	return clock - tsk->vtime_snap;
+	return jiffies_to_cputime(now - tsk->vtime_snap);
 }
 
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
-	unsigned long long delta = vtime_delta(tsk);
+	unsigned long now = READ_ONCE(jiffies);
+	unsigned long delta = now - tsk->vtime_snap;
 
-	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
-	tsk->vtime_snap += delta;
+	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
+	tsk->vtime_snap = now;
 
-	/* CHECKME: always safe to convert nsecs to cputime? */
-	return nsecs_to_cputime(delta);
+	return jiffies_to_cputime(delta);
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
@@ -696,37 +698,44 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	if (!vtime_delta(tsk))
+		return;
+
+	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
-	__vtime_account_system(tsk);
+	write_seqcount_begin(&tsk->vtime_seqcount);
+	if (vtime_delta(tsk))
+		__vtime_account_system(tsk);
 	if (context_tracking_in_user())
 		tsk->vtime_snap_whence = VTIME_USER;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_account_user(struct task_struct *tsk)
 {
 	cputime_t delta_cpu;
 
-	write_seqlock(&tsk->vtime_seqlock);
-	delta_cpu = get_vtime_delta(tsk);
+	write_seqcount_begin(&tsk->vtime_seqcount);
 	tsk->vtime_snap_whence = VTIME_SYS;
-	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
-	write_sequnlock(&tsk->vtime_seqlock);
+	if (vtime_delta(tsk)) {
+		delta_cpu = get_vtime_delta(tsk);
+		account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+	}
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
-	__vtime_account_system(tsk);
+	write_seqcount_begin(&tsk->vtime_seqcount);
+	if (vtime_delta(tsk))
+		__vtime_account_system(tsk);
 	tsk->vtime_snap_whence = VTIME_USER;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_guest_enter(struct task_struct *tsk)
@@ -738,19 +747,20 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * synchronization against the reader (task_gtime())
 	 * that can thus safely catch up with a tickless delta.
 	 */
-	write_seqlock(&tsk->vtime_seqlock);
-	__vtime_account_system(tsk);
+	write_seqcount_begin(&tsk->vtime_seqcount);
+	if (vtime_delta(tsk))
+		__vtime_account_system(tsk);
 	current->flags |= PF_VCPU;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 
 void vtime_guest_exit(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
 	current->flags &= ~PF_VCPU;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
@@ -763,24 +773,26 @@ void vtime_account_idle(struct task_struct *tsk)
 
 void arch_vtime_task_switch(struct task_struct *prev)
 {
-	write_seqlock(&prev->vtime_seqlock);
-	prev->vtime_snap_whence = VTIME_SLEEPING;
-	write_sequnlock(&prev->vtime_seqlock);
+	write_seqcount_begin(&prev->vtime_seqcount);
+	prev->vtime_snap_whence = VTIME_INACTIVE;
+	write_seqcount_end(&prev->vtime_seqcount);
 
-	write_seqlock(&current->vtime_seqlock);
+	write_seqcount_begin(&current->vtime_seqcount);
 	current->vtime_snap_whence = VTIME_SYS;
-	current->vtime_snap = sched_clock_cpu(smp_processor_id());
-	write_sequnlock(&current->vtime_seqlock);
+	current->vtime_snap = jiffies;
+	write_seqcount_end(&current->vtime_seqcount);
 }
 
 void vtime_init_idle(struct task_struct *t, int cpu)
 {
 	unsigned long flags;
 
-	write_seqlock_irqsave(&t->vtime_seqlock, flags);
+	local_irq_save(flags);
+	write_seqcount_begin(&t->vtime_seqcount);
 	t->vtime_snap_whence = VTIME_SYS;
-	t->vtime_snap = sched_clock_cpu(cpu);
-	write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+	t->vtime_snap = jiffies;
+	write_seqcount_end(&t->vtime_seqcount);
+	local_irq_restore(flags);
 }
 
 cputime_t task_gtime(struct task_struct *t)
@@ -788,17 +800,17 @@ cputime_t task_gtime(struct task_struct *t)
 	unsigned int seq;
 	cputime_t gtime;
 
-	if (!context_tracking_is_enabled())
+	if (!vtime_accounting_enabled())
 		return t->gtime;
 
 	do {
-		seq = read_seqbegin(&t->vtime_seqlock);
+		seq = read_seqcount_begin(&t->vtime_seqcount);
 
 		gtime = t->gtime;
-		if (t->flags & PF_VCPU)
+		if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
 			gtime += vtime_delta(t);
 
-	} while (read_seqretry(&t->vtime_seqlock, seq));
+	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 
 	return gtime;
 }
@@ -821,7 +833,7 @@ fetch_task_cputime(struct task_struct *t,
 		*udelta = 0;
 		*sdelta = 0;
 
-		seq = read_seqbegin(&t->vtime_seqlock);
+		seq = read_seqcount_begin(&t->vtime_seqcount);
 
 		if (u_dst)
 			*u_dst = *u_src;
@@ -829,7 +841,7 @@ fetch_task_cputime(struct task_struct *t,
 			*s_dst = *s_src;
 
 		/* Task is sleeping, nothing to add */
-		if (t->vtime_snap_whence == VTIME_SLEEPING ||
+		if (t->vtime_snap_whence == VTIME_INACTIVE ||
 		    is_idle_task(t))
 			continue;
 
@@ -845,7 +857,7 @@ fetch_task_cputime(struct task_struct *t,
 			if (t->vtime_snap_whence == VTIME_SYS)
 				*sdelta = delta;
 		}
-	} while (read_seqretry(&t->vtime_seqlock, seq));
+	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 }
 
 
@@ -853,6 +865,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
 {
 	cputime_t udelta, sdelta;
 
+	if (!vtime_accounting_enabled()) {
+		if (utime)
+			*utime = t->utime;
+		if (stime)
+			*stime = t->stime;
+		return;
+	}
+
 	fetch_task_cputime(t, utime, stime, &t->utime,
 			   &t->stime, &udelta, &sdelta);
 	if (utime)
@@ -866,6 +886,14 @@ void task_cputime_scaled(struct task_struct *t,
 {
 	cputime_t udelta, sdelta;
 
+	if (!vtime_accounting_enabled()) {
+		if (utimescaled)
+			*utimescaled = t->utimescaled;
+		if (stimescaled)
+			*stimescaled = t->stimescaled;
+		return;
+	}
+
 	fetch_task_cputime(t, utimescaled, stimescaled,
 			   &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
 	if (utimescaled)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b0a15e28..c7a036fac 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 		}
 	}
 
-	if (leftmost)
+	if (leftmost) {
 		dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+		dl_rq->earliest_dl.next = p->dl.deadline;
+	}
 
 	rb_link_node(&p->pushable_dl_tasks, parent, link);
 	rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 
 		next_node = rb_next(&p->pushable_dl_tasks);
 		dl_rq->pushable_dl_tasks_leftmost = next_node;
+		if (next_node) {
+			dl_rq->earliest_dl.next = rb_entry(next_node,
+				struct task_struct, pushable_dl_tasks)->dl.deadline;
+		}
 	}
 
 	rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -346,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+	/*
+	 * We are racing with the deadline timer. So, do nothing because
+	 * the deadline timer handler will take care of properly recharging
+	 * the runtime and postponing the deadline
+	 */
+	if (dl_se->dl_throttled)
+		return;
 
 	/*
 	 * We use the regular wall clock time to set deadlines in the
@@ -355,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
 	 */
 	dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
 	dl_se->runtime = pi_se->dl_runtime;
-	dl_se->dl_new = 0;
 }
 
 /*
@@ -393,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
 		dl_se->runtime = pi_se->dl_runtime;
 	}
 
+	if (dl_se->dl_yielded && dl_se->runtime > 0)
+		dl_se->runtime = 0;
+
 	/*
 	 * We keep moving the deadline away until we get some
 	 * available runtime for the entity. This ensures correct
@@ -414,7 +430,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
 	 * entity.
 	 */
 	if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
-		printk_deferred_once("sched: DL replenish lagged to much\n");
+		printk_deferred_once("sched: DL replenish lagged too much\n");
 		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
 		dl_se->runtime = pi_se->dl_runtime;
 	}
@@ -494,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	/*
-	 * The arrival of a new instance needs special treatment, i.e.,
-	 * the actual scheduling parameters have to be "renewed".
-	 */
-	if (dl_se->dl_new) {
-		setup_new_dl_entity(dl_se, pi_se);
-		return;
-	}
-
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
 	    dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
 		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -599,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	}
 
 	/*
-	 * This is possible if switched_from_dl() raced against a running
-	 * callback that took the above !dl_task() path and we've since then
-	 * switched back into SCHED_DEADLINE.
-	 *
-	 * There's nothing to do except drop our task reference.
-	 */
-	if (dl_se->dl_new)
-		goto unlock;
-
-	/*
 	 * The task might have been boosted by someone else and might be in the
 	 * boosting/deboosting path, its not throttled.
 	 */
@@ -729,8 +726,11 @@ static void update_curr_dl(struct rq *rq)
 	 * approach need further study.
 	 */
 	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0))
+	if (unlikely((s64)delta_exec <= 0)) {
+		if (unlikely(dl_se->dl_yielded))
+			goto throttle;
 		return;
+	}
 
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
@@ -743,8 +743,10 @@ static void update_curr_dl(struct rq *rq)
 
 	sched_rt_avg_update(rq, delta_exec);
 
-	dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
-	if (dl_runtime_exceeded(dl_se)) {
+	dl_se->runtime -= delta_exec;
+
+throttle:
+	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
 		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -782,42 +784,14 @@ static void update_curr_dl(struct rq *rq)
 
 #ifdef CONFIG_SMP
 
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
-
-static inline u64 next_deadline(struct rq *rq)
-{
-	struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
-
-	if (next && dl_prio(next->prio))
-		return next->dl.deadline;
-	else
-		return 0;
-}
-
 static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 {
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
 	if (dl_rq->earliest_dl.curr == 0 ||
 	    dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
-		/*
-		 * If the dl_rq had no -deadline tasks, or if the new task
-		 * has shorter deadline than the current one on dl_rq, we
-		 * know that the previous earliest becomes our next earliest,
-		 * as the new task becomes the earliest itself.
-		 */
-		dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
 		dl_rq->earliest_dl.curr = deadline;
 		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
-	} else if (dl_rq->earliest_dl.next == 0 ||
-		   dl_time_before(deadline, dl_rq->earliest_dl.next)) {
-		/*
-		 * On the other hand, if the new -deadline task has a
-		 * a later deadline than the earliest one on dl_rq, but
-		 * it is earlier than the next (if any), we must
-		 * recompute the next-earliest.
-		 */
-		dl_rq->earliest_dl.next = next_deadline(rq);
 	}
 }
 
@@ -839,7 +813,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 
 		entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
 		dl_rq->earliest_dl.curr = entry->deadline;
-		dl_rq->earliest_dl.next = next_deadline(rq);
 		cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
 	}
 }
@@ -940,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 	 * parameters of the task might need updating. Otherwise,
 	 * we want a replenishment of its runtime.
 	 */
-	if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
+	if (flags & ENQUEUE_WAKEUP)
 		update_dl_entity(dl_se, pi_se);
 	else if (flags & ENQUEUE_REPLENISH)
 		replenish_dl_entity(dl_se, pi_se);
@@ -1017,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
  */
 static void yield_task_dl(struct rq *rq)
 {
-	struct task_struct *p = rq->curr;
-
 	/*
 	 * We make the task go to sleep until its current deadline by
 	 * forcing its runtime to zero. This way, update_curr_dl() stops
 	 * it and the bandwidth timer will wake it up and will give it
 	 * new scheduling parameters (thanks to dl_yielded=1).
 	 */
-	if (p->dl.runtime > 0) {
-		rq->curr->dl.dl_yielded = 1;
-		p->dl.runtime = 0;
-	}
+	rq->curr->dl.dl_yielded = 1;
+
 	update_rq_clock(rq);
 	update_curr_dl(rq);
 	/*
@@ -1274,28 +1243,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 	return 0;
 }
 
-/* Returns the second earliest -deadline task, NULL otherwise */
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
-{
-	struct rb_node *next_node = rq->dl.rb_leftmost;
-	struct sched_dl_entity *dl_se;
-	struct task_struct *p = NULL;
-
-next_node:
-	next_node = rb_next(next_node);
-	if (next_node) {
-		dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
-		p = dl_task_of(dl_se);
-
-		if (pick_dl_task(rq, p, cpu))
-			return p;
-
-		goto next_node;
-	}
-
-	return NULL;
-}
-
 /*
  * Return the earliest pushable rq's task, which is suitable to be executed
  * on the CPU, NULL otherwise:
@@ -1767,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
+	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+		setup_new_dl_entity(&p->dl, &p->dl);
+
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@@ -1813,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 		 */
 		resched_curr(rq);
 #endif /* CONFIG_SMP */
-	} else
-		switched_to_dl(rq, p);
+	}
 }
 
 const struct sched_class dl_sched_class = {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 641511771..4fbc3bd5f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -16,6 +16,7 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
 #include <linux/mempolicy.h>
+#include <linux/debugfs.h>
 
 #include "sched.h"
 
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
 
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
+#define SCHED_FEAT(name, enabled)	\
+	#name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	for (i = 0; i < __SCHED_FEAT_NR; i++) {
+		if (!(sysctl_sched_features & (1UL << i)))
+			seq_puts(m, "NO_");
+		seq_printf(m, "%s ", sched_feat_names[i]);
+	}
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled)	\
+	jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+	static_key_disable(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+	static_key_enable(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+	int i;
+	int neg = 0;
+
+	if (strncmp(cmp, "NO_", 3) == 0) {
+		neg = 1;
+		cmp += 3;
+	}
+
+	for (i = 0; i < __SCHED_FEAT_NR; i++) {
+		if (strcmp(cmp, sched_feat_names[i]) == 0) {
+			if (neg) {
+				sysctl_sched_features &= ~(1UL << i);
+				sched_feat_disable(i);
+			} else {
+				sysctl_sched_features |= (1UL << i);
+				sched_feat_enable(i);
+			}
+			break;
+		}
+	}
+
+	return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+		size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	char *cmp;
+	int i;
+	struct inode *inode;
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+	cmp = strstrip(buf);
+
+	/* Ensure the static_key remains in a consistent state */
+	inode = file_inode(filp);
+	inode_lock(inode);
+	i = sched_feat_set(cmp);
+	inode_unlock(inode);
+	if (i == __SCHED_FEAT_NR)
+		return -EINVAL;
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+	.open		= sched_feat_open,
+	.write		= sched_feat_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+	debugfs_create_file("sched_features", 0644, NULL, NULL,
+			&sched_feat_fops);
+
+	return 0;
+}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table sd_ctl_dir[] = {
+	{
+		.procname	= "sched_domain",
+		.mode		= 0555,
+	},
+	{}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= sd_ctl_dir,
+	},
+	{}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+	return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+	struct ctl_table *entry;
+
+	/*
+	 * In the intermediate directories, both the child directory and
+	 * procname are dynamically allocated and could fail but the mode
+	 * will always be set. In the lowest directory the names are
+	 * static strings and all have proc handlers.
+	 */
+	for (entry = *tablep; entry->mode; entry++) {
+		if (entry->child)
+			sd_free_ctl_entry(&entry->child);
+		if (entry->proc_handler == NULL)
+			kfree(entry->procname);
+	}
+
+	kfree(*tablep);
+	*tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+		const char *procname, void *data, int maxlen,
+		umode_t mode, proc_handler *proc_handler,
+		bool load_idx)
+{
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+
+	if (load_idx) {
+		entry->extra1 = &min_load_idx;
+		entry->extra2 = &max_load_idx;
+	}
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+	if (table == NULL)
+		return NULL;
+
+	set_table_entry(&table[0], "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[1], "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[9], "cache_nice_tries",
+		&sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[10], "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[11], "max_newidle_lb_cost",
+		&sd->max_newidle_lb_cost,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[12], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+	/* &table[13] is terminator */
+
+	return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct ctl_table *entry, *table;
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	char buf[32];
+
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+	if (table == NULL)
+		return NULL;
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_possible_cpus();
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	char buf[32];
+
+	WARN_ON(sd_ctl_dir[0].child);
+	sd_ctl_dir[0].child = entry;
+
+	if (entry == NULL)
+		return;
+
+	for_each_possible_cpu(i) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+		entry++;
+	}
+
+	WARN_ON(sd_sysctl_header);
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+	unregister_sysctl_table(sd_sysctl_header);
+	sd_sysctl_header = NULL;
+	if (sd_ctl_dir[0].child)
+		sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-	PN(se->statistics.wait_start);
-	PN(se->statistics.sleep_start);
-	PN(se->statistics.block_start);
-	PN(se->statistics.sleep_max);
-	PN(se->statistics.block_max);
-	PN(se->statistics.exec_max);
-	PN(se->statistics.slice_max);
-	PN(se->statistics.wait_max);
-	PN(se->statistics.wait_sum);
-	P(se->statistics.wait_count);
+	if (schedstat_enabled()) {
+		PN(se->statistics.wait_start);
+		PN(se->statistics.sleep_start);
+		PN(se->statistics.block_start);
+		PN(se->statistics.sleep_max);
+		PN(se->statistics.block_max);
+		PN(se->statistics.exec_max);
+		PN(se->statistics.slice_max);
+		PN(se->statistics.wait_max);
+		PN(se->statistics.wait_sum);
+		P(se->statistics.wait_count);
+	}
 #endif
 	P(se->load.weight);
 #ifdef CONFIG_SMP
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-		SPLIT_NS(p->se.statistics.wait_sum),
-		SPLIT_NS(p->se.sum_exec_runtime),
-		SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+	if (schedstat_enabled()) {
+		SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+			SPLIT_NS(p->se.statistics.wait_sum),
+			SPLIT_NS(p->se.sum_exec_runtime),
+			SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+	}
 #else
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 		0LL, 0L,
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 
 void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 {
+	struct dl_bw *dl_bw;
+
 	SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
 	SEQ_printf(m, "  .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+#ifdef CONFIG_SMP
+	dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+#else
+	dl_bw = &dl_rq->dl_bw;
+#endif
+	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
 }
 
 extern __read_mostly int sched_clock_running;
@@ -313,17 +630,18 @@ do {									\
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
 #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 
-	P(yld_count);
-
-	P(sched_count);
-	P(sched_goidle);
 #ifdef CONFIG_SMP
 	P64(avg_idle);
 	P64(max_idle_balance_cost);
 #endif
 
-	P(ttwu_count);
-	P(ttwu_local);
+	if (schedstat_enabled()) {
+		P(yld_count);
+		P(sched_count);
+		P(sched_goidle);
+		P(ttwu_count);
+		P(ttwu_local);
+	}
 
 #undef P
 #undef P64
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-	PN(se.statistics.sum_sleep_runtime);
-	PN(se.statistics.wait_start);
-	PN(se.statistics.sleep_start);
-	PN(se.statistics.block_start);
-	PN(se.statistics.sleep_max);
-	PN(se.statistics.block_max);
-	PN(se.statistics.exec_max);
-	PN(se.statistics.slice_max);
-	PN(se.statistics.wait_max);
-	PN(se.statistics.wait_sum);
-	P(se.statistics.wait_count);
-	PN(se.statistics.iowait_sum);
-	P(se.statistics.iowait_count);
 	P(se.nr_migrations);
-	P(se.statistics.nr_migrations_cold);
-	P(se.statistics.nr_failed_migrations_affine);
-	P(se.statistics.nr_failed_migrations_running);
-	P(se.statistics.nr_failed_migrations_hot);
-	P(se.statistics.nr_forced_migrations);
-	P(se.statistics.nr_wakeups);
-	P(se.statistics.nr_wakeups_sync);
-	P(se.statistics.nr_wakeups_migrate);
-	P(se.statistics.nr_wakeups_local);
-	P(se.statistics.nr_wakeups_remote);
-	P(se.statistics.nr_wakeups_affine);
-	P(se.statistics.nr_wakeups_affine_attempts);
-	P(se.statistics.nr_wakeups_passive);
-	P(se.statistics.nr_wakeups_idle);
 
-	{
+	if (schedstat_enabled()) {
 		u64 avg_atom, avg_per_cpu;
 
+		PN(se.statistics.sum_sleep_runtime);
+		PN(se.statistics.wait_start);
+		PN(se.statistics.sleep_start);
+		PN(se.statistics.block_start);
+		PN(se.statistics.sleep_max);
+		PN(se.statistics.block_max);
+		PN(se.statistics.exec_max);
+		PN(se.statistics.slice_max);
+		PN(se.statistics.wait_max);
+		PN(se.statistics.wait_sum);
+		P(se.statistics.wait_count);
+		PN(se.statistics.iowait_sum);
+		P(se.statistics.iowait_count);
+		P(se.statistics.nr_migrations_cold);
+		P(se.statistics.nr_failed_migrations_affine);
+		P(se.statistics.nr_failed_migrations_running);
+		P(se.statistics.nr_failed_migrations_hot);
+		P(se.statistics.nr_forced_migrations);
+		P(se.statistics.nr_wakeups);
+		P(se.statistics.nr_wakeups_sync);
+		P(se.statistics.nr_wakeups_migrate);
+		P(se.statistics.nr_wakeups_local);
+		P(se.statistics.nr_wakeups_remote);
+		P(se.statistics.nr_wakeups_affine);
+		P(se.statistics.nr_wakeups_affine_attempts);
+		P(se.statistics.nr_wakeups_passive);
+		P(se.statistics.nr_wakeups_idle);
+
 		avg_atom = p->se.sum_exec_runtime;
 		if (nr_switches)
 			avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 82e905862..ac7fb39c3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,8 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 
-#include <linux/latencytop.h>
 #include <linux/sched.h>
+#include <linux/latencytop.h>
 #include <linux/cpumask.h>
 #include <linux/cpuidle.h>
 #include <linux/slab.h>
@@ -763,16 +763,52 @@ static void update_curr_fair(struct rq *rq)
 	update_curr(cfs_rq_of(&rq->curr->se));
 }
 
+#ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+	u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+	    likely(wait_start > se->statistics.wait_start))
+		wait_start -= se->statistics.wait_start;
+
+	se->statistics.wait_start = wait_start;
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct task_struct *p;
+	u64 delta;
+
+	delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+	if (entity_is_task(se)) {
+		p = task_of(se);
+		if (task_on_rq_migrating(p)) {
+			/*
+			 * Preserve migrating task's wait time so wait_start
+			 * time stamp can be adjusted to accumulate wait time
+			 * prior to migration.
+			 */
+			se->statistics.wait_start = delta;
+			return;
+		}
+		trace_sched_stat_wait(p, delta);
+	}
+
+	se->statistics.wait_max = max(se->statistics.wait_max, delta);
+	se->statistics.wait_count++;
+	se->statistics.wait_sum += delta;
+	se->statistics.wait_start = 0;
 }
 
 /*
  * Task is being enqueued - update stats:
  */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/*
 	 * Are we enqueueing a waiting task? (for current tasks
@@ -782,25 +818,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		update_stats_wait_start(cfs_rq, se);
 }
 
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
-	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
-	if (entity_is_task(se)) {
-		trace_sched_stat_wait(task_of(se),
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-	}
-#endif
-	schedstat_set(se->statistics.wait_start, 0);
-}
-
 static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	/*
 	 * Mark the end of the wait period if dequeueing a
@@ -808,8 +827,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 */
 	if (se != cfs_rq->curr)
 		update_stats_wait_end(cfs_rq, se);
+
+	if (flags & DEQUEUE_SLEEP) {
+		if (entity_is_task(se)) {
+			struct task_struct *tsk = task_of(se);
+
+			if (tsk->state & TASK_INTERRUPTIBLE)
+				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+			if (tsk->state & TASK_UNINTERRUPTIBLE)
+				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+		}
+	}
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
 }
 
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+}
+#endif
+
 /*
  * We are picking a new current task - update its stats:
  */
@@ -905,10 +957,11 @@ struct numa_group {
 	spinlock_t lock; /* nr_tasks, tasks */
 	int nr_tasks;
 	pid_t gid;
+	int active_nodes;
 
 	struct rcu_head rcu;
-	nodemask_t active_nodes;
 	unsigned long total_faults;
+	unsigned long max_faults_cpu;
 	/*
 	 * Faults_cpu is used to decide whether memory should move
 	 * towards the CPU. As a consequence, these stats are weighted
@@ -967,6 +1020,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 }
 
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
 /* Handle placement on systems where not all nodes are directly connected. */
 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 					int maxdist, bool task)
@@ -1116,27 +1181,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 		return true;
 
 	/*
-	 * Do not migrate if the destination is not a node that
-	 * is actively used by this numa group.
+	 * Destination node is much more heavily used than the source
+	 * node? Allow migration.
 	 */
-	if (!node_isset(dst_nid, ng->active_nodes))
-		return false;
-
-	/*
-	 * Source is a node that is not actively used by this
-	 * numa group, while the destination is. Migrate.
-	 */
-	if (!node_isset(src_nid, ng->active_nodes))
+	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+					ACTIVE_NODE_FRACTION)
 		return true;
 
 	/*
-	 * Both source and destination are nodes in active
-	 * use by this numa group. Maximize memory bandwidth
-	 * by migrating from more heavily used groups, to less
-	 * heavily used ones, spreading the load around.
-	 * Use a 1/4 hysteresis to avoid spurious page movement.
+	 * Distribute memory according to CPU & memory use on each node,
+	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
+	 *
+	 * faults_cpu(dst)   3   faults_cpu(src)
+	 * --------------- * - > ---------------
+	 * faults_mem(dst)   4   faults_mem(src)
 	 */
-	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
 static unsigned long weighted_cpuload(const int cpu);
@@ -1218,8 +1279,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
 	if (env->best_task)
 		put_task_struct(env->best_task);
-	if (p)
-		get_task_struct(p);
 
 	env->best_task = p;
 	env->best_imp = imp;
@@ -1287,20 +1346,30 @@ static void task_numa_compare(struct task_numa_env *env,
 	long imp = env->p->numa_group ? groupimp : taskimp;
 	long moveimp = imp;
 	int dist = env->dist;
+	bool assigned = false;
 
 	rcu_read_lock();
 
 	raw_spin_lock_irq(&dst_rq->lock);
 	cur = dst_rq->curr;
 	/*
-	 * No need to move the exiting task, and this ensures that ->curr
-	 * wasn't reaped and thus get_task_struct() in task_numa_assign()
-	 * is safe under RCU read lock.
-	 * Note that rcu_read_lock() itself can't protect from the final
-	 * put_task_struct() after the last schedule().
+	 * No need to move the exiting task or idle task.
 	 */
 	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
 		cur = NULL;
+	else {
+		/*
+		 * The task_struct must be protected here to protect the
+		 * p->numa_faults access in the task_weight since the
+		 * numa_faults could already be freed in the following path:
+		 * finish_task_switch()
+		 *     --> put_task_struct()
+		 *         --> __put_task_struct()
+		 *             --> task_numa_free()
+		 */
+		get_task_struct(cur);
+	}
+
 	raw_spin_unlock_irq(&dst_rq->lock);
 
 	/*
@@ -1384,6 +1453,7 @@ balance:
 		 */
 		if (!load_too_imbalanced(src_load, dst_load, env)) {
 			imp = moveimp - 1;
+			put_task_struct(cur);
 			cur = NULL;
 			goto assign;
 		}
@@ -1409,9 +1479,16 @@ balance:
 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 
 assign:
+	assigned = true;
 	task_numa_assign(env, cur, imp);
 unlock:
 	rcu_read_unlock();
+	/*
+	 * The dst_rq->curr isn't assigned. The protection for task_struct is
+	 * finished.
+	 */
+	if (cur && !assigned)
+		put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -1466,7 +1543,7 @@ static int task_numa_migrate(struct task_struct *p)
 
 		.best_task = NULL,
 		.best_imp = 0,
-		.best_cpu = -1
+		.best_cpu = -1,
 	};
 	struct sched_domain *sd;
 	unsigned long taskweight, groupweight;
@@ -1518,8 +1595,7 @@ static int task_numa_migrate(struct task_struct *p)
 	 *   multiple NUMA nodes; in order to better consolidate the group,
 	 *   we need to check other locations.
 	 */
-	if (env.best_cpu == -1 || (p->numa_group &&
-			nodes_weight(p->numa_group->active_nodes) > 1)) {
+	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
 		for_each_online_node(nid) {
 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 				continue;
@@ -1554,12 +1630,14 @@ static int task_numa_migrate(struct task_struct *p)
 	 * trying for a better one later. Do not set the preferred node here.
 	 */
 	if (p->numa_group) {
+		struct numa_group *ng = p->numa_group;
+
 		if (env.best_cpu == -1)
 			nid = env.src_nid;
 		else
 			nid = env.dst_nid;
 
-		if (node_isset(nid, p->numa_group->active_nodes))
+		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
 			sched_setnuma(p, env.dst_nid);
 	}
 
@@ -1609,20 +1687,15 @@ static void numa_migrate_preferred(struct task_struct *p)
 }
 
 /*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
  * tracking the nodes from which NUMA hinting faults are triggered. This can
  * be different from the set of nodes where the workload's memory is currently
  * located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
  */
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
 {
 	unsigned long faults, max_faults = 0;
-	int nid;
+	int nid, active_nodes = 0;
 
 	for_each_online_node(nid) {
 		faults = group_faults_cpu(numa_group, nid);
@@ -1632,12 +1705,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
 
 	for_each_online_node(nid) {
 		faults = group_faults_cpu(numa_group, nid);
-		if (!node_isset(nid, numa_group->active_nodes)) {
-			if (faults > max_faults * 6 / 16)
-				node_set(nid, numa_group->active_nodes);
-		} else if (faults < max_faults * 3 / 16)
-			node_clear(nid, numa_group->active_nodes);
+		if (faults * ACTIVE_NODE_FRACTION > max_faults)
+			active_nodes++;
 	}
+
+	numa_group->max_faults_cpu = max_faults;
+	numa_group->active_nodes = active_nodes;
 }
 
 /*
@@ -1928,7 +2001,7 @@ static void task_numa_placement(struct task_struct *p)
 	update_task_scan_period(p, fault_types[0], fault_types[1]);
 
 	if (p->numa_group) {
-		update_numa_active_node_mask(p->numa_group);
+		numa_group_count_active_nodes(p->numa_group);
 		spin_unlock_irq(group_lock);
 		max_nid = preferred_group_nid(p, max_group_nid);
 	}
@@ -1972,14 +2045,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 			return;
 
 		atomic_set(&grp->refcount, 1);
+		grp->active_nodes = 1;
+		grp->max_faults_cpu = 0;
 		spin_lock_init(&grp->lock);
 		grp->gid = p->pid;
 		/* Second half of the array tracks nids where faults happen */
 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
 						nr_node_ids;
 
-		node_set(task_node(current), grp->active_nodes);
-
 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 			grp->faults[i] = p->numa_faults[i];
 
@@ -2093,6 +2166,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	bool migrated = flags & TNF_MIGRATED;
 	int cpu_node = task_node(current);
 	int local = !!(flags & TNF_FAULT_LOCAL);
+	struct numa_group *ng;
 	int priv;
 
 	if (!static_branch_likely(&sched_numa_balancing))
@@ -2133,9 +2207,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	 * actively using should be counted as local. This allows the
 	 * scan rate to slow down when a workload has settled down.
 	 */
-	if (!priv && !local && p->numa_group &&
-			node_isset(cpu_node, p->numa_group->active_nodes) &&
-			node_isset(mem_node, p->numa_group->active_nodes))
+	ng = p->numa_group;
+	if (!priv && !local && ng && ng->active_nodes > 1 &&
+				numa_is_active_node(cpu_node, ng) &&
+				numa_is_active_node(mem_node, ng))
 		local = 1;
 
 	task_numa_placement(p);
@@ -2180,6 +2255,7 @@ void task_numa_work(struct callback_head *work)
 	unsigned long migrate, next_scan, now = jiffies;
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
+	u64 runtime = p->se.sum_exec_runtime;
 	struct vm_area_struct *vma;
 	unsigned long start, end;
 	unsigned long nr_pte_updates = 0;
@@ -2302,6 +2378,17 @@ out:
 	else
 		reset_ptenuma_scan(p);
 	up_read(&mm->mmap_sem);
+
+	/*
+	 * Make sure tasks use at least 32x as much time to run other code
+	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
+	 * Usually update_task_scan_period slows down scanning enough; on an
+	 * overloaded system we need to limit overhead on a per task basis.
+	 */
+	if (unlikely(p->se.sum_exec_runtime != runtime)) {
+		u64 diff = p->se.sum_exec_runtime - runtime;
+		p->node_stamp += 32 * diff;
+	}
 }
 
 /*
@@ -2695,12 +2782,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 
+	/*
+	 * No need to update load_avg for root_task_group as it is not used.
+	 */
+	if (cfs_rq->tg == &root_task_group)
+		return;
+
 	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
 	}
 }
 
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+		      struct cfs_rq *prev, struct cfs_rq *next)
+{
+	if (!sched_feat(ATTACH_AGE_LOAD))
+		return;
+
+	/*
+	 * We are supposed to update the task to "current" time, then its up to
+	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+	 * getting what current time is, so simply throw away the out-of-date
+	 * time. This will result in the wakee task is less decayed, but giving
+	 * the wakee more load sounds not bad.
+	 */
+	if (se->avg.last_update_time && prev) {
+		u64 p_last_update_time;
+		u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+		u64 p_last_update_time_copy;
+		u64 n_last_update_time_copy;
+
+		do {
+			p_last_update_time_copy = prev->load_last_update_time_copy;
+			n_last_update_time_copy = next->load_last_update_time_copy;
+
+			smp_rmb();
+
+			p_last_update_time = prev->avg.last_update_time;
+			n_last_update_time = next->avg.last_update_time;
+
+		} while (p_last_update_time != p_last_update_time_copy ||
+			 n_last_update_time != n_last_update_time_copy);
+#else
+		p_last_update_time = prev->avg.last_update_time;
+		n_last_update_time = next->avg.last_update_time;
+#endif
+		__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+				  &se->avg, 0, 0, NULL);
+		se->avg.last_update_time = n_last_update_time;
+	}
+}
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2834,48 +2973,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
 }
 
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 last_update_time;
-
 #ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
 	u64 last_update_time_copy;
+	u64 last_update_time;
 
 	do {
 		last_update_time_copy = cfs_rq->load_last_update_time_copy;
 		smp_rmb();
 		last_update_time = cfs_rq->avg.last_update_time;
 	} while (last_update_time != last_update_time_copy);
-#else
-	last_update_time = cfs_rq->avg.last_update_time;
-#endif
 
-	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
-	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
-	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+	return last_update_time;
 }
-
-/*
- * Update the rq's load with the elapsed running time before entering
- * idle. if the last scheduled task is not a CFS task, idle_enter will
- * be the only way to update the runnable statistic.
- */
-void idle_enter_fair(struct rq *this_rq)
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 {
+	return cfs_rq->avg.last_update_time;
 }
+#endif
 
 /*
- * Update the rq's load with the elapsed idle time before a task is
- * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
- * be the only way to update the runnable statistic.
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
  */
-void idle_exit_fair(struct rq *this_rq)
+void remove_entity_load_avg(struct sched_entity *se)
 {
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 last_update_time;
+
+	/*
+	 * Newly created task or never used group entity should not be removed
+	 * from its (source) cfs_rq
+	 */
+	if (se->avg.last_update_time == 0)
+		return;
+
+	last_update_time = cfs_rq_last_update_time(cfs_rq);
+
+	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -3020,6 +3159,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+	if (schedstat_enabled())
+		return;
+
+	/* Force schedstat enabled if a dependent tracepoint is active */
+	if (trace_sched_stat_wait_enabled()    ||
+			trace_sched_stat_sleep_enabled()   ||
+			trace_sched_stat_iowait_enabled()  ||
+			trace_sched_stat_blocked_enabled() ||
+			trace_sched_stat_runtime_enabled())  {
+		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+			     "stat_blocked and stat_runtime require the "
+			     "kernel parameter schedstats=enabled or "
+			     "kernel.sched_schedstats=1\n");
+	}
+#endif
+}
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -3040,11 +3199,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
-		enqueue_sleeper(cfs_rq, se);
+		if (schedstat_enabled())
+			enqueue_sleeper(cfs_rq, se);
 	}
 
-	update_stats_enqueue(cfs_rq, se);
-	check_spread(cfs_rq, se);
+	check_schedstat_required();
+	if (schedstat_enabled()) {
+		update_stats_enqueue(cfs_rq, se);
+		check_spread(cfs_rq, se);
+	}
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
@@ -3111,19 +3274,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_curr(cfs_rq);
 	dequeue_entity_load_avg(cfs_rq, se);
 
-	update_stats_dequeue(cfs_rq, se);
-	if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-		if (entity_is_task(se)) {
-			struct task_struct *tsk = task_of(se);
-
-			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
-			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
-		}
-#endif
-	}
+	if (schedstat_enabled())
+		update_stats_dequeue(cfs_rq, se, flags);
 
 	clear_buddies(cfs_rq, se);
 
@@ -3197,7 +3349,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * a CPU. So account for the time it spent waiting on the
 		 * runqueue.
 		 */
-		update_stats_wait_end(cfs_rq, se);
+		if (schedstat_enabled())
+			update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
 		update_load_avg(se, 1);
 	}
@@ -3210,7 +3363,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * least twice that of our own weight (i.e. dont track it
 	 * when there are only lesser-weight tasks around):
 	 */
-	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 		se->statistics.slice_max = max(se->statistics.slice_max,
 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
 	}
@@ -3293,9 +3446,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	/* throttle cfs_rqs exceeding runtime */
 	check_cfs_rq_runtime(cfs_rq);
 
-	check_spread(cfs_rq, prev);
+	if (schedstat_enabled()) {
+		check_spread(cfs_rq, prev);
+		if (prev->on_rq)
+			update_stats_wait_start(cfs_rq, prev);
+	}
+
 	if (prev->on_rq) {
-		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
@@ -4265,42 +4422,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  */
 
 /*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ * The exact cpuload calculated at every tick would be:
+ *
+ *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
  *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ * If a cpu misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when cpu may be busy, then we have:
+ *
+ *   load_n   = (1 - 1/2^i)^n * load_0
+ *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
  *
  * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ *   load' = (1 - 1/2^i)^n * load
+ *
+ * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
+ * This allows us to precompute the above in said factors, thereby allowing the
+ * reduction of an arbitrary n in O(log_2 n) steps. (See also
+ * fixed_power_int())
  *
  * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
  */
 #define DEGRADE_SHIFT		7
-static const unsigned char
-		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-					{0, 0, 0, 0, 0, 0, 0, 0},
-					{64, 32, 8, 0, 0, 0, 0, 0},
-					{96, 72, 40, 12, 1, 0, 0},
-					{112, 98, 75, 43, 15, 1, 0},
-					{120, 112, 98, 76, 45, 16, 2} };
+
+static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+	{   0,   0,  0,  0,  0,  0, 0, 0 },
+	{  64,  32,  8,  0,  0,  0, 0, 0 },
+	{  96,  72, 40, 12,  1,  0, 0, 0 },
+	{ 112,  98, 75, 43, 15,  1, 0, 0 },
+	{ 120, 112, 98, 76, 45, 16, 2, 0 }
+};
 
 /*
  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
@@ -4331,14 +4483,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 	return load;
 }
 
-/*
+/**
+ * __update_cpu_load - update the rq->cpu_load[] statistics
+ * @this_rq: The rq to update statistics for
+ * @this_load: The current load
+ * @pending_updates: The number of missed updates
+ * @active: !0 for NOHZ_FULL
+ *
  * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
+ * scheduler tick (TICK_NSEC).
+ *
+ * This function computes a decaying average:
+ *
+ *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
+ *
+ * Because of NOHZ it might not get called on every tick which gives need for
+ * the @pending_updates argument.
+ *
+ *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
+ *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
+ *             = A * (A * load[i]_n-2 + B) + B
+ *             = A * (A * (A * load[i]_n-3 + B) + B) + B
+ *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
+ *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
+ *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
+ *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
+ *
+ * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
+ * any change in load would have resulted in the tick being turned back on.
+ *
+ * For regular NOHZ, this reduces to:
+ *
+ *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
+ *
+ * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
+ * term. See the @active paramter.
  */
 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-			      unsigned long pending_updates)
+			      unsigned long pending_updates, int active)
 {
+	unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
 	int i, scale;
 
 	this_rq->nr_load_updates++;
@@ -4352,6 +4536,15 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 
 		old_load = this_rq->cpu_load[i];
 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
+		if (tickless_load) {
+			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
+			/*
+			 * old_load can never be a negative value because a
+			 * decayed tickless_load cannot be greater than the
+			 * original tickless_load.
+			 */
+			old_load += tickless_load;
+		}
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
@@ -4374,6 +4567,25 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
+static void __update_cpu_load_nohz(struct rq *this_rq,
+				   unsigned long curr_jiffies,
+				   unsigned long load,
+				   int active)
+{
+	unsigned long pending_updates;
+
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	if (pending_updates) {
+		this_rq->last_load_update_tick = curr_jiffies;
+		/*
+		 * In the regular NOHZ case, we were idle, this means load 0.
+		 * In the NOHZ_FULL case, we were non-idle, we should consider
+		 * its weighted load.
+		 */
+		__update_cpu_load(this_rq, load, pending_updates, active);
+	}
+}
+
 /*
  * There is no sane way to deal with nohz on smp when using jiffies because the
  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4391,46 +4603,31 @@ static unsigned long weighted_cpuload(const int cpu)
  * Called from nohz_idle_balance() to update the load ratings before doing the
  * idle balance.
  */
-static void update_idle_cpu_load(struct rq *this_rq)
+static void update_cpu_load_idle(struct rq *this_rq)
 {
-	unsigned long curr_jiffies = READ_ONCE(jiffies);
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
-	unsigned long pending_updates;
-
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (load || curr_jiffies == this_rq->last_load_update_tick)
+	if (weighted_cpuload(cpu_of(this_rq)))
 		return;
 
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	this_rq->last_load_update_tick = curr_jiffies;
-
-	__update_cpu_load(this_rq, load, pending_updates);
+	__update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
 }
 
 /*
  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
  */
-void update_cpu_load_nohz(void)
+void update_cpu_load_nohz(int active)
 {
 	struct rq *this_rq = this_rq();
 	unsigned long curr_jiffies = READ_ONCE(jiffies);
-	unsigned long pending_updates;
+	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
 
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
 	raw_spin_lock(&this_rq->lock);
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	if (pending_updates) {
-		this_rq->last_load_update_tick = curr_jiffies;
-		/*
-		 * We were idle, this means load 0, the current load might be
-		 * !0 due to remote wakeups and the sort.
-		 */
-		__update_cpu_load(this_rq, 0, pending_updates);
-	}
+	__update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
 	raw_spin_unlock(&this_rq->lock);
 }
 #endif /* CONFIG_NO_HZ */
@@ -4442,10 +4639,10 @@ void update_cpu_load_active(struct rq *this_rq)
 {
 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
 	/*
-	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+	 * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
 	 */
 	this_rq->last_load_update_tick = jiffies;
-	__update_cpu_load(this_rq, load, 1);
+	__update_cpu_load(this_rq, load, 1, 1);
 }
 
 /*
@@ -5032,8 +5229,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 /*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
- * other assumptions, including the state of rq->lock, should be made.
+ * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
  */
 static void migrate_task_rq_fair(struct task_struct *p)
 {
@@ -5746,8 +5942,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(env->src_rq, p, 0);
 	set_task_cpu(p, env->dst_cpu);
 }
 
@@ -5880,8 +6076,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	BUG_ON(task_rq(p) != rq);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 }
 
@@ -6327,7 +6523,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 			bool *overload)
 {
 	unsigned long load;
-	int i;
+	int i, nr_running;
 
 	memset(sgs, 0, sizeof(*sgs));
 
@@ -6344,7 +6540,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->group_util += cpu_util(i);
 		sgs->sum_nr_running += rq->cfs.h_nr_running;
 
-		if (rq->nr_running > 1)
+		nr_running = rq->nr_running;
+		if (nr_running > 1)
 			*overload = true;
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -6352,7 +6549,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
 		sgs->sum_weighted_load += weighted_cpuload(i);
-		if (idle_cpu(i))
+		/*
+		 * No need to call idle_cpu() if nr_running is not 0
+		 */
+		if (!nr_running && idle_cpu(i))
 			sgs->idle_cpus++;
 	}
 
@@ -7273,8 +7473,6 @@ static int idle_balance(struct rq *this_rq)
 	int pulled_task = 0;
 	u64 curr_cost = 0;
 
-	idle_enter_fair(this_rq);
-
 	/*
 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
 	 * measure the duration of idle_balance() as idle time.
@@ -7355,10 +7553,8 @@ out:
 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
 		pulled_task = -1;
 
-	if (pulled_task) {
-		idle_exit_fair(this_rq);
+	if (pulled_task)
 		this_rq->idle_stamp = 0;
-	}
 
 	return pulled_task;
 }
@@ -7737,7 +7933,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			raw_spin_lock_irq(&rq->lock);
 			update_rq_clock(rq);
-			update_idle_cpu_load(rq);
+			update_cpu_load_idle(rq);
 			raw_spin_unlock_irq(&rq->lock);
 			rebalance_domains(rq, CPU_IDLE);
 		}
@@ -8123,11 +8319,8 @@ void free_fair_sched_group(struct task_group *tg)
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
-		if (tg->se) {
-			if (tg->se[i])
-				remove_entity_load_avg(tg->se[i]);
+		if (tg->se)
 			kfree(tg->se[i]);
-		}
 	}
 
 	kfree(tg->cfs_rq);
@@ -8175,21 +8368,29 @@ err:
 	return 0;
 }
 
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
 {
-	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
+	struct rq *rq;
+	int cpu;
 
-	/*
-	* Only empty task groups can be destroyed; so we can speculatively
-	* check on_list without danger of it being re-added.
-	*/
-	if (!tg->cfs_rq[cpu]->on_list)
-		return;
+	for_each_possible_cpu(cpu) {
+		if (tg->se[cpu])
+			remove_entity_load_avg(tg->se[cpu]);
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+		/*
+		 * Only empty task groups can be destroyed; so we can speculatively
+		 * check on_list without danger of it being re-added.
+		 */
+		if (!tg->cfs_rq[cpu]->on_list)
+			continue;
+
+		rq = cpu_rq(cpu);
+
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
 }
 
 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8271,7 +8472,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	return 1;
 }
 
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4a2ef5a02..544a7133c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -97,12 +97,6 @@ void default_idle_call(void)
 static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		      int next_state)
 {
-	/* Fall back to the default arch idle method on errors. */
-	if (next_state < 0) {
-		default_idle_call();
-		return next_state;
-	}
-
 	/*
 	 * The idle task must be scheduled, it is pointless to go to idle, just
 	 * update no idle residency and return.
@@ -168,7 +162,7 @@ static void cpuidle_idle_call(void)
 	 */
 	if (idle_should_freeze()) {
 		entered_state = cpuidle_enter_freeze(drv, dev);
-		if (entered_state >= 0) {
+		if (entered_state > 0) {
 			local_irq_enable();
 			goto exit_idle;
 		}
@@ -219,6 +213,7 @@ static void cpu_idle_loop(void)
 		 */
 
 		__current_set_polling();
+		quiet_vmstat();
 		tick_nohz_idle_enter();
 
 		while (!need_resched()) {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c4ae0f1fd..47ce94931 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
-	idle_exit_fair(rq);
 	rq_last_tick_reset(rq);
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86abe0..a774b4dbf 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	raw_spin_lock(&rt_b->rt_runtime_lock);
 	if (!rt_b->rt_period_active) {
 		rt_b->rt_period_active = 1;
-		hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+		/*
+		 * SCHED_DEADLINE updates the bandwidth, as a run away
+		 * RT task with a DL task could hog a CPU. But DL does
+		 * not reset the period. If a deadline task was running
+		 * without an RT task running, it can cause RT tasks to
+		 * throttle when they start up. Kick the timer right away
+		 * to update the period.
+		 */
+		hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
 		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
 	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-	return !list_empty(&rt_se->run_list);
+	return rt_se->on_rq;
 }
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 	return rt_se->my_q;
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 		if (!rt_se)
 			enqueue_top_rt_rq(rt_rq);
 		else if (!on_rt_rq(rt_se))
-			enqueue_rt_entity(rt_se, false);
+			enqueue_rt_entity(rt_se, 0);
 
 		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_curr(rq);
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 	if (!rt_se)
 		dequeue_top_rt_rq(rt_rq);
 	else if (on_rt_rq(rt_se))
-		dequeue_rt_entity(rt_se);
+		dequeue_rt_entity(rt_se, 0);
 }
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1166,7 +1174,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	dec_rt_group(rt_se, rt_rq);
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+		return false;
+
+	return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+	list_del_init(&rt_se->run_list);
+
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
+		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+	rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1210,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 	 * get throttled and the current group doesn't have any other
 	 * active members.
 	 */
-	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+		if (rt_se->on_list)
+			__delist_rt_entity(rt_se, array);
 		return;
+	}
 
-	if (head)
-		list_add(&rt_se->run_list, queue);
-	else
-		list_add_tail(&rt_se->run_list, queue);
-	__set_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(rt_se->on_list);
+		if (flags & ENQUEUE_HEAD)
+			list_add(&rt_se->run_list, queue);
+		else
+			list_add_tail(&rt_se->run_list, queue);
+
+		__set_bit(rt_se_prio(rt_se), array->bitmap);
+		rt_se->on_list = 1;
+	}
+	rt_se->on_rq = 1;
 
 	inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 
-	list_del_init(&rt_se->run_list);
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(!rt_se->on_list);
+		__delist_rt_entity(rt_se, array);
+	}
+	rt_se->on_rq = 0;
 
 	dec_rt_tasks(rt_se, rt_rq);
 }
@@ -1207,7 +1249,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct sched_rt_entity *back = NULL;
 
@@ -1220,31 +1262,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 
 	for (rt_se = back; rt_se; rt_se = rt_se->back) {
 		if (on_rt_rq(rt_se))
-			__dequeue_rt_entity(rt_se);
+			__dequeue_rt_entity(rt_se, flags);
 	}
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se, head);
+		__enqueue_rt_entity(rt_se, flags);
 	enqueue_top_rt_rq(&rq->rt);
 }
 
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se, false);
+			__enqueue_rt_entity(rt_se, flags);
 	}
 	enqueue_top_rt_rq(&rq->rt);
 }
@@ -1260,7 +1302,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
-	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+	enqueue_rt_entity(rt_se, flags);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
@@ -1271,7 +1313,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_rt_entity *rt_se = &p->rt;
 
 	update_curr_rt(rq);
-	dequeue_rt_entity(rt_se);
+	dequeue_rt_entity(rt_se, flags);
 
 	dequeue_pushable_task(rq, p);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b242775bf..ef5875fff 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/binfmts.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
@@ -248,7 +249,12 @@ struct task_group {
 	unsigned long shares;
 
 #ifdef	CONFIG_SMP
-	atomic_long_t load_avg;
+	/*
+	 * load_avg can be heavily contended at clock tick time, so put
+	 * it in its own cacheline separated from the fields above which
+	 * will also be accessed at each tick.
+	 */
+	atomic_long_t load_avg ____cacheline_aligned;
 #endif
 #endif
 
@@ -308,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);
 
 extern void free_fair_sched_group(struct task_group *tg);
 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void unregister_fair_sched_group(struct task_group *tg);
 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 			struct sched_entity *se, int cpu,
 			struct sched_entity *parent);
 extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+			     struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+			     struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #else /* CONFIG_CGROUP_SCHED */
 
@@ -896,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
 
 extern int group_balance_cpu(struct sched_group *sg);
 
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
 #else
 
 static inline void sched_ttwu_pending(void) { }
@@ -933,6 +958,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
 	p->se.cfs_rq = tg->cfs_rq[cpu];
 	p->se.parent = tg->se[cpu];
 #endif
@@ -1008,6 +1034,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
 extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
 
 static inline u64 global_rt_period(void)
 {
@@ -1076,7 +1103,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	 * In particular, the load of prev->state in finish_task_switch() must
 	 * happen before this.
 	 *
-	 * Pairs with the control dependency and rmb in try_to_wake_up().
+	 * Pairs with the smp_cond_acquire() in try_to_wake_up().
 	 */
 	smp_store_release(&prev->on_cpu, 0);
 #endif
@@ -1113,59 +1140,43 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #define WEIGHT_IDLEPRIO                3
 #define WMULT_IDLEPRIO         1431655765
 
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */     88761,     71755,     56483,     46273,     36291,
- /* -15 */     29154,     23254,     18705,     14949,     11916,
- /* -10 */      9548,      7620,      6100,      4904,      3906,
- /*  -5 */      3121,      2501,      1991,      1586,      1277,
- /*   0 */      1024,       820,       655,       526,       423,
- /*   5 */       335,       272,       215,       172,       137,
- /*  10 */       110,        87,        70,        56,        45,
- /*  15 */        36,        29,        23,        18,        15,
-};
+extern const int sched_prio_to_weight[40];
+extern const u32 sched_prio_to_wmult[40];
 
 /*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
  *
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
  */
-static const u32 prio_to_wmult[40] = {
- /* -20 */     48388,     59856,     76040,     92818,    118348,
- /* -15 */    147320,    184698,    229616,    287308,    360437,
- /* -10 */    449829,    563644,    704093,    875809,   1099582,
- /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
- /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
- /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
- /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
- /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
+
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
 
 #define ENQUEUE_WAKEUP		0x01
-#define ENQUEUE_HEAD		0x02
+#define ENQUEUE_RESTORE		0x02
+#define ENQUEUE_MOVE		0x04
+
+#define ENQUEUE_HEAD		0x08
+#define ENQUEUE_REPLENISH	0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x20
 #else
 #define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	0x08
-#define ENQUEUE_RESTORE	0x10
-
-#define DEQUEUE_SLEEP		0x01
-#define DEQUEUE_SAVE		0x02
 
 #define RETRY_TASK		((void *)-1UL)
 
@@ -1252,16 +1263,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
-extern void idle_enter_fair(struct rq *this_rq);
-extern void idle_exit_fair(struct rq *this_rq);
-
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
 
-#else
-
-static inline void idle_enter_fair(struct rq *rq) { }
-static inline void idle_exit_fair(struct rq *rq) { }
-
 #endif
 
 #ifdef CONFIG_CPU_IDLE
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b0fbc7632..70b3b6a20 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 	if (rq)
 		rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)	do { var = (val); } while (0)
+# define schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field)	do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt)	do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
+# define schedstat_enabled()		0
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 # define schedstat_set(var, val)	do { } while (0)
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644
index 000000000..82f0dff90
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,123 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+			     struct lock_class_key *key)
+{
+	raw_spin_lock_init(&q->lock);
+	lockdep_set_class_and_name(&q->lock, key, name);
+	INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+	struct swait_queue *curr;
+
+	if (list_empty(&q->task_list))
+		return;
+
+	curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+	wake_up_process(curr->task);
+	list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+	unsigned long flags;
+
+	if (!swait_active(q))
+		return;
+
+	raw_spin_lock_irqsave(&q->lock, flags);
+	swake_up_locked(q);
+	raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+	struct swait_queue *curr;
+	LIST_HEAD(tmp);
+
+	if (!swait_active(q))
+		return;
+
+	raw_spin_lock_irq(&q->lock);
+	list_splice_init(&q->task_list, &tmp);
+	while (!list_empty(&tmp)) {
+		curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+		wake_up_state(curr->task, TASK_NORMAL);
+		list_del_init(&curr->task_list);
+
+		if (list_empty(&tmp))
+			break;
+
+		raw_spin_unlock_irq(&q->lock);
+		raw_spin_lock_irq(&q->lock);
+	}
+	raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	wait->task = current;
+	if (list_empty(&wait->task_list))
+		list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&q->lock, flags);
+	__prepare_to_swait(q, wait);
+	set_current_state(state);
+	raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+	if (signal_pending_state(state, current))
+		return -ERESTARTSYS;
+
+	prepare_to_swait(q, wait, state);
+
+	return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	__set_current_state(TASK_RUNNING);
+	if (!list_empty(&wait->task_list))
+		list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+
+	if (!list_empty_careful(&wait->task_list)) {
+		raw_spin_lock_irqsave(&q->lock, flags);
+		list_del_init(&wait->task_list);
+		raw_spin_unlock_irqrestore(&q->lock, flags);
+	}
+}
+EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/signal.c b/kernel/signal.c
index f3f1f7a97..0508544c8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set)
 	current->saved_sigmask = current->blocked;
 	set_current_blocked(set);
 
-	__set_current_state(TASK_INTERRUPTIBLE);
-	schedule();
+	while (!signal_pending(current)) {
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+	}
 	set_restore_sigmask();
 	return -ERESTARTNOHAND;
 }
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 28c8e736d..d264f59bf 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -174,7 +174,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 	if (tsk)
 		return 0;
 
-	td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu));
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
 	if (!td)
 		return -ENOMEM;
 	td->cpu = cpu;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e4436f..8aae49dd7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 
 	if (preempt_count() == cnt) {
 #ifdef CONFIG_DEBUG_PREEMPT
-		current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+		current->preempt_disable_ip = get_lock_parent_ip();
 #endif
-		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+		trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
 	}
 }
 EXPORT_SYMBOL(__local_bh_disable_ip);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a3bbaee77..a467e6c28 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -28,7 +28,6 @@
  */
 struct cpu_stop_done {
 	atomic_t		nr_todo;	/* nr left to execute */
-	bool			executed;	/* actually executed? */
 	int			ret;		/* collected return value */
 	struct completion	completion;	/* fired if nr_todo reaches 0 */
 };
@@ -63,14 +62,10 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 }
 
 /* signal completion unless @done is NULL */
-static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+static void cpu_stop_signal_done(struct cpu_stop_done *done)
 {
-	if (done) {
-		if (executed)
-			done->executed = true;
-		if (atomic_dec_and_test(&done->nr_todo))
-			complete(&done->completion);
-	}
+	if (atomic_dec_and_test(&done->nr_todo))
+		complete(&done->completion);
 }
 
 static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
@@ -81,17 +76,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
-static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 	unsigned long flags;
+	bool enabled;
 
 	spin_lock_irqsave(&stopper->lock, flags);
-	if (stopper->enabled)
+	enabled = stopper->enabled;
+	if (enabled)
 		__cpu_stop_queue_work(stopper, work);
-	else
-		cpu_stop_signal_done(work->done, false);
+	else if (work->done)
+		cpu_stop_signal_done(work->done);
 	spin_unlock_irqrestore(&stopper->lock, flags);
+
+	return enabled;
 }
 
 /**
@@ -124,9 +123,10 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
 
 	cpu_stop_init_done(&done, 1);
-	cpu_stop_queue_work(cpu, &work);
+	if (!cpu_stop_queue_work(cpu, &work))
+		return -ENOENT;
 	wait_for_completion(&done.completion);
-	return done.executed ? done.ret : -ENOENT;
+	return done.ret;
 }
 
 /* This controls the threads on each CPU. */
@@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	struct cpu_stop_work work1, work2;
 	struct multi_stop_data msdata;
 
-	preempt_disable();
 	msdata = (struct multi_stop_data){
 		.fn = fn,
 		.data = arg,
@@ -277,16 +276,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 
 	if (cpu1 > cpu2)
 		swap(cpu1, cpu2);
-	if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
-		preempt_enable();
+	if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2))
 		return -ENOENT;
-	}
-
-	preempt_enable();
 
 	wait_for_completion(&done.completion);
-
-	return done.executed ? done.ret : -ENOENT;
+	return done.ret;
 }
 
 /**
@@ -302,23 +296,28 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
  *
  * CONTEXT:
  * Don't care.
+ *
+ * RETURNS:
+ * true if cpu_stop_work was queued successfully and @fn will be called,
+ * false otherwise.
  */
-void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
 	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
-	cpu_stop_queue_work(cpu, work_buf);
+	return cpu_stop_queue_work(cpu, work_buf);
 }
 
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
 
-static void queue_stop_cpus_work(const struct cpumask *cpumask,
+static bool queue_stop_cpus_work(const struct cpumask *cpumask,
 				 cpu_stop_fn_t fn, void *arg,
 				 struct cpu_stop_done *done)
 {
 	struct cpu_stop_work *work;
 	unsigned int cpu;
+	bool queued = false;
 
 	/*
 	 * Disable preemption while queueing to avoid getting
@@ -331,9 +330,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 		work->fn = fn;
 		work->arg = arg;
 		work->done = done;
-		cpu_stop_queue_work(cpu, work);
+		if (cpu_stop_queue_work(cpu, work))
+			queued = true;
 	}
 	lg_global_unlock(&stop_cpus_lock);
+
+	return queued;
 }
 
 static int __stop_cpus(const struct cpumask *cpumask,
@@ -342,9 +344,10 @@ static int __stop_cpus(const struct cpumask *cpumask,
 	struct cpu_stop_done done;
 
 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
-	queue_stop_cpus_work(cpumask, fn, arg, &done);
+	if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
+		return -ENOENT;
 	wait_for_completion(&done.completion);
-	return done.executed ? done.ret : -ENOENT;
+	return done.ret;
 }
 
 /**
@@ -432,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 	struct cpu_stop_work *work;
-	int ret;
 
 repeat:
 	work = NULL;
@@ -448,23 +450,19 @@ repeat:
 		cpu_stop_fn_t fn = work->fn;
 		void *arg = work->arg;
 		struct cpu_stop_done *done = work->done;
-		char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
-
-		/* cpu stop callbacks are not allowed to sleep */
-		preempt_disable();
+		int ret;
 
+		/* cpu stop callbacks must not sleep, make in_atomic() == T */
+		preempt_count_inc();
 		ret = fn(arg);
-		if (ret)
-			done->ret = ret;
-
-		/* restore preemption and check it's still balanced */
-		preempt_enable();
+		if (done) {
+			if (ret)
+				done->ret = ret;
+			cpu_stop_signal_done(done);
+		}
+		preempt_count_dec();
 		WARN_ONCE(preempt_count(),
-			  "cpu_stop: %s(%p) leaked preempt count\n",
-			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
-					  ksym_buf), arg);
-
-		cpu_stop_signal_done(done, true);
+			  "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
 	}
 }
@@ -531,8 +529,6 @@ static int __init cpu_stop_init(void)
 }
 early_initcall(cpu_stop_init);
 
-#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
-
 static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 {
 	struct multi_stop_data msdata = {
@@ -630,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 	mutex_unlock(&stop_cpus_mutex);
 	return ret ?: done.ret;
 }
-
-#endif	/* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0623787ec..2c5e3a8e0 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid);
 cond_syscall(sys_setfsgid);
 cond_syscall(sys_capget);
 cond_syscall(sys_capset);
+cond_syscall(sys_copy_file_range);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dc6858d66..f5102fabe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -173,7 +173,7 @@ extern int no_unaligned_warning;
 #define SYSCTL_WRITES_WARN	 0
 #define SYSCTL_WRITES_STRICT	 1
 
-static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
+static int sysctl_writes_strict = SYSCTL_WRITES_STRICT;
 
 static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHEDSTATS
+	{
+		.procname	= "sched_schedstats",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_schedstats,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif /* CONFIG_SCHEDSTATS */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
 	{
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &latencytop_enabled,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= sysctl_latencytop,
 	},
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -1568,6 +1579,28 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+	{
+		.procname	= "mmap_rnd_bits",
+		.data		= &mmap_rnd_bits,
+		.maxlen		= sizeof(mmap_rnd_bits),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= (void *)&mmap_rnd_bits_min,
+		.extra2		= (void *)&mmap_rnd_bits_max,
+	},
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+	{
+		.procname	= "mmap_rnd_compat_bits",
+		.data		= &mmap_rnd_compat_bits,
+		.maxlen		= sizeof(mmap_rnd_compat_bits),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= (void *)&mmap_rnd_compat_bits_min,
+		.extra2		= (void *)&mmap_rnd_compat_bits_max,
+	},
+#endif
 	{ }
 };
 
@@ -1735,6 +1768,20 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &pipe_proc_fn,
 		.extra1		= &pipe_min_size,
 	},
+	{
+		.procname	= "pipe-user-pages-hard",
+		.data		= &pipe_user_pages_hard,
+		.maxlen		= sizeof(pipe_user_pages_hard),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "pipe-user-pages-soft",
+		.data		= &pipe_user_pages_soft,
+		.maxlen		= sizeof(pipe_user_pages_soft),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{ }
 };
 
@@ -2047,9 +2094,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 		  void *data)
 {
 	int *i, vleft, first = 1, err = 0;
-	unsigned long page = 0;
 	size_t left;
-	char *kbuf;
+	char *kbuf = NULL, *p;
 	
 	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
@@ -2078,15 +2124,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 
 		if (left > PAGE_SIZE - 1)
 			left = PAGE_SIZE - 1;
-		page = __get_free_page(GFP_TEMPORARY);
-		kbuf = (char *) page;
-		if (!kbuf)
-			return -ENOMEM;
-		if (copy_from_user(kbuf, buffer, left)) {
-			err = -EFAULT;
-			goto free;
-		}
-		kbuf[left] = 0;
+		p = kbuf = memdup_user_nul(buffer, left);
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
 	}
 
 	for (; left && vleft--; i++, first=0) {
@@ -2094,11 +2134,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 		bool neg;
 
 		if (write) {
-			left -= proc_skip_spaces(&kbuf);
+			left -= proc_skip_spaces(&p);
 
 			if (!left)
 				break;
-			err = proc_get_long(&kbuf, &left, &lval, &neg,
+			err = proc_get_long(&p, &left, &lval, &neg,
 					     proc_wspace_sep,
 					     sizeof(proc_wspace_sep), NULL);
 			if (err)
@@ -2125,10 +2165,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 	if (!write && !first && left && !err)
 		err = proc_put_char(&buffer, &left, '\n');
 	if (write && !err && left)
-		left -= proc_skip_spaces(&kbuf);
-free:
+		left -= proc_skip_spaces(&p);
 	if (write) {
-		free_page(page);
+		kfree(kbuf);
 		if (first)
 			return err ? : -EINVAL;
 	}
@@ -2310,9 +2349,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 {
 	unsigned long *i, *min, *max;
 	int vleft, first = 1, err = 0;
-	unsigned long page = 0;
 	size_t left;
-	char *kbuf;
+	char *kbuf = NULL, *p;
 
 	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
@@ -2340,15 +2378,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 
 		if (left > PAGE_SIZE - 1)
 			left = PAGE_SIZE - 1;
-		page = __get_free_page(GFP_TEMPORARY);
-		kbuf = (char *) page;
-		if (!kbuf)
-			return -ENOMEM;
-		if (copy_from_user(kbuf, buffer, left)) {
-			err = -EFAULT;
-			goto free;
-		}
-		kbuf[left] = 0;
+		p = kbuf = memdup_user_nul(buffer, left);
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
 	}
 
 	for (; left && vleft--; i++, first = 0) {
@@ -2357,9 +2389,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 		if (write) {
 			bool neg;
 
-			left -= proc_skip_spaces(&kbuf);
+			left -= proc_skip_spaces(&p);
 
-			err = proc_get_long(&kbuf, &left, &val, &neg,
+			err = proc_get_long(&p, &left, &val, &neg,
 					     proc_wspace_sep,
 					     sizeof(proc_wspace_sep), NULL);
 			if (err)
@@ -2385,10 +2417,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 	if (!write && !first && left && !err)
 		err = proc_put_char(&buffer, &left, '\n');
 	if (write && !err)
-		left -= proc_skip_spaces(&kbuf);
-free:
+		left -= proc_skip_spaces(&p);
 	if (write) {
-		free_page(page);
+		kfree(kbuf);
 		if (first)
 			return err ? : -EINVAL;
 	}
@@ -2650,34 +2681,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 	}
 
 	if (write) {
-		unsigned long page = 0;
-		char *kbuf;
+		char *kbuf, *p;
 
 		if (left > PAGE_SIZE - 1)
 			left = PAGE_SIZE - 1;
 
-		page = __get_free_page(GFP_TEMPORARY);
-		kbuf = (char *) page;
-		if (!kbuf)
-			return -ENOMEM;
-		if (copy_from_user(kbuf, buffer, left)) {
-			free_page(page);
-			return -EFAULT;
-                }
-		kbuf[left] = 0;
+		p = kbuf = memdup_user_nul(buffer, left);
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
 
 		tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
 				     GFP_KERNEL);
 		if (!tmp_bitmap) {
-			free_page(page);
+			kfree(kbuf);
 			return -ENOMEM;
 		}
-		proc_skip_char(&kbuf, &left, '\n');
+		proc_skip_char(&p, &left, '\n');
 		while (!err && left) {
 			unsigned long val_a, val_b;
 			bool neg;
 
-			err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+			err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
 					     sizeof(tr_a), &c);
 			if (err)
 				break;
@@ -2688,12 +2712,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 
 			val_b = val_a;
 			if (left) {
-				kbuf++;
+				p++;
 				left--;
 			}
 
 			if (c == '-') {
-				err = proc_get_long(&kbuf, &left, &val_b,
+				err = proc_get_long(&p, &left, &val_b,
 						     &neg, tr_b, sizeof(tr_b),
 						     &c);
 				if (err)
@@ -2704,16 +2728,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 					break;
 				}
 				if (left) {
-					kbuf++;
+					p++;
 					left--;
 				}
 			}
 
 			bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
 			first = 0;
-			proc_skip_char(&kbuf, &left, '\n');
+			proc_skip_char(&p, &left, '\n');
 		}
-		free_page(page);
+		kfree(kbuf);
 	} else {
 		unsigned long bit_a, bit_b = 0;
 
diff --git a/kernel/task_work.c b/kernel/task_work.c
index bce3211e7..53fa971d0 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -118,4 +118,3 @@ void task_work_run(void)
 		} while (work);
 	}
 }
-EXPORT_SYMBOL_GPL(task_work_run);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 7fbba635a..e840ed867 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -271,11 +271,27 @@ static int alarmtimer_suspend(struct device *dev)
 		__pm_wakeup_event(ws, MSEC_PER_SEC);
 	return ret;
 }
+
+static int alarmtimer_resume(struct device *dev)
+{
+	struct rtc_device *rtc;
+
+	rtc = alarmtimer_get_rtcdev();
+	if (rtc)
+		rtc_timer_cancel(rtc, &rtctimer);
+	return 0;
+}
+
 #else
 static int alarmtimer_suspend(struct device *dev)
 {
 	return 0;
 }
+
+static int alarmtimer_resume(struct device *dev)
+{
+	return 0;
+}
 #endif
 
 static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
@@ -800,6 +816,7 @@ out:
 /* Suspend hook structures */
 static const struct dev_pm_ops alarmtimer_pm_ops = {
 	.suspend = alarmtimer_suspend,
+	.resume = alarmtimer_resume,
 };
 
 static struct platform_driver alarmtimer_driver = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1347882d1..664de5392 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -218,8 +218,8 @@ static void clocksource_watchdog(unsigned long data)
 
 		/* Check the deviation from the watchdog clocksource. */
 		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
-			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
-				cs->name);
+			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
+				smp_processor_id(), cs->name);
 			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
 				watchdog->name, wdnow, wdlast, watchdog->mask);
 			pr_warn("                      '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 149cc8086..6df8927c5 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,8 +16,11 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/rtc.h>
+#include <linux/math64.h>
 
 #include "ntp_internal.h"
+#include "timekeeping_internal.h"
+
 
 /*
  * NTP timekeeping variables:
@@ -70,7 +73,7 @@ static long			time_esterror = NTP_PHASE_LIMIT;
 static s64			time_freq;
 
 /* time at last adjustment (secs):					*/
-static long			time_reftime;
+static time64_t		time_reftime;
 
 static long			time_adjust;
 
@@ -297,25 +300,27 @@ static void ntp_update_offset(long offset)
 	if (!(time_status & STA_PLL))
 		return;
 
-	if (!(time_status & STA_NANO))
+	if (!(time_status & STA_NANO)) {
+		/* Make sure the multiplication below won't overflow */
+		offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC);
 		offset *= NSEC_PER_USEC;
+	}
 
 	/*
 	 * Scale the phase adjustment and
 	 * clamp to the operating range.
 	 */
-	offset = min(offset, MAXPHASE);
-	offset = max(offset, -MAXPHASE);
+	offset = clamp(offset, -MAXPHASE, MAXPHASE);
 
 	/*
 	 * Select how the frequency is to be controlled
 	 * and in which mode (PLL or FLL).
 	 */
-	secs = get_seconds() - time_reftime;
+	secs = (long)(__ktime_get_real_seconds() - time_reftime);
 	if (unlikely(time_status & STA_FREQHOLD))
 		secs = 0;
 
-	time_reftime = get_seconds();
+	time_reftime = __ktime_get_real_seconds();
 
 	offset64    = offset;
 	freq_adj    = ntp_update_offset_fll(offset64, secs);
@@ -390,10 +395,11 @@ ktime_t ntp_get_next_leap(void)
  *
  * Also handles leap second processing, and returns leap offset
  */
-int second_overflow(unsigned long secs)
+int second_overflow(time64_t secs)
 {
 	s64 delta;
 	int leap = 0;
+	s32 rem;
 
 	/*
 	 * Leap second processing. If in leap-insert state at the end of the
@@ -404,19 +410,19 @@ int second_overflow(unsigned long secs)
 	case TIME_OK:
 		if (time_status & STA_INS) {
 			time_state = TIME_INS;
-			ntp_next_leap_sec = secs + SECS_PER_DAY -
-						(secs % SECS_PER_DAY);
+			div_s64_rem(secs, SECS_PER_DAY, &rem);
+			ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
 		} else if (time_status & STA_DEL) {
 			time_state = TIME_DEL;
-			ntp_next_leap_sec = secs + SECS_PER_DAY -
-						 ((secs+1) % SECS_PER_DAY);
+			div_s64_rem(secs + 1, SECS_PER_DAY, &rem);
+			ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
 		}
 		break;
 	case TIME_INS:
 		if (!(time_status & STA_INS)) {
 			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_OK;
-		} else if (secs % SECS_PER_DAY == 0) {
+		} else if (secs == ntp_next_leap_sec) {
 			leap = -1;
 			time_state = TIME_OOP;
 			printk(KERN_NOTICE
@@ -427,7 +433,7 @@ int second_overflow(unsigned long secs)
 		if (!(time_status & STA_DEL)) {
 			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_OK;
-		} else if ((secs + 1) % SECS_PER_DAY == 0) {
+		} else if (secs == ntp_next_leap_sec) {
 			leap = 1;
 			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_WAIT;
@@ -590,7 +596,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
 	 * reference time to current time.
 	 */
 	if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
-		time_reftime = get_seconds();
+		time_reftime = __ktime_get_real_seconds();
 
 	/* only set allowed bits */
 	time_status &= STA_RONLY;
@@ -674,8 +680,24 @@ int ntp_validate_timex(struct timex *txc)
 			return -EINVAL;
 	}
 
-	if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
-		return -EPERM;
+	if (txc->modes & ADJ_SETOFFSET) {
+		/* In order to inject time, you gotta be super-user! */
+		if (!capable(CAP_SYS_TIME))
+			return -EPERM;
+
+		if (txc->modes & ADJ_NANO) {
+			struct timespec ts;
+
+			ts.tv_sec = txc->time.tv_sec;
+			ts.tv_nsec = txc->time.tv_usec;
+			if (!timespec_inject_offset_valid(&ts))
+				return -EINVAL;
+
+		} else {
+			if (!timeval_inject_offset_valid(&txc->time))
+				return -EINVAL;
+		}
+	}
 
 	/*
 	 * Check for potential multiplication overflows that can
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index af924470e..d8a7c11fa 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -6,7 +6,7 @@ extern void ntp_clear(void);
 /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
 extern u64 ntp_tick_length(void);
 extern ktime_t ntp_get_next_leap(void);
-extern int second_overflow(unsigned long secs);
+extern int second_overflow(time64_t secs);
 extern int ntp_validate_timex(struct timex *);
 extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
 extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 22c57e191..0b1742434 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -36,16 +36,17 @@
  */
 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 
-/*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
- */
-static ktime_t last_jiffies_update;
-
 struct tick_sched *tick_get_tick_sched(int cpu)
 {
 	return &per_cpu(tick_cpu_sched, cpu);
 }
 
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+/*
+ * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ */
+static ktime_t last_jiffies_update;
+
 /*
  * Must be called with interrupts disabled !
  */
@@ -143,7 +144,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 	 * when we go busy again does not account too much ticks.
 	 */
 	if (ts->tick_stopped) {
-		touch_softlockup_watchdog();
+		touch_softlockup_watchdog_sched();
 		if (is_idle_task(current))
 			ts->idle_jiffies++;
 	}
@@ -151,6 +152,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 	update_process_times(user_mode(regs));
 	profile_tick(CPU_PROFILING);
 }
+#endif
 
 #ifdef CONFIG_NO_HZ_FULL
 cpumask_var_t tick_nohz_full_mask;
@@ -387,7 +389,7 @@ void __init tick_nohz_init(void)
 /*
  * NO HZ enabled ?
  */
-static int tick_nohz_enabled __read_mostly  = 1;
+int tick_nohz_enabled __read_mostly = 1;
 unsigned long tick_nohz_active  __read_mostly;
 /*
  * Enable / Disable tickless mode
@@ -430,7 +432,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
 	tick_do_update_jiffies64(now);
 	local_irq_restore(flags);
 
-	touch_softlockup_watchdog();
+	touch_softlockup_watchdog_sched();
 }
 
 /*
@@ -603,15 +605,31 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 
 	/*
 	 * If the tick is due in the next period, keep it ticking or
-	 * restart it proper.
+	 * force prod the timer.
 	 */
 	delta = next_tick - basemono;
 	if (delta <= (u64)TICK_NSEC) {
 		tick.tv64 = 0;
+		/*
+		 * We've not stopped the tick yet, and there's a timer in the
+		 * next period, so no point in stopping it either, bail.
+		 */
 		if (!ts->tick_stopped)
 			goto out;
+
+		/*
+		 * If, OTOH, we did stop it, but there's a pending (expired)
+		 * timer reprogram the timer hardware to fire now.
+		 *
+		 * We will not restart the tick proper, just prod the timer
+		 * hardware into firing an interrupt to process the pending
+		 * timers. Just like tick_irq_exit() will not restart the tick
+		 * for 'normal' interrupts.
+		 *
+		 * Only once we exit the idle loop will we re-enable the tick,
+		 * see tick_nohz_idle_exit().
+		 */
 		if (delta == 0) {
-			/* Tick is stopped, but required now. Enforce it */
 			tick_nohz_restart(ts, now);
 			goto out;
 		}
@@ -694,14 +712,14 @@ out:
 	return tick;
 }
 
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
 {
 	/* Update jiffies first */
 	tick_do_update_jiffies64(now);
-	update_cpu_load_nohz();
+	update_cpu_load_nohz(active);
 
 	calc_load_exit_idle();
-	touch_softlockup_watchdog();
+	touch_softlockup_watchdog_sched();
 	/*
 	 * Cancel the scheduled timer and restore the tick
 	 */
@@ -725,7 +743,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 	if (can_stop_full_tick())
 		tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 	else if (ts->tick_stopped)
-		tick_nohz_restart_sched_tick(ts, ktime_get());
+		tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
 #endif
 }
 
@@ -875,7 +893,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	unsigned long ticks;
 
-	if (vtime_accounting_enabled())
+	if (vtime_accounting_cpu_enabled())
 		return;
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
@@ -916,7 +934,7 @@ void tick_nohz_idle_exit(void)
 		tick_nohz_stop_idle(ts, now);
 
 	if (ts->tick_stopped) {
-		tick_nohz_restart_sched_tick(ts, now);
+		tick_nohz_restart_sched_tick(ts, now, 0);
 		tick_nohz_account_idle_ticks(ts);
 	}
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 99188ee5d..34b4cedfa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -845,6 +845,19 @@ time64_t ktime_get_real_seconds(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
 
+/**
+ * __ktime_get_real_seconds - The same as ktime_get_real_seconds
+ * but without the sequence counter protect. This internal function
+ * is called just when timekeeping lock is already held.
+ */
+time64_t __ktime_get_real_seconds(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+
+	return tk->xtime_sec;
+}
+
+
 #ifdef CONFIG_NTP_PPS
 
 /**
@@ -958,7 +971,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 	struct timespec64 ts64, tmp;
 	int ret = 0;
 
-	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+	if (!timespec_inject_offset_valid(ts))
 		return -EINVAL;
 
 	ts64 = timespec_to_timespec64(*ts);
@@ -1591,9 +1604,12 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
 {
 	s64 interval = tk->cycle_interval;
 	s64 xinterval = tk->xtime_interval;
+	u32 base = tk->tkr_mono.clock->mult;
+	u32 max = tk->tkr_mono.clock->maxadj;
+	u32 cur_adj = tk->tkr_mono.mult;
 	s64 tick_error;
 	bool negative;
-	u32 adj;
+	u32 adj_scale;
 
 	/* Remove any current error adj from freq calculation */
 	if (tk->ntp_err_mult)
@@ -1612,13 +1628,33 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
 	/* preserve the direction of correction */
 	negative = (tick_error < 0);
 
-	/* Sort out the magnitude of the correction */
+	/* If any adjustment would pass the max, just return */
+	if (negative && (cur_adj - 1) <= (base - max))
+		return;
+	if (!negative && (cur_adj + 1) >= (base + max))
+		return;
+	/*
+	 * Sort out the magnitude of the correction, but
+	 * avoid making so large a correction that we go
+	 * over the max adjustment.
+	 */
+	adj_scale = 0;
 	tick_error = abs(tick_error);
-	for (adj = 0; tick_error > interval; adj++)
+	while (tick_error > interval) {
+		u32 adj = 1 << (adj_scale + 1);
+
+		/* Check if adjustment gets us within 1 unit from the max */
+		if (negative && (cur_adj - adj) <= (base - max))
+			break;
+		if (!negative && (cur_adj + adj) >= (base + max))
+			break;
+
+		adj_scale++;
 		tick_error >>= 1;
+	}
 
 	/* scale the corrections */
-	timekeeping_apply_adjustment(tk, offset, negative, adj);
+	timekeeping_apply_adjustment(tk, offset, negative, adj_scale);
 }
 
 /*
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ea005a7f..5be76270e 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -17,7 +17,11 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
 {
 	cycle_t ret = (now - last) & mask;
 
-	return (s64) ret > 0 ? ret : 0;
+	/*
+	 * Prevent time going backwards by checking the MSB of mask in
+	 * the result. If set, return 0.
+	 */
+	return ret & ~(mask >> 1) ? 0 : ret;
 }
 #else
 static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
@@ -26,4 +30,6 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
 }
 #endif
 
+extern time64_t __ktime_get_real_seconds(void);
+
 #endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a990824c8..2aeb6ffc0 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -349,16 +349,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 	if (count >= BLK_TN_MAX_MSG)
 		return -EINVAL;
 
-	msg = kmalloc(count + 1, GFP_KERNEL);
-	if (msg == NULL)
-		return -ENOMEM;
-
-	if (copy_from_user(msg, buffer, count)) {
-		kfree(msg);
-		return -EFAULT;
-	}
+	msg = memdup_user_nul(buffer, count);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	msg[count] = '\0';
 	bt = filp->private_data;
 	__trace_note_message(bt, "%s", msg);
 	kfree(msg);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4228fd368..326a75e88 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct perf_event *event;
+	struct file *file;
 
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	event = (struct perf_event *)array->ptrs[index];
-	if (!event)
+	file = (struct file *)array->ptrs[index];
+	if (unlikely(!file))
 		return -ENOENT;
 
+	event = file->private_data;
+
 	/* make sure event is local and doesn't have pmu::count */
 	if (event->oncpu != smp_processor_id() ||
 	    event->pmu->count)
@@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
 	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
 	struct perf_event *event;
+	struct file *file;
 	struct perf_raw_record raw = {
 		.size = size,
 		.data = data,
@@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	event = (struct perf_event *)array->ptrs[index];
-	if (unlikely(!event))
+	file = (struct file *)array->ptrs[index];
+	if (unlikely(!file))
 		return -ENOENT;
 
+	event = file->private_data;
+
 	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
 		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
 		return -EINVAL;
@@ -316,7 +322,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
 	return true;
 }
 
-static struct bpf_verifier_ops kprobe_prog_ops = {
+static const struct bpf_verifier_ops kprobe_prog_ops = {
 	.get_func_proto  = kprobe_prog_func_proto,
 	.is_valid_access = kprobe_prog_is_valid_access,
 };
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f743b147..57a6eea84 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@
 #define FTRACE_HASH_DEFAULT_BITS 10
 #define FTRACE_HASH_MAX_BITS 12
 
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 #define INIT_OPS_HASH(opsname)	\
 	.func_hash		= &opsname.local_hash,			\
@@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly;
 
 static DEFINE_MUTEX(ftrace_lock);
 
-static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 static struct ftrace_ops global_ops;
-static struct ftrace_ops control_ops;
-
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
-				   struct ftrace_ops *op, struct pt_regs *regs);
 
 #if ARCH_SUPPORTS_FTRACE_OPS
 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -203,7 +196,7 @@ void clear_ftrace_function(void)
 	ftrace_trace_function = ftrace_stub;
 }
 
-static void control_ops_disable_all(struct ftrace_ops *ops)
+static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
 {
 	int cpu;
 
@@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops)
 		*per_cpu_ptr(ops->disabled, cpu) = 1;
 }
 
-static int control_ops_alloc(struct ftrace_ops *ops)
+static int per_cpu_ops_alloc(struct ftrace_ops *ops)
 {
 	int __percpu *disabled;
 
+	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
+		return -EINVAL;
+
 	disabled = alloc_percpu(int);
 	if (!disabled)
 		return -ENOMEM;
 
 	ops->disabled = disabled;
-	control_ops_disable_all(ops);
+	per_cpu_ops_disable_all(ops);
 	return 0;
 }
 
@@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { }
 static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
 {
 	/*
-	 * If this is a dynamic ops or we force list func,
+	 * If this is a dynamic, RCU, or per CPU ops, or we force list func,
 	 * then it needs to call the list anyway.
 	 */
-	if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
+			  FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
 		return ftrace_ops_list_func;
 
 	return ftrace_ops_get_func(ops);
@@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
 	return 0;
 }
 
-static void add_ftrace_list_ops(struct ftrace_ops **list,
-				struct ftrace_ops *main_ops,
-				struct ftrace_ops *ops)
-{
-	int first = *list == &ftrace_list_end;
-	add_ftrace_ops(list, ops);
-	if (first)
-		add_ftrace_ops(&ftrace_ops_list, main_ops);
-}
-
-static int remove_ftrace_list_ops(struct ftrace_ops **list,
-				  struct ftrace_ops *main_ops,
-				  struct ftrace_ops *ops)
-{
-	int ret = remove_ftrace_ops(list, ops);
-	if (!ret && *list == &ftrace_list_end)
-		ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
-	return ret;
-}
-
 static void ftrace_update_trampoline(struct ftrace_ops *ops);
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
@@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	if (!core_kernel_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
 
-	if (ops->flags & FTRACE_OPS_FL_CONTROL) {
-		if (control_ops_alloc(ops))
+	if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
+		if (per_cpu_ops_alloc(ops))
 			return -ENOMEM;
-		add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
-		/* The control_ops needs the trampoline update */
-		ops = &control_ops;
-	} else
-		add_ftrace_ops(&ftrace_ops_list, ops);
+	}
+
+	add_ftrace_ops(&ftrace_ops_list, ops);
 
 	/* Always save the function, and reset at unregistering */
 	ops->saved_func = ops->func;
@@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
 		return -EBUSY;
 
-	if (ops->flags & FTRACE_OPS_FL_CONTROL) {
-		ret = remove_ftrace_list_ops(&ftrace_control_list,
-					     &control_ops, ops);
-	} else
-		ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+	ret = remove_ftrace_ops(&ftrace_ops_list, ops);
 
 	if (ret < 0)
 		return ret;
@@ -1687,6 +1658,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		int in_hash = 0;
 		int match = 0;
 
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		if (all) {
 			/*
 			 * Only the filter_hash affects all records.
@@ -1940,7 +1914,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
 	return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
 }
 
-static void print_ip_ins(const char *fmt, unsigned char *p)
+static void print_ip_ins(const char *fmt, const unsigned char *p)
 {
 	int i;
 
@@ -1952,6 +1926,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
 
 static struct ftrace_ops *
 ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+
+enum ftrace_bug_type ftrace_bug_type;
+const void *ftrace_expected;
+
+static void print_bug_type(void)
+{
+	switch (ftrace_bug_type) {
+	case FTRACE_BUG_UNKNOWN:
+		break;
+	case FTRACE_BUG_INIT:
+		pr_info("Initializing ftrace call sites\n");
+		break;
+	case FTRACE_BUG_NOP:
+		pr_info("Setting ftrace call site to NOP\n");
+		break;
+	case FTRACE_BUG_CALL:
+		pr_info("Setting ftrace call site to call ftrace function\n");
+		break;
+	case FTRACE_BUG_UPDATE:
+		pr_info("Updating ftrace call site to call a different ftrace function\n");
+		break;
+	}
+}
 
 /**
  * ftrace_bug - report and shutdown function tracer
@@ -1979,8 +1978,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
 		FTRACE_WARN_ON_ONCE(1);
 		pr_info("ftrace failed to modify ");
 		print_ip_sym(ip);
-		print_ip_ins(" actual: ", (unsigned char *)ip);
+		print_ip_ins(" actual:   ", (unsigned char *)ip);
 		pr_cont("\n");
+		if (ftrace_expected) {
+			print_ip_ins(" expected: ", ftrace_expected);
+			pr_cont("\n");
+		}
 		break;
 	case -EPERM:
 		FTRACE_WARN_ON_ONCE(1);
@@ -1992,6 +1995,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
 		pr_info("ftrace faulted on unknown error ");
 		print_ip_sym(ip);
 	}
+	print_bug_type();
 	if (rec) {
 		struct ftrace_ops *ops = NULL;
 
@@ -2000,15 +2004,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
 			rec->flags & FTRACE_FL_REGS ? " R" : "  ");
 		if (rec->flags & FTRACE_FL_TRAMP_EN) {
 			ops = ftrace_find_tramp_ops_any(rec);
-			if (ops)
-				pr_cont("\ttramp: %pS",
-					(void *)ops->trampoline);
-			else
+			if (ops) {
+				do {
+					pr_cont("\ttramp: %pS (%pS)",
+						(void *)ops->trampoline,
+						(void *)ops->func);
+					ops = ftrace_find_tramp_ops_next(rec, ops);
+				} while (ops);
+			} else
 				pr_cont("\ttramp: ERROR!");
 
 		}
 		ip = ftrace_get_addr_curr(rec);
-		pr_cont(" expected tramp: %lx\n", ip);
+		pr_cont("\n expected tramp: %lx\n", ip);
 	}
 }
 
@@ -2016,6 +2024,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 {
 	unsigned long flag = 0UL;
 
+	ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
+	if (rec->flags & FTRACE_FL_DISABLED)
+		return FTRACE_UPDATE_IGNORE;
+
 	/*
 	 * If we are updating calls:
 	 *
@@ -2077,9 +2090,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 		 *   from the save regs, to a non-save regs function or
 		 *   vice versa, or from a trampoline call.
 		 */
-		if (flag & FTRACE_FL_ENABLED)
+		if (flag & FTRACE_FL_ENABLED) {
+			ftrace_bug_type = FTRACE_BUG_CALL;
 			return FTRACE_UPDATE_MAKE_CALL;
+		}
 
+		ftrace_bug_type = FTRACE_BUG_UPDATE;
 		return FTRACE_UPDATE_MODIFY_CALL;
 	}
 
@@ -2096,6 +2112,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 					FTRACE_FL_REGS_EN);
 	}
 
+	ftrace_bug_type = FTRACE_BUG_NOP;
 	return FTRACE_UPDATE_MAKE_NOP;
 }
 
@@ -2145,6 +2162,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
 }
 
 static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
+			   struct ftrace_ops *op)
+{
+	unsigned long ip = rec->ip;
+
+	while_for_each_ftrace_op(op) {
+
+		if (!op->trampoline)
+			continue;
+
+		if (hash_contains_ip(ip, op->func_hash))
+			return op;
+	} 
+
+	return NULL;
+}
+
+static struct ftrace_ops *
 ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
 {
 	struct ftrace_ops *op;
@@ -2307,17 +2342,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 	ret = ftrace_update_record(rec, enable);
 
+	ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
 	switch (ret) {
 	case FTRACE_UPDATE_IGNORE:
 		return 0;
 
 	case FTRACE_UPDATE_MAKE_CALL:
+		ftrace_bug_type = FTRACE_BUG_CALL;
 		return ftrace_make_call(rec, ftrace_addr);
 
 	case FTRACE_UPDATE_MAKE_NOP:
+		ftrace_bug_type = FTRACE_BUG_NOP;
 		return ftrace_make_nop(NULL, rec, ftrace_old_addr);
 
 	case FTRACE_UPDATE_MODIFY_CALL:
+		ftrace_bug_type = FTRACE_BUG_UPDATE;
 		return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
 	}
 
@@ -2425,6 +2465,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 
 	ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
 	if (ret) {
+		ftrace_bug_type = FTRACE_BUG_INIT;
 		ftrace_bug(ret, rec);
 		return 0;
 	}
@@ -2566,7 +2607,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 }
 
-static void control_ops_free(struct ftrace_ops *ops)
+static void per_cpu_ops_free(struct ftrace_ops *ops)
 {
 	free_percpu(ops->disabled);
 }
@@ -2667,13 +2708,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 
 	if (!command || !ftrace_enabled) {
 		/*
-		 * If these are control ops, they still need their
+		 * If these are per_cpu ops, they still need their
 		 * per_cpu field freed. Since, function tracing is
 		 * not currently active, we can just free them
 		 * without synchronizing all CPUs.
 		 */
-		if (ops->flags & FTRACE_OPS_FL_CONTROL)
-			control_ops_free(ops);
+		if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+			per_cpu_ops_free(ops);
 		return 0;
 	}
 
@@ -2714,7 +2755,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 	/*
 	 * Dynamic ops may be freed, we must make sure that all
 	 * callers are done before leaving this function.
-	 * The same goes for freeing the per_cpu data of the control
+	 * The same goes for freeing the per_cpu data of the per_cpu
 	 * ops.
 	 *
 	 * Again, normal synchronize_sched() is not good enough.
@@ -2725,13 +2766,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 	 * infrastructure to do the synchronization, thus we must do it
 	 * ourselves.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
 		schedule_on_each_cpu(ftrace_sync);
 
 		arch_ftrace_trampoline_free(ops);
 
-		if (ops->flags & FTRACE_OPS_FL_CONTROL)
-			control_ops_free(ops);
+		if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+			per_cpu_ops_free(ops);
 	}
 
 	return 0;
@@ -2798,9 +2839,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
 	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
 		return 0;
 
-	/* If ops traces all mods, we already accounted for it */
+	/* If ops traces all then it includes this function */
 	if (ops_traces_mod(ops))
-		return 0;
+		return 1;
 
 	/* The function must be in the filter */
 	if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
@@ -2814,64 +2855,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
 	return 1;
 }
 
-static int referenced_filters(struct dyn_ftrace *rec)
-{
-	struct ftrace_ops *ops;
-	int cnt = 0;
-
-	for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
-		if (ops_references_rec(ops, rec))
-		    cnt++;
-	}
-
-	return cnt;
-}
-
 static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *p;
 	cycle_t start, stop;
 	unsigned long update_cnt = 0;
-	unsigned long ref = 0;
-	bool test = false;
+	unsigned long rec_flags = 0;
 	int i;
 
+	start = ftrace_now(raw_smp_processor_id());
+
 	/*
-	 * When adding a module, we need to check if tracers are
-	 * currently enabled and if they are set to trace all functions.
-	 * If they are, we need to enable the module functions as well
-	 * as update the reference counts for those function records.
+	 * When a module is loaded, this function is called to convert
+	 * the calls to mcount in its text to nops, and also to create
+	 * an entry in the ftrace data. Now, if ftrace is activated
+	 * after this call, but before the module sets its text to
+	 * read-only, the modification of enabling ftrace can fail if
+	 * the read-only is done while ftrace is converting the calls.
+	 * To prevent this, the module's records are set as disabled
+	 * and will be enabled after the call to set the module's text
+	 * to read-only.
 	 */
-	if (mod) {
-		struct ftrace_ops *ops;
-
-		for (ops = ftrace_ops_list;
-		     ops != &ftrace_list_end; ops = ops->next) {
-			if (ops->flags & FTRACE_OPS_FL_ENABLED) {
-				if (ops_traces_mod(ops))
-					ref++;
-				else
-					test = true;
-			}
-		}
-	}
-
-	start = ftrace_now(raw_smp_processor_id());
+	if (mod)
+		rec_flags |= FTRACE_FL_DISABLED;
 
 	for (pg = new_pgs; pg; pg = pg->next) {
 
 		for (i = 0; i < pg->index; i++) {
-			int cnt = ref;
 
 			/* If something went wrong, bail without enabling anything */
 			if (unlikely(ftrace_disabled))
 				return -1;
 
 			p = &pg->records[i];
-			if (test)
-				cnt += referenced_filters(p);
-			p->flags = cnt;
+			p->flags = rec_flags;
 
 			/*
 			 * Do the initial record conversion from mcount jump
@@ -2881,21 +2899,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 				break;
 
 			update_cnt++;
-
-			/*
-			 * If the tracing is enabled, go ahead and enable the record.
-			 *
-			 * The reason not to enable the record immediatelly is the
-			 * inherent check of ftrace_make_nop/ftrace_make_call for
-			 * correct previous instructions.  Making first the NOP
-			 * conversion puts the module to the correct state, thus
-			 * passing the ftrace_make_call check.
-			 */
-			if (ftrace_start_up && cnt) {
-				int failed = __ftrace_replace_code(p, 1);
-				if (failed)
-					ftrace_bug(failed, p);
-			}
 		}
 	}
 
@@ -3258,7 +3261,7 @@ static int t_show(struct seq_file *m, void *v)
 
 	seq_printf(m, "%ps", (void *)rec->ip);
 	if (iter->flags & FTRACE_ITER_ENABLED) {
-		struct ftrace_ops *ops = NULL;
+		struct ftrace_ops *ops;
 
 		seq_printf(m, " (%ld)%s%s",
 			   ftrace_rec_count(rec),
@@ -3266,14 +3269,19 @@ static int t_show(struct seq_file *m, void *v)
 			   rec->flags & FTRACE_FL_IPMODIFY ? " I" : "  ");
 		if (rec->flags & FTRACE_FL_TRAMP_EN) {
 			ops = ftrace_find_tramp_ops_any(rec);
-			if (ops)
-				seq_printf(m, "\ttramp: %pS",
-					   (void *)ops->trampoline);
-			else
+			if (ops) {
+				do {
+					seq_printf(m, "\ttramp: %pS (%pS)",
+						   (void *)ops->trampoline,
+						   (void *)ops->func);
+					add_trampoline_func(m, ops, rec);
+					ops = ftrace_find_tramp_ops_next(rec, ops);
+				} while (ops);
+			} else
 				seq_puts(m, "\ttramp: ERROR!");
-
+		} else {
+			add_trampoline_func(m, NULL, rec);
 		}
-		add_trampoline_func(m, ops, rec);
 	}	
 
 	seq_putc(m, '\n');
@@ -4898,6 +4906,19 @@ static int ftrace_process_locs(struct module *mod,
 
 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
 
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+	struct ftrace_ops *ops;
+	int cnt = 0;
+
+	for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+		if (ops_references_rec(ops, rec))
+		    cnt++;
+	}
+
+	return cnt;
+}
+
 void ftrace_release_mod(struct module *mod)
 {
 	struct dyn_ftrace *rec;
@@ -4940,44 +4961,85 @@ void ftrace_release_mod(struct module *mod)
 	mutex_unlock(&ftrace_lock);
 }
 
-static void ftrace_init_module(struct module *mod,
-			       unsigned long *start, unsigned long *end)
+void ftrace_module_enable(struct module *mod)
 {
-	if (ftrace_disabled || start == end)
-		return;
-	ftrace_process_locs(mod, start, end);
-}
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
 
-void ftrace_module_init(struct module *mod)
-{
-	ftrace_init_module(mod, mod->ftrace_callsites,
-			   mod->ftrace_callsites +
-			   mod->num_ftrace_callsites);
-}
+	mutex_lock(&ftrace_lock);
 
-static int ftrace_module_notify_exit(struct notifier_block *self,
-				     unsigned long val, void *data)
-{
-	struct module *mod = data;
+	if (ftrace_disabled)
+		goto out_unlock;
 
-	if (val == MODULE_STATE_GOING)
-		ftrace_release_mod(mod);
+	/*
+	 * If the tracing is enabled, go ahead and enable the record.
+	 *
+	 * The reason not to enable the record immediatelly is the
+	 * inherent check of ftrace_make_nop/ftrace_make_call for
+	 * correct previous instructions.  Making first the NOP
+	 * conversion puts the module to the correct state, thus
+	 * passing the ftrace_make_call check.
+	 *
+	 * We also delay this to after the module code already set the
+	 * text to read-only, as we now need to set it back to read-write
+	 * so that we can modify the text.
+	 */
+	if (ftrace_start_up)
+		ftrace_arch_code_modify_prepare();
 
-	return 0;
+	do_for_each_ftrace_rec(pg, rec) {
+		int cnt;
+		/*
+		 * do_for_each_ftrace_rec() is a double loop.
+		 * module text shares the pg. If a record is
+		 * not part of this module, then skip this pg,
+		 * which the "break" will do.
+		 */
+		if (!within_module_core(rec->ip, mod))
+			break;
+
+		cnt = 0;
+
+		/*
+		 * When adding a module, we need to check if tracers are
+		 * currently enabled and if they are, and can trace this record,
+		 * we need to enable the module functions as well as update the
+		 * reference counts for those function records.
+		 */
+		if (ftrace_start_up)
+			cnt += referenced_filters(rec);
+
+		/* This clears FTRACE_FL_DISABLED */
+		rec->flags = cnt;
+
+		if (ftrace_start_up && cnt) {
+			int failed = __ftrace_replace_code(rec, 1);
+			if (failed) {
+				ftrace_bug(failed, rec);
+				goto out_loop;
+			}
+		}
+
+	} while_for_each_ftrace_rec();
+
+ out_loop:
+	if (ftrace_start_up)
+		ftrace_arch_code_modify_post_process();
+
+ out_unlock:
+	mutex_unlock(&ftrace_lock);
 }
-#else
-static int ftrace_module_notify_exit(struct notifier_block *self,
-				     unsigned long val, void *data)
+
+void ftrace_module_init(struct module *mod)
 {
-	return 0;
+	if (ftrace_disabled || !mod->num_ftrace_callsites)
+		return;
+
+	ftrace_process_locs(mod, mod->ftrace_callsites,
+			    mod->ftrace_callsites + mod->num_ftrace_callsites);
 }
 #endif /* CONFIG_MODULES */
 
-struct notifier_block ftrace_module_exit_nb = {
-	.notifier_call = ftrace_module_notify_exit,
-	.priority = INT_MIN,	/* Run after anything that can remove kprobes */
-};
-
 void __init ftrace_init(void)
 {
 	extern unsigned long __start_mcount_loc[];
@@ -5006,10 +5068,6 @@ void __init ftrace_init(void)
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
-	ret = register_module_notifier(&ftrace_module_exit_nb);
-	if (ret)
-		pr_warning("Failed to register trace ftrace module exit notifier\n");
-
 	set_ftrace_early_filters();
 
 	return;
@@ -5116,44 +5174,6 @@ void ftrace_reset_array_ops(struct trace_array *tr)
 	tr->ops->func = ftrace_stub;
 }
 
-static void
-ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
-			struct ftrace_ops *op, struct pt_regs *regs)
-{
-	if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
-		return;
-
-	/*
-	 * Some of the ops may be dynamically allocated,
-	 * they must be freed after a synchronize_sched().
-	 */
-	preempt_disable_notrace();
-	trace_recursion_set(TRACE_CONTROL_BIT);
-
-	/*
-	 * Control funcs (perf) uses RCU. Only trace if
-	 * RCU is currently active.
-	 */
-	if (!rcu_is_watching())
-		goto out;
-
-	do_for_each_ftrace_op(op, ftrace_control_list) {
-		if (!(op->flags & FTRACE_OPS_FL_STUB) &&
-		    !ftrace_function_local_disabled(op) &&
-		    ftrace_ops_test(op, ip, regs))
-			op->func(ip, parent_ip, op, regs);
-	} while_for_each_ftrace_op(op);
- out:
-	trace_recursion_clear(TRACE_CONTROL_BIT);
-	preempt_enable_notrace();
-}
-
-static struct ftrace_ops control_ops = {
-	.func	= ftrace_ops_control_func,
-	.flags	= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
-	INIT_OPS_HASH(control_ops)
-};
-
 static inline void
 __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *ignored, struct pt_regs *regs)
@@ -5170,8 +5190,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 	 * they must be freed after a synchronize_sched().
 	 */
 	preempt_disable_notrace();
+
 	do_for_each_ftrace_op(op, ftrace_ops_list) {
-		if (ftrace_ops_test(op, ip, regs)) {
+		/*
+		 * Check the following for each ops before calling their func:
+		 *  if RCU flag is set, then rcu_is_watching() must be true
+		 *  if PER_CPU is set, then ftrace_function_local_disable()
+		 *                          must be false
+		 *  Otherwise test if the ip matches the ops filter
+		 *
+		 * If any of the above fails then the op->func() is not executed.
+		 */
+		if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
+		    (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+		     !ftrace_function_local_disabled(op)) &&
+		    ftrace_ops_test(op, ip, regs)) {
+		    
 			if (FTRACE_WARN_ON(!op->func)) {
 				pr_warn("op=%p %pS\n", op, op);
 				goto out;
@@ -5195,7 +5229,7 @@ out:
  * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
  * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
  * An architecture can pass partial regs with ftrace_ops and still
- * set the ARCH_SUPPORT_FTARCE_OPS.
+ * set the ARCH_SUPPORTS_FTRACE_OPS.
  */
 #if ARCH_SUPPORTS_FTRACE_OPS
 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -5212,20 +5246,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
 
 /*
  * If there's only one function registered but it does not support
- * recursion, this function will be called by the mcount trampoline.
- * This function will handle recursion protection.
+ * recursion, needs RCU protection and/or requires per cpu handling, then
+ * this function will be called by the mcount trampoline.
  */
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 				   struct ftrace_ops *op, struct pt_regs *regs)
 {
 	int bit;
 
+	if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching())
+		return;
+
 	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
 	if (bit < 0)
 		return;
 
-	op->func(ip, parent_ip, op, regs);
+	preempt_disable_notrace();
 
+	if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+	    !ftrace_function_local_disabled(op)) {
+		op->func(ip, parent_ip, op, regs);
+	}
+
+	preempt_enable_notrace();
 	trace_clear_recursion(bit);
 }
 
@@ -5243,12 +5286,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
 ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 {
 	/*
-	 * If the func handles its own recursion, call it directly.
-	 * Otherwise call the recursion protected function that
-	 * will call the ftrace ops function.
+	 * If the function does not handle recursion, needs to be RCU safe,
+	 * or does per cpu logic, then we need to call the assist handler.
 	 */
-	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
-		return ftrace_ops_recurs_func;
+	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
+	    ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+		return ftrace_ops_assist_func;
 
 	return ops->func;
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c6045a27..95181e368 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old,
 
 /*
  * rb_tail_page_update - move the tail page forward
- *
- * Returns 1 if moved tail page, 0 if someone else did.
  */
-static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
 			       struct buffer_page *tail_page,
 			       struct buffer_page *next_page)
 {
-	struct buffer_page *old_tail;
 	unsigned long old_entries;
 	unsigned long old_write;
-	int ret = 0;
 
 	/*
 	 * The tail page now needs to be moved forward.
@@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
 	 * it is, then it is up to us to update the tail
 	 * pointer.
 	 */
-	if (tail_page == cpu_buffer->tail_page) {
+	if (tail_page == READ_ONCE(cpu_buffer->tail_page)) {
 		/* Zero the write counter */
 		unsigned long val = old_write & ~RB_WRITE_MASK;
 		unsigned long eval = old_entries & ~RB_WRITE_MASK;
@@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		local_set(&next_page->page->commit, 0);
 
-		old_tail = cmpxchg(&cpu_buffer->tail_page,
-				   tail_page, next_page);
-
-		if (old_tail == tail_page)
-			ret = 1;
+		/* Again, either we update tail_page or an interrupt does */
+		(void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
 	}
-
-	return ret;
 }
 
 static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2036,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 	 * the tail page would have moved.
 	 */
 	if (ret == RB_PAGE_NORMAL) {
+		struct buffer_page *buffer_tail_page;
+
+		buffer_tail_page = READ_ONCE(cpu_buffer->tail_page);
 		/*
 		 * If the tail had moved passed next, then we need
 		 * to reset the pointer.
 		 */
-		if (cpu_buffer->tail_page != tail_page &&
-		    cpu_buffer->tail_page != next_page)
+		if (buffer_tail_page != tail_page &&
+		    buffer_tail_page != next_page)
 			rb_head_page_set_normal(cpu_buffer, new_head,
 						next_page,
 						RB_PAGE_HEAD);
@@ -2135,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	local_sub(length, &tail_page->write);
 }
 
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
+
 /*
  * This is the slow path, force gcc not to inline it.
  */
@@ -2147,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct buffer_page *next_page;
 	int ret;
-	u64 ts;
 
 	next_page = tail_page;
 
@@ -2221,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 		}
 	}
 
-	ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
-	if (ret) {
-		/*
-		 * Nested commits always have zero deltas, so
-		 * just reread the time stamp
-		 */
-		ts = rb_time_stamp(buffer);
-		next_page->page->time_stamp = ts;
-	}
+	rb_tail_page_update(cpu_buffer, tail_page, next_page);
 
  out_again:
 
 	rb_reset_tail(cpu_buffer, tail, info);
 
+	/* Commit what we have for now. */
+	rb_end_commit(cpu_buffer);
+	/* rb_end_commit() decs committing */
+	local_inc(&cpu_buffer->committing);
+
 	/* fail and let the caller try again */
 	return ERR_PTR(-EAGAIN);
 
@@ -2362,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 	addr = (unsigned long)event;
 	addr &= PAGE_MASK;
 
-	bpage = cpu_buffer->tail_page;
+	bpage = READ_ONCE(cpu_buffer->tail_page);
 
 	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
 		unsigned long write_mask =
@@ -2410,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
  again:
 	max_count = cpu_buffer->nr_pages * 100;
 
-	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+	while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) {
 		if (RB_WARN_ON(cpu_buffer, !(--max_count)))
 			return;
 		if (RB_WARN_ON(cpu_buffer,
@@ -2419,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 		local_set(&cpu_buffer->commit_page->page->commit,
 			  rb_page_write(cpu_buffer->commit_page));
 		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-		cpu_buffer->write_stamp =
-			cpu_buffer->commit_page->page->time_stamp;
+		/* Only update the write stamp if the page has an event */
+		if (rb_page_write(cpu_buffer->commit_page))
+			cpu_buffer->write_stamp =
+				cpu_buffer->commit_page->page->time_stamp;
 		/* add barrier to keep gcc from optimizing too much */
 		barrier();
 	}
@@ -2443,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 	 * and pushed the tail page forward, we will be left with
 	 * a dangling commit that will never go forward.
 	 */
-	if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+	if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)))
 		goto again;
 }
 
@@ -2699,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (unlikely(info->add_timestamp))
 		info->length += RB_LEN_TIME_EXTEND;
 
-	tail_page = info->tail_page = cpu_buffer->tail_page;
+	/* Don't let the compiler play games with cpu_buffer->tail_page */
+	tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
 	write = local_add_return(info->length, &tail_page->write);
 
 	/* set write to only the index of the write */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 919d9d076..8414fa40b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -363,8 +363,8 @@ struct trace_option_dentry {
  * @name: the name chosen to select it on the available_tracers file
  * @init: called when one switches to this tracer (echo name > current_tracer)
  * @reset: called when one switches to another tracer
- * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
- * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @start: called when tracing is unpaused (echo 1 > tracing_on)
+ * @stop: called when tracing is paused (echo 0 > tracing_on)
  * @update_thresh: called when tracing_thresh is updated
  * @open: called when the trace file is opened
  * @pipe_open: called when the trace_pipe file is opened
@@ -467,8 +467,6 @@ enum {
 	TRACE_INTERNAL_IRQ_BIT,
 	TRACE_INTERNAL_SIRQ_BIT,
 
-	TRACE_CONTROL_BIT,
-
 	TRACE_BRANCH_BIT,
 /*
  * Abuse of the trace_recursion.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cc9f7a931..00df25fd8 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags |= FTRACE_OPS_FL_CONTROL;
+	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
 	ops->func = perf_ftrace_function_call;
 	return register_ftrace_function(ops);
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d202d991e..05ddc0820 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1343,15 +1343,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	buf = (char *)__get_free_page(GFP_TEMPORARY);
-	if (!buf)
-		return -ENOMEM;
-
-	if (copy_from_user(buf, ubuf, cnt)) {
-		free_page((unsigned long) buf);
-		return -EFAULT;
-	}
-	buf[cnt] = '\0';
+	buf = memdup_user_nul(ubuf, cnt);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
 
 	mutex_lock(&event_mutex);
 	file = event_file_data(filp);
@@ -1359,7 +1353,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		err = apply_event_filter(file, buf);
 	mutex_unlock(&event_mutex);
 
-	free_page((unsigned long) buf);
+	kfree(buf);
 	if (err < 0)
 		return err;
 
@@ -1510,18 +1504,12 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	buf = (char *)__get_free_page(GFP_TEMPORARY);
-	if (!buf)
-		return -ENOMEM;
-
-	if (copy_from_user(buf, ubuf, cnt)) {
-		free_page((unsigned long) buf);
-		return -EFAULT;
-	}
-	buf[cnt] = '\0';
+	buf = memdup_user_nul(ubuf, cnt);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
 
 	err = apply_subsystem_event_filter(dir, buf);
-	free_page((unsigned long) buf);
+	kfree(buf);
 	if (err < 0)
 		return err;
 
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 42a4009fd..b38f617b6 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -237,28 +237,23 @@ static ssize_t event_trigger_regex_write(struct file *file,
 	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	buf = (char *)__get_free_page(GFP_TEMPORARY);
-	if (!buf)
-		return -ENOMEM;
+	buf = memdup_user_nul(ubuf, cnt);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
 
-	if (copy_from_user(buf, ubuf, cnt)) {
-		free_page((unsigned long)buf);
-		return -EFAULT;
-	}
-	buf[cnt] = '\0';
 	strim(buf);
 
 	mutex_lock(&event_mutex);
 	event_file = event_file_data(file);
 	if (unlikely(!event_file)) {
 		mutex_unlock(&event_mutex);
-		free_page((unsigned long)buf);
+		kfree(buf);
 		return -ENODEV;
 	}
 	ret = trigger_process_regex(event_file, buf);
 	mutex_unlock(&event_mutex);
 
-	free_page((unsigned long)buf);
+	kfree(buf);
 	if (ret < 0)
 		goto out;
 
@@ -543,11 +538,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
 	list_add_rcu(&data->list, &file->triggers);
 	ret++;
 
+	update_cond_flag(file);
 	if (trace_event_trigger_enable_disable(file, 1) < 0) {
 		list_del_rcu(&data->list);
+		update_cond_flag(file);
 		ret--;
 	}
-	update_cond_flag(file);
 out:
 	return ret;
 }
@@ -575,8 +571,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
 		if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
 			unregistered = true;
 			list_del_rcu(&data->list);
-			update_cond_flag(file);
 			trace_event_trigger_enable_disable(file, 0);
+			update_cond_flag(file);
 			break;
 		}
 	}
@@ -1319,11 +1315,12 @@ static int event_enable_register_trigger(char *glob,
 	list_add_rcu(&data->list, &file->triggers);
 	ret++;
 
+	update_cond_flag(file);
 	if (trace_event_trigger_enable_disable(file, 1) < 0) {
 		list_del_rcu(&data->list);
+		update_cond_flag(file);
 		ret--;
 	}
-	update_cond_flag(file);
 out:
 	return ret;
 }
@@ -1344,8 +1341,8 @@ static void event_enable_unregister_trigger(char *glob,
 		    (enable_data->file == test_enable_data->file)) {
 			unregistered = true;
 			list_del_rcu(&data->list);
-			update_cond_flag(file);
 			trace_event_trigger_enable_disable(file, 0);
+			update_cond_flag(file);
 			break;
 		}
 	}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 202df6cff..2a1abbaca 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -156,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack)
 		for (; p < top && i < stack_trace_max.nr_entries; p++) {
 			if (stack_dump_trace[i] == ULONG_MAX)
 				break;
-			if (*p == stack_dump_trace[i]) {
+			/*
+			 * The READ_ONCE_NOCHECK is used to let KASAN know that
+			 * this is not a stack-out-of-bounds error.
+			 */
+			if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) {
 				stack_dump_trace[x] = stack_dump_trace[i++];
 				this_size = stack_trace_index[x++] =
 					(top - p) * sizeof(unsigned long);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 975cb49e3..f8e26ab96 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
 	struct mm_struct *mm;
 
-	/* convert pages-usec to Mbyte-usec */
-	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+	/* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
+	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
+	do_div(stats->coremem, 1000 * KB);
+	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
+	do_div(stats->virtmem, 1000 * KB);
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 static void __acct_update_integrals(struct task_struct *tsk,
 				    cputime_t utime, cputime_t stime)
 {
-	if (likely(tsk->mm)) {
-		cputime_t time, dtime;
-		struct timeval value;
-		unsigned long flags;
-		u64 delta;
-
-		local_irq_save(flags);
-		time = stime + utime;
-		dtime = time - tsk->acct_timexpd;
-		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
-		delta = value.tv_sec;
-		delta = delta * USEC_PER_SEC + value.tv_usec;
-
-		if (delta == 0)
-			goto out;
-		tsk->acct_timexpd = time;
-		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
-		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
-	out:
-		local_irq_restore(flags);
-	}
+	cputime_t time, dtime;
+	u64 delta;
+
+	if (!likely(tsk->mm))
+		return;
+
+	time = stime + utime;
+	dtime = time - tsk->acct_timexpd;
+	/* Avoid division: cputime_t is often in nanoseconds already. */
+	delta = cputime_to_nsecs(dtime);
+
+	if (delta < TICK_NSEC)
+		return;
+
+	tsk->acct_timexpd = time;
+	/*
+	 * Divide by 1024 to avoid overflow, and to avoid division.
+	 * The final unit reported to userspace is Mbyte-usecs,
+	 * the rest of the math is done in xacct_add_tsk.
+	 */
+	tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+	tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
 }
 
 /**
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
 void acct_update_integrals(struct task_struct *tsk)
 {
 	cputime_t utime, stime;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	task_cputime(tsk, &utime, &stime);
 	__acct_update_integrals(tsk, utime, stime);
+	local_irq_restore(flags);
 }
 
 /**
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 88fefa68c..9bafc2119 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -602,8 +602,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	struct uid_gid_map new_map;
 	unsigned idx;
 	struct uid_gid_extent *extent = NULL;
-	unsigned long page = 0;
-	char *kbuf, *pos, *next_line;
+	char *kbuf = NULL, *pos, *next_line;
 	ssize_t ret = -EINVAL;
 
 	/*
@@ -638,23 +637,18 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
 		goto out;
 
-	/* Get a buffer */
-	ret = -ENOMEM;
-	page = __get_free_page(GFP_TEMPORARY);
-	kbuf = (char *) page;
-	if (!page)
-		goto out;
-
 	/* Only allow < page size writes at the beginning of the file */
 	ret = -EINVAL;
 	if ((*ppos != 0) || (count >= PAGE_SIZE))
 		goto out;
 
 	/* Slurp in the user data */
-	ret = -EFAULT;
-	if (copy_from_user(kbuf, buf, count))
+	kbuf = memdup_user_nul(buf, count);
+	if (IS_ERR(kbuf)) {
+		ret = PTR_ERR(kbuf);
+		kbuf = NULL;
 		goto out;
-	kbuf[count] = '\0';
+	}
 
 	/* Parse the user data */
 	ret = -EINVAL;
@@ -756,8 +750,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	ret = count;
 out:
 	mutex_unlock(&userns_state_mutex);
-	if (page)
-		free_page(page);
+	kfree(kbuf);
 	return ret;
 }
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18f34cf75..b3ace6ebb 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -20,6 +20,7 @@
 #include <linux/smpboot.h>
 #include <linux/sched/rt.h>
 #include <linux/tick.h>
+#include <linux/workqueue.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -225,7 +226,15 @@ static void __touch_watchdog(void)
 	__this_cpu_write(watchdog_touch_ts, get_timestamp());
 }
 
-void touch_softlockup_watchdog(void)
+/**
+ * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
+ *
+ * Call when the scheduler may have stalled for legitimate reasons
+ * preventing the watchdog task from executing - e.g. the scheduler
+ * entering idle state.  This should only be used for scheduler events.
+ * Use touch_softlockup_watchdog() for everything else.
+ */
+void touch_softlockup_watchdog_sched(void)
 {
 	/*
 	 * Preemption can be enabled.  It doesn't matter which CPU's timestamp
@@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void)
 	 */
 	raw_cpu_write(watchdog_touch_ts, 0);
 }
+
+void touch_softlockup_watchdog(void)
+{
+	touch_softlockup_watchdog_sched();
+	wq_watchdog_touch(raw_smp_processor_id());
+}
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
 void touch_all_softlockup_watchdogs(void)
@@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void)
 	 */
 	for_each_watchdog_cpu(cpu)
 		per_cpu(watchdog_touch_ts, cpu) = 0;
+	wq_watchdog_touch(-1);
 }
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -351,7 +367,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
 			trigger_allbutself_cpu_backtrace();
 
 		if (hardlockup_panic)
-			panic("Hard LOCKUP");
+			nmi_panic(regs, "Hard LOCKUP");
 
 		__this_cpu_write(hard_watchdog_warn, true);
 		return;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 450c21fd0..7ff5dc7d2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -148,6 +148,8 @@ struct worker_pool {
 	int			id;		/* I: pool ID */
 	unsigned int		flags;		/* X: flags */
 
+	unsigned long		watchdog_ts;	/* L: watchdog timestamp */
+
 	struct list_head	worklist;	/* L: list of pending works */
 	int			nr_workers;	/* L: total number of workers */
 
@@ -299,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
-static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+/* PL: allowable cpus for unbound wqs and work items */
+static cpumask_var_t wq_unbound_cpumask;
+
+/* CPU where unbound work was last round robin scheduled from this CPU */
+static DEFINE_PER_CPU(int, wq_rr_cpu_last);
+
+/*
+ * Local execution of unbound work items is no longer guaranteed.  The
+ * following always forces round-robin CPU selection on unbound work items
+ * to uncover usages which depend on it.
+ */
+#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
+static bool wq_debug_force_rr_cpu = true;
+#else
+static bool wq_debug_force_rr_cpu = false;
+#endif
+module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
 
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -1093,6 +1111,8 @@ static void pwq_activate_delayed_work(struct work_struct *work)
 	struct pool_workqueue *pwq = get_work_pwq(work);
 
 	trace_workqueue_activate_work(work);
+	if (list_empty(&pwq->pool->worklist))
+		pwq->pool->watchdog_ts = jiffies;
 	move_linked_works(work, &pwq->pool->worklist, NULL);
 	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
 	pwq->nr_active++;
@@ -1304,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq)
 	return worker && worker->current_pwq->wq == wq;
 }
 
+/*
+ * When queueing an unbound work item to a wq, prefer local CPU if allowed
+ * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
+ * avoid perturbing sensitive tasks.
+ */
+static int wq_select_unbound_cpu(int cpu)
+{
+	static bool printed_dbg_warning;
+	int new_cpu;
+
+	if (likely(!wq_debug_force_rr_cpu)) {
+		if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+			return cpu;
+	} else if (!printed_dbg_warning) {
+		pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
+		printed_dbg_warning = true;
+	}
+
+	if (cpumask_empty(wq_unbound_cpumask))
+		return cpu;
+
+	new_cpu = __this_cpu_read(wq_rr_cpu_last);
+	new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+	if (unlikely(new_cpu >= nr_cpu_ids)) {
+		new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
+		if (unlikely(new_cpu >= nr_cpu_ids))
+			return cpu;
+	}
+	__this_cpu_write(wq_rr_cpu_last, new_cpu);
+
+	return new_cpu;
+}
+
 static void __queue_work(int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
@@ -1329,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 		return;
 retry:
 	if (req_cpu == WORK_CPU_UNBOUND)
-		cpu = raw_smp_processor_id();
+		cpu = wq_select_unbound_cpu(raw_smp_processor_id());
 
 	/* pwq which will be used unless @work is executing elsewhere */
 	if (!(wq->flags & WQ_UNBOUND))
@@ -1395,6 +1448,8 @@ retry:
 		trace_workqueue_activate_work(work);
 		pwq->nr_active++;
 		worklist = &pwq->pool->worklist;
+		if (list_empty(worklist))
+			pwq->pool->watchdog_ts = jiffies;
 	} else {
 		work_flags |= WORK_STRUCT_DELAYED;
 		worklist = &pwq->delayed_works;
@@ -2167,6 +2222,8 @@ recheck:
 			list_first_entry(&pool->worklist,
 					 struct work_struct, entry);
 
+		pool->watchdog_ts = jiffies;
+
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
 			/* optimization path, not strictly necessary */
 			process_one_work(worker, work);
@@ -2250,6 +2307,7 @@ repeat:
 					struct pool_workqueue, mayday_node);
 		struct worker_pool *pool = pwq->pool;
 		struct work_struct *work, *n;
+		bool first = true;
 
 		__set_current_state(TASK_RUNNING);
 		list_del_init(&pwq->mayday_node);
@@ -2266,9 +2324,14 @@ repeat:
 		 * process'em.
 		 */
 		WARN_ON_ONCE(!list_empty(scheduled));
-		list_for_each_entry_safe(work, n, &pool->worklist, entry)
-			if (get_work_pwq(work) == pwq)
+		list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+			if (get_work_pwq(work) == pwq) {
+				if (first)
+					pool->watchdog_ts = jiffies;
 				move_linked_works(work, scheduled, &n);
+			}
+			first = false;
+		}
 
 		if (!list_empty(scheduled)) {
 			process_scheduled_works(rescuer);
@@ -2326,6 +2389,38 @@ repeat:
 	goto repeat;
 }
 
+/**
+ * check_flush_dependency - check for flush dependency sanity
+ * @target_wq: workqueue being flushed
+ * @target_work: work item being flushed (NULL for workqueue flushes)
+ *
+ * %current is trying to flush the whole @target_wq or @target_work on it.
+ * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
+ * reclaiming memory or running on a workqueue which doesn't have
+ * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
+ * a deadlock.
+ */
+static void check_flush_dependency(struct workqueue_struct *target_wq,
+				   struct work_struct *target_work)
+{
+	work_func_t target_func = target_work ? target_work->func : NULL;
+	struct worker *worker;
+
+	if (target_wq->flags & WQ_MEM_RECLAIM)
+		return;
+
+	worker = current_wq_worker();
+
+	WARN_ONCE(current->flags & PF_MEMALLOC,
+		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
+		  current->pid, current->comm, target_wq->name, target_func);
+	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
+			      (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
+		  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
+		  worker->current_pwq->wq->name, worker->current_func,
+		  target_wq->name, target_func);
+}
+
 struct wq_barrier {
 	struct work_struct	work;
 	struct completion	done;
@@ -2535,6 +2630,8 @@ void flush_workqueue(struct workqueue_struct *wq)
 		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
 	}
 
+	check_flush_dependency(wq, NULL);
+
 	mutex_unlock(&wq->mutex);
 
 	wait_for_completion(&this_flusher.done);
@@ -2707,6 +2804,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 		pwq = worker->current_pwq;
 	}
 
+	check_flush_dependency(pwq->wq, work);
+
 	insert_wq_barrier(pwq, barr, work, worker);
 	spin_unlock_irq(&pool->lock);
 
@@ -3079,6 +3178,7 @@ static int init_worker_pool(struct worker_pool *pool)
 	pool->cpu = -1;
 	pool->node = NUMA_NO_NODE;
 	pool->flags |= POOL_DISASSOCIATED;
+	pool->watchdog_ts = jiffies;
 	INIT_LIST_HEAD(&pool->worklist);
 	INIT_LIST_HEAD(&pool->idle_list);
 	hash_init(pool->busy_hash);
@@ -3611,7 +3711,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
 					const struct workqueue_attrs *attrs)
 {
 	struct apply_wqattrs_ctx *ctx;
-	int ret = -ENOMEM;
 
 	/* only unbound workqueues can change attributes */
 	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
@@ -3622,16 +3721,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
 		return -EINVAL;
 
 	ctx = apply_wqattrs_prepare(wq, attrs);
+	if (!ctx)
+		return -ENOMEM;
 
 	/* the ctx has been prepared successfully, let's commit it */
-	if (ctx) {
-		apply_wqattrs_commit(ctx);
-		ret = 0;
-	}
-
+	apply_wqattrs_commit(ctx);
 	apply_wqattrs_cleanup(ctx);
 
-	return ret;
+	return 0;
 }
 
 /**
@@ -4318,7 +4415,9 @@ void show_workqueue_state(void)
 
 		pr_info("pool %d:", pool->id);
 		pr_cont_pool_info(pool);
-		pr_cont(" workers=%d", pool->nr_workers);
+		pr_cont(" hung=%us workers=%d",
+			jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+			pool->nr_workers);
 		if (pool->manager)
 			pr_cont(" manager: %d",
 				task_pid_nr(pool->manager->task));
@@ -5177,6 +5276,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
 static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
 #endif	/* CONFIG_SYSFS */
 
+/*
+ * Workqueue watchdog.
+ *
+ * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
+ * flush dependency, a concurrency managed work item which stays RUNNING
+ * indefinitely.  Workqueue stalls can be very difficult to debug as the
+ * usual warning mechanisms don't trigger and internal workqueue state is
+ * largely opaque.
+ *
+ * Workqueue watchdog monitors all worker pools periodically and dumps
+ * state if some pools failed to make forward progress for a while where
+ * forward progress is defined as the first item on ->worklist changing.
+ *
+ * This mechanism is controlled through the kernel parameter
+ * "workqueue.watchdog_thresh" which can be updated at runtime through the
+ * corresponding sysfs parameter file.
+ */
+#ifdef CONFIG_WQ_WATCHDOG
+
+static void wq_watchdog_timer_fn(unsigned long data);
+
+static unsigned long wq_watchdog_thresh = 30;
+static struct timer_list wq_watchdog_timer =
+	TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+
+static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+
+static void wq_watchdog_reset_touched(void)
+{
+	int cpu;
+
+	wq_watchdog_touched = jiffies;
+	for_each_possible_cpu(cpu)
+		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+}
+
+static void wq_watchdog_timer_fn(unsigned long data)
+{
+	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+	bool lockup_detected = false;
+	struct worker_pool *pool;
+	int pi;
+
+	if (!thresh)
+		return;
+
+	rcu_read_lock();
+
+	for_each_pool(pool, pi) {
+		unsigned long pool_ts, touched, ts;
+
+		if (list_empty(&pool->worklist))
+			continue;
+
+		/* get the latest of pool and touched timestamps */
+		pool_ts = READ_ONCE(pool->watchdog_ts);
+		touched = READ_ONCE(wq_watchdog_touched);
+
+		if (time_after(pool_ts, touched))
+			ts = pool_ts;
+		else
+			ts = touched;
+
+		if (pool->cpu >= 0) {
+			unsigned long cpu_touched =
+				READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
+						  pool->cpu));
+			if (time_after(cpu_touched, ts))
+				ts = cpu_touched;
+		}
+
+		/* did we stall? */
+		if (time_after(jiffies, ts + thresh)) {
+			lockup_detected = true;
+			pr_emerg("BUG: workqueue lockup - pool");
+			pr_cont_pool_info(pool);
+			pr_cont(" stuck for %us!\n",
+				jiffies_to_msecs(jiffies - pool_ts) / 1000);
+		}
+	}
+
+	rcu_read_unlock();
+
+	if (lockup_detected)
+		show_workqueue_state();
+
+	wq_watchdog_reset_touched();
+	mod_timer(&wq_watchdog_timer, jiffies + thresh);
+}
+
+void wq_watchdog_touch(int cpu)
+{
+	if (cpu >= 0)
+		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+	else
+		wq_watchdog_touched = jiffies;
+}
+
+static void wq_watchdog_set_thresh(unsigned long thresh)
+{
+	wq_watchdog_thresh = 0;
+	del_timer_sync(&wq_watchdog_timer);
+
+	if (thresh) {
+		wq_watchdog_thresh = thresh;
+		wq_watchdog_reset_touched();
+		mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
+	}
+}
+
+static int wq_watchdog_param_set_thresh(const char *val,
+					const struct kernel_param *kp)
+{
+	unsigned long thresh;
+	int ret;
+
+	ret = kstrtoul(val, 0, &thresh);
+	if (ret)
+		return ret;
+
+	if (system_wq)
+		wq_watchdog_set_thresh(thresh);
+	else
+		wq_watchdog_thresh = thresh;
+
+	return 0;
+}
+
+static const struct kernel_param_ops wq_watchdog_thresh_ops = {
+	.set	= wq_watchdog_param_set_thresh,
+	.get	= param_get_ulong,
+};
+
+module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
+		0644);
+
+static void wq_watchdog_init(void)
+{
+	wq_watchdog_set_thresh(wq_watchdog_thresh);
+}
+
+#else	/* CONFIG_WQ_WATCHDOG */
+
+static inline void wq_watchdog_init(void) { }
+
+#endif	/* CONFIG_WQ_WATCHDOG */
+
 static void __init wq_numa_init(void)
 {
 	cpumask_var_t *tbl;
@@ -5300,6 +5547,9 @@ static int __init init_workqueues(void)
 	       !system_unbound_wq || !system_freezable_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq);
+
+	wq_watchdog_init();
+
 	return 0;
 }
 early_initcall(init_workqueues);
author	André Fabian Silva Delgado <emulatorman@parabola.nu>	2016-03-25 03:53:42 -0300
committer	André Fabian Silva Delgado <emulatorman@parabola.nu>	2016-03-25 03:53:42 -0300
commit	03dd4cb26d967f9588437b0fc9cc0e8353322bb7 (patch)
tree	fa581f6dc1c0596391690d1f67eceef3af8246dc /kernel
parent	d4e493caf788ef44982e131ff9c786546904d934 (diff)