From 670027c507e99521d416994a18a498def9ef2ea3 Mon Sep 17 00:00:00 2001
From: André Fabian Silva Delgado <emulatorman@parabola.nu>
Date: Sat, 22 Oct 2016 19:31:08 -0300
Subject: Linux-libre 4.8.3-gnu

---
 kernel/Makefile                           |    2 +-
 kernel/power/tuxonice.h                   |  260 -
 kernel/power/tuxonice_alloc.c             |  308 --
 kernel/power/tuxonice_alloc.h             |   54 -
 kernel/power/tuxonice_atomic_copy.c       |  469 --
 kernel/power/tuxonice_atomic_copy.h       |   25 -
 kernel/power/tuxonice_bio.h               |   80 -
 kernel/power/tuxonice_bio_chains.c        | 1121 ----
 kernel/power/tuxonice_bio_core.c          | 1937 -------
 kernel/power/tuxonice_bio_internal.h      |  101 -
 kernel/power/tuxonice_bio_signature.c     |  403 --
 kernel/power/tuxonice_builtin.c           |  498 --
 kernel/power/tuxonice_builtin.h           |   41 -
 kernel/power/tuxonice_checksum.c          |  392 --
 kernel/power/tuxonice_checksum.h          |   31 -
 kernel/power/tuxonice_cluster.c           | 1058 ----
 kernel/power/tuxonice_cluster.h           |   18 -
 kernel/power/tuxonice_compress.c          |  452 --
 kernel/power/tuxonice_copy_before_write.c |  240 -
 kernel/power/tuxonice_extent.c            |  144 -
 kernel/power/tuxonice_extent.h            |   45 -
 kernel/power/tuxonice_file.c              |  484 --
 kernel/power/tuxonice_highlevel.c         | 1414 -----
 kernel/power/tuxonice_incremental.c       |  402 --
 kernel/power/tuxonice_io.c                | 1936 -------
 kernel/power/tuxonice_io.h                |   72 -
 kernel/power/tuxonice_modules.c           |  520 --
 kernel/power/tuxonice_modules.h           |  212 -
 kernel/power/tuxonice_netlink.c           |  324 --
 kernel/power/tuxonice_netlink.h           |   62 -
 kernel/power/tuxonice_pagedir.c           |  345 --
 kernel/power/tuxonice_pagedir.h           |   50 -
 kernel/power/tuxonice_pageflags.c         |   18 -
 kernel/power/tuxonice_pageflags.h         |  106 -
 kernel/power/tuxonice_power_off.c         |  286 -
 kernel/power/tuxonice_power_off.h         |   24 -
 kernel/power/tuxonice_prepare_image.c     | 1089 ----
 kernel/power/tuxonice_prepare_image.h     |   38 -
 kernel/power/tuxonice_prune.c             |  406 --
 kernel/power/tuxonice_storage.c           |  282 -
 kernel/power/tuxonice_storage.h           |   45 -
 kernel/power/tuxonice_swap.c              |  474 --
 kernel/power/tuxonice_sysfs.c             |  333 --
 kernel/power/tuxonice_sysfs.h             |  137 -
 kernel/power/tuxonice_ui.c                |  247 -
 kernel/power/tuxonice_ui.h                |   97 -
 kernel/power/tuxonice_userui.c            |  658 ---
 kernel/sched/Makefile                     |    4 +-
 kernel/sched/MuQSS.c                      | 8247 +++++++++++++++++++++++++++++
 kernel/sched/MuQSS.h                      |  274 +
 kernel/sched/bfs.c                        | 7671 ---------------------------
 kernel/sched/bfs_sched.h                  |  224 -
 kernel/sched/cpufreq.c                    |    4 +-
 kernel/sched/cpufreq_schedutil.c          |    4 +-
 kernel/sched/idle.c                       |    4 +-
 kernel/sched/stats.c                      |    4 +-
 kernel/skip_list.c                        |  174 +
 kernel/skip_lists.c                       |  174 -
 kernel/smpboot.c                          |    2 +-
 kernel/sysctl.c                           |   10 +-
 kernel/time/Kconfig                       |    2 +-
 kernel/trace/trace_selftest.c             |    4 +-
 62 files changed, 8715 insertions(+), 25827 deletions(-)
 delete mode 100644 kernel/power/tuxonice.h
 delete mode 100644 kernel/power/tuxonice_alloc.c
 delete mode 100644 kernel/power/tuxonice_alloc.h
 delete mode 100644 kernel/power/tuxonice_atomic_copy.c
 delete mode 100644 kernel/power/tuxonice_atomic_copy.h
 delete mode 100644 kernel/power/tuxonice_bio.h
 delete mode 100644 kernel/power/tuxonice_bio_chains.c
 delete mode 100644 kernel/power/tuxonice_bio_core.c
 delete mode 100644 kernel/power/tuxonice_bio_internal.h
 delete mode 100644 kernel/power/tuxonice_bio_signature.c
 delete mode 100644 kernel/power/tuxonice_builtin.c
 delete mode 100644 kernel/power/tuxonice_builtin.h
 delete mode 100644 kernel/power/tuxonice_checksum.c
 delete mode 100644 kernel/power/tuxonice_checksum.h
 delete mode 100644 kernel/power/tuxonice_cluster.c
 delete mode 100644 kernel/power/tuxonice_cluster.h
 delete mode 100644 kernel/power/tuxonice_compress.c
 delete mode 100644 kernel/power/tuxonice_copy_before_write.c
 delete mode 100644 kernel/power/tuxonice_extent.c
 delete mode 100644 kernel/power/tuxonice_extent.h
 delete mode 100644 kernel/power/tuxonice_file.c
 delete mode 100644 kernel/power/tuxonice_highlevel.c
 delete mode 100644 kernel/power/tuxonice_incremental.c
 delete mode 100644 kernel/power/tuxonice_io.c
 delete mode 100644 kernel/power/tuxonice_io.h
 delete mode 100644 kernel/power/tuxonice_modules.c
 delete mode 100644 kernel/power/tuxonice_modules.h
 delete mode 100644 kernel/power/tuxonice_netlink.c
 delete mode 100644 kernel/power/tuxonice_netlink.h
 delete mode 100644 kernel/power/tuxonice_pagedir.c
 delete mode 100644 kernel/power/tuxonice_pagedir.h
 delete mode 100644 kernel/power/tuxonice_pageflags.c
 delete mode 100644 kernel/power/tuxonice_pageflags.h
 delete mode 100644 kernel/power/tuxonice_power_off.c
 delete mode 100644 kernel/power/tuxonice_power_off.h
 delete mode 100644 kernel/power/tuxonice_prepare_image.c
 delete mode 100644 kernel/power/tuxonice_prepare_image.h
 delete mode 100644 kernel/power/tuxonice_prune.c
 delete mode 100644 kernel/power/tuxonice_storage.c
 delete mode 100644 kernel/power/tuxonice_storage.h
 delete mode 100644 kernel/power/tuxonice_swap.c
 delete mode 100644 kernel/power/tuxonice_sysfs.c
 delete mode 100644 kernel/power/tuxonice_sysfs.h
 delete mode 100644 kernel/power/tuxonice_ui.c
 delete mode 100644 kernel/power/tuxonice_ui.h
 delete mode 100644 kernel/power/tuxonice_userui.c
 create mode 100644 kernel/sched/MuQSS.c
 create mode 100644 kernel/sched/MuQSS.h
 delete mode 100644 kernel/sched/bfs.c
 delete mode 100644 kernel/sched/bfs_sched.h
 create mode 100644 kernel/skip_list.c
 delete mode 100644 kernel/skip_lists.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index dd84575cb..7241e3459 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o skip_lists.o
+	    async.o range.o smpboot.o skip_list.o
 
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
deleted file mode 100644
index 10b65633f..000000000
--- a/kernel/power/tuxonice.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * kernel/power/tuxonice.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations used throughout swsusp.
- *
- */
-
-#ifndef KERNEL_POWER_TOI_H
-#define KERNEL_POWER_TOI_H
-
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/suspend.h>
-#include <linux/fs.h>
-#include <asm/setup.h>
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-#define TOI_CORE_VERSION "3.3"
-#define        TOI_HEADER_VERSION 3
-#define MY_BOOT_KERNEL_DATA_VERSION 4
-
-struct toi_boot_kernel_data {
-        int version;
-        int size;
-        unsigned long toi_action;
-        unsigned long toi_debug_state;
-        u32 toi_default_console_level;
-        int toi_io_time[2][2];
-        char toi_nosave_commandline[COMMAND_LINE_SIZE];
-        unsigned long pages_used[33];
-        unsigned long incremental_bytes_in;
-        unsigned long incremental_bytes_out;
-        unsigned long compress_bytes_in;
-        unsigned long compress_bytes_out;
-        unsigned long pruned_pages;
-};
-
-extern struct toi_boot_kernel_data toi_bkd;
-
-/* Location of book kernel data struct in kernel being resumed */
-extern unsigned long boot_kernel_data_buffer;
-
-/*                 == Action states ==                 */
-
-enum {
-        TOI_REBOOT,
-        TOI_PAUSE,
-        TOI_LOGALL,
-        TOI_CAN_CANCEL,
-        TOI_KEEP_IMAGE,
-        TOI_FREEZER_TEST,
-        TOI_SINGLESTEP,
-        TOI_PAUSE_NEAR_PAGESET_END,
-        TOI_TEST_FILTER_SPEED,
-        TOI_TEST_BIO,
-        TOI_NO_PAGESET2,
-        TOI_IGNORE_ROOTFS,
-        TOI_REPLACE_SWSUSP,
-        TOI_PAGESET2_FULL,
-        TOI_ABORT_ON_RESAVE_NEEDED,
-        TOI_NO_MULTITHREADED_IO,
-        TOI_NO_DIRECT_LOAD, /* Obsolete */
-        TOI_LATE_CPU_HOTPLUG, /* Obsolete */
-        TOI_GET_MAX_MEM_ALLOCD,
-        TOI_NO_FLUSHER_THREAD,
-        TOI_NO_PS2_IF_UNNEEDED,
-        TOI_POST_RESUME_BREAKPOINT,
-        TOI_NO_READAHEAD,
-        TOI_TRACE_DEBUG_ON,
-        TOI_INCREMENTAL_IMAGE,
-};
-
-extern unsigned long toi_bootflags_mask;
-
-#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
-
-/*                 == Result states ==                 */
-
-enum {
-        TOI_ABORTED,
-        TOI_ABORT_REQUESTED,
-        TOI_NOSTORAGE_AVAILABLE,
-        TOI_INSUFFICIENT_STORAGE,
-        TOI_FREEZING_FAILED,
-        TOI_KEPT_IMAGE,
-        TOI_WOULD_EAT_MEMORY,
-        TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
-        TOI_PM_SEM,
-        TOI_DEVICE_REFUSED,
-        TOI_SYSDEV_REFUSED,
-        TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
-        TOI_UNABLE_TO_PREPARE_IMAGE,
-        TOI_FAILED_MODULE_INIT,
-        TOI_FAILED_MODULE_CLEANUP,
-        TOI_FAILED_IO,
-        TOI_OUT_OF_MEMORY,
-        TOI_IMAGE_ERROR,
-        TOI_PLATFORM_PREP_FAILED,
-        TOI_CPU_HOTPLUG_FAILED,
-        TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */
-        TOI_RESAVE_NEEDED,
-        TOI_CANT_SUSPEND,
-        TOI_NOTIFIERS_PREPARE_FAILED,
-        TOI_PRE_SNAPSHOT_FAILED,
-        TOI_PRE_RESTORE_FAILED,
-        TOI_USERMODE_HELPERS_ERR,
-        TOI_CANT_USE_ALT_RESUME,
-        TOI_HEADER_TOO_BIG,
-        TOI_WAKEUP_EVENT,
-        TOI_SYSCORE_REFUSED,
-        TOI_DPM_PREPARE_FAILED,
-        TOI_DPM_SUSPEND_FAILED,
-        TOI_NUM_RESULT_STATES        /* Used in printing debug info only */
-};
-
-extern unsigned long toi_result;
-
-#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
-#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
-                                test_and_set_bit(bit, &toi_result))
-#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
-#define test_result_state(bit) (test_bit(bit, &toi_result))
-
-/*         == Debug sections and levels ==         */
-
-/* debugging levels. */
-enum {
-        TOI_STATUS = 0,
-        TOI_ERROR = 2,
-        TOI_LOW,
-        TOI_MEDIUM,
-        TOI_HIGH,
-        TOI_VERBOSE,
-};
-
-enum {
-        TOI_ANY_SECTION,
-        TOI_EAT_MEMORY,
-        TOI_IO,
-        TOI_HEADER,
-        TOI_WRITER,
-        TOI_MEMORY,
-        TOI_PAGEDIR,
-        TOI_COMPRESS,
-        TOI_BIO,
-};
-
-#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
-#define clear_debug_state(bit) \
-        (test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
-#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
-
-/*                == Steps in hibernating ==        */
-
-enum {
-        STEP_HIBERNATE_PREPARE_IMAGE,
-        STEP_HIBERNATE_SAVE_IMAGE,
-        STEP_HIBERNATE_POWERDOWN,
-        STEP_RESUME_CAN_RESUME,
-        STEP_RESUME_LOAD_PS1,
-        STEP_RESUME_DO_RESTORE,
-        STEP_RESUME_READ_PS2,
-        STEP_RESUME_GO,
-        STEP_RESUME_ALT_IMAGE,
-        STEP_CLEANUP,
-        STEP_QUIET_CLEANUP
-};
-
-/*                == TuxOnIce states ==
-        (see also include/linux/suspend.h)        */
-
-#define get_toi_state()  (toi_state)
-#define restore_toi_state(saved_state) \
-        do { toi_state = saved_state; } while (0)
-
-/*                == Module support ==                */
-
-struct toi_core_fns {
-        int (*post_context_save)(void);
-        unsigned long (*get_nonconflicting_page)(void);
-        int (*try_hibernate)(void);
-        void (*try_resume)(void);
-};
-
-extern struct toi_core_fns *toi_core_fns;
-
-/*                == All else ==                        */
-#define KB(x) ((x) << (PAGE_SHIFT - 10))
-#define MB(x) ((x) >> (20 - PAGE_SHIFT))
-
-extern int toi_start_anything(int toi_or_resume);
-extern void toi_finish_anything(int toi_or_resume);
-
-extern int save_image_part1(void);
-extern int toi_atomic_restore(void);
-
-extern int toi_try_hibernate(void);
-extern void toi_try_resume(void);
-
-extern int __toi_post_context_save(void);
-
-extern unsigned int nr_hibernates;
-extern char alt_resume_param[256];
-
-extern void copyback_post(void);
-extern int toi_hibernate(void);
-extern unsigned long extra_pd1_pages_used;
-
-#define SECTOR_SIZE 512
-
-extern void toi_early_boot_message(int can_erase_image, int default_answer,
-        char *warning_reason, ...);
-
-extern int do_check_can_resume(void);
-extern int do_toi_step(int step);
-extern int toi_launch_userspace_program(char *command, int channel_no,
-                int wait, int debug);
-
-extern char tuxonice_signature[9];
-
-extern int toi_start_other_threads(void);
-extern void toi_stop_other_threads(void);
-
-extern int toi_trace_index;
-#define TOI_TRACE_DEBUG(PFN, DESC, ...) \
-    do { \
-        if (test_action_state(TOI_TRACE_DEBUG_ON)) { \
-            printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \
-        } \
-    } while(0)
-
-#ifdef CONFIG_TOI_KEEP_IMAGE
-#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE))
-#else
-#define toi_keeping_image (0)
-#endif
-
-#ifdef CONFIG_TOI_INCREMENTAL
-extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose);
-extern int toi_reset_dirtiness(int verbose);
-extern void toi_cbw_write(void);
-extern void toi_cbw_restore(void);
-extern int toi_allocate_cbw_data(void);
-extern void toi_free_cbw_data(void);
-extern int toi_cbw_init(void);
-extern void toi_mark_tasks_cbw(void);
-#else
-static inline int toi_reset_dirtiness(int verbose) { return 0; }
-#define toi_cbw_write() do { } while(0)
-#define toi_cbw_restore() do { } while(0)
-#define toi_allocate_cbw_data() do { } while(0)
-#define toi_free_cbw_data() do { } while(0)
-static inline int toi_cbw_init(void) { return 0; }
-#endif
-#endif
diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
deleted file mode 100644
index 1d8b1cbda..000000000
--- a/kernel/power/tuxonice_alloc.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.c
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/export.h>
-#include <linux/slab.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-#define TOI_ALLOC_PATHS 41
-
-static DEFINE_MUTEX(toi_alloc_mutex);
-
-static struct toi_module_ops toi_alloc_ops;
-
-static int toi_fail_num;
-
-static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
-                toi_free_count[TOI_ALLOC_PATHS],
-                toi_test_count[TOI_ALLOC_PATHS],
-                toi_fail_count[TOI_ALLOC_PATHS];
-static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
-static int cur_allocd, max_allocd;
-
-static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
-        "", /* 0 */
-        "get_io_info_struct",
-        "extent",
-        "extent (loading chain)",
-        "userui channel",
-        "userui arg", /* 5 */
-        "attention list metadata",
-        "extra pagedir memory metadata",
-        "bdev metadata",
-        "extra pagedir memory",
-        "header_locations_read", /* 10 */
-        "bio queue",
-        "prepare_readahead",
-        "i/o buffer",
-        "writer buffer in bio_init",
-        "checksum buffer", /* 15 */
-        "compression buffer",
-        "filewriter signature op",
-        "set resume param alloc1",
-        "set resume param alloc2",
-        "debugging info buffer", /* 20 */
-        "check can resume buffer",
-        "write module config buffer",
-        "read module config buffer",
-        "write image header buffer",
-        "read pageset1 buffer", /* 25 */
-        "get_have_image_data buffer",
-        "checksum page",
-        "worker rw loop",
-        "get nonconflicting page",
-        "ps1 load addresses", /* 30 */
-        "remove swap image",
-        "swap image exists",
-        "swap parse sig location",
-        "sysfs kobj",
-        "swap mark resume attempted buffer", /* 35 */
-        "cluster member",
-        "boot kernel data buffer",
-        "setting swap signature",
-        "block i/o bdev struct",
-        "copy before write", /* 40 */
-};
-
-#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
-        do { \
-                BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
-                \
-                if (FAIL_NUM == toi_fail_num) { \
-                        atomic_inc(&toi_test_count[FAIL_NUM]); \
-                        toi_fail_num = 0; \
-                        return FAIL_VAL; \
-                } \
-        } while (0)
-
-static void alloc_update_stats(int fail_num, void *result, int size)
-{
-        if (!result) {
-                atomic_inc(&toi_fail_count[fail_num]);
-                return;
-        }
-
-        atomic_inc(&toi_alloc_count[fail_num]);
-        if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
-                mutex_lock(&toi_alloc_mutex);
-                toi_cur_allocd[fail_num]++;
-                cur_allocd += size;
-                if (unlikely(cur_allocd > max_allocd)) {
-                        int i;
-
-                        for (i = 0; i < TOI_ALLOC_PATHS; i++)
-                                toi_max_allocd[i] = toi_cur_allocd[i];
-                        max_allocd = cur_allocd;
-                }
-                mutex_unlock(&toi_alloc_mutex);
-        }
-}
-
-static void free_update_stats(int fail_num, int size)
-{
-        BUG_ON(fail_num >= TOI_ALLOC_PATHS);
-        atomic_inc(&toi_free_count[fail_num]);
-        if (unlikely(atomic_read(&toi_free_count[fail_num]) >
-                                atomic_read(&toi_alloc_count[fail_num])))
-                dump_stack();
-        if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
-                mutex_lock(&toi_alloc_mutex);
-                cur_allocd -= size;
-                toi_cur_allocd[fail_num]--;
-                mutex_unlock(&toi_alloc_mutex);
-        }
-}
-
-void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
-{
-        void *result;
-
-        if (toi_alloc_ops.enabled)
-                MIGHT_FAIL(fail_num, NULL);
-        result = kzalloc(size, flags);
-        if (toi_alloc_ops.enabled)
-                alloc_update_stats(fail_num, result, size);
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        return result;
-}
-
-unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
-                unsigned int order)
-{
-        unsigned long result;
-
-        mask |= ___GFP_TOI_NOTRACK;
-        if (toi_alloc_ops.enabled)
-                MIGHT_FAIL(fail_num, 0);
-        result = __get_free_pages(mask, order);
-        if (toi_alloc_ops.enabled)
-                alloc_update_stats(fail_num, (void *) result,
-                                PAGE_SIZE << order);
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        return result;
-}
-
-struct page *toi_alloc_page(int fail_num, gfp_t mask)
-{
-        struct page *result;
-
-        if (toi_alloc_ops.enabled)
-                MIGHT_FAIL(fail_num, NULL);
-        mask |= ___GFP_TOI_NOTRACK;
-        result = alloc_page(mask);
-        if (toi_alloc_ops.enabled)
-                alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        return result;
-}
-
-unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
-{
-        unsigned long result;
-
-        if (toi_alloc_ops.enabled)
-                MIGHT_FAIL(fail_num, 0);
-        mask |= ___GFP_TOI_NOTRACK;
-        result = get_zeroed_page(mask);
-        if (toi_alloc_ops.enabled)
-                alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        return result;
-}
-
-void toi_kfree(int fail_num, const void *arg, int size)
-{
-        if (arg && toi_alloc_ops.enabled)
-                free_update_stats(fail_num, size);
-
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        kfree(arg);
-}
-
-void toi_free_page(int fail_num, unsigned long virt)
-{
-        if (virt && toi_alloc_ops.enabled)
-                free_update_stats(fail_num, PAGE_SIZE);
-
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        free_page(virt);
-}
-
-void toi__free_page(int fail_num, struct page *page)
-{
-        if (page && toi_alloc_ops.enabled)
-                free_update_stats(fail_num, PAGE_SIZE);
-
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        __free_page(page);
-}
-
-void toi_free_pages(int fail_num, struct page *page, int order)
-{
-        if (page && toi_alloc_ops.enabled)
-                free_update_stats(fail_num, PAGE_SIZE << order);
-
-        if (fail_num == toi_trace_allocs)
-                dump_stack();
-        __free_pages(page, order);
-}
-
-void toi_alloc_print_debug_stats(void)
-{
-        int i, header_done = 0;
-
-        if (!toi_alloc_ops.enabled)
-                return;
-
-        for (i = 0; i < TOI_ALLOC_PATHS; i++)
-                if (atomic_read(&toi_alloc_count[i]) !=
-                    atomic_read(&toi_free_count[i])) {
-                        if (!header_done) {
-                                printk(KERN_INFO "Idx  Allocs   Frees   Tests "
-                                        "  Fails     Max Description\n");
-                                header_done = 1;
-                        }
-
-                        printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
-                                atomic_read(&toi_alloc_count[i]),
-                                atomic_read(&toi_free_count[i]),
-                                atomic_read(&toi_test_count[i]),
-                                atomic_read(&toi_fail_count[i]),
-                                toi_max_allocd[i],
-                                toi_alloc_desc[i]);
-                }
-}
-
-static int toi_alloc_initialise(int starting_cycle)
-{
-        int i;
-
-        if (!starting_cycle)
-                return 0;
-
-        if (toi_trace_allocs)
-                dump_stack();
-
-        for (i = 0; i < TOI_ALLOC_PATHS; i++) {
-                atomic_set(&toi_alloc_count[i], 0);
-                atomic_set(&toi_free_count[i], 0);
-                atomic_set(&toi_test_count[i], 0);
-                atomic_set(&toi_fail_count[i], 0);
-                toi_cur_allocd[i] = 0;
-                toi_max_allocd[i] = 0;
-        };
-
-        max_allocd = 0;
-        cur_allocd = 0;
-        return 0;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL),
-        SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0,
-                        NULL),
-        SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_GET_MAX_MEM_ALLOCD, 0),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0,
-                        NULL)
-};
-
-static struct toi_module_ops toi_alloc_ops = {
-        .type                                        = MISC_HIDDEN_MODULE,
-        .name                                        = "allocation debugging",
-        .directory                                = "alloc",
-        .module                                        = THIS_MODULE,
-        .early                                        = 1,
-        .initialise                                = toi_alloc_initialise,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-int toi_alloc_init(void)
-{
-        int result = toi_register_module(&toi_alloc_ops);
-        return result;
-}
-
-void toi_alloc_exit(void)
-{
-        toi_unregister_module(&toi_alloc_ops);
-}
diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
deleted file mode 100644
index 0cd6b686f..000000000
--- a/kernel/power/tuxonice_alloc.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.h
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/slab.h>
-#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN)
-#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
-
-#ifdef CONFIG_PM_DEBUG
-extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
-extern void toi_kfree(int fail_num, const void *arg, int size);
-
-extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
-                unsigned int order);
-#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
-extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
-extern void toi_free_page(int fail_num, unsigned long buf);
-extern void toi__free_page(int fail_num, struct page *page);
-extern void toi_free_pages(int fail_num, struct page *page, int order);
-extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
-extern int toi_alloc_init(void);
-extern void toi_alloc_exit(void);
-
-extern void toi_alloc_print_debug_stats(void);
-
-#else /* CONFIG_PM_DEBUG */
-
-#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
-#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN))
-
-#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
-#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
-#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
-#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
-#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
-#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
-#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
-static inline int toi_alloc_init(void)
-{
-        return 0;
-}
-
-static inline void toi_alloc_exit(void) { }
-
-static inline void toi_alloc_print_debug_stats(void) { }
-
-#endif
-
-extern int toi_trace_allocs;
diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
deleted file mode 100644
index 5845217f8..000000000
--- a/kernel/power/tuxonice_atomic_copy.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.c
- *
- * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/syscore_ops.h>
-#include <linux/ftrace.h>
-#include <asm/suspend.h>
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-
-unsigned long extra_pd1_pages_used;
-
-/**
- * free_pbe_list - free page backup entries used by the atomic copy code.
- * @list:        List to free.
- * @highmem:        Whether the list is in highmem.
- *
- * Normally, this function isn't used. If, however, we need to abort before
- * doing the atomic copy, we use this to free the pbes previously allocated.
- **/
-static void free_pbe_list(struct pbe **list, int highmem)
-{
-        while (*list) {
-                int i;
-                struct pbe *free_pbe, *next_page = NULL;
-                struct page *page;
-
-                if (highmem) {
-                        page = (struct page *) *list;
-                        free_pbe = (struct pbe *) kmap(page);
-                } else {
-                        page = virt_to_page(*list);
-                        free_pbe = *list;
-                }
-
-                for (i = 0; i < PBES_PER_PAGE; i++) {
-                        if (!free_pbe)
-                                break;
-                        if (highmem)
-                                toi__free_page(29, free_pbe->address);
-                        else
-                                toi_free_page(29,
-                                        (unsigned long) free_pbe->address);
-                        free_pbe = free_pbe->next;
-                }
-
-                if (highmem) {
-                        if (free_pbe)
-                                next_page = free_pbe;
-                        kunmap(page);
-                } else {
-                        if (free_pbe)
-                                next_page = free_pbe;
-                }
-
-                toi__free_page(29, page);
-                *list = (struct pbe *) next_page;
-        };
-}
-
-/**
- * copyback_post - post atomic-restore actions
- *
- * After doing the atomic restore, we have a few more things to do:
- *        1) We want to retain some values across the restore, so we now copy
- *        these from the nosave variables to the normal ones.
- *        2) Set the status flags.
- *        3) Resume devices.
- *        4) Tell userui so it can redraw & restore settings.
- *        5) Reread the page cache.
- **/
-void copyback_post(void)
-{
-        struct toi_boot_kernel_data *bkd =
-                (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
-        if (toi_activate_storage(1))
-                panic("Failed to reactivate our storage.");
-
-        toi_post_atomic_restore_modules(bkd);
-
-        toi_cond_pause(1, "About to reload secondary pagedir.");
-
-        if (read_pageset2(0))
-                panic("Unable to successfully reread the page cache.");
-
-        /*
-         * If the user wants to sleep again after resuming from full-off,
-         * it's most likely to be in order to suspend to ram, so we'll
-         * do this check after loading pageset2, to give them the fastest
-         * wakeup when they are ready to use the computer again.
-         */
-        toi_check_resleep();
-
-        if (test_action_state(TOI_INCREMENTAL_IMAGE))
-            toi_reset_dirtiness(1);
-}
-
-/**
- * toi_copy_pageset1 - do the atomic copy of pageset1
- *
- * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
- * because we can't be sure what side effects it has. On my old Duron, with
- * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
- * count at resume time 4 instead of 3.
- *
- * We don't want to call kmap_atomic unconditionally because it has the side
- * effect of incrementing the preempt count, which will leave it one too high
- * post resume (the page containing the preempt count will be copied after
- * its incremented. This is essentially the same problem.
- **/
-void toi_copy_pageset1(void)
-{
-        int i;
-        unsigned long source_index, dest_index;
-
-        memory_bm_position_reset(pageset1_map);
-        memory_bm_position_reset(pageset1_copy_map);
-
-        source_index = memory_bm_next_pfn(pageset1_map, 0);
-        dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
-
-        for (i = 0; i < pagedir1.size; i++) {
-                unsigned long *origvirt, *copyvirt;
-                struct page *origpage, *copypage;
-                int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1,
-                    was_present1, was_present2;
-
-                origpage = pfn_to_page(source_index);
-                copypage = pfn_to_page(dest_index);
-
-                origvirt = PageHighMem(origpage) ?
-                        kmap_atomic(origpage) :
-                        page_address(origpage);
-
-                copyvirt = PageHighMem(copypage) ?
-                        kmap_atomic(copypage) :
-                        page_address(copypage);
-
-                was_present1 = kernel_page_present(origpage);
-                if (!was_present1)
-                        kernel_map_pages(origpage, 1, 1);
-
-                was_present2 = kernel_page_present(copypage);
-                if (!was_present2)
-                        kernel_map_pages(copypage, 1, 1);
-
-                while (loop >= 0) {
-                        *(copyvirt + loop) = *(origvirt + loop);
-                        loop--;
-                }
-
-                if (!was_present1)
-                        kernel_map_pages(origpage, 1, 0);
-
-                if (!was_present2)
-                        kernel_map_pages(copypage, 1, 0);
-
-                if (PageHighMem(origpage))
-                        kunmap_atomic(origvirt);
-
-                if (PageHighMem(copypage))
-                        kunmap_atomic(copyvirt);
-
-                source_index = memory_bm_next_pfn(pageset1_map, 0);
-                dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
-        }
-}
-
-/**
- * __toi_post_context_save - steps after saving the cpu context
- *
- * Steps taken after saving the CPU state to make the actual
- * atomic copy.
- *
- * Called from swsusp_save in snapshot.c via toi_post_context_save.
- **/
-int __toi_post_context_save(void)
-{
-        unsigned long old_ps1_size = pagedir1.size;
-
-        check_checksums();
-
-        free_checksum_pages();
-
-        toi_recalculate_image_contents(1);
-
-        extra_pd1_pages_used = pagedir1.size > old_ps1_size ?
-                pagedir1.size - old_ps1_size : 0;
-
-        if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
-                printk(KERN_INFO "Pageset1 has grown by %lu pages. "
-                        "extra_pages_allowance is currently only %lu.\n",
-                        pagedir1.size - old_ps1_size,
-                        extra_pd1_pages_allowance);
-
-                /*
-                 * Highlevel code will see this, clear the state and
-                 * retry if we haven't already done so twice.
-                 */
-                if (any_to_free(1)) {
-                        set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-                        return 1;
-                }
-                if (try_allocate_extra_memory()) {
-                        printk(KERN_INFO "Failed to allocate the extra memory"
-                                        " needed. Restarting the process.");
-                        set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-                        return 1;
-                }
-                printk(KERN_INFO "However it looks like there's enough"
-                        " free ram and storage to handle this, so "
-                        " continuing anyway.");
-                /* 
-                 * What if try_allocate_extra_memory above calls
-                 * toi_allocate_extra_pagedir_memory and it allocs a new
-                 * slab page via toi_kzalloc which should be in ps1? So...
-                 */
-                toi_recalculate_image_contents(1);
-        }
-
-        if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
-            !test_action_state(TOI_TEST_BIO))
-                toi_copy_pageset1();
-
-        return 0;
-}
-
-/**
- * toi_hibernate - high level code for doing the atomic copy
- *
- * High-level code which prepares to do the atomic copy. Loosely based
- * on the swsusp version, but with the following twists:
- *        - We set toi_running so the swsusp code uses our code paths.
- *        - We give better feedback regarding what goes wrong if there is a
- *          problem.
- *        - We use an extra function to call the assembly, just in case this code
- *          is in a module (return address).
- **/
-int toi_hibernate(void)
-{
-        int error;
-
-        error = toi_lowlevel_builtin();
-
-        if (!error) {
-                struct toi_boot_kernel_data *bkd =
-                        (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
-                /*
-                 * The boot kernel's data may be larger (newer version) or
-                 * smaller (older version) than ours. Copy the minimum
-                 * of the two sizes, so that we don't overwrite valid values
-                 * from pre-atomic copy.
-                 */
-
-                memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
-                        min_t(int, sizeof(struct toi_boot_kernel_data),
-                                bkd->size));
-        }
-
-        return error;
-}
-
-/**
- * toi_atomic_restore - prepare to do the atomic restore
- *
- * Get ready to do the atomic restore. This part gets us into the same
- * state we are in prior to do calling do_toi_lowlevel while
- * hibernating: hot-unplugging secondary cpus and freeze processes,
- * before starting the thread that will do the restore.
- **/
-int toi_atomic_restore(void)
-{
-        int error;
-
-        toi_prepare_status(DONT_CLEAR_BAR,        "Atomic restore.");
-
-        memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line,
-                strlen(saved_command_line));
-
-        toi_pre_atomic_restore_modules(&toi_bkd);
-
-        if (add_boot_kernel_data_pbe())
-                goto Failed;
-
-        toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
-        if (toi_go_atomic(PMSG_QUIESCE, 0))
-                goto Failed;
-
-        /* We'll ignore saved state, but this gets preempt count (etc) right */
-        save_processor_state();
-
-        error = swsusp_arch_resume();
-        /*
-         * Code below is only ever reached in case of failure. Otherwise
-         * execution continues at place where swsusp_arch_suspend was called.
-         *
-         * We don't know whether it's safe to continue (this shouldn't happen),
-         * so lets err on the side of caution.
-         */
-        BUG();
-
-Failed:
-        free_pbe_list(&restore_pblist, 0);
-#ifdef CONFIG_HIGHMEM
-        free_pbe_list(&restore_highmem_pblist, 1);
-#endif
-        return 1;
-}
-
-/**
- * toi_go_atomic - do the actual atomic copy/restore
- * @state:           The state to use for dpm_suspend_start & power_down calls.
- * @suspend_time:  Whether we're suspending or resuming.
- **/
-int toi_go_atomic(pm_message_t state, int suspend_time)
-{
-  if (suspend_time) {
-    if (platform_begin(1)) {
-      set_abort_result(TOI_PLATFORM_PREP_FAILED);
-      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
-      return 1;
-    }
-
-    if (dpm_prepare(PMSG_FREEZE)) {
-      set_abort_result(TOI_DPM_PREPARE_FAILED);
-      dpm_complete(PMSG_RECOVER);
-      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
-      return 1;
-    }
-  }
-
-        suspend_console();
-        pm_restrict_gfp_mask();
-
-  if (suspend_time) {
-    if (dpm_suspend(state)) {
-      set_abort_result(TOI_DPM_SUSPEND_FAILED);
-      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
-      return 1;
-    }
-  } else {
-    if (dpm_suspend_start(state)) {
-      set_abort_result(TOI_DPM_SUSPEND_FAILED);
-      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
-      return 1;
-    }
-  }
-
-        /* At this point, dpm_suspend_start() has been called, but *not*
-         * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now.
-         * Otherwise, drivers for some devices (e.g. interrupt controllers)
-         * become desynchronized with the actual state of the hardware
-         * at resume time, and evil weirdness ensues.
-         */
-
-        if (dpm_suspend_end(state)) {
-                set_abort_result(TOI_DEVICE_REFUSED);
-                toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1);
-                return 1;
-        }
-
-        if (suspend_time) {
-                if (platform_pre_snapshot(1))
-                        set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
-        } else {
-                if (platform_pre_restore(1))
-                        set_abort_result(TOI_PRE_RESTORE_FAILED);
-        }
-
-        if (test_result_state(TOI_ABORTED)) {
-                toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1);
-                return 1;
-        }
-
-        if (disable_nonboot_cpus()) {
-            set_abort_result(TOI_CPU_HOTPLUG_FAILED);
-            toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
-                    suspend_time, 1);
-            return 1;
-        }
-
-        local_irq_disable();
-
-        if (syscore_suspend()) {
-                set_abort_result(TOI_SYSCORE_REFUSED);
-                toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1);
-                return 1;
-        }
-
-        if (suspend_time && pm_wakeup_pending()) {
-                set_abort_result(TOI_WAKEUP_EVENT);
-                toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1);
-                return 1;
-        }
-        return 0;
-}
-
-/**
- * toi_end_atomic - post atomic copy/restore routines
- * @stage:                What step to start at.
- * @suspend_time:        Whether we're suspending or resuming.
- * @error:                Whether we're recovering from an error.
- **/
-void toi_end_atomic(int stage, int suspend_time, int error)
-{
-        pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) :
-                PMSG_RESTORE;
-
-        switch (stage) {
-        case ATOMIC_ALL_STEPS:
-                if (!suspend_time) {
-                        events_check_enabled = false;
-                }
-                platform_leave(1);
-        case ATOMIC_STEP_SYSCORE_RESUME:
-                syscore_resume();
-        case ATOMIC_STEP_IRQS:
-                local_irq_enable();
-        case ATOMIC_STEP_CPU_HOTPLUG:
-                enable_nonboot_cpus();
-        case ATOMIC_STEP_PLATFORM_FINISH:
-                if (!suspend_time && error & 2)
-                        platform_restore_cleanup(1);
-                else 
-                        platform_finish(1);
-                dpm_resume_start(msg);
-        case ATOMIC_STEP_DEVICE_RESUME:
-                if (suspend_time && (error & 2))
-                        platform_recover(1);
-                dpm_resume(msg);
-                if (!toi_in_suspend()) {
-                    dpm_resume_end(PMSG_RECOVER);
-                }
-                if (error || !toi_in_suspend()) {
-                        pm_restore_gfp_mask();
-                }
-                resume_console();
-        case ATOMIC_STEP_DPM_COMPLETE:
-                dpm_complete(msg);
-        case ATOMIC_STEP_PLATFORM_END:
-                platform_end(1);
-
-                toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
-        }
-}
diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
deleted file mode 100644
index e2d2b4fb3..000000000
--- a/kernel/power/tuxonice_atomic_copy.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.h
- *
- * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-enum {
-        ATOMIC_ALL_STEPS,
-        ATOMIC_STEP_SYSCORE_RESUME,
-        ATOMIC_STEP_IRQS,
-        ATOMIC_STEP_CPU_HOTPLUG,
-        ATOMIC_STEP_PLATFORM_FINISH,
-        ATOMIC_STEP_DEVICE_RESUME,
-        ATOMIC_STEP_DPM_COMPLETE,
-        ATOMIC_STEP_PLATFORM_END,
-};
-
-int toi_go_atomic(pm_message_t state, int toi_time);
-void toi_end_atomic(int stage, int toi_time, int error);
-
-extern void platform_recover(int platform_mode);
diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h
deleted file mode 100644
index 2f717f5c5..000000000
--- a/kernel/power/tuxonice_bio.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-#include <linux/buffer_head.h>
-#include "tuxonice_extent.h"
-
-void toi_put_extent_chain(struct hibernate_extent_chain *chain);
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
-                unsigned long start, unsigned long end);
-
-struct hibernate_extent_saved_state {
-        int extent_num;
-        struct hibernate_extent *extent_ptr;
-        unsigned long offset;
-};
-
-struct toi_bdev_info {
-        struct toi_bdev_info *next;
-        struct hibernate_extent_chain blocks;
-        struct block_device *bdev;
-        struct toi_module_ops *allocator;
-        int allocator_index;
-        struct hibernate_extent_chain allocations;
-        char name[266]; /* "swap on " or "file " + up to 256 chars */
-
-        /* Saved in header */
-        char uuid[17];
-        dev_t dev_t;
-        int prio;
-        int bmap_shift;
-        int blocks_per_page;
-        unsigned long pages_used;
-        struct hibernate_extent_saved_state saved_state[4];
-};
-
-struct toi_extent_iterate_state {
-        struct toi_bdev_info *current_chain;
-        int num_chains;
-        int saved_chain_number[4];
-        struct toi_bdev_info *saved_chain_ptr[4];
-};
-
-/*
- * Our exported interface so the swapwriter and filewriter don't
- * need these functions duplicated.
- */
-struct toi_bio_ops {
-        int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
-                        struct page *page);
-        int (*register_storage)(struct toi_bdev_info *new);
-        void (*free_storage)(void);
-};
-
-struct toi_allocator_ops {
-        unsigned long (*toi_swap_storage_available) (void);
-};
-
-extern struct toi_bio_ops toi_bio_ops;
-
-extern char *toi_writer_buffer;
-extern int toi_writer_buffer_posn;
-
-struct toi_bio_allocator_ops {
-        int (*register_storage) (void);
-        unsigned long (*storage_available)(void);
-        int (*allocate_storage) (struct toi_bdev_info *, unsigned long);
-        int (*bmap) (struct toi_bdev_info *);
-        void (*free_storage) (struct toi_bdev_info *);
-        unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used);
-};
-
-extern int toi_bio_register_storage(void);
diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c
deleted file mode 100644
index 11cd37f77..000000000
--- a/kernel/power/tuxonice_bio_chains.c
+++ /dev/null
@@ -1,1121 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_devinfo.c
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/mm_types.h>
-#include "tuxonice_bio.h"
-#include "tuxonice_bio_internal.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-#include "tuxonice_io.h"
-
-static struct toi_bdev_info *prio_chain_head;
-static int num_chains;
-
-/* Pointer to current entry being loaded/saved. */
-struct toi_extent_iterate_state toi_writer_posn;
-
-#define metadata_size (sizeof(struct toi_bdev_info) - \
-                offsetof(struct toi_bdev_info, uuid))
-
-/*
- * After section 0 (header) comes 2 => next_section[0] = 2
- */
-static int next_section[3] = { 2, 3, 1 };
-
-/**
- * dump_block_chains - print the contents of the bdev info array.
- **/
-void dump_block_chains(void)
-{
-        int i = 0;
-        int j;
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-
-        while (cur_chain) {
-                struct hibernate_extent *this = cur_chain->blocks.first;
-
-                printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio);
-
-                while (this) {
-                        printk(KERN_CONT " [%lu-%lu]%s", this->start,
-                                        this->end, this->next ? "," : "");
-                        this = this->next;
-                }
-
-                printk("\n");
-                cur_chain = cur_chain->next;
-                i++;
-        }
-
-        printk(KERN_DEBUG "Saved states:\n");
-        for (i = 0; i < 4; i++) {
-                printk(KERN_DEBUG "Slot %d: Chain %d.\n",
-                        i, toi_writer_posn.saved_chain_number[i]);
-
-                cur_chain = prio_chain_head;
-                j = 0;
-                while (cur_chain) {
-                        printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n",
-                                        j, cur_chain->saved_state[i].extent_num,
-                                        cur_chain->saved_state[i].offset);
-                        cur_chain = cur_chain->next;
-                        j++;
-                }
-                printk(KERN_CONT "\n");
-        }
-}
-
-/**
- *
- **/
-static void toi_extent_chain_next(void)
-{
-        struct toi_bdev_info *this = toi_writer_posn.current_chain;
-
-        if (!this->blocks.current_extent)
-                return;
-
-        if (this->blocks.current_offset == this->blocks.current_extent->end) {
-                if (this->blocks.current_extent->next) {
-                        this->blocks.current_extent =
-                                this->blocks.current_extent->next;
-                        this->blocks.current_offset =
-                                this->blocks.current_extent->start;
-                } else {
-                        this->blocks.current_extent = NULL;
-                        this->blocks.current_offset = 0;
-                }
-        } else
-                this->blocks.current_offset++;
-}
-
-/**
- *
- */
-
-static struct toi_bdev_info *__find_next_chain_same_prio(void)
-{
-        struct toi_bdev_info *start_chain = toi_writer_posn.current_chain;
-        struct toi_bdev_info *this = start_chain;
-        int orig_prio = this->prio;
-
-        do {
-                this = this->next;
-
-                if (!this)
-                        this = prio_chain_head;
-
-                /* Back on original chain? Use it again. */
-                if (this == start_chain)
-                        return start_chain;
-
-        } while (!this->blocks.current_extent || this->prio != orig_prio);
-
-        return this;
-}
-
-static void find_next_chain(void)
-{
-        struct toi_bdev_info *this;
-
-        this = __find_next_chain_same_prio();
-
-        /*
-         * If we didn't get another chain of the same priority that we
-         * can use, look for the next priority.
-         */
-        while (this && !this->blocks.current_extent)
-                this = this->next;
-
-        toi_writer_posn.current_chain = this;
-}
-
-/**
- * toi_extent_state_next - go to the next extent
- * @blocks: The number of values to progress.
- * @stripe_mode: Whether to spread usage across all chains.
- *
- * Given a state, progress to the next valid entry. We may begin in an
- * invalid state, as we do when invoked after extent_state_goto_start below.
- *
- * When using compression and expected_compression > 0, we let the image size
- * be larger than storage, so we can validly run out of data to return.
- **/
-static unsigned long toi_extent_state_next(int blocks, int current_stream)
-{
-        int i;
-
-        if (!toi_writer_posn.current_chain)
-                return -ENOSPC;
-
-        /* Assume chains always have lengths that are multiples of @blocks */
-        for (i = 0; i < blocks; i++)
-                toi_extent_chain_next();
-
-        /* The header stream is not striped */
-        if (current_stream ||
-            !toi_writer_posn.current_chain->blocks.current_extent)
-                find_next_chain();
-
-        return  toi_writer_posn.current_chain ? 0 : -ENOSPC;
-}
-
-static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this)
-{
-        struct toi_bdev_info **prev_ptr;
-        struct toi_bdev_info *cur;
-
-        /* Loop through the existing chain, finding where to insert it */
-        prev_ptr = &prio_chain_head;
-        cur = prio_chain_head;
-
-        while (cur && cur->prio >= this->prio) {
-                prev_ptr = &cur->next;
-                cur = cur->next;
-        }
-
-        this->next = *prev_ptr;
-        *prev_ptr = this;
-
-        this = prio_chain_head;
-        while (this)
-                this = this->next;
-        num_chains++;
-}
-
-/**
- * toi_extent_state_goto_start - reinitialize an extent chain iterator
- * @state:        Iterator to reinitialize
- **/
-void toi_extent_state_goto_start(void)
-{
-        struct toi_bdev_info *this = prio_chain_head;
-
-        while (this) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "Setting current extent to %p.", this->blocks.first);
-                this->blocks.current_extent = this->blocks.first;
-                if (this->blocks.current_extent) {
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Setting current offset to %lu.",
-                                        this->blocks.current_extent->start);
-                        this->blocks.current_offset =
-                                this->blocks.current_extent->start;
-                }
-
-                this = this->next;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.",
-                        prio_chain_head);
-        toi_writer_posn.current_chain = prio_chain_head;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start.");
-}
-
-/**
- * toi_extent_state_save - save state of the iterator
- * @state:                Current state of the chain
- * @saved_state:        Iterator to populate
- *
- * Given a state and a struct hibernate_extent_state_store, save the current
- * position in a format that can be used with relocated chains (at
- * resume time).
- **/
-void toi_extent_state_save(int slot)
-{
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-        struct hibernate_extent *extent;
-        struct hibernate_extent_saved_state *chain_state;
-        int i = 0;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.",
-                        slot);
-
-        if (!toi_writer_posn.current_chain) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => "
-                                "chain_num = -1.");
-                toi_writer_posn.saved_chain_number[slot] = -1;
-                return;
-        }
-
-        while (cur_chain) {
-                i++;
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) "
-                                "state, slot %d.", i, cur_chain, slot);
-
-                chain_state = &cur_chain->saved_state[slot];
-
-                chain_state->offset = cur_chain->blocks.current_offset;
-
-                if (toi_writer_posn.current_chain == cur_chain) {
-                        toi_writer_posn.saved_chain_number[slot] = i;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain "
-                                        "we were on => chain_num is %d.", i);
-                }
-
-                if (!cur_chain->blocks.current_extent) {
-                        chain_state->extent_num = 0;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent "
-                                        "for this chain => extent_num %d is 0.",
-                                        i);
-                        cur_chain = cur_chain->next;
-                        continue;
-                }
-
-                extent = cur_chain->blocks.first;
-                chain_state->extent_num = 1;
-
-                while (extent != cur_chain->blocks.current_extent) {
-                        chain_state->extent_num++;
-                        extent = extent->next;
-                }
-
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i,
-                                chain_state->extent_num);
-
-                cur_chain = cur_chain->next;
-        }
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "Completed saving extent state slot %d.", slot);
-}
-
-/**
- * toi_extent_state_restore - restore the position saved by extent_state_save
- * @state:                State to populate
- * @saved_state:        Iterator saved to restore
- **/
-void toi_extent_state_restore(int slot)
-{
-        int i = 0;
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-        struct hibernate_extent_saved_state *chain_state;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "toi_extent_state_restore - slot %d.", slot);
-
-        if (toi_writer_posn.saved_chain_number[slot] == -1) {
-                toi_writer_posn.current_chain = NULL;
-                return;
-        }
-
-        while (cur_chain) {
-                int posn;
-                int j;
-                i++;
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) "
-                                "state, slot %d.", i, cur_chain, slot);
-
-                chain_state = &cur_chain->saved_state[slot];
-
-                posn = chain_state->extent_num;
-
-                cur_chain->blocks.current_extent = cur_chain->blocks.first;
-                cur_chain->blocks.current_offset = chain_state->offset;
-
-                if (i == toi_writer_posn.saved_chain_number[slot]) {
-                        toi_writer_posn.current_chain = cur_chain;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Found current chain.");
-                }
-
-                for (j = 0; j < 4; j++)
-                        if (i == toi_writer_posn.saved_chain_number[j]) {
-                                toi_writer_posn.saved_chain_ptr[j] = cur_chain;
-                                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Found saved chain ptr %d (%p) (offset"
-                                        " %d).", j, cur_chain,
-                                        cur_chain->saved_state[j].offset);
-                        }
-
-                if (posn) {
-                        while (--posn)
-                                cur_chain->blocks.current_extent =
-                                        cur_chain->blocks.current_extent->next;
-                } else
-                        cur_chain->blocks.current_extent = NULL;
-
-                cur_chain = cur_chain->next;
-        }
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done.");
-        if (test_action_state(TOI_LOGALL))
-                dump_block_chains();
-}
-
-/*
- * Storage needed
- *
- * Returns amount of space in the image header required
- * for the chain data. This ignores the links between
- * pages, which we factor in when allocating the space.
- */
-int toi_bio_devinfo_storage_needed(void)
-{
-        int result = sizeof(num_chains);
-        struct toi_bdev_info *chain = prio_chain_head;
-
-        while (chain) {
-                result += metadata_size;
-
-                /* Chain size */
-                result += sizeof(int);
-
-                /* Extents */
-                result += (2 * sizeof(unsigned long) *
-                        chain->blocks.num_extents);
-
-                chain = chain->next;
-        }
-
-        result += 4 * sizeof(int);
-        return result;
-}
-
-static unsigned long chain_pages_used(struct toi_bdev_info *chain)
-{
-        struct hibernate_extent *this = chain->blocks.first;
-        struct hibernate_extent_saved_state *state = &chain->saved_state[3];
-        unsigned long size = 0;
-        int extent_idx = 1;
-
-        if (!state->extent_num) {
-                if (!this)
-                        return 0;
-                else
-                        return chain->blocks.size;
-        }
-
-        while (extent_idx < state->extent_num) {
-                size += (this->end - this->start + 1);
-                this = this->next;
-                extent_idx++;
-        }
-
-        /* We didn't use the one we're sitting on, so don't count it */
-        return size + state->offset - this->start;
-}
-
-void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain)
-{
-    unsigned long used = chain_pages_used(chain);
-
-    /* Free the storage */
-    unsigned long first_freed = 0;
-
-    if (chain->allocator->bio_allocator_ops->free_unused_storage)
-        first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used);
-
-    printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed);
-
-    /* Adjust / free the extents. */
-    toi_put_extent_chain_from(&chain->blocks, first_freed);
-
-    {
-        struct hibernate_extent *this = chain->blocks.first;
-        while (this) {
-            printk("Extent %lx-%lx.\n", this->start, this->end);
-            this = this->next;
-        }
-    }
-}
-
-/**
- * toi_serialise_extent_chain - write a chain in the image
- * @chain:        Chain to write.
- **/
-static int toi_serialise_extent_chain(struct toi_bdev_info *chain)
-{
-        struct hibernate_extent *this;
-        int ret;
-        int i = 1;
-
-        chain->pages_used = chain_pages_used(chain);
-
-        if (test_action_state(TOI_LOGALL))
-                dump_block_chains();
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).",
-                        chain->dev_t);
-        /* Device info -  dev_t, prio, bmap_shift, blocks per page, positions */
-        ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
-                        (char *) &chain->uuid, metadata_size);
-        if (ret)
-                return ret;
-
-        /* Num extents */
-        ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
-                        (char *) &chain->blocks.num_extents, sizeof(int));
-        if (ret)
-                return ret;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
-                        chain->blocks.num_extents);
-
-        this = chain->blocks.first;
-        while (this) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i);
-                ret = toiActiveAllocator->rw_header_chunk(WRITE,
-                                &toi_blockwriter_ops,
-                                (char *) this, 2 * sizeof(this->start));
-                if (ret)
-                        return ret;
-                this = this->next;
-                i++;
-        }
-
-        return ret;
-}
-
-int toi_serialise_extent_chains(void)
-{
-        struct toi_bdev_info *this = prio_chain_head;
-        int result;
-
-        /* Write the number of chains */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)",
-                        num_chains);
-        result = toiActiveAllocator->rw_header_chunk(WRITE,
-                        &toi_blockwriter_ops, (char *) &num_chains,
-                        sizeof(int));
-        if (result)
-                return result;
-
-        /* Then the chains themselves */
-        while (this) {
-                result = toi_serialise_extent_chain(this);
-                if (result)
-                        return result;
-                this = this->next;
-        }
-
-        /*
-         * Finally, the chain we should be on at the start of each
-         * section.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers.");
-        result = toiActiveAllocator->rw_header_chunk(WRITE,
-                        &toi_blockwriter_ops,
-                        (char *) &toi_writer_posn.saved_chain_number[0],
-                        4 * sizeof(int));
-
-        return result;
-}
-
-int toi_register_storage_chain(struct toi_bdev_info *new)
-{
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.",
-                        new);
-        toi_insert_chain_in_prio_list(new);
-        return 0;
-}
-
-static void free_bdev_info(struct toi_bdev_info *chain)
-{
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain);
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents.");
-        toi_put_extent_chain(&chain->blocks);
-
-        /*
-         * The allocator may need to do more than just free the chains
-         * (swap_free, for example). Don't call from boot kernel.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents.");
-        if (chain->allocator)
-                chain->allocator->bio_allocator_ops->free_storage(chain);
-
-        /*
-         * Dropping out of reading atomic copy? Need to undo
-         * toi_open_by_devnum.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev.");
-        if (chain->bdev && !IS_ERR(chain->bdev) &&
-                        chain->bdev != resume_block_device &&
-                        chain->bdev != header_block_device &&
-                        test_toi_state(TOI_TRYING_TO_RESUME))
-                toi_close_bdev(chain->bdev);
-
-        /* Poison */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct.");
-        toi_kfree(39, chain, sizeof(*chain));
-
-        if (prio_chain_head == chain)
-                prio_chain_head = NULL;
-
-        num_chains--;
-}
-
-void free_all_bdev_info(void)
-{
-        struct toi_bdev_info *this = prio_chain_head;
-
-        while (this) {
-                struct toi_bdev_info *next = this->next;
-                free_bdev_info(this);
-                this = next;
-        }
-
-        memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn));
-        prio_chain_head = NULL;
-}
-
-static void set_up_start_position(void)
-{
-        toi_writer_posn.current_chain = prio_chain_head;
-        go_next_page(0, 0);
-}
-
-/**
- * toi_load_extent_chain - read back a chain saved in the image
- * @chain:        Chain to load
- *
- * The linked list of extents is reconstructed from the disk. chain will point
- * to the first entry.
- **/
-int toi_load_extent_chain(int index, int *num_loaded)
-{
-        struct toi_bdev_info *chain = toi_kzalloc(39,
-                        sizeof(struct toi_bdev_info), GFP_ATOMIC);
-        struct hibernate_extent *this, *last = NULL;
-        int i, ret;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index);
-        /* Get dev_t, prio, bmap_shift, blocks per page, positions */
-        ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
-                        (char *) &chain->uuid, metadata_size);
-
-        if (ret) {
-                printk(KERN_ERR "Failed to read the size of extent chain.\n");
-                toi_kfree(39, chain, sizeof(*chain));
-                return 1;
-        }
-
-        toi_bkd.pages_used[index] = chain->pages_used;
-
-        ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
-                        (char *) &chain->blocks.num_extents, sizeof(int));
-        if (ret) {
-                printk(KERN_ERR "Failed to read the size of extent chain.\n");
-                toi_kfree(39, chain, sizeof(*chain));
-                return 1;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
-                        chain->blocks.num_extents);
-
-        for (i = 0; i < chain->blocks.num_extents; i++) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1);
-
-                this = toi_kzalloc(2, sizeof(struct hibernate_extent),
-                                TOI_ATOMIC_GFP);
-                if (!this) {
-                        printk(KERN_INFO "Failed to allocate a new extent.\n");
-                        free_bdev_info(chain);
-                        return -ENOMEM;
-                }
-                this->next = NULL;
-                /* Get the next page */
-                ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
-                                NULL, (char *) this, 2 * sizeof(this->start));
-                if (ret) {
-                        printk(KERN_INFO "Failed to read an extent.\n");
-                        toi_kfree(2, this, sizeof(struct hibernate_extent));
-                        free_bdev_info(chain);
-                        return 1;
-                }
-
-                if (last)
-                        last->next = this;
-                else {
-                        char b1[32], b2[32], b3[32];
-                        /*
-                         * Open the bdev
-                         */
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                "Chain dev_t is %s. Resume dev t is %s. Header"
-                                " bdev_t is %s.\n",
-                                format_dev_t(b1, chain->dev_t),
-                                format_dev_t(b2, resume_dev_t),
-                                format_dev_t(b3, toi_sig_data->header_dev_t));
-
-                        if (chain->dev_t == resume_dev_t)
-                                chain->bdev = resume_block_device;
-                        else if (chain->dev_t == toi_sig_data->header_dev_t)
-                                chain->bdev = header_block_device;
-                        else {
-                                chain->bdev = toi_open_bdev(chain->uuid,
-                                                chain->dev_t, 1);
-                                if (IS_ERR(chain->bdev)) {
-                                        free_bdev_info(chain);
-                                        return -ENODEV;
-                                }
-                        }
-
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift "
-                                        "is %d and blocks per page is %d.",
-                                        chain->bmap_shift,
-                                        chain->blocks_per_page);
-
-                        chain->blocks.first = this;
-
-                        /*
-                         * Couldn't do this earlier, but can't do
-                         * goto_start now - we may have already used blocks
-                         * in the first chain.
-                         */
-                        chain->blocks.current_extent = this;
-                        chain->blocks.current_offset = this->start;
-
-                        /*
-                         * Can't wait until we've read the whole chain
-                         * before we insert it in the list. We might need
-                         * this chain to read the next page in the header
-                         */
-                        toi_insert_chain_in_prio_list(chain);
-                }
-
-                /*
-                 * We have to wait until 2 extents are loaded before setting up
-                 * properly because if the first extent has only one page, we
-                 * will need to put the position on the second extent. Sounds
-                 * obvious, but it wasn't!
-                 */
-                (*num_loaded)++;
-                if ((*num_loaded) == 2)
-                        set_up_start_position();
-                last = this;
-        }
-
-        /*
-         * Shouldn't get empty chains, but it's not impossible. Link them in so
-         * they get freed properly later.
-         */
-        if (!chain->blocks.num_extents)
-                toi_insert_chain_in_prio_list(chain);
-
-        if (!chain->blocks.current_extent) {
-                chain->blocks.current_extent = chain->blocks.first;
-                if (chain->blocks.current_extent)
-                        chain->blocks.current_offset =
-                                chain->blocks.current_extent->start;
-        }
-        return 0;
-}
-
-int toi_load_extent_chains(void)
-{
-        int result;
-        int to_load;
-        int i;
-        int extents_loaded = 0;
-
-        result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
-                        (char *) &to_load,
-                        sizeof(int));
-        if (result)
-                return result;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load);
-
-        for (i = 0; i < to_load; i++) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.",
-                                i, to_load);
-                result = toi_load_extent_chain(i, &extents_loaded);
-                if (result)
-                        return result;
-        }
-
-        /* If we never got to a second extent, we still need to do this. */
-        if (extents_loaded == 1)
-                set_up_start_position();
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers.");
-        result = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
-                        &toi_blockwriter_ops,
-                        (char *) &toi_writer_posn.saved_chain_number[0],
-                        4 * sizeof(int));
-
-        return result;
-}
-
-static int toi_end_of_stream(int writing, int section_barrier)
-{
-        struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
-        int compare_to = next_section[current_stream];
-        struct toi_bdev_info *compare_chain =
-                toi_writer_posn.saved_chain_ptr[compare_to];
-        int compare_offset = compare_chain ?
-                compare_chain->saved_state[compare_to].offset : 0;
-
-        if (!section_barrier)
-                return 0;
-
-        if (!cur_chain)
-                return 1;
-
-        if (cur_chain == compare_chain &&
-            cur_chain->blocks.current_offset == compare_offset) {
-                if (writing) {
-                        if (!current_stream) {
-                                debug_broken_header();
-                                return 1;
-                        }
-                } else {
-                        more_readahead = 0;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Reached the end of stream %d "
-                                        "(not an error).", current_stream);
-                        return 1;
-                }
-        }
-
-        return 0;
-}
-
-/**
- * go_next_page - skip blocks to the start of the next page
- * @writing: Whether we're reading or writing the image.
- *
- * Go forward one page.
- **/
-int go_next_page(int writing, int section_barrier)
-{
-        struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
-        int max = cur_chain ? cur_chain->blocks_per_page : 1;
-
-        /* Nope. Go foward a page - or maybe two. Don't stripe the header,
-         * so that bad fragmentation doesn't put the extent data containing
-         * the location of the second page out of the first header page.
-         */
-        if (toi_extent_state_next(max, current_stream)) {
-                /* Don't complain if readahead falls off the end */
-                if (writing && section_barrier) {
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. "
-                                "Expected compression ratio too optimistic?");
-                        if (test_action_state(TOI_LOGALL))
-                                dump_block_chains();
-                }
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to "
-                                "read/write. (Not necessarily a fatal error.");
-                return -ENOSPC;
-        }
-
-        return 0;
-}
-
-int devices_of_same_priority(struct toi_bdev_info *this)
-{
-        struct toi_bdev_info *check = prio_chain_head;
-        int i = 0;
-
-        while (check) {
-                if (check->prio == this->prio)
-                        i++;
-                check = check->next;
-        }
-
-        return i;
-}
-
-/**
- * toi_bio_rw_page - do i/o on the next disk page in the image
- * @writing: Whether reading or writing.
- * @page: Page to do i/o on.
- * @is_readahead: Whether we're doing readahead
- * @free_group: The group used in allocating the page
- *
- * Submit a page for reading or writing, possibly readahead.
- * Pass the group used in allocating the page as well, as it should
- * be freed on completion of the bio if we're writing the page.
- **/
-int toi_bio_rw_page(int writing, struct page *page,
-                int is_readahead, int free_group)
-{
-        int result = toi_end_of_stream(writing, 1);
-        struct toi_bdev_info *dev_info = toi_writer_posn.current_chain;
-
-        if (result) {
-                if (writing)
-                        abort_hibernate(TOI_INSUFFICIENT_STORAGE,
-                                "Insufficient storage for your image.");
-                else
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to "
-                                "read/write another page when stream has "
-                                "ended.");
-                return -ENOSPC;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "%s %lx:%ld",
-                        writing ? "Write" : "Read",
-                        dev_info->dev_t, dev_info->blocks.current_offset);
-
-        result = toi_do_io(writing, dev_info->bdev,
-                dev_info->blocks.current_offset << dev_info->bmap_shift,
-                page, is_readahead, 0, free_group);
-
-        /* Ignore the result here - will check end of stream if come in again */
-        go_next_page(writing, 1);
-
-        if (result)
-                printk(KERN_ERR "toi_do_io returned %d.\n", result);
-        return result;
-}
-
-dev_t get_header_dev_t(void)
-{
-        return prio_chain_head->dev_t;
-}
-
-struct block_device *get_header_bdev(void)
-{
-        return prio_chain_head->bdev;
-}
-
-unsigned long get_headerblock(void)
-{
-        return prio_chain_head->blocks.first->start <<
-                prio_chain_head->bmap_shift;
-}
-
-int get_main_pool_phys_params(void)
-{
-        struct toi_bdev_info *this = prio_chain_head;
-        int result;
-
-        while (this) {
-                result = this->allocator->bio_allocator_ops->bmap(this);
-                if (result)
-                        return result;
-                this = this->next;
-        }
-
-        return 0;
-}
-
-static int apply_header_reservation(void)
-{
-        int i;
-
-        if (!header_pages_reserved) {
-                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                "No header pages reserved at the moment.");
-                return 0;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation.");
-
-        /* Apply header space reservation */
-        toi_extent_state_goto_start();
-
-        for (i = 0; i < header_pages_reserved; i++)
-                if (go_next_page(1, 0))
-                        return -ENOSPC;
-
-        /* The end of header pages will be the start of pageset 2 */
-        toi_extent_state_save(2);
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "Finished applying header reservation.");
-        return 0;
-}
-
-int toi_bio_register_storage(void)
-{
-        int result = 0;
-        struct toi_module_ops *this_module;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
-                "Registering storage.");
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    this_module->type != BIO_ALLOCATOR_MODULE)
-                        continue;
-                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                "Registering storage from %s.",
-                                this_module->name);
-                result = this_module->bio_allocator_ops->register_storage();
-                if (result)
-                        break;
-        }
-
-        return result;
-}
-
-void toi_bio_free_unused_storage(void)
-{
-    struct toi_bdev_info *this = prio_chain_head;
-
-    while (this) {
-        toi_bio_free_unused_storage_chain(this);
-        this = this->next;
-    }
-}
-
-int toi_bio_allocate_storage(unsigned long request)
-{
-        struct toi_bdev_info *chain = prio_chain_head;
-        unsigned long to_get = request;
-        unsigned long extra_pages, needed;
-        int no_free = 0;
-
-        if (!chain) {
-            printk("TuxOnIce: No storage was registered.\n");
-            return 0;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
-                        "Request is %lu pages.", request);
-        extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
-                               + sizeof(int)), PAGE_SIZE);
-        needed = request + extra_pages + header_pages_reserved;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu "
-                        "for header => %lu.",
-                        extra_pages, header_pages_reserved, needed);
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.",
-                        raw_pages_allocd);
-
-        to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get);
-
-        if (!to_get)
-                return apply_header_reservation();
-
-        while (to_get && chain) {
-                int num_group = devices_of_same_priority(chain);
-                int divisor = num_group - no_free;
-                int i;
-                unsigned long portion = DIV_ROUND_UP(to_get, divisor);
-                unsigned long got = 0;
-                unsigned long got_this_round = 0;
-                struct toi_bdev_info *top = chain;
-
-                toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                " Start of loop. To get is %lu. Divisor is %d.",
-                                to_get, divisor);
-                no_free = 0;
-
-                /*
-                 * We're aiming to spread the allocated storage as evenly
-                 * as possible, but we also want to get all the storage we
-                 * can off this priority.
-                 */
-                for (i = 0; i < num_group; i++) {
-                        struct toi_bio_allocator_ops *ops =
-                                chain->allocator->bio_allocator_ops;
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        " Asking for %lu pages from chain %p.",
-                                        portion, chain);
-                        got = ops->allocate_storage(chain, portion);
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        " Got %lu pages from allocator %p.",
-                                        got, chain);
-                        if (!got)
-                                no_free++;
-                        got_this_round += got;
-                        chain = chain->next;
-                }
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a "
-                                "total of %lu pages from %d allocators.",
-                                got_this_round, divisor - no_free);
-
-                raw_pages_allocd += got_this_round;
-                to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd :
-                        0;
-
-                /*
-                 * If we got anything from chains of this priority and we
-                 * still have storage to allocate, go over this priority
-                 * again.
-                 */
-                if (got_this_round && to_get)
-                        chain = top;
-                else
-                        no_free = 0;
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling "
-                        "get_main_pool_phys_params");
-        /* Now let swap allocator bmap the pages */
-        get_main_pool_phys_params();
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header.");
-        return apply_header_reservation();
-}
-
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd)
-{
-        int i = 0;
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-
-        while (cur_chain) {
-                cur_chain->pages_used = bkd->pages_used[i];
-                cur_chain = cur_chain->next;
-                i++;
-        }
-}
-
-int toi_bio_chains_debug_info(char *buffer, int size)
-{
-        /* Show what we actually used */
-        struct toi_bdev_info *cur_chain = prio_chain_head;
-        int len = 0;
-
-        while (cur_chain) {
-                len += scnprintf(buffer + len, size - len, "  Used %lu pages "
-                                "from %s.\n", cur_chain->pages_used,
-                                cur_chain->name);
-                cur_chain = cur_chain->next;
-        }
-
-        return len;
-}
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
-    struct toi_bdev_info *this = toi_writer_posn.current_chain,
-                         *cmp = prio_chain_head;
-
-    ptr->save.chain = 1;
-    while (this != cmp) {
-        ptr->save.chain++;
-        cmp = cmp->next;
-    }
-    ptr->save.block = this->blocks.current_offset;
-
-    /* Save the raw info internally for quicker access when updating pointers */
-    ptr->bdev = this->bdev;
-    ptr->block = this->blocks.current_offset << this->bmap_shift;
-}
-
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
-    int i = ptr->save.chain - 1;
-    struct toi_bdev_info *this;
-    struct hibernate_extent *hib;
-
-    /* Find chain by stored index */
-    this = prio_chain_head;
-    while (i) {
-        this = this->next;
-        i--;
-    }
-    toi_writer_posn.current_chain = this;
-
-    /* Restore block */
-    this->blocks.current_offset = ptr->save.block;
-
-    /* Find current offset from block number */
-    hib = this->blocks.first;
-
-    while (hib->start > ptr->save.block) {
-        hib = hib->next;
-    }
-
-    this->blocks.last_touched = this->blocks.current_extent = hib;
-}
diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c
deleted file mode 100644
index bc27662d7..000000000
--- a/kernel/power/tuxonice_bio_core.c
+++ /dev/null
@@ -1,1937 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains block io functions for TuxOnIce. These are
- * used by the swapwriter and it is planned that they will also
- * be used by the NFSwriter.
- *
- */
-
-#include <linux/blkdev.h>
-#include <linux/syscalls.h>
-#include <linux/suspend.h>
-#include <linux/ctype.h>
-#include <linux/mount.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-#define MEMORY_ONLY 1
-#define THROTTLE_WAIT 2
-
-/* #define MEASURE_MUTEX_CONTENTION */
-#ifndef MEASURE_MUTEX_CONTENTION
-#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
-#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
-#else
-unsigned long mutex_times[2][2][NR_CPUS];
-#define my_mutex_lock(index, the_lock) do { \
-        int have_mutex; \
-        have_mutex = mutex_trylock(the_lock); \
-        if (!have_mutex) { \
-                mutex_lock(the_lock); \
-                mutex_times[index][0][smp_processor_id()]++; \
-        } else { \
-                mutex_times[index][1][smp_processor_id()]++; \
-        }
-
-#define my_mutex_unlock(index, the_lock) \
-        mutex_unlock(the_lock); \
-} while (0)
-#endif
-
-static int page_idx, reset_idx;
-
-static int target_outstanding_io = 1024;
-static int max_outstanding_writes, max_outstanding_reads;
-
-static struct page *bio_queue_head, *bio_queue_tail;
-static atomic_t toi_bio_queue_size;
-static DEFINE_SPINLOCK(bio_queue_lock);
-
-static int free_mem_throttle, throughput_throttle;
-int more_readahead = 1;
-static struct page *readahead_list_head, *readahead_list_tail;
-
-static struct page *waiting_on;
-
-static atomic_t toi_io_in_progress, toi_io_done;
-static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
-
-int current_stream;
-/* Not static, so that the allocators can setup and complete
- * writing the header */
-char *toi_writer_buffer;
-int toi_writer_buffer_posn;
-
-static DEFINE_MUTEX(toi_bio_mutex);
-static DEFINE_MUTEX(toi_bio_readahead_mutex);
-
-static struct task_struct *toi_queue_flusher;
-static int toi_bio_queue_flush_pages(int dedicated_thread);
-
-struct toi_module_ops toi_blockwriter_ops;
-
-struct toi_incremental_image_pointer toi_inc_ptr[2][2];
-
-#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \
-               atomic_read(&toi_bio_queue_size))
-
-unsigned long raw_pages_allocd, header_pages_reserved;
-
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
-                int no_readahead);
-
-/**
- * set_free_mem_throttle - set the point where we pause to avoid oom.
- *
- * Initially, this value is zero, but when we first fail to allocate memory,
- * we set it (plus a buffer) and thereafter throttle i/o once that limit is
- * reached.
- **/
-static void set_free_mem_throttle(void)
-{
-        int new_throttle = nr_free_buffer_pages() + 256;
-
-        if (new_throttle > free_mem_throttle)
-                free_mem_throttle = new_throttle;
-}
-
-#define NUM_REASONS 7
-static atomic_t reasons[NUM_REASONS];
-static char *reason_name[NUM_REASONS] = {
-        "readahead not ready",
-        "bio allocation",
-        "synchronous I/O",
-        "toi_bio_get_new_page",
-        "memory low",
-        "readahead buffer allocation",
-        "throughput_throttle",
-};
-
-/* User Specified Parameters. */
-unsigned long resume_firstblock;
-dev_t resume_dev_t;
-struct block_device *resume_block_device;
-static atomic_t resume_bdev_open_count;
-
-struct block_device *header_block_device;
-
-/**
- * toi_open_bdev: Open a bdev at resume time.
- *
- * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
- * (the user can have resume= pointing at a swap partition/file that isn't
- * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
- * header. It will be from a swap partition that was enabled when we hibernated,
- * but we don't know it's real index until we read that first page.
- * dev_t: The device major/minor.
- * display_errs: Whether to try to do this quietly.
- *
- * We stored a dev_t in the image header. Open the matching device without
- * requiring /dev/<whatever> in most cases and record the details needed
- * to close it later and avoid duplicating work.
- */
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
-                int display_errs)
-{
-        struct block_device *bdev;
-        dev_t device = default_device;
-        char buf[32];
-        int retried = 0;
-
-retry:
-        if (uuid) {
-                struct fs_info seek;
-                strncpy((char *) &seek.uuid, uuid, 16);
-                seek.dev_t = 0;
-                seek.last_mount_size = 0;
-                device = blk_lookup_fs_info(&seek);
-                if (!device) {
-                        device = default_device;
-                        printk(KERN_DEBUG "Unable to resolve uuid. Falling back"
-                                        " to dev_t.\n");
-                } else
-                        printk(KERN_DEBUG "Resolved uuid to device %s.\n",
-                                        format_dev_t(buf, device));
-        }
-
-        if (!device) {
-                printk(KERN_ERR "TuxOnIce attempting to open a "
-                                "blank dev_t!\n");
-                dump_stack();
-                return NULL;
-        }
-        bdev = toi_open_by_devnum(device);
-
-        if (IS_ERR(bdev) || !bdev) {
-                if (!retried) {
-                        retried = 1;
-                        wait_for_device_probe();
-                        goto retry;
-                }
-                if (display_errs)
-                        toi_early_boot_message(1, TOI_CONTINUE_REQ,
-                                "Failed to get access to block device "
-                                "\"%x\" (error %d).\n Maybe you need "
-                                "to run mknod and/or lvmsetup in an "
-                                "initrd/ramfs?", device, bdev);
-                return ERR_PTR(-EINVAL);
-        }
-        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                        "TuxOnIce got bdev %p for dev_t %x.",
-                        bdev, device);
-
-        return bdev;
-}
-
-static void toi_bio_reserve_header_space(unsigned long request)
-{
-        header_pages_reserved = request;
-}
-
-/**
- * do_bio_wait - wait for some TuxOnIce I/O to complete
- * @reason: The array index of the reason we're waiting.
- *
- * Wait for a particular page of I/O if we're after a particular page.
- * If we're not after a particular page, wait instead for all in flight
- * I/O to be completed or for us to have enough free memory to be able
- * to submit more I/O.
- *
- * If we wait, we also update our statistics regarding why we waited.
- **/
-static void do_bio_wait(int reason)
-{
-        struct page *was_waiting_on = waiting_on;
-
-        /* On SMP, waiting_on can be reset, so we make a copy */
-        if (was_waiting_on) {
-                wait_on_page_locked(was_waiting_on);
-                atomic_inc(&reasons[reason]);
-        } else {
-                atomic_inc(&reasons[reason]);
-
-                wait_event(num_in_progress_wait,
-                        !atomic_read(&toi_io_in_progress) ||
-                        nr_free_buffer_pages() > free_mem_throttle);
-        }
-}
-
-/**
- * throttle_if_needed - wait for I/O completion if throttle points are reached
- * @flags: What to check and how to act.
- *
- * Check whether we need to wait for some I/O to complete. We always check
- * whether we have enough memory available, but may also (depending upon
- * @reason) check if the throughput throttle limit has been reached.
- **/
-static int throttle_if_needed(int flags)
-{
-        int free_pages = nr_free_buffer_pages();
-
-        /* Getting low on memory and I/O is in progress? */
-        while (unlikely(free_pages < free_mem_throttle) &&
-                        atomic_read(&toi_io_in_progress) &&
-                        !test_result_state(TOI_ABORTED)) {
-                if (!(flags & THROTTLE_WAIT))
-                        return -ENOMEM;
-                do_bio_wait(4);
-                free_pages = nr_free_buffer_pages();
-        }
-
-        while (!(flags & MEMORY_ONLY) && throughput_throttle &&
-                TOTAL_OUTSTANDING_IO >= throughput_throttle &&
-                !test_result_state(TOI_ABORTED)) {
-                int result = toi_bio_queue_flush_pages(0);
-                if (result)
-                        return result;
-                atomic_inc(&reasons[6]);
-                wait_event(num_in_progress_wait,
-                        !atomic_read(&toi_io_in_progress) ||
-                        TOTAL_OUTSTANDING_IO < throughput_throttle);
-        }
-
-        return 0;
-}
-
-/**
- * update_throughput_throttle - update the raw throughput throttle
- * @jif_index: The number of times this function has been called.
- *
- * This function is called four times per second by the core, and used to limit
- * the amount of I/O we submit at once, spreading out our waiting through the
- * whole job and letting userui get an opportunity to do its work.
- *
- * We don't start limiting I/O until 1/4s has gone so that we get a
- * decent sample for our initial limit, and keep updating it because
- * throughput may vary (on rotating media, eg) with our block number.
- *
- * We throttle to 1/10s worth of I/O.
- **/
-static void update_throughput_throttle(int jif_index)
-{
-        int done = atomic_read(&toi_io_done);
-        throughput_throttle = done * 2 / 5 / jif_index;
-}
-
-/**
- * toi_finish_all_io - wait for all outstanding i/o to complete
- *
- * Flush any queued but unsubmitted I/O and wait for it all to complete.
- **/
-static int toi_finish_all_io(void)
-{
-        int result = toi_bio_queue_flush_pages(0);
-        toi_bio_queue_flusher_should_finish = 1;
-        wake_up(&toi_io_queue_flusher);
-        wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO);
-        return result;
-}
-
-/**
- * toi_end_bio - bio completion function.
- * @bio: bio that has completed.
- *
- * Function called by the block driver from interrupt context when I/O is
- * completed. If we were writing the page, we want to free it and will have
- * set bio->bi_private to the parameter we should use in telling the page
- * allocation accounting code what the page was allocated for. If we're
- * reading the page, it will be in the singly linked list made from
- * page->private pointers.
- **/
-static void toi_end_bio(struct bio *bio)
-{
-        struct page *page = bio->bi_io_vec[0].bv_page;
-
-        BUG_ON(bio->bi_error);
-
-        unlock_page(page);
-        bio_put(bio);
-
-        if (waiting_on == page)
-                waiting_on = NULL;
-
-        put_page(page);
-
-        if (bio->bi_private)
-                toi__free_page((int) ((unsigned long) bio->bi_private) , page);
-
-        bio_put(bio);
-
-        atomic_dec(&toi_io_in_progress);
-        atomic_inc(&toi_io_done);
-
-        wake_up(&num_in_progress_wait);
-}
-
-/**
- * submit - submit BIO request
- * @writing: READ or WRITE.
- * @dev: The block device we're using.
- * @first_block: The first sector we're using.
- * @page: The page being used for I/O.
- * @free_group: If writing, the group that was used in allocating the page
- *         and which will be used in freeing the page from the completion
- *         routine.
- *
- * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
- * textbook - allocate and initialize the bio. If we're writing, make sure
- * the page is marked as dirty. Then submit it and carry on."
- *
- * If we're just testing the speed of our own code, we fake having done all
- * the hard work and all toi_end_bio immediately.
- **/
-static int submit(int writing, struct block_device *dev, sector_t first_block,
-                struct page *page, int free_group)
-{
-        struct bio *bio = NULL;
-        int cur_outstanding_io, result;
-
-        /*
-         * Shouldn't throttle if reading - can deadlock in the single
-         * threaded case as pages are only freed when we use the
-         * readahead.
-         */
-        if (writing) {
-                result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT);
-                if (result)
-                        return result;
-        }
-
-        while (!bio) {
-                bio = bio_alloc(TOI_ATOMIC_GFP, 1);
-                if (!bio) {
-                        set_free_mem_throttle();
-                        do_bio_wait(1);
-                }
-        }
-
-        bio->bi_bdev = dev;
-        bio->bi_iter.bi_sector = first_block;
-        bio->bi_private = (void *) ((unsigned long) free_group);
-        bio->bi_end_io = toi_end_bio;
-        bio_set_flag(bio, BIO_TOI);
-
-        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-                printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n",
-                                (unsigned long long) first_block);
-                bio_put(bio);
-                return -EFAULT;
-        }
-
-        bio_get(bio);
-
-        cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
-        if (writing) {
-                if (cur_outstanding_io > max_outstanding_writes)
-                        max_outstanding_writes = cur_outstanding_io;
-        } else {
-                if (cur_outstanding_io > max_outstanding_reads)
-                        max_outstanding_reads = cur_outstanding_io;
-        }
-
-        /* Still read the header! */
-        if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) {
-                /* Fake having done the hard work */
-                bio->bi_error = 0;
-                toi_end_bio(bio);
-        } else
-                submit_bio(writing | REQ_SYNC, bio);
-
-        return 0;
-}
-
-/**
- * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
- *
- * @writing: Whether reading or writing.
- * @bdev: The block device which we're using.
- * @block0: The first sector we're reading or writing.
- * @page: The page on which I/O is being done.
- * @readahead_index: If doing readahead, the index (reset this flag when done).
- * @syncio: Whether the i/o is being done synchronously.
- *
- * Prepare and start a read or write operation.
- *
- * Note that we always work with our own page. If writing, we might be given a
- * compression buffer that will immediately be used to start compressing the
- * next page. For reading, we do readahead and therefore don't know the final
- * address where the data needs to go.
- **/
-int toi_do_io(int writing, struct block_device *bdev, long block0,
-        struct page *page, int is_readahead, int syncio, int free_group)
-{
-        page->private = 0;
-
-        /* Do here so we don't race against toi_bio_get_next_page_read */
-        lock_page(page);
-
-        if (is_readahead) {
-                if (readahead_list_head)
-                        readahead_list_tail->private = (unsigned long) page;
-                else
-                        readahead_list_head = page;
-
-                readahead_list_tail = page;
-        }
-
-        /* Done before submitting to avoid races. */
-        if (syncio)
-                waiting_on = page;
-
-        /* Submit the page */
-        get_page(page);
-
-        if (submit(writing, bdev, block0, page, free_group))
-                return -EFAULT;
-
-        if (syncio)
-                do_bio_wait(2);
-
-        return 0;
-}
-
-/**
- * toi_bdev_page_io - simpler interface to do directly i/o on a single page
- * @writing: Whether reading or writing.
- * @bdev: Block device on which we're operating.
- * @pos: Sector at which page to read or write starts.
- * @page: Page to be read/written.
- *
- * A simple interface to submit a page of I/O and wait for its completion.
- * The caller must free the page used.
- **/
-static int toi_bdev_page_io(int writing, struct block_device *bdev,
-                long pos, struct page *page)
-{
-        return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
-}
-
-/**
- * toi_bio_memory_needed - report the amount of memory needed for block i/o
- *
- * We want to have at least enough memory so as to have target_outstanding_io
- * or more transactions on the fly at once. If we can do more, fine.
- **/
-static int toi_bio_memory_needed(void)
-{
-        return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
-                                sizeof(struct bio));
-}
-
-/**
- * toi_bio_print_debug_stats - put out debugging info in the buffer provided
- * @buffer: A buffer of size @size into which text should be placed.
- * @size: The size of @buffer.
- *
- * Fill a buffer with debugging info. This is used for both our debug_info sysfs
- * entry and for recording the same info in dmesg.
- **/
-static int toi_bio_print_debug_stats(char *buffer, int size)
-{
-        int len = 0;
-
-        if (toiActiveAllocator != &toi_blockwriter_ops) {
-                len = scnprintf(buffer, size,
-                                "- Block I/O inactive.\n");
-                return len;
-        }
-
-        len = scnprintf(buffer, size, "- Block I/O active.\n");
-
-        len += toi_bio_chains_debug_info(buffer + len, size - len);
-
-        len += scnprintf(buffer + len, size - len,
-                        "- Max outstanding reads %d. Max writes %d.\n",
-                        max_outstanding_reads, max_outstanding_writes);
-
-        len += scnprintf(buffer + len, size - len,
-                "  Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
-                target_outstanding_io,
-                PAGE_SIZE, (unsigned int) sizeof(struct request),
-                (unsigned int) sizeof(struct bio), toi_bio_memory_needed());
-
-#ifdef MEASURE_MUTEX_CONTENTION
-        {
-        int i;
-
-        len += scnprintf(buffer + len, size - len,
-                "  Mutex contention while reading:\n  Contended      Free\n");
-
-        for_each_online_cpu(i)
-                len += scnprintf(buffer + len, size - len,
-                "  %9lu %9lu\n",
-                mutex_times[0][0][i], mutex_times[0][1][i]);
-
-        len += scnprintf(buffer + len, size - len,
-                "  Mutex contention while writing:\n  Contended      Free\n");
-
-        for_each_online_cpu(i)
-                len += scnprintf(buffer + len, size - len,
-                "  %9lu %9lu\n",
-                mutex_times[1][0][i], mutex_times[1][1][i]);
-
-        }
-#endif
-
-        return len + scnprintf(buffer + len, size - len,
-                "  Free mem throttle point reached %d.\n", free_mem_throttle);
-}
-
-static int total_header_bytes;
-static int unowned;
-
-void debug_broken_header(void)
-{
-        printk(KERN_DEBUG "Image header too big for size allocated!\n");
-        print_toi_header_storage_for_modules();
-        printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed());
-        printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header));
-        printk(KERN_DEBUG "Total unowned : %d.\n", unowned);
-        printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes,
-                        DIV_ROUND_UP(total_header_bytes, PAGE_SIZE));
-        printk(KERN_DEBUG "Space needed now : %ld.\n",
-                        get_header_storage_needed(0));
-        dump_block_chains();
-        abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small.");
-}
-
-static int toi_bio_update_previous_inc_img_ptr(int stream)
-{
-    int result;
-    char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP);
-    struct page *page;
-    struct toi_incremental_image_pointer *prev, *this;
-
-    prev = &toi_inc_ptr[stream][0];
-    this = &toi_inc_ptr[stream][1];
-
-    if (!buffer) {
-        // We're at the start of writing a pageset. Memory should not be that scarce.
-        return -ENOMEM;
-    }
-
-    page = virt_to_page(buffer);
-    result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0);
-
-    if (result)
-        goto out;
-
-    memcpy(buffer, (char *) this, sizeof(this->save));
-
-    result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12);
-
-    // If the IO is successfully submitted (!result), the page will be freed
-    // asynchronously on completion.
-out:
-    if (result)
-        toi__free_page(12, virt_to_page(buffer));
-    return result;
-}
-
-/**
- * toi_rw_init_incremental - incremental image part of setting up to write new section
- */
-static int toi_write_init_incremental(int stream)
-{
-    int result = 0;
-
-    // Remember the location of this block so we can link to it.
-    toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
-    // Update the pointer at the start of the last pageset with the same stream number.
-    result = toi_bio_update_previous_inc_img_ptr(stream);
-    if (result)
-        return result;
-
-    // Move the current to the previous slot.
-    memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]));
-
-    // Store a blank pointer at the start of this incremental pageset
-    memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1]));
-    result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
-    if (result)
-        return result;
-
-    // Serialise extent chains if this is an incremental pageset
-    return toi_serialise_extent_chains();
-}
-
-/**
- * toi_read_init_incremental - incremental image part of setting up to read new section
- */
-static int toi_read_init_incremental(int stream)
-{
-    int result;
-
-    // Set our position to the start of the next pageset
-    toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
-    // Read the start of the next incremental pageset (if any)
-    result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
-
-    if (!result)
-        result = toi_load_extent_chains();
-
-    return result;
-}
-
-/**
- * toi_rw_init - prepare to read or write a stream in the image
- * @writing: Whether reading or writing.
- * @stream number: Section of the image being processed.
- *
- * Prepare to read or write a section ('stream') in the image.
- **/
-static int toi_rw_init(int writing, int stream_number)
-{
-        if (stream_number)
-                toi_extent_state_restore(stream_number);
-        else
-                toi_extent_state_goto_start();
-
-        if (writing) {
-                reset_idx = 0;
-                if (!current_stream)
-                        page_idx = 0;
-        } else {
-                reset_idx = 1;
-        }
-
-        atomic_set(&toi_io_done, 0);
-        if (!toi_writer_buffer)
-                toi_writer_buffer = (char *) toi_get_zeroed_page(11,
-                                TOI_ATOMIC_GFP);
-        toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
-
-        current_stream = stream_number;
-
-        more_readahead = 1;
-
-        if (test_result_state(TOI_KEPT_IMAGE)) {
-            int result;
-
-            if (writing) {
-                result = toi_write_init_incremental(stream_number);
-            } else {
-                result = toi_read_init_incremental(stream_number);
-            }
-
-            if (result)
-                return result;
-        }
-
-        return toi_writer_buffer ? 0 : -ENOMEM;
-}
-
-/**
- * toi_bio_queue_write - queue a page for writing
- * @full_buffer: Pointer to a page to be queued
- *
- * Add a page to the queue to be submitted. If we're the queue flusher,
- * we'll do this once we've dropped toi_bio_mutex, so other threads can
- * continue to submit I/O while we're on the slow path doing the actual
- * submission.
- **/
-static void toi_bio_queue_write(char **full_buffer)
-{
-        struct page *page = virt_to_page(*full_buffer);
-        unsigned long flags;
-
-        *full_buffer = NULL;
-        page->private = 0;
-
-        spin_lock_irqsave(&bio_queue_lock, flags);
-        if (!bio_queue_head)
-                bio_queue_head = page;
-        else
-                bio_queue_tail->private = (unsigned long) page;
-
-        bio_queue_tail = page;
-        atomic_inc(&toi_bio_queue_size);
-
-        spin_unlock_irqrestore(&bio_queue_lock, flags);
-        wake_up(&toi_io_queue_flusher);
-}
-
-/**
- * toi_rw_cleanup - Cleanup after i/o.
- * @writing: Whether we were reading or writing.
- *
- * Flush all I/O and clean everything up after reading or writing a
- * section of the image.
- **/
-static int toi_rw_cleanup(int writing)
-{
-        int i, result = 0;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup.");
-        if (writing) {
-                if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED))
-                        toi_bio_queue_write(&toi_writer_buffer);
-
-                while (bio_queue_head && !result)
-                        result = toi_bio_queue_flush_pages(0);
-
-                if (result)
-                        return result;
-
-                if (current_stream == 2)
-                        toi_extent_state_save(1);
-                else if (current_stream == 1)
-                        toi_extent_state_save(3);
-        }
-
-        result = toi_finish_all_io();
-
-        while (readahead_list_head) {
-                void *next = (void *) readahead_list_head->private;
-                toi__free_page(12, readahead_list_head);
-                readahead_list_head = next;
-        }
-
-        readahead_list_tail = NULL;
-
-        if (!current_stream)
-                return result;
-
-        for (i = 0; i < NUM_REASONS; i++) {
-                if (!atomic_read(&reasons[i]))
-                        continue;
-                printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n",
-                                reason_name[i], atomic_read(&reasons[i]));
-                atomic_set(&reasons[i], 0);
-        }
-
-        current_stream = 0;
-        return result;
-}
-
-/**
- * toi_start_one_readahead - start one page of readahead
- * @dedicated_thread: Is this a thread dedicated to doing readahead?
- *
- * Start one new page of readahead. If this is being called by a thread
- * whose only just is to submit readahead, don't quit because we failed
- * to allocate a page.
- **/
-static int toi_start_one_readahead(int dedicated_thread)
-{
-        char *buffer = NULL;
-        int oom = 0, result;
-
-        result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0);
-        if (result) {
-            printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result);
-            return result;
-        }
-
-        mutex_lock(&toi_bio_readahead_mutex);
-
-        while (!buffer) {
-                buffer = (char *) toi_get_zeroed_page(12,
-                                TOI_ATOMIC_GFP);
-                if (!buffer) {
-                        if (oom && !dedicated_thread) {
-                                mutex_unlock(&toi_bio_readahead_mutex);
-                                printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result);
-                                return -ENOMEM;
-                        }
-
-                        oom = 1;
-                        set_free_mem_throttle();
-                        do_bio_wait(5);
-                }
-        }
-
-        result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
-        if (result) {
-            printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result);
-        }
-        if (result == -ENOSPC)
-                toi__free_page(12, virt_to_page(buffer));
-        mutex_unlock(&toi_bio_readahead_mutex);
-        if (result) {
-                if (result == -ENOSPC)
-                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
-                                        "Last readahead page submitted.");
-                else
-                        printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n",
-                                        result);
-        }
-        return result;
-}
-
-/**
- * toi_start_new_readahead - start new readahead
- * @dedicated_thread: Are we dedicated to this task?
- *
- * Start readahead of image pages.
- *
- * We can be called as a thread dedicated to this task (may be helpful on
- * systems with lots of CPUs), in which case we don't exit until there's no
- * more readahead.
- *
- * If this is not called by a dedicated thread, we top up our queue until
- * there's no more readahead to submit, we've submitted the number given
- * in target_outstanding_io or the number in progress exceeds the target
- * outstanding I/O value.
- *
- * No mutex needed because this is only ever called by the first cpu.
- **/
-static int toi_start_new_readahead(int dedicated_thread)
-{
-        int last_result, num_submitted = 0;
-
-        /* Start a new readahead? */
-        if (!more_readahead)
-                return 0;
-
-        do {
-                last_result = toi_start_one_readahead(dedicated_thread);
-
-                if (last_result) {
-                        if (last_result == -ENOMEM || last_result == -ENOSPC)
-                                return 0;
-
-                        printk(KERN_DEBUG
-                                "Begin read chunk returned %d.\n",
-                                last_result);
-                } else
-                        num_submitted++;
-
-        } while (more_readahead && !last_result &&
-                 (dedicated_thread ||
-                  (num_submitted < target_outstanding_io &&
-                   atomic_read(&toi_io_in_progress) < target_outstanding_io)));
-
-        return last_result;
-}
-
-/**
- * bio_io_flusher - start the dedicated I/O flushing routine
- * @writing: Whether we're writing the image.
- **/
-static int bio_io_flusher(int writing)
-{
-
-        if (writing)
-                return toi_bio_queue_flush_pages(1);
-        else
-                return toi_start_new_readahead(1);
-}
-
-/**
- * toi_bio_get_next_page_read - read a disk page, perhaps with readahead
- * @no_readahead: Whether we can use readahead
- *
- * Read a page from disk, submitting readahead and cleaning up finished i/o
- * while we wait for the page we're after.
- **/
-static int toi_bio_get_next_page_read(int no_readahead)
-{
-        char *virt;
-        struct page *old_readahead_list_head;
-
-        /*
-         * When reading the second page of the header, we have to
-         * delay submitting the read until after we've gotten the
-         * extents out of the first page.
-         */
-        if (unlikely(no_readahead)) {
-            int result = toi_start_one_readahead(0);
-            if (result) {
-                printk(KERN_EMERG "No readahead and toi_start_one_readahead "
-                        "returned non-zero.\n");
-                return -EIO;
-            }
-        }
-
-        if (unlikely(!readahead_list_head)) {
-                /*
-                 * If the last page finishes exactly on the page
-                 * boundary, we will be called one extra time and
-                 * have no data to return. In this case, we should
-                 * not BUG(), like we used to!
-                 */
-                if (!more_readahead) {
-                        printk(KERN_EMERG "No more readahead.\n");
-                        return -ENOSPC;
-                }
-                if (unlikely(toi_start_one_readahead(0))) {
-                        printk(KERN_EMERG "No readahead and "
-                         "toi_start_one_readahead returned non-zero.\n");
-                        return -EIO;
-                }
-        }
-
-        if (PageLocked(readahead_list_head)) {
-                waiting_on = readahead_list_head;
-                do_bio_wait(0);
-        }
-
-        virt = page_address(readahead_list_head);
-        memcpy(toi_writer_buffer, virt, PAGE_SIZE);
-        
-        mutex_lock(&toi_bio_readahead_mutex);
-        old_readahead_list_head = readahead_list_head;
-        readahead_list_head = (struct page *) readahead_list_head->private;
-        mutex_unlock(&toi_bio_readahead_mutex);
-        toi__free_page(12, old_readahead_list_head);
-        return 0;
-}
-
-/**
- * toi_bio_queue_flush_pages - flush the queue of pages queued for writing
- * @dedicated_thread: Whether we're a dedicated thread
- *
- * Flush the queue of pages ready to be written to disk.
- *
- * If we're a dedicated thread, stay in here until told to leave,
- * sleeping in wait_event.
- *
- * The first thread is normally the only one to come in here. Another
- * thread can enter this routine too, though, via throttle_if_needed.
- * Since that's the case, we must be careful to only have one thread
- * doing this work at a time. Otherwise we have a race and could save
- * pages out of order.
- *
- * If an error occurs, free all remaining pages without submitting them
- * for I/O.
- **/
-
-int toi_bio_queue_flush_pages(int dedicated_thread)
-{
-        unsigned long flags;
-        int result = 0;
-        static DEFINE_MUTEX(busy);
-
-        if (!mutex_trylock(&busy))
-                return 0;
-
-top:
-        spin_lock_irqsave(&bio_queue_lock, flags);
-        while (bio_queue_head) {
-                struct page *page = bio_queue_head;
-                bio_queue_head = (struct page *) page->private;
-                if (bio_queue_tail == page)
-                        bio_queue_tail = NULL;
-                atomic_dec(&toi_bio_queue_size);
-                spin_unlock_irqrestore(&bio_queue_lock, flags);
-
-                /* Don't generate more error messages if already had one */
-                if (!result)
-                        result = toi_bio_rw_page(WRITE, page, 0, 11);
-                /*
-                 * If writing the page failed, don't drop out.
-                 * Flush the rest of the queue too.
-                 */
-                if (result)
-                        toi__free_page(11 , page);
-                spin_lock_irqsave(&bio_queue_lock, flags);
-        }
-        spin_unlock_irqrestore(&bio_queue_lock, flags);
-
-        if (dedicated_thread) {
-                wait_event(toi_io_queue_flusher, bio_queue_head ||
-                                toi_bio_queue_flusher_should_finish);
-                if (likely(!toi_bio_queue_flusher_should_finish))
-                        goto top;
-                toi_bio_queue_flusher_should_finish = 0;
-        }
-
-        mutex_unlock(&busy);
-        return result;
-}
-
-/**
- * toi_bio_get_new_page - get a new page for I/O
- * @full_buffer: Pointer to a page to allocate.
- **/
-static int toi_bio_get_new_page(char **full_buffer)
-{
-        int result = throttle_if_needed(THROTTLE_WAIT);
-        if (result)
-                return result;
-
-        while (!*full_buffer) {
-                *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
-                if (!*full_buffer) {
-                        set_free_mem_throttle();
-                        do_bio_wait(3);
-                }
-        }
-
-        return 0;
-}
-
-/**
- * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O
- * @writing:                Bool - whether writing (or reading).
- * @buffer:                The start of the buffer to write or fill.
- * @buffer_size:        The size of the buffer to write or fill.
- * @no_readahead:        Don't try to start readhead (when getting extents).
- **/
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
-                int no_readahead)
-{
-        int bytes_left = buffer_size, result = 0;
-
-        while (bytes_left) {
-                char *source_start = buffer + buffer_size - bytes_left;
-                char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
-                int capacity = PAGE_SIZE - toi_writer_buffer_posn;
-                char *to = writing ? dest_start : source_start;
-                char *from = writing ? source_start : dest_start;
-
-                if (bytes_left <= capacity) {
-                        memcpy(to, from, bytes_left);
-                        toi_writer_buffer_posn += bytes_left;
-                        return 0;
-                }
-
-                /* Complete this page and start a new one */
-                memcpy(to, from, capacity);
-                bytes_left -= capacity;
-
-                if (!writing) {
-                        /*
-                         * Perform actual I/O:
-                         * read readahead_list_head into toi_writer_buffer
-                         */
-                        int result = toi_bio_get_next_page_read(no_readahead);
-                        if (result && bytes_left) {
-                                printk("toi_bio_get_next_page_read "
-                                        "returned %d. Expecting to read %d bytes.\n", result, bytes_left);
-                                return result;
-                        }
-                } else {
-                        toi_bio_queue_write(&toi_writer_buffer);
-                        result = toi_bio_get_new_page(&toi_writer_buffer);
-                        if (result) {
-                                printk(KERN_ERR "toi_bio_get_new_page returned "
-                                                "%d.\n", result);
-                                return result;
-                        }
-                }
-
-                toi_writer_buffer_posn = 0;
-                toi_cond_pause(0, NULL);
-        }
-
-        return 0;
-}
-
-/**
- * toi_bio_read_page - read a page of the image
- * @pfn:                The pfn where the data belongs.
- * @buffer_page:        The page containing the (possibly compressed) data.
- * @buf_size:                The number of bytes on @buffer_page used (PAGE_SIZE).
- *
- * Read a (possibly compressed) page from the image, into buffer_page,
- * returning its pfn and the buffer size.
- **/
-static int toi_bio_read_page(unsigned long *pfn, int buf_type,
-                void *buffer_page, unsigned int *buf_size)
-{
-        int result = 0;
-        int this_idx;
-        char *buffer_virt = TOI_MAP(buf_type, buffer_page);
-
-        /*
-         * Only call start_new_readahead if we don't have a dedicated thread
-         * and we're the queue flusher.
-         */
-        if (current == toi_queue_flusher && more_readahead &&
-                        !test_action_state(TOI_NO_READAHEAD)) {
-                int result2 = toi_start_new_readahead(0);
-                if (result2) {
-                        printk(KERN_DEBUG "Queue flusher and "
-                         "toi_start_one_readahead returned non-zero.\n");
-                        result = -EIO;
-                        goto out;
-                }
-        }
-
-        my_mutex_lock(0, &toi_bio_mutex);
-
-        /*
-         * Structure in the image:
-         *        [destination pfn|page size|page data]
-         * buf_size is PAGE_SIZE
-         * We can validly find there's nothing to read in a multithreaded
-         * situation.
-         */
-        if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) ||
-            toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
-            toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
-            toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
-                result = -ENODATA;
-                goto out_unlock;
-        }
-
-        if (reset_idx) {
-                page_idx = this_idx;
-                reset_idx = 0;
-        } else {
-                page_idx++;
-                if (!this_idx)
-                        result = -ENODATA;
-                else if (page_idx != this_idx)
-                        printk(KERN_ERR "Got page index %d, expected %d.\n",
-                                        this_idx, page_idx);
-        }
-
-out_unlock:
-        my_mutex_unlock(0, &toi_bio_mutex);
-out:
-        TOI_UNMAP(buf_type, buffer_page);
-        return result;
-}
-
-/**
- * toi_bio_write_page - write a page of the image
- * @pfn:                The pfn where the data belongs.
- * @buffer_page:        The page containing the (possibly compressed) data.
- * @buf_size:        The number of bytes on @buffer_page used.
- *
- * Write a (possibly compressed) page to the image from the buffer, together
- * with it's index and buffer size.
- **/
-static int toi_bio_write_page(unsigned long pfn, int buf_type,
-                void *buffer_page, unsigned int buf_size)
-{
-        char *buffer_virt;
-        int result = 0, result2 = 0;
-
-        if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
-                return 0;
-
-        my_mutex_lock(1, &toi_bio_mutex);
-
-        if (test_result_state(TOI_ABORTED)) {
-                my_mutex_unlock(1, &toi_bio_mutex);
-                return 0;
-        }
-
-        buffer_virt = TOI_MAP(buf_type, buffer_page);
-        page_idx++;
-
-        /*
-         * Structure in the image:
-         *        [destination pfn|page size|page data]
-         * buf_size is PAGE_SIZE
-         */
-        if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) ||
-            toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
-            toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
-            toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
-                printk(KERN_DEBUG "toi_rw_buffer returned non-zero to "
-                                "toi_bio_write_page.\n");
-                result = -EIO;
-        }
-
-        TOI_UNMAP(buf_type, buffer_page);
-        my_mutex_unlock(1, &toi_bio_mutex);
-
-        if (current == toi_queue_flusher)
-                result2 = toi_bio_queue_flush_pages(0);
-
-        return result ? result : result2;
-}
-
-/**
- * _toi_rw_header_chunk - read or write a portion of the image header
- * @writing:                Whether reading or writing.
- * @owner:                The module for which we're writing.
- *                        Used for confirming that modules
- *                        don't use more header space than they asked for.
- * @buffer:                Address of the data to write.
- * @buffer_size:        Size of the data buffer.
- * @no_readahead:        Don't try to start readhead (when getting extents).
- *
- * Perform PAGE_SIZE I/O. Start readahead if needed.
- **/
-static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
-                char *buffer, int buffer_size, int no_readahead)
-{
-        int result = 0;
-
-        if (owner) {
-                owner->header_used += buffer_size;
-                toi_message(TOI_HEADER, TOI_LOW, 1,
-                        "Header: %s : %d bytes (%d/%d) from offset %d.",
-                        owner->name,
-                        buffer_size, owner->header_used,
-                        owner->header_requested,
-                        toi_writer_buffer_posn);
-                if (owner->header_used > owner->header_requested && writing) {
-                        printk(KERN_EMERG "TuxOnIce module %s is using more "
-                                "header space (%u) than it requested (%u).\n",
-                                owner->name,
-                                owner->header_used,
-                                owner->header_requested);
-                        return buffer_size;
-                }
-        } else {
-                unowned += buffer_size;
-                toi_message(TOI_HEADER, TOI_LOW, 1,
-                        "Header: (No owner): %d bytes (%d total so far) from "
-                        "offset %d.", buffer_size, unowned,
-                        toi_writer_buffer_posn);
-        }
-
-        if (!writing && !no_readahead && more_readahead) {
-                result = toi_start_new_readahead(0);
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead "
-                                "returned %d.", result);
-        }
-
-        if (!result) {
-                result = toi_rw_buffer(writing, buffer, buffer_size,
-                                no_readahead);
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned "
-                                "%d.", result);
-        }
-
-        total_header_bytes += buffer_size;
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning "
-                        "%d.", result);
-        return result;
-}
-
-static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
-                char *buffer, int size)
-{
-        return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-static int toi_rw_header_chunk_noreadahead(int writing,
-                struct toi_module_ops *owner, char *buffer, int size)
-{
-        return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-/**
- * toi_bio_storage_needed - get the amount of storage needed for my fns
- **/
-static int toi_bio_storage_needed(void)
-{
-        return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed();
-}
-
-/**
- * toi_bio_save_config_info - save block I/O config to image header
- * @buf:        PAGE_SIZE'd buffer into which data should be saved.
- **/
-static int toi_bio_save_config_info(char *buf)
-{
-        int *ints = (int *) buf;
-        ints[0] = target_outstanding_io;
-        return sizeof(int);
-}
-
-/**
- * toi_bio_load_config_info - restore block I/O config
- * @buf:        Data to be reloaded.
- * @size:        Size of the buffer saved.
- **/
-static void toi_bio_load_config_info(char *buf, int size)
-{
-        int *ints = (int *) buf;
-        target_outstanding_io  = ints[0];
-}
-
-void close_resume_dev_t(int force)
-{
-        if (!resume_block_device)
-                return;
-
-        if (force)
-                atomic_set(&resume_bdev_open_count, 0);
-        else
-                atomic_dec(&resume_bdev_open_count);
-
-        if (!atomic_read(&resume_bdev_open_count)) {
-                toi_close_bdev(resume_block_device);
-                resume_block_device = NULL;
-        }
-}
-
-int open_resume_dev_t(int force, int quiet)
-{
-        if (force) {
-                close_resume_dev_t(1);
-                atomic_set(&resume_bdev_open_count, 1);
-        } else
-                atomic_inc(&resume_bdev_open_count);
-
-        if (resume_block_device)
-                return 0;
-
-        resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0);
-        if (IS_ERR(resume_block_device)) {
-                if (!quiet)
-                        toi_early_boot_message(1, TOI_CONTINUE_REQ,
-                                "Failed to open device %x, where"
-                                " the header should be found.",
-                                resume_dev_t);
-                resume_block_device = NULL;
-                atomic_set(&resume_bdev_open_count, 0);
-                return 1;
-        }
-
-        return 0;
-}
-
-/**
- * toi_bio_initialise - initialise bio code at start of some action
- * @starting_cycle:        Whether starting a hibernation cycle, or just reading or
- *                        writing a sysfs value.
- **/
-static int toi_bio_initialise(int starting_cycle)
-{
-        int result;
-
-        if (!starting_cycle || !resume_dev_t)
-                return 0;
-
-        max_outstanding_writes = 0;
-        max_outstanding_reads = 0;
-        current_stream = 0;
-        toi_queue_flusher = current;
-#ifdef MEASURE_MUTEX_CONTENTION
-        {
-                int i, j, k;
-
-                for (i = 0; i < 2; i++)
-                        for (j = 0; j < 2; j++)
-                                for_each_online_cpu(k)
-                                        mutex_times[i][j][k] = 0;
-        }
-#endif
-        result = open_resume_dev_t(0, 1);
-
-        if (result)
-                return result;
-
-        result = toi_bio_register_storage();
-
-        if (result)
-            return result;
-
-        return get_signature_page();
-}
-
-static unsigned long raw_to_real(unsigned long raw)
-{
-        unsigned long extra;
-
-        extra = (raw * (sizeof(unsigned long) + sizeof(int)) +
-                (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
-                (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
-
-        return raw > extra ? raw - extra : 0;
-}
-
-static unsigned long toi_bio_storage_available(void)
-{
-        unsigned long sum = 0;
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    this_module->type != BIO_ALLOCATOR_MODULE)
-                        continue;
-                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage "
-                                "available from %s.", this_module->name);
-                sum += this_module->bio_allocator_ops->storage_available();
-        }
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu "
-                        "pages (%d header pages).", sum, header_pages_reserved);
-
-        return sum > header_pages_reserved ?
-                raw_to_real(sum - header_pages_reserved) : 0;
-
-}
-
-static unsigned long toi_bio_storage_allocated(void)
-{
-        return raw_pages_allocd > header_pages_reserved ?
-                raw_to_real(raw_pages_allocd - header_pages_reserved) : 0;
-}
-
-/*
- * If we have read part of the image, we might have filled  memory with
- * data that should be zeroed out.
- */
-static void toi_bio_noresume_reset(void)
-{
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset.");
-        toi_rw_cleanup(READ);
-        free_all_bdev_info();
-}
-
-/**
- * toi_bio_cleanup - cleanup after some action
- * @finishing_cycle:        Whether completing a cycle.
- **/
-static void toi_bio_cleanup(int finishing_cycle)
-{
-        if (!finishing_cycle)
-                return;
-
-        if (toi_writer_buffer) {
-                toi_free_page(11, (unsigned long) toi_writer_buffer);
-                toi_writer_buffer = NULL;
-        }
-
-        forget_signature_page();
-
-        if (header_block_device && toi_sig_data &&
-                        toi_sig_data->header_dev_t != resume_dev_t)
-                toi_close_bdev(header_block_device);
-
-        header_block_device = NULL;
-
-        close_resume_dev_t(0);
-}
-
-static int toi_bio_write_header_init(void)
-{
-        int result;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init");
-        toi_rw_init(WRITE, 0);
-        toi_writer_buffer_posn = 0;
-
-        /* Info needed to bootstrap goes at the start of the header.
-         * First we save the positions and devinfo, including the number
-         * of header pages. Then we save the structs containing data needed
-         * for reading the header pages back.
-         * Note that even if header pages take more than one page, when we
-         * read back the info, we will have restored the location of the
-         * next header page by the time we go to use it.
-         */
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains.");
-        result = toi_serialise_extent_chains();
-
-        if (result)
-                return result;
-
-        /*
-         * Signature page hasn't been modified at this point. Write it in
-         * the header so we can restore it later.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page.");
-        return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops,
-                        (char *) toi_cur_sig_page,
-                        PAGE_SIZE);
-}
-
-static int toi_bio_write_header_cleanup(void)
-{
-        int result = 0;
-
-        if (toi_writer_buffer_posn)
-                toi_bio_queue_write(&toi_writer_buffer);
-
-        result = toi_finish_all_io();
-
-        unowned = 0;
-        total_header_bytes = 0;
-
-        /* Set signature to save we have an image */
-        if (!result)
-                result = toi_bio_mark_have_image();
-
-        return result;
-}
-
-/*
- * toi_bio_read_header_init()
- *
- * Description:
- * 1. Attempt to read the device specified with resume=.
- * 2. Check the contents of the swap header for our signature.
- * 3. Warn, ignore, reset and/or continue as appropriate.
- * 4. If continuing, read the toi_swap configuration section
- *    of the header and set up block device info so we can read
- *    the rest of the header & image.
- *
- * Returns:
- * May not return if user choose to reboot at a warning.
- * -EINVAL if cannot resume at this time. Booting should continue
- * normally.
- */
-
-static int toi_bio_read_header_init(void)
-{
-        int result = 0;
-        char buf[32];
-
-        toi_writer_buffer_posn = 0;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init");
-
-        if (!toi_sig_data) {
-                printk(KERN_INFO "toi_bio_read_header_init called when we "
-                                "haven't verified there is an image!\n");
-                return -EINVAL;
-        }
-
-        /*
-         * If the header is not on the resume_swap_dev_t, get the resume device
-         * first.
-         */
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.",
-                        toi_sig_data->header_dev_t);
-        if (toi_sig_data->have_uuid) {
-                struct fs_info seek;
-                dev_t device;
-
-                strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16);
-                seek.dev_t = toi_sig_data->header_dev_t;
-                seek.last_mount_size = 0;
-                device = blk_lookup_fs_info(&seek);
-                if (device) {
-                        printk("Using dev_t %s, returned by blk_lookup_fs_info.\n",
-                                        format_dev_t(buf, device));
-                        toi_sig_data->header_dev_t = device;
-                }
-        }
-        if (toi_sig_data->header_dev_t != resume_dev_t) {
-                header_block_device = toi_open_bdev(NULL,
-                                toi_sig_data->header_dev_t, 1);
-
-                if (IS_ERR(header_block_device))
-                        return PTR_ERR(header_block_device);
-        } else
-                header_block_device = resume_block_device;
-
-        if (!toi_writer_buffer)
-                toi_writer_buffer = (char *) toi_get_zeroed_page(11,
-                                TOI_ATOMIC_GFP);
-        more_readahead = 1;
-
-        /*
-         * Read toi_swap configuration.
-         * Headerblock size taken into account already.
-         */
-        result = toi_bio_ops.bdev_page_io(READ, header_block_device,
-                        toi_sig_data->first_header_block,
-                        virt_to_page((unsigned long) toi_writer_buffer));
-        if (result)
-                return result;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains.");
-        result = toi_load_extent_chains();
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page.");
-        toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
-        if (!toi_orig_sig_page) {
-                printk(KERN_ERR "Failed to allocate memory for the current"
-                        " image signature.\n");
-                return -ENOMEM;
-        }
-
-        return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops,
-                        (char *) toi_orig_sig_page,
-                        PAGE_SIZE);
-}
-
-static int toi_bio_read_header_cleanup(void)
-{
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup.");
-        return toi_rw_cleanup(READ);
-}
-
-/* Works only for digits and letters, but small and fast */
-#define TOLOWER(x) ((x) | 0x20)
-
-/*
- * UUID must be 32 chars long. It may have dashes, but nothing
- * else.
- */
-char *uuid_from_commandline(char *commandline)
-{
-        int low = 0;
-        char *result = NULL, *output, *ptr;
-
-        if (strncmp(commandline, "UUID=", 5))
-                return NULL;
-
-        result = kzalloc(17, GFP_KERNEL);
-        if (!result) {
-                printk("Failed to kzalloc UUID text memory.\n");
-                return NULL;
-        }
-
-        ptr = commandline + 5;
-        output = result;
-
-        while (*ptr && (output - result) < 16) {
-                if (isxdigit(*ptr)) {
-                        int value = isdigit(*ptr) ? *ptr - '0' :
-                                TOLOWER(*ptr) - 'a' + 10;
-                        if (low) {
-                                *output += value;
-                                output++;
-                        } else {
-                                *output = value << 4;
-                        }
-                        low = !low;
-                } else if (*ptr != '-')
-                        break;
-                ptr++;
-        }
-
-        if ((output - result) < 16 || *ptr) {
-                printk(KERN_DEBUG "Found resume=UUID=, but the value looks "
-                                "invalid.\n");
-                kfree(result);
-                result = NULL;
-        }
-
-        return result;
-}
-
-#define retry_if_fails(command) \
-do { \
-        command; \
-        if (!resume_dev_t && !waited_for_device_probe) { \
-                wait_for_device_probe(); \
-                command; \
-                waited_for_device_probe = 1; \
-        } \
-} while(0)
-
-/**
- * try_to_open_resume_device: Try to parse and open resume=
- *
- * Any "swap:" has been stripped away and we just have the path to deal with.
- * We attempt to do name_to_dev_t, open and stat the file. Having opened the
- * file, get the struct block_device * to match.
- */
-static int try_to_open_resume_device(char *commandline, int quiet)
-{
-        struct kstat stat;
-        int error = 0;
-        char *uuid = uuid_from_commandline(commandline);
-        int waited_for_device_probe = 0;
-
-        resume_dev_t = MKDEV(0, 0);
-
-        if (!strlen(commandline))
-                retry_if_fails(toi_bio_scan_for_image(quiet));
-
-        if (uuid) {
-                struct fs_info seek;
-                strncpy((char *) &seek.uuid, uuid, 16);
-                seek.dev_t = resume_dev_t;
-                seek.last_mount_size = 0;
-                retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek));
-                kfree(uuid);
-        }
-
-        if (!resume_dev_t)
-                retry_if_fails(resume_dev_t = name_to_dev_t(commandline));
-
-        if (!resume_dev_t) {
-                struct file *file = filp_open(commandline,
-                                O_RDONLY|O_LARGEFILE, 0);
-
-                if (!IS_ERR(file) && file) {
-                        vfs_getattr(&file->f_path, &stat);
-                        filp_close(file, NULL);
-                } else
-                        error = vfs_stat(commandline, &stat);
-                if (!error)
-                        resume_dev_t = stat.rdev;
-        }
-
-        if (!resume_dev_t) {
-                if (quiet)
-                        return 1;
-
-                if (test_toi_state(TOI_TRYING_TO_RESUME))
-                        toi_early_boot_message(1, toi_translate_err_default,
-                          "Failed to translate \"%s\" into a device id.\n",
-                          commandline);
-                else
-                        printk("TuxOnIce: Can't translate \"%s\" into a device "
-                                        "id yet.\n", commandline);
-                return 1;
-        }
-
-        return open_resume_dev_t(1, quiet);
-}
-
-/*
- * Parse Image Location
- *
- * Attempt to parse a resume= parameter.
- * Swap Writer accepts:
- * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
- *
- * Where:
- * DEVNAME is convertable to a dev_t by name_to_dev_t
- * FIRSTBLOCK is the location of the first block in the swap file
- * (specifying for a swap partition is nonsensical but not prohibited).
- * Data is validated by attempting to read a swap header from the
- * location given. Failure will result in toi_swap refusing to
- * save an image, and a reboot with correct parameters will be
- * necessary.
- */
-static int toi_bio_parse_sig_location(char *commandline,
-                int only_allocator, int quiet)
-{
-        char *thischar, *devstart, *colon = NULL;
-        int signature_found, result = -EINVAL, temp_result = 0;
-
-        if (strncmp(commandline, "swap:", 5) &&
-            strncmp(commandline, "file:", 5)) {
-                /*
-                 * Failing swap:, we'll take a simple resume=/dev/hda2, or a
-                 * blank value (scan) but fall through to other allocators
-                 * if /dev/ or UUID= isn't matched.
-                 */
-                if (strncmp(commandline, "/dev/", 5) &&
-                    strncmp(commandline, "UUID=", 5) &&
-                    strlen(commandline))
-                        return 1;
-        } else
-                commandline += 5;
-
-        devstart = commandline;
-        thischar = commandline;
-        while ((*thischar != ':') && (*thischar != '@') &&
-                ((thischar - commandline) < 250) && (*thischar))
-                thischar++;
-
-        if (*thischar == ':') {
-                colon = thischar;
-                *colon = 0;
-                thischar++;
-        }
-
-        while ((thischar - commandline) < 250 && *thischar)
-                thischar++;
-
-        if (colon) {
-                unsigned long block;
-                temp_result = kstrtoul(colon + 1, 0, &block);
-                if (!temp_result)
-                        resume_firstblock = (int) block;
-        } else
-                resume_firstblock = 0;
-
-        clear_toi_state(TOI_CAN_HIBERNATE);
-        clear_toi_state(TOI_CAN_RESUME);
-
-        if (!temp_result)
-                temp_result = try_to_open_resume_device(devstart, quiet);
-
-        if (colon)
-                *colon = ':';
-
-        /* No error if we only scanned */
-        if (temp_result)
-                return strlen(commandline) ? -EINVAL : 1;
-
-        signature_found = toi_bio_image_exists(quiet);
-
-        if (signature_found != -1) {
-                result = 0;
-                /*
-                 * TODO: If only file storage, CAN_HIBERNATE should only be
-                 * set if file allocator's target is valid.
-                 */
-                set_toi_state(TOI_CAN_HIBERNATE);
-                set_toi_state(TOI_CAN_RESUME);
-        } else
-                if (!quiet)
-                        printk(KERN_ERR "TuxOnIce: Block I/O: No "
-                                "signature found at %s.\n", devstart);
-
-        return result;
-}
-
-static void toi_bio_release_storage(void)
-{
-        header_pages_reserved = 0;
-        raw_pages_allocd = 0;
-
-        free_all_bdev_info();
-}
-
-/* toi_swap_remove_image
- *
- */
-static int toi_bio_remove_image(void)
-{
-        int result;
-
-        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image.");
-
-        result = toi_bio_restore_original_signature();
-
-        /*
-         * We don't do a sanity check here: we want to restore the swap
-         * whatever version of kernel made the hibernate image.
-         *
-         * We need to write swap, but swap may not be enabled so
-         * we write the device directly
-         *
-         * If we don't have an current_signature_page, we didn't
-         * read an image header, so don't change anything.
-         */
-
-        toi_bio_release_storage();
-
-        return result;
-}
-
-struct toi_bio_ops toi_bio_ops = {
-        .bdev_page_io = toi_bdev_page_io,
-        .register_storage = toi_register_storage_chain,
-        .free_storage = toi_bio_release_storage,
-};
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io,
-                        0, 16384, 0, NULL),
-};
-
-struct toi_module_ops toi_blockwriter_ops = {
-        .type                           = WRITER_MODULE,
-        .name                           = "block i/o",
-        .directory                      = "block_io",
-        .module                         = THIS_MODULE,
-        .memory_needed                  = toi_bio_memory_needed,
-        .print_debug_info               = toi_bio_print_debug_stats,
-        .storage_needed                 = toi_bio_storage_needed,
-        .save_config_info               = toi_bio_save_config_info,
-        .load_config_info               = toi_bio_load_config_info,
-        .initialise                     = toi_bio_initialise,
-        .cleanup                        = toi_bio_cleanup,
-        .post_atomic_restore            = toi_bio_chains_post_atomic,
-
-        .rw_init                        = toi_rw_init,
-        .rw_cleanup                     = toi_rw_cleanup,
-        .read_page                      = toi_bio_read_page,
-        .write_page                     = toi_bio_write_page,
-        .rw_header_chunk                = toi_rw_header_chunk,
-        .rw_header_chunk_noreadahead    = toi_rw_header_chunk_noreadahead,
-        .io_flusher                     = bio_io_flusher,
-        .update_throughput_throttle     = update_throughput_throttle,
-        .finish_all_io                  = toi_finish_all_io,
-
-        .noresume_reset                 = toi_bio_noresume_reset,
-        .storage_available              = toi_bio_storage_available,
-        .storage_allocated              = toi_bio_storage_allocated,
-        .reserve_header_space           = toi_bio_reserve_header_space,
-        .allocate_storage               = toi_bio_allocate_storage,
-        .free_unused_storage            = toi_bio_free_unused_storage,
-        .image_exists                   = toi_bio_image_exists,
-        .mark_resume_attempted          = toi_bio_mark_resume_attempted,
-        .write_header_init              = toi_bio_write_header_init,
-        .write_header_cleanup           = toi_bio_write_header_cleanup,
-        .read_header_init               = toi_bio_read_header_init,
-        .read_header_cleanup            = toi_bio_read_header_cleanup,
-        .get_header_version             = toi_bio_get_header_version,
-        .remove_image                   = toi_bio_remove_image,
-        .parse_sig_location             = toi_bio_parse_sig_location,
-
-        .sysfs_data                     = sysfs_params,
-        .num_sysfs_entries              = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/**
- * toi_block_io_load - load time routine for block I/O module
- *
- * Register block i/o ops and sysfs entries.
- **/
-static __init int toi_block_io_load(void)
-{
-        return toi_register_module(&toi_blockwriter_ops);
-}
-
-late_initcall(toi_block_io_load);
diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h
deleted file mode 100644
index 5e1964a61..000000000
--- a/kernel/power/tuxonice_bio_internal.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_internal.h
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-/* Extent chains */
-void toi_extent_state_goto_start(void);
-void toi_extent_state_save(int slot);
-int go_next_page(int writing, int section_barrier);
-void toi_extent_state_restore(int slot);
-void free_all_bdev_info(void);
-int devices_of_same_priority(struct toi_bdev_info *this);
-int toi_register_storage_chain(struct toi_bdev_info *new);
-int toi_serialise_extent_chains(void);
-int toi_load_extent_chains(void);
-int toi_bio_rw_page(int writing, struct page *page, int is_readahead,
-                int free_group);
-int toi_bio_restore_original_signature(void);
-int toi_bio_devinfo_storage_needed(void);
-unsigned long get_headerblock(void);
-dev_t get_header_dev_t(void);
-struct block_device *get_header_bdev(void);
-int toi_bio_allocate_storage(unsigned long request);
-void toi_bio_free_unused_storage(void);
-
-/* Signature functions */
-#define HaveImage "HaveImage"
-#define NoImage "TuxOnIce"
-#define sig_size (sizeof(HaveImage))
-
-struct sig_data {
-        char sig[sig_size];
-        int have_image;
-        int resumed_before;
-
-        char have_uuid;
-        char header_uuid[17];
-        dev_t header_dev_t;
-        unsigned long first_header_block;
-
-        /* Repeat the signature to be sure we have a header version */
-        char sig2[sig_size];
-        int header_version;
-};
-
-void forget_signature_page(void);
-int toi_check_for_signature(void);
-int toi_bio_image_exists(int quiet);
-int get_signature_page(void);
-int toi_bio_mark_resume_attempted(int);
-extern char *toi_cur_sig_page;
-extern char *toi_orig_sig_page;
-int toi_bio_mark_have_image(void);
-extern struct sig_data *toi_sig_data;
-extern dev_t resume_dev_t;
-extern struct block_device *resume_block_device;
-extern struct block_device *header_block_device;
-extern unsigned long resume_firstblock;
-
-struct block_device *open_bdev(dev_t device, int display_errs);
-extern int current_stream;
-extern int more_readahead;
-int toi_do_io(int writing, struct block_device *bdev, long block0,
-        struct page *page, int is_readahead, int syncio, int free_group);
-int get_main_pool_phys_params(void);
-
-void toi_close_bdev(struct block_device *bdev);
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
-                int display_errs);
-
-extern struct toi_module_ops toi_blockwriter_ops;
-void dump_block_chains(void);
-void debug_broken_header(void);
-extern unsigned long raw_pages_allocd, header_pages_reserved;
-int toi_bio_chains_debug_info(char *buffer, int size);
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd);
-int toi_bio_scan_for_image(int quiet);
-int toi_bio_get_header_version(void);
-
-void close_resume_dev_t(int force);
-int open_resume_dev_t(int force, int quiet);
-
-struct toi_incremental_image_pointer_saved_data {
-    unsigned long block;
-    int chain;
-};
-
-struct toi_incremental_image_pointer {
-    struct toi_incremental_image_pointer_saved_data save;
-    struct block_device *bdev;
-    unsigned long block;
-};
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c
deleted file mode 100644
index f5418f092..000000000
--- a/kernel/power/tuxonice_bio_signature.c
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_signature.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-struct sig_data *toi_sig_data;
-
-/* Struct of swap header pages */
-
-struct old_sig_data {
-        dev_t device;
-        unsigned long sector;
-        int resume_attempted;
-        int orig_sig_type;
-};
-
-union diskpage {
-        union swap_header swh;        /* swh.magic is the only member used */
-        struct sig_data sig_data;
-        struct old_sig_data old_sig_data;
-};
-
-union p_diskpage {
-        union diskpage *pointer;
-        char *ptr;
-        unsigned long address;
-};
-
-char *toi_cur_sig_page;
-char *toi_orig_sig_page;
-int have_image;
-int have_old_image;
-
-int get_signature_page(void)
-{
-        if (!toi_cur_sig_page) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0,
-                                "Allocating current signature page.");
-                toi_cur_sig_page = (char *) toi_get_zeroed_page(38,
-                        TOI_ATOMIC_GFP);
-                if (!toi_cur_sig_page) {
-                        printk(KERN_ERR "Failed to allocate memory for the "
-                                "current image signature.\n");
-                        return -ENOMEM;
-                }
-
-                toi_sig_data = (struct sig_data *) toi_cur_sig_page;
-        }
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx,"
-                        " sector %d.",
-                        resume_block_device->bd_dev, resume_firstblock);
-
-        return toi_bio_ops.bdev_page_io(READ, resume_block_device,
-                resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-void forget_signature_page(void)
-{
-        if (toi_cur_sig_page) {
-                toi_sig_data = NULL;
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page"
-                                " (%p).", toi_cur_sig_page);
-                toi_free_page(38, (unsigned long) toi_cur_sig_page);
-                toi_cur_sig_page = NULL;
-        }
-
-        if (toi_orig_sig_page) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page"
-                                " (%p).", toi_orig_sig_page);
-                toi_free_page(38, (unsigned long) toi_orig_sig_page);
-                toi_orig_sig_page = NULL;
-        }
-}
-
-/*
- * We need to ensure we use the signature page that's currently on disk,
- * so as to not remove the image header. Post-atomic-restore, the orig sig
- * page will be empty, so we can use that as our method of knowing that we
- * need to load the on-disk signature and not use the non-image sig in
- * memory. (We're going to powerdown after writing the change, so it's safe.
- */
-int toi_bio_mark_resume_attempted(int flag)
-{
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.",
-                        flag);
-        if (!toi_orig_sig_page) {
-                forget_signature_page();
-                get_signature_page();
-        }
-        toi_sig_data->resumed_before = flag;
-        return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-                resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int toi_bio_mark_have_image(void)
-{
-        int result = 0;
-        char buf[32];
-        struct fs_info *fs_info;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists.");
-        memcpy(toi_sig_data->sig, tuxonice_signature,
-                        sizeof(tuxonice_signature));
-        toi_sig_data->have_image = 1;
-        toi_sig_data->resumed_before = 0;
-        toi_sig_data->header_dev_t = get_header_dev_t();
-        toi_sig_data->have_uuid = 0;
-
-        fs_info = fs_info_from_block_dev(get_header_bdev());
-        if (fs_info && !IS_ERR(fs_info)) {
-                memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16);
-                free_fs_info(fs_info);
-        } else
-                result = (int) PTR_ERR(fs_info);
-
-        if (!result) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.",
-                                format_dev_t(buf, get_header_dev_t()));
-                toi_sig_data->have_uuid = 1;
-        } else
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for "
-                                "dev_t %s.",
-                                format_dev_t(buf, get_header_dev_t()));
-
-        toi_sig_data->first_header_block = get_headerblock();
-        have_image = 1;
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block "
-                        "is %d.", toi_sig_data->header_dev_t,
-                        toi_sig_data->first_header_block);
-
-        memcpy(toi_sig_data->sig2, tuxonice_signature,
-                        sizeof(tuxonice_signature));
-        toi_sig_data->header_version = TOI_HEADER_VERSION;
-
-        return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-                resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int remove_old_signature(void)
-{
-        union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page;
-        char *orig_sig;
-        char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
-        int result;
-        struct block_device *header_bdev;
-        struct old_sig_data *old_sig_data =
-                &swap_header_page.pointer->old_sig_data;
-
-        header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1);
-        result = toi_bio_ops.bdev_page_io(READ, header_bdev,
-                        old_sig_data->sector, virt_to_page(header_start));
-
-        if (result)
-                goto out;
-
-        /*
-         * TODO: Get the original contents of the first bytes of the swap
-         * header page.
-         */
-        if (!old_sig_data->orig_sig_type)
-                orig_sig = "SWAP-SPACE";
-        else
-                orig_sig = "SWAPSPACE2";
-
-        memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
-        memcpy(swap_header_page.ptr, header_start, 10);
-
-        result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-                resume_firstblock, virt_to_page(swap_header_page.ptr));
-
-out:
-        toi_close_bdev(header_bdev);
-        have_old_image = 0;
-        toi_free_page(38, (unsigned long) header_start);
-        return result;
-}
-
-/*
- * toi_bio_restore_original_signature - restore the original signature
- *
- * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used.
- * It will have the original signature page contents, stored in the image
- * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain
- * the contents that were loaded when we started the cycle.
- */
-int toi_bio_restore_original_signature(void)
-{
-        char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page;
-
-        if (have_old_image)
-                return remove_old_signature();
-
-        if (!use) {
-                printk("toi_bio_restore_original_signature: No signature "
-                                "page loaded.\n");
-                return 0;
-        }
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists.");
-        have_image = 0;
-        toi_sig_data->have_image = 0;
-        return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
-                resume_firstblock, virt_to_page(use));
-}
-
-/*
- * check_for_signature - See whether we have an image.
- *
- * Returns 0 if no image, 1 if there is one, -1 if indeterminate.
- */
-int toi_check_for_signature(void)
-{
-        union p_diskpage swap_header_page;
-        int type;
-        const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" };
-        const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" };
-        char *swap_header;
-
-        if (!toi_cur_sig_page) {
-                int result = get_signature_page();
-
-                if (result)
-                        return result;
-        }
-
-        /*
-         * Start by looking for the binary header.
-         */
-        if (!memcmp(tuxonice_signature, toi_cur_sig_page,
-                                sizeof(tuxonice_signature))) {
-                have_image = toi_sig_data->have_image;
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. "
-                                "Have image is %d.", have_image);
-                if (have_image)
-                        toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is "
-                                        "%x. First block is %d.",
-                                        toi_sig_data->header_dev_t,
-                                        toi_sig_data->first_header_block);
-                return toi_sig_data->have_image;
-        }
-
-        /*
-         * Failing that, try old file allocator headers.
-         */
-
-        if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) {
-                have_image = 1;
-                return 1;
-        }
-
-        have_image = 0;
-
-        if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage)))
-                return 0;
-
-        /*
-         * Nope? How about swap?
-         */
-        swap_header_page = (union p_diskpage) toi_cur_sig_page;
-        swap_header = swap_header_page.pointer->swh.magic.magic;
-
-        /* Normal swapspace? */
-        for (type = 0; type < 2; type++)
-                if (!memcmp(normal_sigs[type], swap_header,
-                                        strlen(normal_sigs[type])))
-                        return 0;
-
-        /* Swsusp or uswsusp? */
-        for (type = 0; type < 3; type++)
-                if (!memcmp(swsusp_sigs[type], swap_header,
-                                        strlen(swsusp_sigs[type])))
-                        return 2;
-
-        /* Old TuxOnIce version? */
-        if (!memcmp(tuxonice_signature, swap_header,
-                                sizeof(tuxonice_signature) - 1)) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce "
-                                "signature.");
-                have_old_image = 1;
-                return 3;
-        }
-
-        return -1;
-}
-
-/*
- * Image_exists
- *
- * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
- */
-int toi_bio_image_exists(int quiet)
-{
-        int result;
-        char *msg = NULL;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists.");
-
-        if (!resume_dev_t) {
-                if (!quiet)
-                        printk(KERN_INFO "Not even trying to read header "
-                                "because resume_dev_t is not set.\n");
-                return -1;
-        }
-
-        if (open_resume_dev_t(0, quiet))
-                return -1;
-
-        result = toi_check_for_signature();
-
-        clear_toi_state(TOI_RESUMED_BEFORE);
-        if (toi_sig_data->resumed_before)
-                set_toi_state(TOI_RESUMED_BEFORE);
-
-        if (quiet || result == -ENOMEM)
-                return result;
-
-        if (result == -1)
-                msg = "TuxOnIce: Unable to find a signature."
-                                " Could you have moved a swap file?\n";
-        else if (!result)
-                msg = "TuxOnIce: No image found.\n";
-        else if (result == 1)
-                msg = "TuxOnIce: Image found.\n";
-        else if (result == 2)
-                msg = "TuxOnIce: uswsusp or swsusp image found.\n";
-        else if (result == 3)
-                msg = "TuxOnIce: Old implementation's signature found.\n";
-
-        printk(KERN_INFO "%s", msg);
-
-        return result;
-}
-
-int toi_bio_scan_for_image(int quiet)
-{
-        struct block_device *bdev;
-        char default_name[255] = "";
-
-        if (!quiet)
-                printk(KERN_DEBUG "Scanning swap devices for TuxOnIce "
-                                "signature...\n");
-        for (bdev = next_bdev_of_type(NULL, "swap"); bdev;
-                                bdev = next_bdev_of_type(bdev, "swap")) {
-                int result;
-                char name[255] = "";
-                sprintf(name, "%u:%u", MAJOR(bdev->bd_dev),
-                                MINOR(bdev->bd_dev));
-                if (!quiet)
-                        printk(KERN_DEBUG "- Trying %s.\n", name);
-                resume_block_device = bdev;
-                resume_dev_t = bdev->bd_dev;
-
-                result = toi_check_for_signature();
-
-                resume_block_device = NULL;
-                resume_dev_t = MKDEV(0, 0);
-
-                if (!default_name[0])
-                        strcpy(default_name, name);
-
-                if (result == 1) {
-                        /* Got one! */
-                        strcpy(resume_file, name);
-                        next_bdev_of_type(bdev, NULL);
-                        if (!quiet)
-                                printk(KERN_DEBUG " ==> Image found on %s.\n",
-                                                resume_file);
-                        return 1;
-                }
-                forget_signature_page();
-        }
-
-        if (!quiet)
-                printk(KERN_DEBUG "TuxOnIce scan: No image found.\n");
-        strcpy(resume_file, default_name);
-        return 0;
-}
-
-int toi_bio_get_header_version(void)
-{
-        return (memcmp(toi_sig_data->sig2, tuxonice_signature,
-                                sizeof(tuxonice_signature))) ?
-                0 : toi_sig_data->header_version;
-
-}
diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
deleted file mode 100644
index 22bf07a43..000000000
--- a/kernel/power/tuxonice_builtin.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <linux/kernel.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/bio.h>
-#include <linux/root_dev.h>
-#include <linux/freezer.h>
-#include <linux/reboot.h>
-#include <linux/writeback.h>
-#include <linux/tty.h>
-#include <linux/crypto.h>
-#include <linux/cpu.h>
-#include <linux/ctype.h>
-#include <linux/kthread.h>
-#include "tuxonice_io.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_alloc.h"
-
-unsigned long toi_bootflags_mask;
-
-/*
- * Highmem related functions (x86 only).
- */
-
-#ifdef CONFIG_HIGHMEM
-
-/**
- * copyback_high: Restore highmem pages.
- *
- * Highmem data and pbe lists are/can be stored in highmem.
- * The format is slightly different to the lowmem pbe lists
- * used for the assembly code: the last pbe in each page is
- * a struct page * instead of struct pbe *, pointing to the
- * next page where pbes are stored (or NULL if happens to be
- * the end of the list). Since we don't want to generate
- * unnecessary deltas against swsusp code, we use a cast
- * instead of a union.
- **/
-
-static void copyback_high(void)
-{
-        struct page *pbe_page = (struct page *) restore_highmem_pblist;
-        struct pbe *this_pbe, *first_pbe;
-        unsigned long *origpage, *copypage;
-        int pbe_index = 1;
-
-        if (!pbe_page)
-                return;
-
-        this_pbe = (struct pbe *) kmap_atomic(pbe_page);
-        first_pbe = this_pbe;
-
-        while (this_pbe) {
-                int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
-
-                origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address));
-                copypage = kmap_atomic((struct page *) this_pbe->address);
-
-                while (loop >= 0) {
-                        *(origpage + loop) = *(copypage + loop);
-                        loop--;
-                }
-
-                kunmap_atomic(origpage);
-                kunmap_atomic(copypage);
-
-                if (!this_pbe->next)
-                        break;
-
-                if (pbe_index < PBES_PER_PAGE) {
-                        this_pbe++;
-                        pbe_index++;
-                } else {
-                        pbe_page = (struct page *) this_pbe->next;
-                        kunmap_atomic(first_pbe);
-                        if (!pbe_page)
-                                return;
-                        this_pbe = (struct pbe *) kmap_atomic(pbe_page);
-                        first_pbe = this_pbe;
-                        pbe_index = 1;
-                }
-        }
-        kunmap_atomic(first_pbe);
-}
-
-#else /* CONFIG_HIGHMEM */
-static void copyback_high(void) { }
-#endif
-
-char toi_wait_for_keypress_dev_console(int timeout)
-{
-        int fd, this_timeout = 255, orig_kthread = 0;
-        char key = '\0';
-        struct termios t, t_backup;
-
-        /* We should be guaranteed /dev/console exists after populate_rootfs()
-         * in init/main.c.
-         */
-        fd = sys_open("/dev/console", O_RDONLY, 0);
-        if (fd < 0) {
-                printk(KERN_INFO "Couldn't open /dev/console.\n");
-                return key;
-        }
-
-        if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
-                goto out_close;
-
-        memcpy(&t_backup, &t, sizeof(t));
-
-        t.c_lflag &= ~(ISIG|ICANON|ECHO);
-        t.c_cc[VMIN] = 0;
-
-new_timeout:
-        if (timeout > 0) {
-                this_timeout = timeout < 26 ? timeout : 25;
-                timeout -= this_timeout;
-                this_timeout *= 10;
-        }
-
-        t.c_cc[VTIME] = this_timeout;
-
-        if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
-                goto out_restore;
-
-        if (current->flags & PF_KTHREAD) {
-            orig_kthread = (current->flags & PF_KTHREAD);
-            current->flags &= ~PF_KTHREAD;
-        }
-
-        while (1) {
-                if (sys_read(fd, &key, 1) <= 0) {
-                        if (timeout)
-                                goto new_timeout;
-                        key = '\0';
-                        break;
-                }
-                key = tolower(key);
-                if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
-                        if (key == 'c') {
-                                set_toi_state(TOI_CONTINUE_REQ);
-                                break;
-                        } else if (key == ' ')
-                                break;
-                } else
-                        break;
-        }
-        if (orig_kthread) {
-            current->flags |= PF_KTHREAD;
-        }
-
-out_restore:
-        sys_ioctl(fd, TCSETS, (long)&t_backup);
-out_close:
-        sys_close(fd);
-
-        return key;
-}
-
-struct toi_boot_kernel_data toi_bkd __nosavedata
-                __attribute__((aligned(PAGE_SIZE))) = {
-        MY_BOOT_KERNEL_DATA_VERSION,
-        0,
-#ifdef CONFIG_TOI_REPLACE_SWSUSP
-        (1 << TOI_REPLACE_SWSUSP) |
-#endif
-        (1 << TOI_NO_FLUSHER_THREAD) |
-        (1 << TOI_PAGESET2_FULL),
-};
-
-struct block_device *toi_open_by_devnum(dev_t dev)
-{
-        struct block_device *bdev = bdget(dev);
-        int err = -ENOMEM;
-        if (bdev)
-                err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
-        return err ? ERR_PTR(err) : bdev;
-}
-
-/**
- * toi_close_bdev: Close a swap bdev.
- *
- * int: The swap entry number to close.
- */
-void toi_close_bdev(struct block_device *bdev)
-{
-        blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
-}
-
-int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
-struct toi_core_fns *toi_core_fns;
-unsigned long toi_result;
-struct pagedir pagedir1 = {1};
-struct toi_cbw **toi_first_cbw;
-int toi_next_cbw;
-
-unsigned long toi_get_nonconflicting_page(void)
-{
-        return toi_core_fns->get_nonconflicting_page();
-}
-
-int toi_post_context_save(void)
-{
-        return toi_core_fns->post_context_save();
-}
-
-int try_tuxonice_hibernate(void)
-{
-        if (!toi_core_fns)
-                return -ENODEV;
-
-        return toi_core_fns->try_hibernate();
-}
-
-static int num_resume_calls;
-#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL
-static int ignore_late_initcall = 1;
-#else
-static int ignore_late_initcall;
-#endif
-
-int toi_translate_err_default = TOI_CONTINUE_REQ;
-
-void try_tuxonice_resume(void)
-{
-        if (!hibernation_available())
-                return;
-
-        /* Don't let it wrap around eventually */
-        if (num_resume_calls < 2)
-                num_resume_calls++;
-
-        if (num_resume_calls == 1 && ignore_late_initcall) {
-                printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n");
-                return;
-        }
-
-        if (toi_core_fns)
-                toi_core_fns->try_resume();
-        else
-                printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
-}
-
-int toi_lowlevel_builtin(void)
-{
-        int error = 0;
-
-        save_processor_state();
-        error = swsusp_arch_suspend();
-        if (error)
-                printk(KERN_ERR "Error %d hibernating\n", error);
-
-        /* Restore control flow appears here */
-        if (!toi_in_hibernate) {
-                copyback_high();
-                set_toi_state(TOI_NOW_RESUMING);
-        }
-
-        restore_processor_state();
-        return error;
-}
-
-unsigned long toi_compress_bytes_in;
-unsigned long toi_compress_bytes_out;
-
-int toi_in_suspend(void)
-{
-  return in_suspend;
-}
-
-unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
-                (1 << TOI_IGNORE_LOGLEVEL) |
-                (1 << TOI_IO_STOPPED));
-
-/* The number of hibernates we have started (some may have been cancelled) */
-unsigned int nr_hibernates;
-int toi_running;
-__nosavedata int toi_in_hibernate;
-__nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_trace_allocs;
-
-void toi_read_lock_tasklist(void)
-{
-        read_lock(&tasklist_lock);
-}
-
-void toi_read_unlock_tasklist(void)
-{
-        read_unlock(&tasklist_lock);
-}
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-int (*toi_flag_zram_disks) (void);
-
-int toi_do_flag_zram_disks(void)
-{
-        return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0;
-}
-
-#endif
-
-/* toi_generate_free_page_map
- *
- * Description:        This routine generates a bitmap of free pages from the
- *                 lists used by the memory manager. We then use the bitmap
- *                 to quickly calculate which pages to save and in which
- *                 pagesets.
- */
-void toi_generate_free_page_map(void)
-{
-        int order, cpu, t;
-        unsigned long flags, i;
-        struct zone *zone;
-        struct list_head *curr;
-        unsigned long pfn;
-        struct page *page;
-
-        for_each_populated_zone(zone) {
-
-                if (!zone->spanned_pages)
-                        continue;
-
-                spin_lock_irqsave(&zone->lock, flags);
-
-                for (i = 0; i < zone->spanned_pages; i++) {
-                        pfn = zone->zone_start_pfn + i;
-
-                        if (!pfn_valid(pfn))
-                                continue;
-
-                        page = pfn_to_page(pfn);
-
-                        ClearPageNosaveFree(page);
-                }
-
-                for_each_migratetype_order(order, t) {
-                        list_for_each(curr,
-                                        &zone->free_area[order].free_list[t]) {
-                                unsigned long j;
-
-                                pfn = page_to_pfn(list_entry(curr, struct page,
-                                                        lru));
-                                for (j = 0; j < (1UL << order); j++)
-                                        SetPageNosaveFree(pfn_to_page(pfn + j));
-                        }
-                }
-
-                for_each_online_cpu(cpu) {
-                        struct per_cpu_pageset *pset =
-                                per_cpu_ptr(zone->pageset, cpu);
-                        struct per_cpu_pages *pcp = &pset->pcp;
-                        struct page *page;
-                        int t;
-
-                        for (t = 0; t < MIGRATE_PCPTYPES; t++)
-                                list_for_each_entry(page, &pcp->lists[t], lru)
-                                        SetPageNosaveFree(page);
-                }
-
-                spin_unlock_irqrestore(&zone->lock, flags);
-        }
-}
-
-/* toi_size_of_free_region
- *
- * Description:        Return the number of pages that are free, beginning with and
- *                 including this one.
- */
-int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn)
-{
-        unsigned long this_pfn = start_pfn,
-                      end_pfn = zone_end_pfn(zone);
-
-        while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
-                this_pfn++;
-
-        return this_pfn - start_pfn;
-}
-
-static int __init toi_wait_setup(char *str)
-{
-        int value;
-
-        if (sscanf(str, "=%d", &value)) {
-                if (value < -1 || value > 255)
-                        printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
-                                        "255.\n");
-                else
-                        toi_wait = value;
-        }
-
-        return 1;
-}
-__setup("toi_wait", toi_wait_setup);
-
-static int __init toi_translate_retry_setup(char *str)
-{
-        toi_translate_err_default = 0;
-        return 1;
-}
-__setup("toi_translate_retry", toi_translate_retry_setup);
-
-static int __init toi_debug_setup(char *str)
-{
-        toi_bkd.toi_action |= (1 << TOI_LOGALL);
-        toi_bootflags_mask |= (1 << TOI_LOGALL);
-        toi_bkd.toi_debug_state = 255;
-        toi_bkd.toi_default_console_level = 7;
-        return 1;
-}
-__setup("toi_debug_setup", toi_debug_setup);
-
-static int __init toi_pause_setup(char *str)
-{
-        toi_bkd.toi_action |= (1 << TOI_PAUSE);
-        toi_bootflags_mask |= (1 << TOI_PAUSE);
-        return 1;
-}
-__setup("toi_pause", toi_pause_setup);
-
-#ifdef CONFIG_PM_DEBUG
-static int __init toi_trace_allocs_setup(char *str)
-{
-        int value;
-
-        if (sscanf(str, "=%d", &value))
-                toi_trace_allocs = value;
-
-        return 1;
-}
-__setup("toi_trace_allocs", toi_trace_allocs_setup);
-#endif
-
-static int __init toi_ignore_late_initcall_setup(char *str)
-{
-        int value;
-
-        if (sscanf(str, "=%d", &value))
-                ignore_late_initcall = value;
-
-        return 1;
-}
-__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup);
-
-static int __init toi_force_no_multithreaded_setup(char *str)
-{
-        int value;
-
-        toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO);
-        toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO);
-
-        if (sscanf(str, "=%d", &value) && value)
-                toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO);
-
-        return 1;
-}
-__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup);
-
-#ifdef CONFIG_KGDB
-static int __init toi_post_resume_breakpoint_setup(char *str)
-{
-        int value;
-
-        toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT);
-        toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT);
-        if (sscanf(str, "=%d", &value) && value)
-                toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT);
-
-        return 1;
-}
-__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup);
-#endif
-
-static int __init toi_disable_readahead_setup(char *str)
-{
-        int value;
-
-        toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD);
-        toi_bootflags_mask |= (1 << TOI_NO_READAHEAD);
-        if (sscanf(str, "=%d", &value) && value)
-                toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD);
-
-        return 1;
-}
-__setup("toi_no_readahead", toi_disable_readahead_setup);
diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
deleted file mode 100644
index 9539818e0..000000000
--- a/kernel/power/tuxonice_builtin.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <asm/setup.h>
-
-extern struct toi_core_fns *toi_core_fns;
-extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
-extern unsigned int nr_hibernates;
-extern int toi_in_hibernate;
-
-extern __nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_lowlevel_builtin(void);
-
-#ifdef CONFIG_HIGHMEM
-extern __nosavedata struct zone_data *toi_nosave_zone_list;
-extern __nosavedata unsigned long toi_nosave_max_pfn;
-#endif
-
-extern unsigned long toi_get_nonconflicting_page(void);
-extern int toi_post_context_save(void);
-
-extern char toi_wait_for_keypress_dev_console(int timeout);
-extern struct block_device *toi_open_by_devnum(dev_t dev);
-extern void toi_close_bdev(struct block_device *bdev);
-extern int toi_wait;
-extern int toi_translate_err_default;
-extern int toi_force_no_multithreaded;
-extern void toi_read_lock_tasklist(void);
-extern void toi_read_unlock_tasklist(void);
-extern int toi_in_suspend(void);
-extern void toi_generate_free_page_map(void);
-extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn);
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-extern int toi_do_flag_zram_disks(void);
-#else
-#define toi_do_flag_zram_disks() (0)
-#endif
diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
deleted file mode 100644
index 1c4e10c72..000000000
--- a/kernel/power/tuxonice_checksum.c
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-
-static struct toi_module_ops toi_checksum_ops;
-
-/* Constant at the mo, but I might allow tuning later */
-static char toi_checksum_name[32] = "md4";
-/* Bytes per checksum */
-#define CHECKSUM_SIZE (16)
-
-#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
-
-struct cpu_context {
-        struct crypto_hash *transform;
-        struct hash_desc desc;
-        struct scatterlist sg[2];
-        char *buf;
-};
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-static int pages_allocated;
-static unsigned long page_list;
-
-static int toi_num_resaved;
-
-static unsigned long this_checksum, next_page;
-static int checksum_count;
-
-static inline int checksum_pages_needed(void)
-{
-        return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
-}
-
-/* ---- Local buffer management ---- */
-
-/*
- * toi_checksum_cleanup
- *
- * Frees memory allocated for our labours.
- */
-static void toi_checksum_cleanup(int ending_cycle)
-{
-        int cpu;
-
-        if (ending_cycle) {
-                for_each_online_cpu(cpu) {
-                        struct cpu_context *this = &per_cpu(contexts, cpu);
-                        if (this->transform) {
-                                crypto_free_hash(this->transform);
-                                this->transform = NULL;
-                                this->desc.tfm = NULL;
-                        }
-
-                        if (this->buf) {
-                                toi_free_page(27, (unsigned long) this->buf);
-                                this->buf = NULL;
-                        }
-                }
-        }
-}
-
-/*
- * toi_crypto_initialise
- *
- * Prepare to do some work by allocating buffers and transforms.
- * Returns: Int: Zero. Even if we can't set up checksum, we still
- * seek to hibernate.
- */
-static int toi_checksum_initialise(int starting_cycle)
-{
-        int cpu;
-
-        if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
-                return 0;
-
-        if (!*toi_checksum_name) {
-                printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
-                return 1;
-        }
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                struct page *page;
-
-                this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
-                if (IS_ERR(this->transform)) {
-                        printk(KERN_INFO "TuxOnIce: Failed to initialise the "
-                                "%s checksum algorithm: %ld.\n",
-                                toi_checksum_name, (long) this->transform);
-                        this->transform = NULL;
-                        return 1;
-                }
-
-                this->desc.tfm = this->transform;
-                this->desc.flags = 0;
-
-                page = toi_alloc_page(27, GFP_KERNEL);
-                if (!page)
-                        return 1;
-                this->buf = page_address(page);
-                sg_init_one(&this->sg[0], this->buf, PAGE_SIZE);
-        }
-        return 0;
-}
-
-/*
- * toi_checksum_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_checksum_print_debug_stats(char *buffer, int size)
-{
-        int len;
-
-        if (!toi_checksum_ops.enabled)
-                return scnprintf(buffer, size,
-                        "- Checksumming disabled.\n");
-
-        len = scnprintf(buffer, size, "- Checksum method is '%s'.\n",
-                        toi_checksum_name);
-        len += scnprintf(buffer + len, size - len,
-                "  %d pages resaved in atomic copy.\n", toi_num_resaved);
-        return len;
-}
-
-static int toi_checksum_memory_needed(void)
-{
-        return toi_checksum_ops.enabled ?
-                checksum_pages_needed() << PAGE_SHIFT : 0;
-}
-
-static int toi_checksum_storage_needed(void)
-{
-        if (toi_checksum_ops.enabled)
-                return strlen(toi_checksum_name) + sizeof(int) + 1;
-        else
-                return 0;
-}
-
-/*
- * toi_checksum_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_checksum_save_config_info(char *buffer)
-{
-        int namelen = strlen(toi_checksum_name) + 1;
-        int total_len;
-
-        *((unsigned int *) buffer) = namelen;
-        strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
-        total_len = sizeof(unsigned int) + namelen;
-        return total_len;
-}
-
-/* toi_checksum_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description:        Reload information needed for dechecksuming the image at
- * resume time.
- */
-static void toi_checksum_load_config_info(char *buffer, int size)
-{
-        int namelen;
-
-        namelen = *((unsigned int *) (buffer));
-        strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
-                        namelen);
-        return;
-}
-
-/*
- * Free Checksum Memory
- */
-
-void free_checksum_pages(void)
-{
-        while (pages_allocated) {
-                unsigned long next = *((unsigned long *) page_list);
-                ClearPageNosave(virt_to_page(page_list));
-                toi_free_page(15, (unsigned long) page_list);
-                page_list = next;
-                pages_allocated--;
-        }
-}
-
-/*
- * Allocate Checksum Memory
- */
-
-int allocate_checksum_pages(void)
-{
-        int pages_needed = checksum_pages_needed();
-
-        if (!toi_checksum_ops.enabled)
-                return 0;
-
-        while (pages_allocated < pages_needed) {
-                unsigned long *new_page =
-                  (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
-                if (!new_page) {
-                        printk(KERN_ERR "Unable to allocate checksum pages.\n");
-                        return -ENOMEM;
-                }
-                SetPageNosave(virt_to_page(new_page));
-                (*new_page) = page_list;
-                page_list = (unsigned long) new_page;
-                pages_allocated++;
-        }
-
-        next_page = (unsigned long) page_list;
-        checksum_count = 0;
-
-        return 0;
-}
-
-char *tuxonice_get_next_checksum(void)
-{
-        if (!toi_checksum_ops.enabled)
-                return NULL;
-
-        if (checksum_count % CHECKSUMS_PER_PAGE)
-                this_checksum += CHECKSUM_SIZE;
-        else {
-                this_checksum = next_page + sizeof(void *);
-                next_page = *((unsigned long *) next_page);
-        }
-
-        checksum_count++;
-        return (char *) this_checksum;
-}
-
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
-{
-        char *pa;
-        int result, cpu = smp_processor_id();
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
-        if (!toi_checksum_ops.enabled)
-                return 0;
-
-        pa = kmap(page);
-        memcpy(ctx->buf, pa, PAGE_SIZE);
-        kunmap(page);
-        result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
-                                                checksum_locn);
-        if (result)
-                printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest "
-                                "returned %d.\n", result);
-        return result;
-}
-/*
- * Calculate checksums
- */
-
-void check_checksums(void)
-{
-        int index = 0, cpu = smp_processor_id();
-        char current_checksum[CHECKSUM_SIZE];
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-        unsigned long pfn;
-
-        if (!toi_checksum_ops.enabled) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled.");
-                return;
-        }
-
-        next_page = (unsigned long) page_list;
-
-        toi_num_resaved = 0;
-        this_checksum = 0;
-
-        toi_trace_index++;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums.");
-        memory_bm_position_reset(pageset2_map);
-        for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP;
-                        pfn = memory_bm_next_pfn(pageset2_map, 0)) {
-                int ret, resave_needed = false;
-                char *pa;
-                struct page *page = pfn_to_page(pfn);
-
-                if (index < checksum_count) {
-                    if (index % CHECKSUMS_PER_PAGE) {
-                        this_checksum += CHECKSUM_SIZE;
-                    } else {
-                        this_checksum = next_page + sizeof(void *);
-                        next_page = *((unsigned long *) next_page);
-                    }
-
-                    /* Done when IRQs disabled so must be atomic */
-                    pa = kmap_atomic(page);
-                    memcpy(ctx->buf, pa, PAGE_SIZE);
-                    kunmap_atomic(pa);
-                    ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
-                            current_checksum);
-
-                    if (ret) {
-                        printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
-                        return;
-                    }
-
-                    resave_needed = memcmp(current_checksum, (char *) this_checksum,
-                            CHECKSUM_SIZE);
-                } else {
-                    resave_needed = true;
-                }
-
-                if (resave_needed) {
-                        TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed);
-                        SetPageResave(pfn_to_page(pfn));
-                        toi_num_resaved++;
-                        if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
-                                set_abort_result(TOI_RESAVE_NEEDED);
-                }
-
-                index++;
-        }
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete.");
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0,
-                        NULL),
-        SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_ABORT_ON_RESAVE_NEEDED, 0)
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_checksum_ops = {
-        .type                        = MISC_MODULE,
-        .name                        = "checksumming",
-        .directory                = "checksum",
-        .module                        = THIS_MODULE,
-        .initialise                = toi_checksum_initialise,
-        .cleanup                = toi_checksum_cleanup,
-        .print_debug_info        = toi_checksum_print_debug_stats,
-        .save_config_info        = toi_checksum_save_config_info,
-        .load_config_info        = toi_checksum_load_config_info,
-        .memory_needed                = toi_checksum_memory_needed,
-        .storage_needed                = toi_checksum_storage_needed,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-int toi_checksum_init(void)
-{
-        int result = toi_register_module(&toi_checksum_ops);
-        return result;
-}
-
-void toi_checksum_exit(void)
-{
-        toi_unregister_module(&toi_checksum_ops);
-}
diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
deleted file mode 100644
index c8196fbb0..000000000
--- a/kernel/power/tuxonice_checksum.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#if defined(CONFIG_TOI_CHECKSUM)
-extern int toi_checksum_init(void);
-extern void toi_checksum_exit(void);
-void check_checksums(void);
-int allocate_checksum_pages(void);
-void free_checksum_pages(void);
-char *tuxonice_get_next_checksum(void);
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
-#else
-static inline int toi_checksum_init(void) { return 0; }
-static inline void toi_checksum_exit(void) { }
-static inline void check_checksums(void) { };
-static inline int allocate_checksum_pages(void) { return 0; };
-static inline void free_checksum_pages(void) { };
-static inline char *tuxonice_get_next_checksum(void) { return NULL; };
-static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
-        { return 0; }
-#endif
-
diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
deleted file mode 100644
index 2873f93c6..000000000
--- a/kernel/power/tuxonice_cluster.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines for cluster hibernation support.
- *
- * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
- *
- * How does it work?
- *
- * There is no 'master' node that tells everyone else what to do. All nodes
- * send messages to the broadcast address/port, maintain a list of peers
- * and figure out when to progress to the next step in hibernating or resuming.
- * This makes us more fault tolerant when it comes to nodes coming and going
- * (which may be more of an issue if we're hibernating when power supplies
- * are being unreliable).
- *
- * At boot time, we start a ktuxonice thread that handles communication with
- * other nodes. This node maintains a state machine that controls our progress
- * through hibernating and resuming, keeping us in step with other nodes. Nodes
- * are identified by their hw address.
- *
- * On startup, the node sends CLUSTER_PING on the configured interface's
- * broadcast address, port $toi_cluster_port (see below) and begins to listen
- * for other broadcast messages. CLUSTER_PING messages are repeated at
- * intervals of 5 minutes, with a random offset to spread traffic out.
- *
- * A hibernation cycle is initiated from any node via
- *
- * echo > /sys/power/tuxonice/do_hibernate
- *
- * and (possibily) the hibernate script. At each step of the process, the node
- * completes its work, and waits for all other nodes to signal completion of
- * their work (or timeout) before progressing to the next step.
- *
- * Request/state  Action before reply        Possible reply        Next state
- * HIBERNATE          capable, pre-script        HIBERNATE|ACK        NODE_PREP
- *                                         HIBERNATE|NACK        INIT_0
- *
- * PREP                  prepare_image                PREP|ACK        IMAGE_WRITE
- *                                         PREP|NACK        INIT_0
- *                                         ABORT                RUNNING
- *
- * IO                  write image                IO|ACK                power off
- *                                         ABORT                POST_RESUME
- *
- * (Boot time)          check for image        IMAGE|ACK        RESUME_PREP
- *                                         (Note 1)
- *                                         IMAGE|NACK        (Note 2)
- *
- * PREP                  prepare read image        PREP|ACK        IMAGE_READ
- *                                         PREP|NACK        (As NACK_IMAGE)
- *
- * IO                  read image                IO|ACK                POST_RESUME
- *
- * POST_RESUME          thaw, post-script                        RUNNING
- *
- * INIT_0          init 0
- *
- * Other messages:
- *
- * - PING: Request for all other live nodes to send a PONG. Used at startup to
- *   announce presence, when a node is suspected dead and periodically, in case
- *   segments of the network are [un]plugged.
- *
- * - PONG: Response to a PING.
- *
- * - ABORT: Request to cancel writing an image.
- *
- * - BYE: Notification that this node is shutting down.
- *
- * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
- * nodes which are slower to start up can get state synchronised. If a node
- * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
- * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
- * must invalidate its image (if any) and boot normally.
- *
- * Note 2: May occur when one node lost power or powered off while others
- * hibernated. This node waits for others to complete resuming (ACK_READ)
- * before completing its boot, so that it appears as a fail node restarting.
- *
- * If any node has an image, then it also has a list of nodes that hibernated
- * in synchronisation with it. The node will wait for other nodes to appear
- * or timeout before beginning its restoration.
- *
- * If a node has no image, it needs to wait, in case other nodes which do have
- * an image are going to resume, but are taking longer to announce their
- * presence. For this reason, the user can specify a timeout value and a number
- * of nodes detected before we just continue. (We might want to assume in a
- * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
- * the remaining nodes will too. This might help in situations where some nodes
- * are much slower to boot, or more subject to hardware failures or such like).
- */
-
-#include <linux/suspend.h>
-#include <linux/if.h>
-#include <linux/rtnetlink.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/if_arp.h>
-#include <linux/kthread.h>
-#include <linux/wait.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-
-#if 1
-#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
-#else
-#define PRINTK(a, b...) do { } while (0)
-#endif
-
-static int loopback_mode;
-static int num_local_nodes = 1;
-#define MAX_LOCAL_NODES 8
-#define SADDR (loopback_mode ? b->sid : h->saddr)
-
-#define MYNAME "TuxOnIce Clustering"
-
-enum cluster_message {
-        MSG_ACK = 1,
-        MSG_NACK = 2,
-        MSG_PING = 4,
-        MSG_ABORT = 8,
-        MSG_BYE = 16,
-        MSG_HIBERNATE = 32,
-        MSG_IMAGE = 64,
-        MSG_IO = 128,
-        MSG_RUNNING = 256
-};
-
-static char *str_message(int message)
-{
-        switch (message) {
-        case 4:
-                return "Ping";
-        case 8:
-                return "Abort";
-        case 9:
-                return "Abort acked";
-        case 10:
-                return "Abort nacked";
-        case 16:
-                return "Bye";
-        case 17:
-                return "Bye acked";
-        case 18:
-                return "Bye nacked";
-        case 32:
-                return "Hibernate request";
-        case 33:
-                return "Hibernate ack";
-        case 34:
-                return "Hibernate nack";
-        case 64:
-                return "Image exists?";
-        case 65:
-                return "Image does exist";
-        case 66:
-                return "No image here";
-        case 128:
-                return "I/O";
-        case 129:
-                return "I/O okay";
-        case 130:
-                return "I/O failed";
-        case 256:
-                return "Running";
-        default:
-                printk(KERN_ERR "Unrecognised message %d.\n", message);
-                return "Unrecognised message (see dmesg)";
-        }
-}
-
-#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
-#define MSG_STATE_MASK (~MSG_ACK_MASK)
-
-struct node_info {
-        struct list_head member_list;
-        wait_queue_head_t member_events;
-        spinlock_t member_list_lock;
-        spinlock_t receive_lock;
-        int peer_count, ignored_peer_count;
-        struct toi_sysfs_data sysfs_data;
-        enum cluster_message current_message;
-};
-
-struct node_info node_array[MAX_LOCAL_NODES];
-
-struct cluster_member {
-        __be32 addr;
-        enum cluster_message message;
-        struct list_head list;
-        int ignore;
-};
-
-#define toi_cluster_port_send 3501
-#define toi_cluster_port_recv 3502
-
-static struct net_device *net_dev;
-static struct toi_module_ops toi_cluster_ops;
-
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
-                struct packet_type *pt, struct net_device *orig_dev);
-
-static struct packet_type toi_cluster_packet_type = {
-        .type =        __constant_htons(ETH_P_IP),
-        .func =        toi_recv,
-};
-
-struct toi_pkt {                /* BOOTP packet format */
-        struct iphdr iph;        /* IP header */
-        struct udphdr udph;        /* UDP header */
-        u8 htype;                /* HW address type */
-        u8 hlen;                /* HW address length */
-        __be32 xid;                /* Transaction ID */
-        __be16 secs;                /* Seconds since we started */
-        __be16 flags;                /* Just what it says */
-        u8 hw_addr[16];                /* Sender's HW address */
-        u16 message;                /* Message */
-        unsigned long sid;        /* Source ID for loopback testing */
-};
-
-static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
-
-static int added_pack;
-
-static int others_have_image;
-
-/* Key used to allow multiple clusters on the same lan */
-static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
-static char pre_hibernate_script[255] =
-        CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
-static char post_hibernate_script[255] =
-        CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
-
-/*                        List of cluster members                        */
-static unsigned long continue_delay = 5 * HZ;
-static unsigned long cluster_message_timeout = 3 * HZ;
-
-/*                 === Membership list ===         */
-
-static void print_member_info(int index)
-{
-        struct cluster_member *this;
-
-        printk(KERN_INFO "==> Dumping node %d.\n", index);
-
-        list_for_each_entry(this, &node_array[index].member_list, list)
-                printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
-                                NIPQUAD(this->addr),
-                                str_message(this->message),
-                                this->ignore ? "(Ignored)" : "");
-        printk(KERN_INFO "== Done ==\n");
-}
-
-static struct cluster_member *__find_member(int index, __be32 addr)
-{
-        struct cluster_member *this;
-
-        list_for_each_entry(this, &node_array[index].member_list, list) {
-                if (this->addr != addr)
-                        continue;
-
-                return this;
-        }
-
-        return NULL;
-}
-
-static void set_ignore(int index, __be32 addr, struct cluster_member *this)
-{
-        if (this->ignore) {
-                PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
-                                index, NIPQUAD(addr));
-                return;
-        }
-
-        PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
-                                index, NIPQUAD(addr));
-        this->ignore = 1;
-        node_array[index].ignored_peer_count++;
-}
-
-static int __add_update_member(int index, __be32 addr, int message)
-{
-        struct cluster_member *this;
-
-        this = __find_member(index, addr);
-        if (this) {
-                if (this->message != message) {
-                        this->message = message;
-                        if ((message & MSG_NACK) &&
-                            (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
-                                set_ignore(index, addr, this);
-                        PRINTK("Node %d sees node %d.%d.%d.%d now sending "
-                                        "%s.\n", index, NIPQUAD(addr),
-                                        str_message(message));
-                        wake_up(&node_array[index].member_events);
-                }
-                return 0;
-        }
-
-        this = (struct cluster_member *) toi_kzalloc(36,
-                        sizeof(struct cluster_member), GFP_KERNEL);
-
-        if (!this)
-                return -1;
-
-        this->addr = addr;
-        this->message = message;
-        this->ignore = 0;
-        INIT_LIST_HEAD(&this->list);
-
-        node_array[index].peer_count++;
-
-        PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
-                        NIPQUAD(addr), str_message(message));
-
-        if ((message & MSG_NACK) &&
-            (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
-                set_ignore(index, addr, this);
-        list_add_tail(&this->list, &node_array[index].member_list);
-        return 1;
-}
-
-static int add_update_member(int index, __be32 addr, int message)
-{
-        int result;
-        unsigned long flags;
-        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-        result = __add_update_member(index, addr, message);
-        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-
-        print_member_info(index);
-
-        wake_up(&node_array[index].member_events);
-
-        return result;
-}
-
-static void del_member(int index, __be32 addr)
-{
-        struct cluster_member *this;
-        unsigned long flags;
-
-        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-        this = __find_member(index, addr);
-
-        if (this) {
-                list_del_init(&this->list);
-                toi_kfree(36, this, sizeof(*this));
-                node_array[index].peer_count--;
-        }
-
-        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-/*                 === Message transmission ===        */
-
-static void toi_send_if(int message, unsigned long my_id);
-
-/*
- *  Process received TOI packet.
- */
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
-                struct packet_type *pt, struct net_device *orig_dev)
-{
-        struct toi_pkt *b;
-        struct iphdr *h;
-        int len, result, index;
-        unsigned long addr, message, ack;
-
-        /* Perform verifications before taking the lock.  */
-        if (skb->pkt_type == PACKET_OTHERHOST)
-                goto drop;
-
-        if (dev != net_dev)
-                goto drop;
-
-        skb = skb_share_check(skb, GFP_ATOMIC);
-        if (!skb)
-                return NET_RX_DROP;
-
-        if (!pskb_may_pull(skb,
-                           sizeof(struct iphdr) +
-                           sizeof(struct udphdr)))
-                goto drop;
-
-        b = (struct toi_pkt *)skb_network_header(skb);
-        h = &b->iph;
-
-        if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
-                goto drop;
-
-        /* Fragments are not supported */
-        if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
-                if (net_ratelimit())
-                        printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
-                               "cluster message.\n");
-                goto drop;
-        }
-
-        if (skb->len < ntohs(h->tot_len))
-                goto drop;
-
-        if (ip_fast_csum((char *) h, h->ihl))
-                goto drop;
-
-        if (b->udph.source != htons(toi_cluster_port_send) ||
-            b->udph.dest != htons(toi_cluster_port_recv))
-                goto drop;
-
-        if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
-                goto drop;
-
-        len = ntohs(b->udph.len) - sizeof(struct udphdr);
-
-        /* Ok the front looks good, make sure we can get at the rest.  */
-        if (!pskb_may_pull(skb, skb->len))
-                goto drop;
-
-        b = (struct toi_pkt *)skb_network_header(skb);
-        h = &b->iph;
-
-        addr = SADDR;
-        PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
-                        str_message(b->message), NIPQUAD(addr));
-
-        message = b->message & MSG_STATE_MASK;
-        ack = b->message & MSG_ACK_MASK;
-
-        for (index = 0; index < num_local_nodes; index++) {
-                int new_message = node_array[index].current_message,
-                    old_message = new_message;
-
-                if (index == SADDR || !old_message) {
-                        PRINTK("Ignoring node %d (offline or self).\n", index);
-                        continue;
-                }
-
-                /* One message at a time, please. */
-                spin_lock(&node_array[index].receive_lock);
-
-                result = add_update_member(index, SADDR, b->message);
-                if (result == -1) {
-                        printk(KERN_INFO "Failed to add new cluster member "
-                                        NIPQUAD_FMT ".\n",
-                                        NIPQUAD(addr));
-                        goto drop_unlock;
-                }
-
-                switch (b->message & MSG_STATE_MASK) {
-                case MSG_PING:
-                        break;
-                case MSG_ABORT:
-                        break;
-                case MSG_BYE:
-                        break;
-                case MSG_HIBERNATE:
-                        /* Can I hibernate? */
-                        new_message = MSG_HIBERNATE |
-                                ((index & 1) ? MSG_NACK : MSG_ACK);
-                        break;
-                case MSG_IMAGE:
-                        /* Can I resume? */
-                        new_message = MSG_IMAGE |
-                                ((index & 1) ? MSG_NACK : MSG_ACK);
-                        if (new_message != old_message)
-                                printk(KERN_ERR "Setting whether I can resume "
-                                                "to %d.\n", new_message);
-                        break;
-                case MSG_IO:
-                        new_message = MSG_IO | MSG_ACK;
-                        break;
-                case MSG_RUNNING:
-                        break;
-                default:
-                        if (net_ratelimit())
-                                printk(KERN_ERR "Unrecognised TuxOnIce cluster"
-                                        " message %d from " NIPQUAD_FMT ".\n",
-                                        b->message, NIPQUAD(addr));
-                };
-
-                if (old_message != new_message) {
-                        node_array[index].current_message = new_message;
-                        printk(KERN_INFO ">>> Sending new message for node "
-                                        "%d.\n", index);
-                        toi_send_if(new_message, index);
-                } else if (!ack) {
-                        printk(KERN_INFO ">>> Resending message for node %d.\n",
-                                        index);
-                        toi_send_if(new_message, index);
-                }
-drop_unlock:
-                spin_unlock(&node_array[index].receive_lock);
-        };
-
-drop:
-        /* Throw the packet out. */
-        kfree_skb(skb);
-
-        return 0;
-}
-
-/*
- *  Send cluster message to single interface.
- */
-static void toi_send_if(int message, unsigned long my_id)
-{
-        struct sk_buff *skb;
-        struct toi_pkt *b;
-        int hh_len = LL_RESERVED_SPACE(net_dev);
-        struct iphdr *h;
-
-        /* Allocate packet */
-        skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
-        if (!skb)
-                return;
-        skb_reserve(skb, hh_len);
-        b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
-        memset(b, 0, sizeof(struct toi_pkt));
-
-        /* Construct IP header */
-        skb_reset_network_header(skb);
-        h = ip_hdr(skb);
-        h->version = 4;
-        h->ihl = 5;
-        h->tot_len = htons(sizeof(struct toi_pkt));
-        h->frag_off = htons(IP_DF);
-        h->ttl = 64;
-        h->protocol = IPPROTO_UDP;
-        h->daddr = htonl(INADDR_BROADCAST);
-        h->check = ip_fast_csum((unsigned char *) h, h->ihl);
-
-        /* Construct UDP header */
-        b->udph.source = htons(toi_cluster_port_send);
-        b->udph.dest = htons(toi_cluster_port_recv);
-        b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
-        /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
-
-        /* Construct message */
-        b->message = message;
-        b->sid = my_id;
-        b->htype = net_dev->type; /* can cause undefined behavior */
-        b->hlen = net_dev->addr_len;
-        memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
-        b->secs = htons(3); /* 3 seconds */
-
-        /* Chain packet down the line... */
-        skb->dev = net_dev;
-        skb->protocol = htons(ETH_P_IP);
-        if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
-                     net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
-                        dev_queue_xmit(skb) < 0)
-                printk(KERN_INFO "E");
-}
-
-/*        =========================================                */
-
-/*                        kTOICluster                        */
-
-static atomic_t num_cluster_threads;
-static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
-
-static int kTOICluster(void *data)
-{
-        unsigned long my_id;
-
-        my_id = atomic_add_return(1, &num_cluster_threads) - 1;
-        node_array[my_id].current_message = (unsigned long) data;
-
-        PRINTK("kTOICluster daemon %lu starting.\n", my_id);
-
-        current->flags |= PF_NOFREEZE;
-
-        while (node_array[my_id].current_message) {
-                toi_send_if(node_array[my_id].current_message, my_id);
-                sleep_on_timeout(&clusterd_events,
-                                cluster_message_timeout);
-                PRINTK("Link state %lu is %d.\n", my_id,
-                                node_array[my_id].current_message);
-        }
-
-        toi_send_if(MSG_BYE, my_id);
-        atomic_dec(&num_cluster_threads);
-        wake_up(&clusterd_events);
-
-        PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
-        __set_current_state(TASK_RUNNING);
-        return 0;
-}
-
-static void kill_clusterd(void)
-{
-        int i;
-
-        for (i = 0; i < num_local_nodes; i++) {
-                if (node_array[i].current_message) {
-                        PRINTK("Seeking to kill clusterd %d.\n", i);
-                        node_array[i].current_message = 0;
-                }
-        }
-        wait_event(clusterd_events,
-                        !atomic_read(&num_cluster_threads));
-        PRINTK("All cluster daemons have exited.\n");
-}
-
-static int peers_not_in_message(int index, int message, int precise)
-{
-        struct cluster_member *this;
-        unsigned long flags;
-        int result = 0;
-
-        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-        list_for_each_entry(this, &node_array[index].member_list, list) {
-                if (this->ignore)
-                        continue;
-
-                PRINTK("Peer %d.%d.%d.%d sending %s. "
-                        "Seeking %s.\n",
-                        NIPQUAD(this->addr),
-                        str_message(this->message), str_message(message));
-                if ((precise ? this->message :
-                                        this->message & MSG_STATE_MASK) !=
-                                        message)
-                        result++;
-        }
-        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-        PRINTK("%d peers in sought message.\n", result);
-        return result;
-}
-
-static void reset_ignored(int index)
-{
-        struct cluster_member *this;
-        unsigned long flags;
-
-        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
-        list_for_each_entry(this, &node_array[index].member_list, list)
-                this->ignore = 0;
-        node_array[index].ignored_peer_count = 0;
-        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-static int peers_in_message(int index, int message, int precise)
-{
-        return node_array[index].peer_count -
-                node_array[index].ignored_peer_count -
-                peers_not_in_message(index, message, precise);
-}
-
-static int time_to_continue(int index, unsigned long start, int message)
-{
-        int first = peers_not_in_message(index, message, 0);
-        int second = peers_in_message(index, message, 1);
-
-        PRINTK("First part returns %d, second returns %d.\n", first, second);
-
-        if (!first && !second) {
-                PRINTK("All peers answered message %d.\n",
-                        message);
-                return 1;
-        }
-
-        if (time_after(jiffies, start + continue_delay)) {
-                PRINTK("Timeout reached.\n");
-                return 1;
-        }
-
-        PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
-                        start + continue_delay);
-        return 0;
-}
-
-void toi_initiate_cluster_hibernate(void)
-{
-        int result;
-        unsigned long start;
-
-        result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
-        if (result)
-                return;
-
-        toi_send_if(MSG_HIBERNATE, 0);
-
-        start = jiffies;
-        wait_event(node_array[0].member_events,
-                        time_to_continue(0, start, MSG_HIBERNATE));
-
-        if (test_action_state(TOI_FREEZER_TEST)) {
-                toi_send_if(MSG_ABORT, 0);
-
-                start = jiffies;
-                wait_event(node_array[0].member_events,
-                        time_to_continue(0, start, MSG_RUNNING));
-
-                do_toi_step(STEP_QUIET_CLEANUP);
-                return;
-        }
-
-        toi_send_if(MSG_IO, 0);
-
-        result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
-        if (result)
-                return;
-
-        /* This code runs at resume time too! */
-        if (toi_in_hibernate)
-                result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-}
-
-/* toi_cluster_print_debug_stats
- *
- * Description:        Print information to be recorded for debugging purposes into a
- *                 buffer.
- * Arguments:        buffer: Pointer to a buffer into which the debug info will be
- *                         printed.
- *                 size:        Size of the buffer.
- * Returns:        Number of characters written to the buffer.
- */
-static int toi_cluster_print_debug_stats(char *buffer, int size)
-{
-        int len;
-
-        if (strlen(toi_cluster_iface))
-                len = scnprintf(buffer, size,
-                                "- Cluster interface is '%s'.\n",
-                                toi_cluster_iface);
-        else
-                len = scnprintf(buffer, size,
-                                "- Cluster support is disabled.\n");
-        return len;
-}
-
-/* cluster_memory_needed
- *
- * Description:        Tell the caller how much memory we need to operate during
- *                 hibernate/resume.
- * Returns:        Unsigned long. Maximum number of bytes of memory required for
- *                 operation.
- */
-static int toi_cluster_memory_needed(void)
-{
-        return 0;
-}
-
-static int toi_cluster_storage_needed(void)
-{
-        return 1 + strlen(toi_cluster_iface);
-}
-
-/* toi_cluster_save_config_info
- *
- * Description:        Save informaton needed when reloading the image at resume time.
- * Arguments:        Buffer:                Pointer to a buffer of size PAGE_SIZE.
- * Returns:        Number of bytes used for saving our data.
- */
-static int toi_cluster_save_config_info(char *buffer)
-{
-        strcpy(buffer, toi_cluster_iface);
-        return strlen(toi_cluster_iface + 1);
-}
-
-/* toi_cluster_load_config_info
- *
- * Description:        Reload information needed for declustering the image at
- *                 resume time.
- * Arguments:        Buffer:                Pointer to the start of the data.
- *                Size:                Number of bytes that were saved.
- */
-static void toi_cluster_load_config_info(char *buffer, int size)
-{
-        strncpy(toi_cluster_iface, buffer, size);
-        return;
-}
-
-static void cluster_startup(void)
-{
-        int have_image = do_check_can_resume(), i;
-        unsigned long start = jiffies, initial_message;
-        struct task_struct *p;
-
-        initial_message = MSG_IMAGE;
-
-        have_image = 1;
-
-        for (i = 0; i < num_local_nodes; i++) {
-                PRINTK("Starting ktoiclusterd %d.\n", i);
-                p = kthread_create(kTOICluster, (void *) initial_message,
-                                "ktoiclusterd/%d", i);
-                if (IS_ERR(p)) {
-                        printk(KERN_ERR "Failed to start ktoiclusterd.\n");
-                        return;
-                }
-
-                wake_up_process(p);
-        }
-
-        /* Wait for delay or someone else sending first message */
-        wait_event(node_array[0].member_events, time_to_continue(0, start,
-                                MSG_IMAGE));
-
-        others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
-
-        printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
-                " %d.\n", have_image ? "" : "don't ", others_have_image);
-
-        if (have_image) {
-                int result;
-
-                /* Start to resume */
-                printk(KERN_INFO "  === Starting to resume ===  \n");
-                node_array[0].current_message = MSG_IO;
-                toi_send_if(MSG_IO, 0);
-
-                /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
-                result = 0;
-
-                if (!result) {
-                        /*
-                         * Atomic restore - we'll come back in the hibernation
-                         * path.
-                         */
-
-                        /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
-                        result = 0;
-
-                        /* do_toi_step(STEP_QUIET_CLEANUP); */
-                }
-
-                node_array[0].current_message |= MSG_NACK;
-
-                /* For debugging - disable for real life? */
-                wait_event(node_array[0].member_events,
-                                time_to_continue(0, start, MSG_IO));
-        }
-
-        if (others_have_image) {
-                /* Wait for them to resume */
-                printk(KERN_INFO "Waiting for other nodes to resume.\n");
-                start = jiffies;
-                wait_event(node_array[0].member_events,
-                                time_to_continue(0, start, MSG_RUNNING));
-                if (peers_not_in_message(0, MSG_RUNNING, 0))
-                        printk(KERN_INFO "Timed out while waiting for other "
-                                        "nodes to resume.\n");
-        }
-
-        /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
-         * as appropriate.
-         *
-         * If we don't have an image:
-         * - Wait until someone else says they have one, or conditions are met
-         *   for continuing to boot (n machines or t seconds).
-         * - If anyone has an image, wait for them to resume before continuing
-         *   to boot.
-         *
-         * If we have an image:
-         * - Wait until conditions are met before continuing to resume (n
-         *   machines or t seconds). Send RESUME_PREP and freeze processes.
-         *   NACK_PREP if freezing fails (shouldn't) and follow logic for
-         *   us having no image above. On success, wait for [N]ACK_PREP from
-         *   other machines. Read image (including atomic restore) until done.
-         *   Wait for ACK_READ from others (should never fail). Thaw processes
-         *   and do post-resume. (The section after the atomic restore is done
-         *   via the code for hibernating).
-         */
-
-        node_array[0].current_message = MSG_RUNNING;
-}
-
-/* toi_cluster_open_iface
- *
- * Description:        Prepare to use an interface.
- */
-
-static int toi_cluster_open_iface(void)
-{
-        struct net_device *dev;
-
-        rtnl_lock();
-
-        for_each_netdev(&init_net, dev) {
-                if (/* dev == &init_net.loopback_dev || */
-                    strcmp(dev->name, toi_cluster_iface))
-                        continue;
-
-                net_dev = dev;
-                break;
-        }
-
-        rtnl_unlock();
-
-        if (!net_dev) {
-                printk(KERN_ERR MYNAME ": Device %s not found.\n",
-                                toi_cluster_iface);
-                return -ENODEV;
-        }
-
-        dev_add_pack(&toi_cluster_packet_type);
-        added_pack = 1;
-
-        loopback_mode = (net_dev == init_net.loopback_dev);
-        num_local_nodes = loopback_mode ? 8 : 1;
-
-        PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
-                        loopback_mode ? "on" : "off", num_local_nodes);
-
-        cluster_startup();
-        return 0;
-}
-
-/* toi_cluster_close_iface
- *
- * Description: Stop using an interface.
- */
-
-static int toi_cluster_close_iface(void)
-{
-        kill_clusterd();
-        if (added_pack) {
-                dev_remove_pack(&toi_cluster_packet_type);
-                added_pack = 0;
-        }
-        return 0;
-}
-
-static void write_side_effect(void)
-{
-        if (toi_cluster_ops.enabled) {
-                toi_cluster_open_iface();
-                set_toi_state(TOI_CLUSTER_MODE);
-        } else {
-                toi_cluster_close_iface();
-                clear_toi_state(TOI_CLUSTER_MODE);
-        }
-}
-
-static void node_write_side_effect(void)
-{
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
-                        NULL),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
-                        write_side_effect),
-        SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
-        SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
-                        256, 0, NULL),
-        SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
-                        256, 0, STRING),
-        SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
-                        0)
-};
-
-/*
- * Ops structure.
- */
-
-static struct toi_module_ops toi_cluster_ops = {
-        .type                        = FILTER_MODULE,
-        .name                        = "Cluster",
-        .directory                = "cluster",
-        .module                        = THIS_MODULE,
-        .memory_needed                 = toi_cluster_memory_needed,
-        .print_debug_info        = toi_cluster_print_debug_stats,
-        .save_config_info        = toi_cluster_save_config_info,
-        .load_config_info        = toi_cluster_load_config_info,
-        .storage_needed                = toi_cluster_storage_needed,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-#ifdef MODULE
-#define INIT static __init
-#define EXIT static __exit
-#else
-#define INIT
-#define EXIT
-#endif
-
-INIT int toi_cluster_init(void)
-{
-        int temp = toi_register_module(&toi_cluster_ops), i;
-        struct kobject *kobj = toi_cluster_ops.dir_kobj;
-
-        for (i = 0; i < MAX_LOCAL_NODES; i++) {
-                node_array[i].current_message = 0;
-                INIT_LIST_HEAD(&node_array[i].member_list);
-                init_waitqueue_head(&node_array[i].member_events);
-                spin_lock_init(&node_array[i].member_list_lock);
-                spin_lock_init(&node_array[i].receive_lock);
-
-                /* Set up sysfs entry */
-                node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
-                                sizeof(node_array[i].sysfs_data.attr.name),
-                                GFP_KERNEL);
-                sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
-                                i);
-                node_array[i].sysfs_data.attr.mode = SYSFS_RW;
-                node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
-                node_array[i].sysfs_data.flags = 0;
-                node_array[i].sysfs_data.data.integer.variable =
-                        (int *) &node_array[i].current_message;
-                node_array[i].sysfs_data.data.integer.minimum = 0;
-                node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
-                node_array[i].sysfs_data.write_side_effect =
-                        node_write_side_effect;
-                toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
-        }
-
-        toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
-
-        if (toi_cluster_ops.enabled)
-                toi_cluster_open_iface();
-
-        return temp;
-}
-
-EXIT void toi_cluster_exit(void)
-{
-        int i;
-        toi_cluster_close_iface();
-
-        for (i = 0; i < MAX_LOCAL_NODES; i++)
-                toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
-                                &node_array[i].sysfs_data);
-        toi_unregister_module(&toi_cluster_ops);
-}
-
-static int __init toi_cluster_iface_setup(char *iface)
-{
-        toi_cluster_ops.enabled = (*iface &&
-                        strcmp(iface, "off"));
-
-        if (toi_cluster_ops.enabled)
-                strncpy(toi_cluster_iface, iface, strlen(iface));
-}
-
-__setup("toi_cluster=", toi_cluster_iface_setup);
diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
deleted file mode 100644
index 84356b304..000000000
--- a/kernel/power/tuxonice_cluster.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_TOI_CLUSTER
-extern int toi_cluster_init(void);
-extern void toi_cluster_exit(void);
-extern void toi_initiate_cluster_hibernate(void);
-#else
-static inline int toi_cluster_init(void) { return 0; }
-static inline void toi_cluster_exit(void) { }
-static inline void toi_initiate_cluster_hibernate(void) { }
-#endif
-
diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
deleted file mode 100644
index 84b85226d..000000000
--- a/kernel/power/tuxonice_compress.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * kernel/power/compression.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data compression routines for TuxOnIce,
- * using cryptoapi.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-static int toi_expected_compression;
-
-static struct toi_module_ops toi_compression_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_compressor_name[32] = "lzo";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
-        u8 *page_buffer;
-        struct crypto_comp *transform;
-        unsigned int len;
-        u8 *buffer_start;
-        u8 *output_buffer;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_compress_crypto_prepare(void)
-{
-        int cpu;
-
-        if (!*toi_compressor_name) {
-                printk(KERN_INFO "TuxOnIce: Compression enabled but no "
-                                "compressor name set.\n");
-                return 1;
-        }
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
-                if (IS_ERR(this->transform)) {
-                        printk(KERN_INFO "TuxOnIce: Failed to initialise the "
-                                        "%s compression transform.\n",
-                                        toi_compressor_name);
-                        this->transform = NULL;
-                        return 1;
-                }
-
-                this->page_buffer =
-                        (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
-
-                if (!this->page_buffer) {
-                        printk(KERN_ERR
-                          "Failed to allocate a page buffer for TuxOnIce "
-                          "compression driver.\n");
-                        return -ENOMEM;
-                }
-
-                this->output_buffer =
-                        (char *) vmalloc_32(OUT_BUF_SIZE);
-
-                if (!this->output_buffer) {
-                        printk(KERN_ERR
-                          "Failed to allocate a output buffer for TuxOnIce "
-                          "compression driver.\n");
-                        return -ENOMEM;
-                }
-        }
-
-        return 0;
-}
-
-static int toi_compress_rw_cleanup(int writing)
-{
-        int cpu;
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                if (this->transform) {
-                        crypto_free_comp(this->transform);
-                        this->transform = NULL;
-                }
-
-                if (this->page_buffer)
-                        toi_free_page(16, (unsigned long) this->page_buffer);
-
-                this->page_buffer = NULL;
-
-                if (this->output_buffer)
-                        vfree(this->output_buffer);
-
-                this->output_buffer = NULL;
-        }
-
-        return 0;
-}
-
-/*
- * toi_compress_init
- */
-
-static int toi_compress_init(int toi_or_resume)
-{
-        if (!toi_or_resume)
-                return 0;
-
-        toi_compress_bytes_in = 0;
-        toi_compress_bytes_out = 0;
-
-        next_driver = toi_get_next_filter(&toi_compression_ops);
-
-        return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_compress_rw_init()
- */
-
-static int toi_compress_rw_init(int rw, int stream_number)
-{
-        if (toi_compress_crypto_prepare()) {
-                printk(KERN_ERR "Failed to initialise compression "
-                                "algorithm.\n");
-                if (rw == READ) {
-                        printk(KERN_INFO "Unable to read the image.\n");
-                        return -ENODEV;
-                } else {
-                        printk(KERN_INFO "Continuing without "
-                                "compressing the image.\n");
-                        toi_compression_ops.enabled = 0;
-                }
-        }
-
-        return 0;
-}
-
-/*
- * toi_compress_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page:        Pointer to a buffer of size PAGE_SIZE, containing
- * data to be compressed.
- *
- * Returns:        0 on success. Otherwise the error is that returned by later
- *                 modules, -ECHILD if we have a broken pipeline or -EIO if
- *                 zlib errs.
- */
-static int toi_compress_write_page(unsigned long index, int buf_type,
-                void *buffer_page, unsigned int buf_size)
-{
-        int ret = 0, cpu = smp_processor_id();
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-        u8* output_buffer = buffer_page;
-        int output_len = buf_size;
-        int out_buf_type = buf_type;
-
-        if (ctx->transform) {
-
-                ctx->buffer_start = TOI_MAP(buf_type, buffer_page);
-                ctx->len = OUT_BUF_SIZE;
-
-                ret = crypto_comp_compress(ctx->transform,
-                        ctx->buffer_start, buf_size,
-                        ctx->output_buffer, &ctx->len);
-
-                TOI_UNMAP(buf_type, buffer_page);
-
-                toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
-                                "CPU %d, index %lu: %d bytes",
-                                cpu, index, ctx->len);
-
-                if (!ret && ctx->len < buf_size) { /* some compression */
-                        output_buffer = ctx->output_buffer;
-                        output_len = ctx->len;
-                        out_buf_type = TOI_VIRT;
-                }
-
-        }
-
-        mutex_lock(&stats_lock);
-
-        toi_compress_bytes_in += buf_size;
-        toi_compress_bytes_out += output_len;
-
-        mutex_unlock(&stats_lock);
-
-        if (!ret)
-                ret = next_driver->write_page(index, out_buf_type,
-                                output_buffer, output_len);
-
-        return ret;
-}
-
-/*
- * toi_compress_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules and decompress it until the input buffer
- * is filled.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_compress_read_page(unsigned long *index, int buf_type,
-                void *buffer_page, unsigned int *buf_size)
-{
-        int ret, cpu = smp_processor_id();
-        unsigned int len;
-        unsigned int outlen = PAGE_SIZE;
-        char *buffer_start;
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
-        if (!ctx->transform)
-                return next_driver->read_page(index, TOI_PAGE, buffer_page,
-                                buf_size);
-
-        /*
-         * All our reads must be synchronous - we can't decompress
-         * data that hasn't been read yet.
-         */
-
-        ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len);
-
-        buffer_start = kmap(buffer_page);
-
-        /* Error or uncompressed data */
-        if (ret || len == PAGE_SIZE) {
-                memcpy(buffer_start, ctx->page_buffer, len);
-                goto out;
-        }
-
-        ret = crypto_comp_decompress(
-                        ctx->transform,
-                        ctx->page_buffer,
-                        len, buffer_start, &outlen);
-
-        toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
-                        "CPU %d, index %lu: %d=>%d (%d).",
-                        cpu, *index, len, outlen, ret);
-
-        if (ret)
-                abort_hibernate(TOI_FAILED_IO,
-                        "Compress_read returned %d.\n", ret);
-        else if (outlen != PAGE_SIZE) {
-                abort_hibernate(TOI_FAILED_IO,
-                        "Decompression yielded %d bytes instead of %ld.\n",
-                        outlen, PAGE_SIZE);
-                printk(KERN_ERR "Decompression yielded %d bytes instead of "
-                                "%ld.\n", outlen, PAGE_SIZE);
-                ret = -EIO;
-                *buf_size = outlen;
-        }
-out:
-        TOI_UNMAP(buf_type, buffer_page);
-        return ret;
-}
-
-/*
- * toi_compress_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_compress_print_debug_stats(char *buffer, int size)
-{
-        unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
-                      pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
-        int len;
-
-        /* Output the compression ratio achieved. */
-        if (*toi_compressor_name)
-                len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
-                                toi_compressor_name);
-        else
-                len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
-        if (pages_in)
-                len += scnprintf(buffer+len, size - len, "  Compressed "
-                        "%lu bytes into %lu (%ld percent compression).\n",
-                  toi_compress_bytes_in,
-                  toi_compress_bytes_out,
-                  (pages_in - pages_out) * 100 / pages_in);
-        return len;
-}
-
-/*
- * toi_compress_compression_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_compress_memory_needed(void)
-{
-        return 2 * PAGE_SIZE;
-}
-
-static int toi_compress_storage_needed(void)
-{
-        return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
-                strlen(toi_compressor_name) + 1;
-}
-
-/*
- * toi_compress_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_compress_save_config_info(char *buffer)
-{
-        int len = strlen(toi_compressor_name) + 1, offset = 0;
-
-        *((unsigned long *) buffer) = toi_compress_bytes_in;
-        offset += sizeof(unsigned long);
-        *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out;
-        offset += sizeof(unsigned long);
-        *((int *) (buffer + offset)) = toi_expected_compression;
-        offset += sizeof(int);
-        *((int *) (buffer + offset)) = len;
-        offset += sizeof(int);
-        strncpy(buffer + offset, toi_compressor_name, len);
-        return offset + len;
-}
-
-/* toi_compress_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description:        Reload information needed for decompressing the image at
- * resume time.
- */
-static void toi_compress_load_config_info(char *buffer, int size)
-{
-        int len, offset = 0;
-
-        toi_compress_bytes_in = *((unsigned long *) buffer);
-        offset += sizeof(unsigned long);
-        toi_compress_bytes_out = *((unsigned long *) (buffer + offset));
-        offset += sizeof(unsigned long);
-        toi_expected_compression = *((int *) (buffer + offset));
-        offset += sizeof(int);
-        len = *((int *) (buffer + offset));
-        offset += sizeof(int);
-        strncpy(toi_compressor_name, buffer + offset, len);
-}
-
-static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        bkd->compress_bytes_in = toi_compress_bytes_in;
-        bkd->compress_bytes_out = toi_compress_bytes_out;
-}
-
-static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        toi_compress_bytes_in = bkd->compress_bytes_in;
-        toi_compress_bytes_out = bkd->compress_bytes_out;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Description:        Returns the expected ratio between data passed into this module
- *                 and the amount of data output when writing.
- * Returns:        100 if the module is disabled. Otherwise the value set by the
- *                 user via our sysfs entry.
- */
-
-static int toi_compress_expected_ratio(void)
-{
-        if (!toi_compression_ops.enabled)
-                return 100;
-        else
-                return 100 - toi_expected_compression;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression,
-                        0, 99, 0, NULL),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0,
-                        NULL),
-        SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_compression_ops = {
-        .type                        = FILTER_MODULE,
-        .name                        = "compression",
-        .directory                = "compression",
-        .module                        = THIS_MODULE,
-        .initialise                = toi_compress_init,
-        .memory_needed                 = toi_compress_memory_needed,
-        .print_debug_info        = toi_compress_print_debug_stats,
-        .save_config_info        = toi_compress_save_config_info,
-        .load_config_info        = toi_compress_load_config_info,
-        .storage_needed                = toi_compress_storage_needed,
-        .expected_compression        = toi_compress_expected_ratio,
-
-        .pre_atomic_restore        = toi_compress_pre_atomic_restore,
-        .post_atomic_restore        = toi_compress_post_atomic_restore,
-
-        .rw_init                = toi_compress_rw_init,
-        .rw_cleanup                = toi_compress_rw_cleanup,
-
-        .write_page                = toi_compress_write_page,
-        .read_page                = toi_compress_read_page,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_compress_load(void)
-{
-        return toi_register_module(&toi_compression_ops);
-}
-
-late_initcall(toi_compress_load);
diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c
deleted file mode 100644
index eb627915e..000000000
--- a/kernel/power/tuxonice_copy_before_write.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * kernel/power/tuxonice_copy_before_write.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines (apart from the fault handling code) to deal with allocating memory
- * for copying pages before they are modified, restoring the contents and getting
- * the contents written to disk.
- */
-
-#include <linux/percpu-defs.h>
-#include <linux/sched.h>
-#include <linux/tuxonice.h>
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states);
-#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw))
-#define toi_cbw_pool_size 100
-
-static void _toi_free_cbw_data(struct toi_cbw_state *state)
-{
-    struct toi_cbw *page_ptr, *ptr, *next;
-
-    page_ptr = ptr = state->first;
-
-    while(ptr) {
-        next = ptr->next;
-
-        if (ptr->virt) {
-            toi__free_page(40, virt_to_page(ptr->virt));
-        }
-        if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) {
-            /* Must be on a new page - free the previous one. */
-            toi__free_page(40, virt_to_page(page_ptr));
-            page_ptr = ptr;
-        }
-        ptr = next;
-    }
-
-    if (page_ptr) {
-        toi__free_page(40, virt_to_page(page_ptr));
-    }
-
-    state->first = state->next = state->last = NULL;
-    state->size = 0;
-}
-
-void toi_free_cbw_data(void)
-{
-    int i;
-
-    for_each_online_cpu(i) {
-        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
-        if (!state->first)
-            continue;
-
-        state->enabled = 0;
-
-        while (state->active) {
-            schedule();
-        }
-
-        _toi_free_cbw_data(state);
-    }
-}
-
-static int _toi_allocate_cbw_data(struct toi_cbw_state *state)
-{
-    while(state->size < toi_cbw_pool_size) {
-        int i;
-        struct toi_cbw *ptr;
-
-        ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL);
-
-        if (!ptr) {
-            return -ENOMEM;
-        }
-
-        if (!state->first) {
-            state->first = state->next = state->last = ptr;
-        }
-
-        for (i = 0; i < CBWS_PER_PAGE; i++) {
-            struct toi_cbw *cbw = &ptr[i];
-
-            cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL);
-            if (!cbw->virt) {
-                state->size += i;
-                printk("Out of memory allocating CBW pages.\n");
-                return -ENOMEM;
-            }
-
-            if (cbw == state->first)
-                continue;
-
-            state->last->next = cbw;
-            state->last = cbw;
-        }
-
-        state->size += CBWS_PER_PAGE;
-    }
-
-    state->enabled = 1;
-
-    return 0;
-}
-
-
-int toi_allocate_cbw_data(void)
-{
-    int i, result;
-
-    for_each_online_cpu(i) {
-        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
-        result = _toi_allocate_cbw_data(state);
-
-        if (result)
-            return result;
-    }
-
-    return 0;
-}
-
-void toi_cbw_restore(void)
-{
-    if (!toi_keeping_image)
-        return;
-
-}
-
-void toi_cbw_write(void)
-{
-    if (!toi_keeping_image)
-        return;
-
-}
-
-/**
- * toi_cbw_test_read - Test copy before write on one page
- *
- * Allocate copy before write buffers, then make one page only copy-before-write
- * and attempt to write to it. We should then be able to retrieve the original
- * version from the cbw buffer and the modified version from the page itself.
- */
-static int toi_cbw_test_read(const char *buffer, int count)
-{
-    unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL);
-    char *original = "Original contents";
-    char *modified = "Modified material";
-    struct page *page = virt_to_page(virt);
-    int i, len = 0, found = 0, pfn = page_to_pfn(page);
-
-    if (!page) {
-        printk("toi_cbw_test_read: Unable to allocate a page for testing.\n");
-        return -ENOMEM;
-    }
-
-    memcpy((char *) virt, original, strlen(original));
-
-    if (toi_allocate_cbw_data()) {
-        printk("toi_cbw_test_read: Unable to allocate cbw data.\n");
-        return -ENOMEM;
-    }
-
-    toi_reset_dirtiness_one(pfn, 0);
-
-    SetPageTOI_CBW(page);
-
-    memcpy((char *) virt, modified, strlen(modified));
-
-    if (strncmp((char *) virt, modified, strlen(modified))) {
-        len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n");
-    }
-
-    for_each_online_cpu(i) {
-        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-        struct toi_cbw *ptr = state->first, *last_ptr = ptr;
-
-        if (!found) {
-            while (ptr) {
-                if (ptr->pfn == pfn) {
-                    found = 1;
-                    if (strncmp(ptr->virt, original, strlen(original))) {
-                        len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n");
-                    } else {
-                        len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n");
-                    }
-                    break;
-                }
-
-                last_ptr = ptr;
-                ptr = ptr->next;
-            }
-        }
-
-        if (!last_ptr)
-            len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i);
-    }
-
-    if (!found)
-        len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n");
-
-    toi_free_cbw_data();
-
-    return len;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read,
-                        NULL, SYSFS_NEEDS_SM_FOR_READ, NULL),
-};
-
-static struct toi_module_ops toi_cbw_ops = {
-        .type                                        = MISC_HIDDEN_MODULE,
-        .name                                        = "copy_before_write debugging",
-        .directory                                = "cbw",
-        .module                                        = THIS_MODULE,
-        .early                                        = 1,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-int toi_cbw_init(void)
-{
-        int result = toi_register_module(&toi_cbw_ops);
-        return result;
-}
diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
deleted file mode 100644
index 522c836ad..000000000
--- a/kernel/power/tuxonice_extent.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * These functions encapsulate the manipulation of storage metadata.
- */
-
-#include <linux/suspend.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-
-/**
- * toi_get_extent - return a free extent
- *
- * May fail, returning NULL instead.
- **/
-static struct hibernate_extent *toi_get_extent(void)
-{
-        return (struct hibernate_extent *) toi_kzalloc(2,
-                        sizeof(struct hibernate_extent), TOI_ATOMIC_GFP);
-}
-
-/**
- * toi_put_extent_chain - free a chain of extents starting from value 'from'
- * @chain:        Chain to free.
- *
- * Note that 'from' is an extent value, and may be part way through an extent.
- * In this case, the extent should be truncated (if necessary) and following
- * extents freed.
- **/
-void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from)
-{
-        struct hibernate_extent *this;
-
-        this = chain->first;
-
-        while (this) {
-                struct hibernate_extent *next = this->next;
-
-                // Delete the whole extent?
-                if (this->start >= from) {
-                    chain->size -= (this->end - this->start + 1);
-                    if (chain->first == this)
-                        chain->first = next;
-                    if (chain->last_touched == this)
-                        chain->last_touched = NULL;
-                    if (chain->current_extent == this)
-                        chain->current_extent = NULL;
-                    toi_kfree(2, this, sizeof(*this));
-                    chain->num_extents--;
-                } else if (this->end >= from) {
-                    // Delete part of the extent
-                    chain->size -= (this->end - from + 1);
-                    this->start = from;
-                }
-                this = next;
-        }
-}
-
-/**
- * toi_put_extent_chain - free a whole chain of extents
- * @chain:        Chain to free.
- **/
-void toi_put_extent_chain(struct hibernate_extent_chain *chain)
-{
-    toi_put_extent_chain_from(chain, 0);
-}
-
-/**
- * toi_add_to_extent_chain - add an extent to an existing chain
- * @chain:        Chain to which the extend should be added
- * @start:        Start of the extent (first physical block)
- * @end:        End of the extent (last physical block)
- *
- * The chain information is updated if the insertion is successful.
- **/
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
-                unsigned long start, unsigned long end)
-{
-        struct hibernate_extent *new_ext = NULL, *cur_ext = NULL;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0,
-                "Adding extent %lu-%lu to chain %p.\n", start, end, chain);
-
-        /* Find the right place in the chain */
-        if (chain->last_touched && chain->last_touched->start < start)
-                cur_ext = chain->last_touched;
-        else if (chain->first && chain->first->start < start)
-                cur_ext = chain->first;
-
-        if (cur_ext) {
-                while (cur_ext->next && cur_ext->next->start < start)
-                        cur_ext = cur_ext->next;
-
-                if (cur_ext->end == (start - 1)) {
-                        struct hibernate_extent *next_ext = cur_ext->next;
-                        cur_ext->end = end;
-
-                        /* Merge with the following one? */
-                        if (next_ext && cur_ext->end + 1 == next_ext->start) {
-                                cur_ext->end = next_ext->end;
-                                cur_ext->next = next_ext->next;
-                                toi_kfree(2, next_ext, sizeof(*next_ext));
-                                chain->num_extents--;
-                        }
-
-                        chain->last_touched = cur_ext;
-                        chain->size += (end - start + 1);
-
-                        return 0;
-                }
-        }
-
-        new_ext = toi_get_extent();
-        if (!new_ext) {
-                printk(KERN_INFO "Error unable to append a new extent to the "
-                                "chain.\n");
-                return -ENOMEM;
-        }
-
-        chain->num_extents++;
-        chain->size += (end - start + 1);
-        new_ext->start = start;
-        new_ext->end = end;
-
-        chain->last_touched = new_ext;
-
-        if (cur_ext) {
-                new_ext->next = cur_ext->next;
-                cur_ext->next = new_ext;
-        } else {
-                if (chain->first)
-                        new_ext->next = chain->first;
-                chain->first = new_ext;
-        }
-
-        return 0;
-}
diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
deleted file mode 100644
index aeccf1f5e..000000000
--- a/kernel/power/tuxonice_extent.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations related to extents. Extents are
- * TuxOnIce's method of storing some of the metadata for the image.
- * See tuxonice_extent.c for more info.
- *
- */
-
-#include "tuxonice_modules.h"
-
-#ifndef EXTENT_H
-#define EXTENT_H
-
-struct hibernate_extent {
-        unsigned long start, end;
-        struct hibernate_extent *next;
-};
-
-struct hibernate_extent_chain {
-        unsigned long size; /* size of the chain ie sum (max-min+1) */
-        int num_extents;
-        struct hibernate_extent *first, *last_touched;
-        struct hibernate_extent *current_extent;
-        unsigned long current_offset;
-};
-
-/* Simplify iterating through all the values in an extent chain */
-#define toi_extent_for_each(extent_chain, extentpointer, value) \
-if ((extent_chain)->first) \
-        for ((extentpointer) = (extent_chain)->first, (value) = \
-                        (extentpointer)->start; \
-             ((extentpointer) && ((extentpointer)->next || (value) <= \
-                                 (extentpointer)->end)); \
-             (((value) == (extentpointer)->end) ? \
-                ((extentpointer) = (extentpointer)->next, (value) = \
-                 ((extentpointer) ? (extentpointer)->start : 0)) : \
-                        (value)++))
-
-extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from);
-#endif
diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
deleted file mode 100644
index baf191211..000000000
--- a/kernel/power/tuxonice_file.c
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- * kernel/power/tuxonice_file.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of a simple file as a
- * backing store. It is based upon the swapallocator, and shares the
- * same basic working. Here, though, we have nothing to do with
- * swapspace, and only one device to worry about.
- *
- * The user can just
- *
- * echo TuxOnIce > /path/to/my_file
- *
- * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
- *
- * and
- *
- * echo /path/to/my_file > /sys/power/tuxonice/file/target
- *
- * then put what they find in /sys/power/tuxonice/resume
- * as their resume= parameter in lilo.conf (and rerun lilo if using it).
- *
- * Having done this, they're ready to hibernate and resume.
- *
- * TODO:
- * - File resizing.
- */
-
-#include <linux/blkdev.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-
-#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
-
-static struct toi_module_ops toi_fileops;
-
-static struct file *target_file;
-static struct block_device *toi_file_target_bdev;
-static unsigned long pages_available, pages_allocated;
-static char toi_file_target[256];
-static struct inode *target_inode;
-static int file_target_priority;
-static int used_devt;
-static int target_claim;
-static dev_t toi_file_dev_t;
-static int sig_page_index;
-
-/* For test_toi_file_target */
-static struct toi_bdev_info *file_chain;
-
-static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num)
-{
-        int j;
-        sector_t last = 0;
-
-        for (j = 0; j < dev_info->blocks_per_page; j++) {
-                sector_t this = bmap(target_inode,
-                                page_num * dev_info->blocks_per_page + j);
-
-                if (!this || (last && (last + 1) != this))
-                        break;
-
-                last = this;
-        }
-
-        return j == dev_info->blocks_per_page;
-}
-
-static unsigned long get_usable_pages(struct toi_bdev_info *dev_info)
-{
-        unsigned long result = 0;
-        struct block_device *bdev = dev_info->bdev;
-        int i;
-
-        switch (target_inode->i_mode & S_IFMT) {
-        case S_IFSOCK:
-        case S_IFCHR:
-        case S_IFIFO: /* Socket, Char, Fifo */
-                return -1;
-        case S_IFREG: /* Regular file: current size - holes + free
-                         space on part */
-                for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) {
-                        if (has_contiguous_blocks(dev_info, i))
-                                result++;
-                }
-                break;
-        case S_IFBLK: /* Block device */
-                if (!bdev->bd_disk) {
-                        toi_message(TOI_IO, TOI_VERBOSE, 0,
-                                        "bdev->bd_disk null.");
-                        return 0;
-                }
-
-                result = (bdev->bd_part ?
-                        bdev->bd_part->nr_sects :
-                        get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9);
-        }
-
-
-        return result;
-}
-
-static int toi_file_register_storage(void)
-{
-        struct toi_bdev_info *devinfo;
-        int result = 0;
-        struct fs_info *fs_info;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage.");
-        if (!strlen(toi_file_target)) {
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: "
-                                "No target filename set.");
-                return 0;
-        }
-
-        target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0);
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.",
-                        toi_file_target, target_file);
-
-        if (IS_ERR(target_file) || !target_file) {
-                target_file = NULL;
-                toi_file_dev_t = name_to_dev_t(toi_file_target);
-                if (!toi_file_dev_t) {
-                        struct kstat stat;
-                        int error = vfs_stat(toi_file_target, &stat);
-                        printk(KERN_INFO "Open file %s returned %p and "
-                                        "name_to_devt failed.\n",
-                                        toi_file_target, target_file);
-                        if (error) {
-                                printk(KERN_INFO "Stating the file also failed."
-                                        " Nothing more we can do.\n");
-                                return 0;
-                        } else
-                                toi_file_dev_t = stat.rdev;
-                }
-
-                toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t);
-                if (IS_ERR(toi_file_target_bdev)) {
-                        printk(KERN_INFO "Got a dev_num (%lx) but failed to "
-                                        "open it.\n",
-                                        (unsigned long) toi_file_dev_t);
-                        toi_file_target_bdev = NULL;
-                        return 0;
-                }
-                used_devt = 1;
-                target_inode = toi_file_target_bdev->bd_inode;
-        } else
-                target_inode = target_file->f_mapping->host;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target.");
-        if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
-            S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
-                printk(KERN_INFO "File support works with regular files,"
-                                " character files and block devices.\n");
-                /* Cleanup routine will undo the above */
-                return 0;
-        }
-
-        if (!used_devt) {
-                if (S_ISBLK(target_inode->i_mode)) {
-                        toi_file_target_bdev = I_BDEV(target_inode);
-                        if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE |
-                                                FMODE_READ, NULL))
-                                target_claim = 1;
-                } else
-                        toi_file_target_bdev = target_inode->i_sb->s_bdev;
-                if (!toi_file_target_bdev) {
-                        printk(KERN_INFO "%s is not a valid file allocator "
-                                        "target.\n", toi_file_target);
-                        return 0;
-                }
-                toi_file_dev_t = toi_file_target_bdev->bd_dev;
-        }
-
-        devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC);
-        if (!devinfo) {
-                printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n");
-                return -ENOMEM;
-        }
-
-        devinfo->bdev = toi_file_target_bdev;
-        devinfo->allocator = &toi_fileops;
-        devinfo->allocator_index = 0;
-
-        fs_info = fs_info_from_block_dev(toi_file_target_bdev);
-        if (fs_info && !IS_ERR(fs_info)) {
-                memcpy(devinfo->uuid, &fs_info->uuid, 16);
-                free_fs_info(fs_info);
-        } else
-                result = (int) PTR_ERR(fs_info);
-
-        /* Unlike swap code, only complain if fs_info_from_block_dev returned
-         * -ENOMEM. The 'file' might be a full partition, so might validly not
-         * have an identifiable type, UUID etc.
-         */
-        if (result)
-                printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n",
-                                result);
-        devinfo->dev_t = toi_file_dev_t;
-        devinfo->prio = file_target_priority;
-        devinfo->bmap_shift = target_inode->i_blkbits - 9;
-        devinfo->blocks_per_page =
-                (1 << (PAGE_SHIFT - target_inode->i_blkbits));
-        sprintf(devinfo->name, "file %s", toi_file_target);
-        file_chain = devinfo;
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap "
-                        "shift is %d. Blocks per page %d.",
-                        devinfo->dev_t, devinfo->prio, devinfo->bmap_shift,
-                        devinfo->blocks_per_page);
-
-        /* Keep one aside for the signature */
-        pages_available = get_usable_pages(devinfo) - 1;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu "
-                        "pages.", pages_available);
-
-        toi_bio_ops.register_storage(devinfo);
-        return 0;
-}
-
-static unsigned long toi_file_storage_available(void)
-{
-        return pages_available;
-}
-
-static int toi_file_allocate_storage(struct toi_bdev_info *chain,
-                unsigned long request)
-{
-        unsigned long available = pages_available - pages_allocated;
-        unsigned long to_add = min(available, request);
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated "
-                "is %lu. Allocating %lu pages from file.",
-                pages_available, pages_allocated, to_add);
-        pages_allocated += to_add;
-
-        return to_add;
-}
-
-/**
- * __populate_block_list - add an extent to the chain
- * @min:        Start of the extent (first physical block = sector)
- * @max:        End of the extent (last physical block = sector)
- *
- * If TOI_TEST_BIO is set, print a debug message, outputting the min and max
- * fs block numbers.
- **/
-static int __populate_block_list(struct toi_bdev_info *chain, int min, int max)
-{
-        if (test_action_state(TOI_TEST_BIO))
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.",
-                        min << chain->bmap_shift,
-                        ((max + 1) << chain->bmap_shift) - 1);
-
-        return toi_add_to_extent_chain(&chain->blocks, min, max);
-}
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
-        int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0;
-        unsigned long pages_mapped = 0;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks.");
-
-        if (chain->blocks.first)
-                toi_put_extent_chain(&chain->blocks);
-
-        if (!target_is_normal_file()) {
-                result = (pages_available > 0) ?
-                        __populate_block_list(chain, chain->blocks_per_page,
-                                (pages_allocated + 1) *
-                                chain->blocks_per_page - 1) : 0;
-                return result;
-        }
-
-        /*
-         * FIXME: We are assuming the first page is contiguous. Is that
-         * assumption always right?
-         */
-
-        for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
-                sector_t new_sector;
-
-                if (!has_contiguous_blocks(chain, i))
-                        continue;
-
-                if (!have_sig_page) {
-                        have_sig_page = 1;
-                        sig_page_index = i;
-                        continue;
-                }
-
-                pages_mapped++;
-
-                /* Ignore first page - it has the header */
-                if (pages_mapped == 1)
-                        continue;
-
-                new_sector = bmap(target_inode, (i * chain->blocks_per_page));
-
-                /*
-                 * I'd love to be able to fill in holes and resize
-                 * files, but not yet...
-                 */
-
-                if (new_sector == extent_max + 1)
-                        extent_max += chain->blocks_per_page;
-                else {
-                        if (extent_min > -1) {
-                                result = __populate_block_list(chain,
-                                                extent_min, extent_max);
-                                if (result)
-                                        return result;
-                        }
-
-                        extent_min = new_sector;
-                        extent_max = extent_min +
-                                chain->blocks_per_page - 1;
-                }
-
-                if (pages_mapped == pages_allocated)
-                        break;
-        }
-
-        if (extent_min > -1) {
-                result = __populate_block_list(chain, extent_min, extent_max);
-                if (result)
-                        return result;
-        }
-
-        return 0;
-}
-
-static void toi_file_free_storage(struct toi_bdev_info *chain)
-{
-        pages_allocated = 0;
-        file_chain = NULL;
-}
-
-/**
- * toi_file_print_debug_stats - print debug info
- * @buffer:        Buffer to data to populate
- * @size:        Size of the buffer
- **/
-static int toi_file_print_debug_stats(char *buffer, int size)
-{
-        int len = scnprintf(buffer, size, "- File Allocator active.\n");
-
-        len += scnprintf(buffer+len, size-len, "  Storage available for "
-                        "image: %lu pages.\n", pages_available);
-
-        return len;
-}
-
-static void toi_file_cleanup(int finishing_cycle)
-{
-        if (toi_file_target_bdev) {
-                if (target_claim) {
-                        blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ);
-                        target_claim = 0;
-                }
-
-                if (used_devt) {
-                        blkdev_put(toi_file_target_bdev,
-                                        FMODE_READ | FMODE_NDELAY);
-                        used_devt = 0;
-                }
-                toi_file_target_bdev = NULL;
-                target_inode = NULL;
-        }
-
-        if (target_file) {
-                filp_close(target_file, NULL);
-                target_file = NULL;
-        }
-
-        pages_available = 0;
-}
-
-/**
- * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target
- *
- * Test wheter the target file is valid for hibernating.
- **/
-static void test_toi_file_target(void)
-{
-        int result = toi_file_register_storage();
-        sector_t sector;
-        char buf[50];
-        struct fs_info *fs_info;
-
-        if (result || !file_chain)
-                return;
-
-        /* This doesn't mean we're in business. Is any storage available? */
-        if (!pages_available)
-                goto out;
-
-        toi_file_allocate_storage(file_chain, 1);
-        result = get_main_pool_phys_params(file_chain);
-        if (result)
-                goto out;
-
-
-        sector = bmap(target_inode, sig_page_index *
-                        file_chain->blocks_per_page) << file_chain->bmap_shift;
-
-        /* Use the uuid, or the dev_t if that fails */
-        fs_info = fs_info_from_block_dev(toi_file_target_bdev);
-        if (!fs_info || IS_ERR(fs_info)) {
-                bdevname(toi_file_target_bdev, buf);
-                sprintf(resume_file, "/dev/%s:%llu", buf,
-                                (unsigned long long) sector);
-        } else {
-                int i;
-                hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0);
-
-                /* Remove the spaces */
-                for (i = 1; i < 16; i++) {
-                        buf[2 * i] = buf[3 * i];
-                        buf[2 * i + 1] = buf[3 * i + 1];
-                }
-                buf[32] = 0;
-                sprintf(resume_file, "UUID=%s:0x%llx", buf,
-                                (unsigned long long) sector);
-                free_fs_info(fs_info);
-        }
-
-        toi_attempt_to_parse_resume_device(0);
-out:
-        toi_file_free_storage(file_chain);
-        toi_bio_ops.free_storage();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256,
-                SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL),
-        SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095,
-                        4096, 0, NULL),
-};
-
-static struct toi_bio_allocator_ops toi_bio_fileops = {
-        .register_storage                        = toi_file_register_storage,
-        .storage_available                        = toi_file_storage_available,
-        .allocate_storage                        = toi_file_allocate_storage,
-        .bmap                                        = get_main_pool_phys_params,
-        .free_storage                                = toi_file_free_storage,
-};
-
-static struct toi_module_ops toi_fileops = {
-        .type                                        = BIO_ALLOCATOR_MODULE,
-        .name                                        = "file storage",
-        .directory                                = "file",
-        .module                                        = THIS_MODULE,
-        .print_debug_info                        = toi_file_print_debug_stats,
-        .cleanup                                = toi_file_cleanup,
-        .bio_allocator_ops                        = &toi_bio_fileops,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_file_load(void)
-{
-        return toi_register_module(&toi_fileops);
-}
-
-late_initcall(toi_file_load);
diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
deleted file mode 100644
index 13bb93811..000000000
--- a/kernel/power/tuxonice_highlevel.c
+++ /dev/null
@@ -1,1414 +0,0 @@
-/*
- * kernel/power/tuxonice_highlevel.c
- */
-/** \mainpage TuxOnIce.
- *
- * TuxOnIce provides support for saving and restoring an image of
- * system memory to an arbitrary storage device, either on the local computer,
- * or across some network. The support is entirely OS based, so TuxOnIce
- * works without requiring BIOS, APM or ACPI support. The vast majority of the
- * code is also architecture independant, so it should be very easy to port
- * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
- * and preemption. Initramfses and initrds are also supported.
- *
- * TuxOnIce uses a modular design, in which the method of storing the image is
- * completely abstracted from the core code, as are transformations on the data
- * such as compression and/or encryption (multiple 'modules' can be used to
- * provide arbitrary combinations of functionality). The user interface is also
- * modular, so that arbitrarily simple or complex interfaces can be used to
- * provide anything from debugging information through to eye candy.
- *
- * \section Copyright
- *
- * TuxOnIce is released under the GPLv2.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR>
- *
- * \section Credits
- *
- * Nigel would like to thank the following people for their work:
- *
- * Bernard Blackham <bernard@blackham.com.au><BR>
- * Web page & Wiki administration, some coding. A person without whom
- * TuxOnIce would not be where it is.
- *
- * Michael Frank <mhf@linuxmail.org><BR>
- * Extensive testing and help with improving stability. I was constantly
- * amazed by the quality and quantity of Michael's help.
- *
- * Pavel Machek <pavel@ucw.cz><BR>
- * Modifications, defectiveness pointing, being with Gabor at the very
- * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
- * 2.5.17. Even though Pavel and I disagree on the direction suspend to
- * disk should take, I appreciate the valuable work he did in helping Gabor
- * get the concept working.
- *
- * ..and of course the myriads of TuxOnIce users who have helped diagnose
- * and fix bugs, made suggestions on how to improve the code, proofread
- * documentation, and donated time and money.
- *
- * Thanks also to corporate sponsors:
- *
- * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
- *
- * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
- * allowed him to work on TuxOnIce and PM related issues on company time.
- *
- * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
- * 2003 to Jan 2004.
- *
- * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
- * maintenance of SMP and Highmem support.
- *
- * <B>OSDL.</B> Provided access to various hardware configurations, make
- * occasional small donations to the project.
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/freezer.h>
-#include <generated/utsrelease.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/writeback.h>
-#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */
-#include <linux/bio.h>
-#include <linux/kgdb.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_cluster.h"
-
-/*! Pageset metadata. */
-struct pagedir pagedir2 = {2};
-
-static mm_segment_t oldfs;
-static DEFINE_MUTEX(tuxonice_in_use);
-static int block_dump_save;
-
-int toi_trace_index;
-
-/* Binary signature if an image is present */
-char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
-
-unsigned long boot_kernel_data_buffer;
-
-static char *result_strings[] = {
-        "Hibernation was aborted",
-        "The user requested that we cancel the hibernation",
-        "No storage was available",
-        "Insufficient storage was available",
-        "Freezing filesystems and/or tasks failed",
-        "A pre-existing image was used",
-        "We would free memory, but image size limit doesn't allow this",
-        "Unable to free enough memory to hibernate",
-        "Unable to obtain the Power Management Semaphore",
-        "A device suspend/resume returned an error",
-        "A system device suspend/resume returned an error",
-        "The extra pages allowance is too small",
-        "We were unable to successfully prepare an image",
-        "TuxOnIce module initialisation failed",
-        "TuxOnIce module cleanup failed",
-        "I/O errors were encountered",
-        "Ran out of memory",
-        "An error was encountered while reading the image",
-        "Platform preparation failed",
-        "CPU Hotplugging failed",
-        "Architecture specific preparation failed",
-        "Pages needed resaving, but we were told to abort if this happens",
-        "We can't hibernate at the moment (invalid resume= or filewriter "
-                "target?)",
-        "A hibernation preparation notifier chain member cancelled the "
-                "hibernation",
-        "Pre-snapshot preparation failed",
-        "Pre-restore preparation failed",
-        "Failed to disable usermode helpers",
-        "Can't resume from alternate image",
-        "Header reservation too small",
-        "Device Power Management Preparation failed",
-};
-
-/**
- * toi_finish_anything - cleanup after doing anything
- * @hibernate_or_resume:        Whether finishing a cycle or attempt at
- *                                resuming.
- *
- * This is our basic clean-up routine, matching start_anything below. We
- * call cleanup routines, drop module references and restore process fs and
- * cpus allowed masks, together with the global block_dump variable's value.
- **/
-void toi_finish_anything(int hibernate_or_resume)
-{
-        toi_running = 0;
-        toi_cleanup_modules(hibernate_or_resume);
-        toi_put_modules();
-        if (hibernate_or_resume) {
-                block_dump = block_dump_save;
-                set_cpus_allowed_ptr(current, cpu_all_mask);
-                toi_alloc_print_debug_stats();
-                atomic_inc(&snapshot_device_available);
-                unlock_system_sleep();
-        }
-
-        set_fs(oldfs);
-        mutex_unlock(&tuxonice_in_use);
-}
-
-/**
- * toi_start_anything - basic initialisation for TuxOnIce
- * @toi_or_resume:        Whether starting a cycle or attempt at resuming.
- *
- * Our basic initialisation routine. Take references on modules, use the
- * kernel segment, recheck resume= if no active allocator is set, initialise
- * modules, save and reset block_dump and ensure we're running on CPU0.
- **/
-int toi_start_anything(int hibernate_or_resume)
-{
-        mutex_lock(&tuxonice_in_use);
-
-        oldfs = get_fs();
-        set_fs(KERNEL_DS);
-
-        toi_trace_index = 0;
-
-        if (hibernate_or_resume) {
-            lock_system_sleep();
-
-                if (!atomic_add_unless(&snapshot_device_available, -1, 0))
-                        goto snapshotdevice_unavailable;
-        }
-
-        if (hibernate_or_resume == SYSFS_HIBERNATE)
-                toi_print_modules();
-
-        if (toi_get_modules()) {
-                printk(KERN_INFO "TuxOnIce: Get modules failed!\n");
-                goto prehibernate_err;
-        }
-
-        if (hibernate_or_resume) {
-                block_dump_save = block_dump;
-                block_dump = 0;
-                set_cpus_allowed_ptr(current,
-                                cpumask_of(cpumask_first(cpu_online_mask)));
-        }
-
-        if (toi_initialise_modules_early(hibernate_or_resume))
-                goto early_init_err;
-
-        if (!toiActiveAllocator)
-                toi_attempt_to_parse_resume_device(!hibernate_or_resume);
-
-        if (!toi_initialise_modules_late(hibernate_or_resume)) {
-            toi_running = 1; /* For the swsusp code we use :< */
-            return 0;
-        }
-
-        toi_cleanup_modules(hibernate_or_resume);
-early_init_err:
-        if (hibernate_or_resume) {
-                block_dump_save = block_dump;
-                set_cpus_allowed_ptr(current, cpu_all_mask);
-        }
-        toi_put_modules();
-prehibernate_err:
-        if (hibernate_or_resume)
-                atomic_inc(&snapshot_device_available);
-snapshotdevice_unavailable:
-        if (hibernate_or_resume)
-                mutex_unlock(&pm_mutex);
-        release_super_lock();
-        set_fs(oldfs);
-        mutex_unlock(&tuxonice_in_use);
-        return -EBUSY;
-}
-
-/*
- * Nosave page tracking.
- *
- * Here rather than in prepare_image because we want to do it once only at the
- * start of a cycle.
- */
-
-/**
- * mark_nosave_pages - set up our Nosave bitmap
- *
- * Build a bitmap of Nosave pages from the list. The bitmap allows faster
- * use when preparing the image.
- **/
-static void mark_nosave_pages(void)
-{
-        struct nosave_region *region;
-
-        list_for_each_entry(region, &nosave_regions, list) {
-                unsigned long pfn;
-
-                for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
-                        if (pfn_valid(pfn)) {
-                                SetPageNosave(pfn_to_page(pfn));
-                        }
-        }
-}
-
-/**
- * allocate_bitmaps - allocate bitmaps used to record page states
- *
- * Allocate the bitmaps we use to record the various TuxOnIce related
- * page states.
- **/
-static int allocate_bitmaps(void)
-{
-        if (toi_alloc_bitmap(&pageset1_map) ||
-            toi_alloc_bitmap(&pageset1_copy_map) ||
-            toi_alloc_bitmap(&pageset2_map) ||
-            toi_alloc_bitmap(&io_map) ||
-            toi_alloc_bitmap(&nosave_map) ||
-            toi_alloc_bitmap(&free_map) ||
-            toi_alloc_bitmap(&compare_map) ||
-            toi_alloc_bitmap(&page_resave_map))
-                return 1;
-
-        return 0;
-}
-
-/**
- * free_bitmaps - free the bitmaps used to record page states
- *
- * Free the bitmaps allocated above. It is not an error to call
- * memory_bm_free on a bitmap that isn't currently allocated.
- **/
-static void free_bitmaps(void)
-{
-        toi_free_bitmap(&pageset1_map);
-        toi_free_bitmap(&pageset1_copy_map);
-        toi_free_bitmap(&pageset2_map);
-        toi_free_bitmap(&io_map);
-        toi_free_bitmap(&nosave_map);
-        toi_free_bitmap(&free_map);
-        toi_free_bitmap(&compare_map);
-        toi_free_bitmap(&page_resave_map);
-}
-
-/**
- * io_MB_per_second - return the number of MB/s read or written
- * @write:        Whether to return the speed at which we wrote.
- *
- * Calculate the number of megabytes per second that were read or written.
- **/
-static int io_MB_per_second(int write)
-{
-        return (toi_bkd.toi_io_time[write][1]) ?
-                MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
-                toi_bkd.toi_io_time[write][1] : 0;
-}
-
-#define SNPRINTF(a...)         do { len += scnprintf(((char *) buffer) + len, \
-                count - len - 1, ## a); } while (0)
-
-/**
- * get_debug_info - fill a buffer with debugging information
- * @buffer:        The buffer to be filled.
- * @count:        The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_debug_info(const char *buffer, int count)
-{
-        int len = 0, i, first_result = 1;
-
-        SNPRINTF("TuxOnIce debugging info:\n");
-        SNPRINTF("- TuxOnIce core  : " TOI_CORE_VERSION "\n");
-        SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
-        SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
-        SNPRINTF("- Attempt number : %d\n", nr_hibernates);
-        SNPRINTF("- Parameters     : %ld %ld %ld %d %ld %ld\n",
-                        toi_result,
-                        toi_bkd.toi_action,
-                        toi_bkd.toi_debug_state,
-                        toi_bkd.toi_default_console_level,
-                        image_size_limit,
-                        toi_poweroff_method);
-        SNPRINTF("- Overall expected compression percentage: %d.\n",
-                        100 - toi_expected_compression_ratio());
-        len += toi_print_module_debug_info(((char *) buffer) + len,
-                        count - len - 1);
-        if (toi_bkd.toi_io_time[0][1]) {
-                if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
-                        SNPRINTF("- I/O speed: Write %ld KB/s",
-                          (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
-                          toi_bkd.toi_io_time[0][1]));
-                        if (toi_bkd.toi_io_time[1][1])
-                                SNPRINTF(", Read %ld KB/s",
-                                  (KB((unsigned long)
-                                      toi_bkd.toi_io_time[1][0]) * HZ /
-                                  toi_bkd.toi_io_time[1][1]));
-                } else {
-                        SNPRINTF("- I/O speed: Write %ld MB/s",
-                         (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
-                          toi_bkd.toi_io_time[0][1]));
-                        if (toi_bkd.toi_io_time[1][1])
-                                SNPRINTF(", Read %ld MB/s",
-                                 (MB((unsigned long)
-                                     toi_bkd.toi_io_time[1][0]) * HZ /
-                                  toi_bkd.toi_io_time[1][1]));
-                }
-                SNPRINTF(".\n");
-        } else
-                SNPRINTF("- No I/O speed stats available.\n");
-        SNPRINTF("- Extra pages    : %lu used/%lu.\n",
-                        extra_pd1_pages_used, extra_pd1_pages_allowance);
-
-        for (i = 0; i < TOI_NUM_RESULT_STATES; i++)
-                if (test_result_state(i)) {
-                        SNPRINTF("%s: %s.\n", first_result ?
-                                        "- Result         " :
-                                        "                 ",
-                                        result_strings[i]);
-                        first_result = 0;
-                }
-        if (first_result)
-                SNPRINTF("- Result         : %s.\n", nr_hibernates ?
-                        "Succeeded" :
-                        "No hibernation attempts so far");
-        return len;
-}
-
-#ifdef CONFIG_TOI_INCREMENTAL
-/**
- * get_toi_page_state - fill a buffer with page state information
- * @buffer:        The buffer to be filled.
- * @count:        The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_page_state(const char *buffer, int count)
-{
-    int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0;
-    int len = 0;
-    struct zone *zone;
-    int allocated_bitmaps = 0;
-
-    set_cpus_allowed_ptr(current,
-            cpumask_of(cpumask_first(cpu_online_mask)));
-
-    if (!free_map) {
-        BUG_ON(toi_alloc_bitmap(&free_map));
-        allocated_bitmaps = 1;
-    }
-
-    toi_generate_free_page_map();
-
-    for_each_populated_zone(zone) {
-        unsigned long loop;
-
-        total += zone->spanned_pages;
-
-        for (loop = 0; loop < zone->spanned_pages; loop++) {
-            unsigned long pfn = zone->zone_start_pfn + loop;
-            struct page *page;
-            int chunk_size;
-
-            if (!pfn_valid(pfn)) {
-                continue;
-            }
-
-            chunk_size = toi_size_of_free_region(zone, pfn);
-            if (chunk_size) {
-                /*
-                 * If the page gets allocated, it will be need
-                 * saving in an image.
-                 * Don't bother with explicitly removing any
-                 * RO protection applied below.
-                 * We'll SetPageTOI_Dirty(page) if/when it
-                 * gets allocated.
-                 */
-                free += chunk_size;
-                loop += chunk_size - 1;
-                continue;
-            }
-
-            page = pfn_to_page(pfn);
-
-            if (PageTOI_Untracked(page)) {
-                untracked++;
-            } else if (PageTOI_RO(page)) {
-                ro++;
-            } else if (PageTOI_Dirty(page)) {
-                dirty++;
-            } else {
-                printk("Page %ld state 'other'.\n", pfn);
-                other++;
-            }
-        }
-    }
-
-    if (allocated_bitmaps) {
-        toi_free_bitmap(&free_map);
-    }
-
-    set_cpus_allowed_ptr(current, cpu_all_mask);
-
-    SNPRINTF("TuxOnIce page breakdown:\n");
-    SNPRINTF("- Free           : %d\n", free);
-    SNPRINTF("- Untracked      : %d\n", untracked);
-    SNPRINTF("- Read only      : %d\n", ro);
-    SNPRINTF("- Dirty          : %d\n", dirty);
-    SNPRINTF("- Other          : %d\n", other);
-    SNPRINTF("- Invalid        : %d\n", invalid);
-    SNPRINTF("- Total          : %d\n", total);
-    return len;
-}
-#endif
-
-/**
- * do_cleanup - cleanup after attempting to hibernate or resume
- * @get_debug_info:        Whether to allocate and return debugging info.
- *
- * Cleanup after attempting to hibernate or resume, possibly getting
- * debugging info as we do so.
- **/
-static void do_cleanup(int get_debug_info, int restarting)
-{
-        int i = 0;
-        char *buffer = NULL;
-
-        trap_non_toi_io = 0;
-
-        if (get_debug_info)
-                toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
-
-        free_checksum_pages();
-
-        toi_cbw_restore();
-        toi_free_cbw_data();
-
-        if (get_debug_info)
-                buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
-
-        if (buffer)
-                i = get_toi_debug_info(buffer, PAGE_SIZE);
-
-        toi_free_extra_pagedir_memory();
-
-        pagedir1.size = 0;
-        pagedir2.size = 0;
-        set_highmem_size(pagedir1, 0);
-        set_highmem_size(pagedir2, 0);
-
-        if (boot_kernel_data_buffer) {
-                if (!test_toi_state(TOI_BOOT_KERNEL))
-                        toi_free_page(37, boot_kernel_data_buffer);
-                boot_kernel_data_buffer = 0;
-        }
-
-        if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) {
-                unlock_device_hotplug();
-                clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
-        }
-
-        clear_toi_state(TOI_BOOT_KERNEL);
-        if (current->flags & PF_SUSPEND_TASK)
-                thaw_processes();
-
-        if (!restarting)
-                toi_stop_other_threads();
-
-        if (toi_keeping_image &&
-            !test_result_state(TOI_ABORTED)) {
-                toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
-                        "TuxOnIce: Not invalidating the image due "
-                        "to Keep Image or Incremental Image being enabled.");
-                set_result_state(TOI_KEPT_IMAGE);
-
-                /*
-                 * For an incremental image, free unused storage so
-                 * swap (if any) can be used for normal system operation,
-                 * if so desired.
-                 */
-
-                toiActiveAllocator->free_unused_storage();
-        } else
-                if (toiActiveAllocator)
-                        toiActiveAllocator->remove_image();
-
-        free_bitmaps();
-        usermodehelper_enable();
-
-        if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
-                pm_notifier_call_chain(PM_POST_HIBERNATION);
-                clear_toi_state(TOI_NOTIFIERS_PREPARE);
-        }
-
-        if (buffer && i) {
-                /* Printk can only handle 1023 bytes, including
-                 * its level mangling. */
-                for (i = 0; i < 3; i++)
-                        printk(KERN_ERR "%s", buffer + (1023 * i));
-                toi_free_page(20, (unsigned long) buffer);
-        }
-
-        if (!restarting)
-                toi_cleanup_console();
-
-        free_attention_list();
-
-        if (!restarting)
-                toi_deactivate_storage(0);
-
-        clear_toi_state(TOI_IGNORE_LOGLEVEL);
-        clear_toi_state(TOI_TRYING_TO_RESUME);
-        clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * check_still_keeping_image - we kept an image; check whether to reuse it.
- *
- * We enter this routine when we have kept an image. If the user has said they
- * want to still keep it, all we need to do is powerdown. If powering down
- * means hibernating to ram and the power doesn't run out, we'll return 1.
- * If we do power off properly or the battery runs out, we'll resume via the
- * normal paths.
- *
- * If the user has said they want to remove the previously kept image, we
- * remove it, and return 0. We'll then store a new image.
- **/
-static int check_still_keeping_image(void)
-{
-    if (toi_keeping_image) {
-        if (!test_action_state(TOI_INCREMENTAL_IMAGE)) {
-            printk(KERN_INFO "Image already stored: powering down "
-                    "immediately.");
-            do_toi_step(STEP_HIBERNATE_POWERDOWN);
-            return 1;
-        }
-        /**
-         * Incremental image - need to write new part.
-         * We detect that we're writing an incremental image by looking
-         * at test_result_state(TOI_KEPT_IMAGE)
-         **/
-        return 0;
-    }
-
-    printk(KERN_INFO "Invalidating previous image.\n");
-    toiActiveAllocator->remove_image();
-
-    return 0;
-}
-
-/**
- * toi_init - prepare to hibernate to disk
- *
- * Initialise variables & data structures, in preparation for
- * hibernating to disk.
- **/
-static int toi_init(int restarting)
-{
-        int result, i, j;
-
-        toi_result = 0;
-
-        printk(KERN_INFO "Initiating a hibernation cycle.\n");
-
-        nr_hibernates++;
-
-        for (i = 0; i < 2; i++)
-                for (j = 0; j < 2; j++)
-                        toi_bkd.toi_io_time[i][j] = 0;
-
-        if (!test_toi_state(TOI_CAN_HIBERNATE) ||
-            allocate_bitmaps())
-                return 1;
-
-        mark_nosave_pages();
-
-        if (!restarting)
-                toi_prepare_console();
-
-        result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
-        if (result) {
-                set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
-                return 1;
-        }
-        set_toi_state(TOI_NOTIFIERS_PREPARE);
-
-        if (!restarting) {
-                printk(KERN_ERR "Starting other threads.");
-                toi_start_other_threads();
-        }
-
-        result = usermodehelper_disable();
-        if (result) {
-                printk(KERN_ERR "TuxOnIce: Failed to disable usermode "
-                                "helpers\n");
-                set_result_state(TOI_USERMODE_HELPERS_ERR);
-                return 1;
-        }
-
-        boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
-        if (!boot_kernel_data_buffer) {
-                printk(KERN_ERR "TuxOnIce: Failed to allocate "
-                                "boot_kernel_data_buffer.\n");
-                set_result_state(TOI_OUT_OF_MEMORY);
-                return 1;
-        }
-
-        toi_allocate_cbw_data();
-
-        return 0;
-}
-
-/**
- * can_hibernate - perform basic 'Can we hibernate?' tests
- *
- * Perform basic tests that must pass if we're going to be able to hibernate:
- * Can we get the pm_mutex? Is resume= valid (we need to know where to write
- * the image header).
- **/
-static int can_hibernate(void)
-{
-        if (!test_toi_state(TOI_CAN_HIBERNATE))
-                toi_attempt_to_parse_resume_device(0);
-
-        if (!test_toi_state(TOI_CAN_HIBERNATE)) {
-                printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
-                        "This may be because you haven't put something along "
-                        "the lines of\n\nresume=swap:/dev/hda1\n\n"
-                        "in lilo.conf or equivalent. (Where /dev/hda1 is your "
-                        "swap partition).\n");
-                set_abort_result(TOI_CANT_SUSPEND);
-                return 0;
-        }
-
-        if (strlen(alt_resume_param)) {
-                attempt_to_parse_alt_resume_param();
-
-                if (!strlen(alt_resume_param)) {
-                        printk(KERN_INFO "Alternate resume parameter now "
-                                        "invalid. Aborting.\n");
-                        set_abort_result(TOI_CANT_USE_ALT_RESUME);
-                        return 0;
-                }
-        }
-
-        return 1;
-}
-
-/**
- * do_post_image_write - having written an image, figure out what to do next
- *
- * After writing an image, we might load an alternate image or power down.
- * Powering down might involve hibernating to ram, in which case we also
- * need to handle reloading pageset2.
- **/
-static int do_post_image_write(void)
-{
-        /* If switching images fails, do normal powerdown */
-        if (alt_resume_param[0])
-                do_toi_step(STEP_RESUME_ALT_IMAGE);
-
-        toi_power_down();
-
-        barrier();
-        mb();
-        return 0;
-}
-
-/**
- * __save_image - do the hard work of saving the image
- *
- * High level routine for getting the image saved. The key assumptions made
- * are that processes have been frozen and sufficient memory is available.
- *
- * We also exit through here at resume time, coming back from toi_hibernate
- * after the atomic restore. This is the reason for the toi_in_hibernate
- * test.
- **/
-static int __save_image(void)
-{
-        int temp_result, did_copy = 0;
-
-        toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
-
-        toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
-                " - Final values: %d and %d.",
-                pagedir1.size, pagedir2.size);
-
-        toi_cond_pause(1, "About to write pagedir2.");
-
-        temp_result = write_pageset(&pagedir2);
-
-        if (temp_result == -1 || test_result_state(TOI_ABORTED))
-                return 1;
-
-        toi_cond_pause(1, "About to copy pageset 1.");
-
-        if (test_result_state(TOI_ABORTED))
-                return 1;
-
-        toi_deactivate_storage(1);
-
-        toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
-        toi_in_hibernate = 1;
-
-        if (toi_go_atomic(PMSG_FREEZE, 1))
-                goto Failed;
-
-        temp_result = toi_hibernate();
-
-#ifdef CONFIG_KGDB
-        if (test_action_state(TOI_POST_RESUME_BREAKPOINT))
-                kgdb_breakpoint();
-#endif
-
-        if (!temp_result)
-                did_copy = 1;
-
-        /* We return here at resume time too! */
-        toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result);
-
-Failed:
-        if (toi_activate_storage(1))
-                panic("Failed to reactivate our storage.");
-
-        /* Resume time? */
-        if (!toi_in_hibernate) {
-                copyback_post();
-                return 0;
-        }
-
-        /* Nope. Hibernating. So, see if we can save the image... */
-
-        if (temp_result || test_result_state(TOI_ABORTED)) {
-                if (did_copy)
-                        goto abort_reloading_pagedir_two;
-                else
-                        return 1;
-        }
-
-        toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size,
-                        NULL);
-
-        if (test_result_state(TOI_ABORTED))
-                goto abort_reloading_pagedir_two;
-
-        toi_cond_pause(1, "About to write pageset1.");
-
-        toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1");
-
-        temp_result = write_pageset(&pagedir1);
-
-        /* We didn't overwrite any memory, so no reread needs to be done. */
-        if (test_action_state(TOI_TEST_FILTER_SPEED) ||
-            test_action_state(TOI_TEST_BIO))
-                return 1;
-
-        if (temp_result == 1 || test_result_state(TOI_ABORTED))
-                goto abort_reloading_pagedir_two;
-
-        toi_cond_pause(1, "About to write header.");
-
-        if (test_result_state(TOI_ABORTED))
-                goto abort_reloading_pagedir_two;
-
-        temp_result = write_image_header();
-
-        if (!temp_result && !test_result_state(TOI_ABORTED))
-                return 0;
-
-abort_reloading_pagedir_two:
-        temp_result = read_pageset2(1);
-
-        /* If that failed, we're sunk. Panic! */
-        if (temp_result)
-                panic("Attempt to reload pagedir 2 while aborting "
-                                "a hibernate failed.");
-
-        return 1;
-}
-
-static void map_ps2_pages(int enable)
-{
-        unsigned long pfn = 0;
-
-        memory_bm_position_reset(pageset2_map);
-        pfn = memory_bm_next_pfn(pageset2_map, 0);
-
-        while (pfn != BM_END_OF_MAP) {
-                struct page *page = pfn_to_page(pfn);
-                kernel_map_pages(page, 1, enable);
-                pfn = memory_bm_next_pfn(pageset2_map, 0);
-        }
-}
-
-/**
- * do_save_image - save the image and handle the result
- *
- * Save the prepared image. If we fail or we're in the path returning
- * from the atomic restore, cleanup.
- **/
-static int do_save_image(void)
-{
-        int result;
-        map_ps2_pages(0);
-        result = __save_image();
-        map_ps2_pages(1);
-        return result;
-}
-
-/**
- * do_prepare_image - try to prepare an image
- *
- * Seek to initialise and prepare an image to be saved. On failure,
- * cleanup.
- **/
-static int do_prepare_image(void)
-{
-        int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-
-        if (!restarting && toi_activate_storage(0))
-                return 1;
-
-        /*
-         * If kept image and still keeping image and hibernating to RAM, (non
-         * incremental image case) we will return 1 after hibernating and
-         * resuming (provided the power doesn't run out. In that case, we skip
-         * directly to cleaning up and exiting.
-         */
-
-        if (!can_hibernate() ||
-            (test_result_state(TOI_KEPT_IMAGE) &&
-             check_still_keeping_image()))
-                return 1;
-
-        if (toi_init(restarting) || toi_prepare_image() ||
-                        test_result_state(TOI_ABORTED))
-                return 1;
-
-        trap_non_toi_io = 1;
-
-        return 0;
-}
-
-/**
- * do_check_can_resume - find out whether an image has been stored
- *
- * Read whether an image exists. We use the same routine as the
- * image_exists sysfs entry, and just look to see whether the
- * first character in the resulting buffer is a '1'.
- **/
-int do_check_can_resume(void)
-{
-        int result = -1;
-
-        if (toi_activate_storage(0))
-                return -1;
-
-        if (!test_toi_state(TOI_RESUME_DEVICE_OK))
-                toi_attempt_to_parse_resume_device(1);
-
-        if (toiActiveAllocator)
-                result = toiActiveAllocator->image_exists(1);
-
-        toi_deactivate_storage(0);
-        return result;
-}
-
-/**
- * do_load_atomic_copy - load the first part of an image, if it exists
- *
- * Check whether we have an image. If one exists, do sanity checking
- * (possibly invalidating the image or even rebooting if the user
- * requests that) before loading it into memory in preparation for the
- * atomic restore.
- *
- * If and only if we have an image loaded and ready to restore, we return 1.
- **/
-static int do_load_atomic_copy(void)
-{
-        int read_image_result = 0;
-
-        if (sizeof(swp_entry_t) != sizeof(long)) {
-                printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
-                        " of long. Please report this!\n");
-                return 1;
-        }
-
-        if (!resume_file[0])
-                printk(KERN_WARNING "TuxOnIce: "
-                        "You need to use a resume= command line parameter to "
-                        "tell TuxOnIce where to look for an image.\n");
-
-        toi_activate_storage(0);
-
-        if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
-                !toi_attempt_to_parse_resume_device(0)) {
-                /*
-                 * Without a usable storage device we can do nothing -
-                 * even if noresume is given
-                 */
-
-                if (!toiNumAllocators)
-                        printk(KERN_ALERT "TuxOnIce: "
-                          "No storage allocators have been registered.\n");
-                else
-                        printk(KERN_ALERT "TuxOnIce: "
-                                "Missing or invalid storage location "
-                                "(resume= parameter). Please correct and "
-                                "rerun lilo (or equivalent) before "
-                                "hibernating.\n");
-                toi_deactivate_storage(0);
-                return 1;
-        }
-
-        if (allocate_bitmaps())
-                return 1;
-
-        read_image_result = read_pageset1(); /* non fatal error ignored */
-
-        if (test_toi_state(TOI_NORESUME_SPECIFIED))
-                clear_toi_state(TOI_NORESUME_SPECIFIED);
-
-        toi_deactivate_storage(0);
-
-        if (read_image_result)
-                return 1;
-
-        return 0;
-}
-
-/**
- * prepare_restore_load_alt_image - save & restore alt image variables
- *
- * Save and restore the pageset1 maps, when loading an alternate image.
- **/
-static void prepare_restore_load_alt_image(int prepare)
-{
-        static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save;
-
-        if (prepare) {
-                pageset1_map_save = pageset1_map;
-                pageset1_map = NULL;
-                pageset1_copy_map_save = pageset1_copy_map;
-                pageset1_copy_map = NULL;
-                set_toi_state(TOI_LOADING_ALT_IMAGE);
-                toi_reset_alt_image_pageset2_pfn();
-        } else {
-                toi_free_bitmap(&pageset1_map);
-                pageset1_map = pageset1_map_save;
-                toi_free_bitmap(&pageset1_copy_map);
-                pageset1_copy_map = pageset1_copy_map_save;
-                clear_toi_state(TOI_NOW_RESUMING);
-                clear_toi_state(TOI_LOADING_ALT_IMAGE);
-        }
-}
-
-/**
- * do_toi_step - perform a step in hibernating or resuming
- *
- * Perform a step in hibernating or resuming an image. This abstraction
- * is in preparation for implementing cluster support, and perhaps replacing
- * uswsusp too (haven't looked whether that's possible yet).
- **/
-int do_toi_step(int step)
-{
-        switch (step) {
-        case STEP_HIBERNATE_PREPARE_IMAGE:
-                return do_prepare_image();
-        case STEP_HIBERNATE_SAVE_IMAGE:
-                return do_save_image();
-        case STEP_HIBERNATE_POWERDOWN:
-                return do_post_image_write();
-        case STEP_RESUME_CAN_RESUME:
-                return do_check_can_resume();
-        case STEP_RESUME_LOAD_PS1:
-                return do_load_atomic_copy();
-        case STEP_RESUME_DO_RESTORE:
-                /*
-                 * If we succeed, this doesn't return.
-                 * Instead, we return from do_save_image() in the
-                 * hibernated kernel.
-                 */
-                return toi_atomic_restore();
-        case STEP_RESUME_ALT_IMAGE:
-                printk(KERN_INFO "Trying to resume alternate image.\n");
-                toi_in_hibernate = 0;
-                save_restore_alt_param(SAVE, NOQUIET);
-                prepare_restore_load_alt_image(1);
-                if (!do_check_can_resume()) {
-                        printk(KERN_INFO "Nothing to resume from.\n");
-                        goto out;
-                }
-                if (!do_load_atomic_copy())
-                        toi_atomic_restore();
-
-                printk(KERN_INFO "Failed to load image.\n");
-out:
-                prepare_restore_load_alt_image(0);
-                save_restore_alt_param(RESTORE, NOQUIET);
-                break;
-        case STEP_CLEANUP:
-                do_cleanup(1, 0);
-                break;
-        case STEP_QUIET_CLEANUP:
-                do_cleanup(0, 0);
-                break;
-        }
-
-        return 0;
-}
-
-/* -- Functions for kickstarting a hibernate or resume --- */
-
-/**
- * toi_try_resume - try to do the steps in resuming
- *
- * Check if we have an image and if so try to resume. Clear the status
- * flags too.
- **/
-void toi_try_resume(void)
-{
-        set_toi_state(TOI_TRYING_TO_RESUME);
-        resume_attempted = 1;
-
-        current->flags |= PF_MEMALLOC;
-        toi_start_other_threads();
-
-        if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
-                        !do_toi_step(STEP_RESUME_LOAD_PS1))
-                do_toi_step(STEP_RESUME_DO_RESTORE);
-
-        toi_stop_other_threads();
-        do_cleanup(0, 0);
-
-        current->flags &= ~PF_MEMALLOC;
-
-        clear_toi_state(TOI_IGNORE_LOGLEVEL);
-        clear_toi_state(TOI_TRYING_TO_RESUME);
-        clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume
- *
- * Wrapper for when __toi_try_resume is called from swsusp resume path,
- * rather than from echo > /sys/power/tuxonice/do_resume.
- **/
-static void toi_sys_power_disk_try_resume(void)
-{
-        resume_attempted = 1;
-
-        /*
-         * There's a comment in kernel/power/disk.c that indicates
-         * we should be able to use mutex_lock_nested below. That
-         * doesn't seem to cut it, though, so let's just turn lockdep
-         * off for now.
-         */
-        lockdep_off();
-
-        if (toi_start_anything(SYSFS_RESUMING))
-                goto out;
-
-        toi_try_resume();
-
-        /*
-         * For initramfs, we have to clear the boot time
-         * flag after trying to resume
-         */
-        clear_toi_state(TOI_BOOT_TIME);
-
-        toi_finish_anything(SYSFS_RESUMING);
-out:
-        lockdep_on();
-}
-
-/**
- * toi_try_hibernate - try to start a hibernation cycle
- *
- * Start a hibernation cycle, coming in from either
- * echo > /sys/power/tuxonice/do_suspend
- *
- * or
- *
- * echo disk > /sys/power/state
- *
- * In the later case, we come in without pm_sem taken; in the
- * former, it has been taken.
- **/
-int toi_try_hibernate(void)
-{
-        int result = 0, sys_power_disk = 0, retries = 0;
-
-        if (!mutex_is_locked(&tuxonice_in_use)) {
-                /* Came in via /sys/power/disk */
-                if (toi_start_anything(SYSFS_HIBERNATING))
-                        return -EBUSY;
-                sys_power_disk = 1;
-        }
-
-        current->flags |= PF_MEMALLOC;
-
-        if (test_toi_state(TOI_CLUSTER_MODE)) {
-                toi_initiate_cluster_hibernate();
-                goto out;
-        }
-
-prepare:
-        result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
-
-        if (result)
-                goto out;
-
-        if (test_action_state(TOI_FREEZER_TEST))
-                goto out_restore_gfp_mask;
-
-        result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
-
-        if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) {
-                if (retries < 2) {
-                        do_cleanup(0, 1);
-                        retries++;
-                        clear_result_state(TOI_ABORTED);
-                        extra_pd1_pages_allowance = extra_pd1_pages_used + 500;
-                        printk(KERN_INFO "Automatically adjusting the extra"
-                                " pages allowance to %ld and restarting.\n",
-                                extra_pd1_pages_allowance);
-                        pm_restore_gfp_mask();
-                        goto prepare;
-                }
-
-                printk(KERN_INFO "Adjusted extra pages allowance twice and "
-                        "still couldn't hibernate successfully. Giving up.");
-        }
-
-        /* This code runs at resume time too! */
-        if (!result && toi_in_hibernate)
-                result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-
-out_restore_gfp_mask:
-        pm_restore_gfp_mask();
-out:
-        do_cleanup(1, 0);
-        current->flags &= ~PF_MEMALLOC;
-
-        if (sys_power_disk)
-                toi_finish_anything(SYSFS_HIBERNATING);
-
-        return result;
-}
-
-/*
- * channel_no: If !0, -c <channel_no> is added to args (userui).
- */
-int toi_launch_userspace_program(char *command, int channel_no,
-                int wait, int debug)
-{
-        int retval;
-        static char *envp[] = {
-                        "HOME=/",
-                        "TERM=linux",
-                        "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-                        NULL };
-        static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
-                };
-        char *channel = NULL;
-        int arg = 0, size;
-        char test_read[255];
-        char *orig_posn = command;
-
-        if (!strlen(orig_posn))
-                return 1;
-
-        if (channel_no) {
-                channel = toi_kzalloc(4, 6, GFP_KERNEL);
-                if (!channel) {
-                        printk(KERN_INFO "Failed to allocate memory in "
-                                "preparing to launch userspace program.\n");
-                        return 1;
-                }
-        }
-
-        /* Up to 6 args supported */
-        while (arg < 6) {
-                sscanf(orig_posn, "%s", test_read);
-                size = strlen(test_read);
-                if (!(size))
-                        break;
-                argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
-                strcpy(argv[arg], test_read);
-                orig_posn += size + 1;
-                *test_read = 0;
-                arg++;
-        }
-
-        if (channel_no) {
-                sprintf(channel, "-c%d", channel_no);
-                argv[arg] = channel;
-        } else
-                arg--;
-
-        if (debug) {
-                argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP);
-                strcpy(argv[arg], "--debug");
-        }
-
-        retval = call_usermodehelper(argv[0], argv, envp, wait);
-
-        /*
-         * If the program reports an error, retval = 256. Don't complain
-         * about that here.
-         */
-        if (retval && retval != 256)
-                printk(KERN_ERR "Failed to launch userspace program '%s': "
-                                "Error %d\n", command, retval);
-
-        {
-                int i;
-                for (i = 0; i < arg; i++)
-                        if (argv[i] && argv[i] != channel)
-                                toi_kfree(5, argv[i], sizeof(*argv[i]));
-        }
-
-        toi_kfree(4, channel, sizeof(*channel));
-
-        return retval;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_LONG("extra_pages_allowance", SYSFS_RW,
-                        &extra_pd1_pages_allowance, 0, LONG_MAX, 0),
-        SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read,
-                        image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL),
-        SYSFS_STRING("resume", SYSFS_RW, resume_file, 255,
-                        SYSFS_NEEDS_SM_FOR_WRITE,
-                        attempt_to_parse_resume_device2),
-        SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255,
-                        SYSFS_NEEDS_SM_FOR_WRITE,
-                        attempt_to_parse_alt_resume_param),
-        SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0,
-                        NULL),
-        SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_IGNORE_ROOTFS, 0),
-        SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2,
-                        INT_MAX, 0),
-        SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0),
-        SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_MULTITHREADED_IO, 0),
-        SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_FLUSHER_THREAD, 0),
-        SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_PAGESET2_FULL, 0),
-        SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0),
-        SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_REPLACE_SWSUSP, 0),
-        SYSFS_STRING("resume_commandline", SYSFS_RW,
-                        toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0,
-                        NULL),
-        SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL),
-        SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_FREEZER_TEST, 0),
-        SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0),
-        SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_TEST_FILTER_SPEED, 0),
-        SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_PAGESET2, 0),
-        SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_PS2_IF_UNNEEDED, 0),
-        SYSFS_STRING("binary_signature", SYSFS_READONLY,
-                        tuxonice_signature, 9, 0, NULL),
-        SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0,
-                        NULL),
-#ifdef CONFIG_KGDB
-        SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_POST_RESUME_BREAKPOINT, 0),
-#endif
-        SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_NO_READAHEAD, 0),
-        SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_TRACE_DEBUG_ON, 0),
-#ifdef CONFIG_TOI_KEEP_IMAGE
-        SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE,
-                        0),
-#endif
-#ifdef CONFIG_TOI_INCREMENTAL
-        SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0,
-                        NULL),
-        SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_INCREMENTAL_IMAGE, 1),
-#endif
-};
-
-static struct toi_core_fns my_fns = {
-        .get_nonconflicting_page = __toi_get_nonconflicting_page,
-        .post_context_save = __toi_post_context_save,
-        .try_hibernate = toi_try_hibernate,
-        .try_resume = toi_sys_power_disk_try_resume,
-};
-
-/**
- * core_load - initialisation of TuxOnIce core
- *
- * Initialise the core, beginning with sysfs. Checksum and so on are part of
- * the core, but have their own initialisation routines because they either
- * aren't compiled in all the time or have their own subdirectories.
- **/
-static __init int core_load(void)
-{
-        int i,
-            numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
-        printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION
-                        " (http://tuxonice.net)\n");
-
-        if (!hibernation_available()) {
-          printk(KERN_INFO "TuxOnIce disabled due to request for hibernation"
-              " to be disabled in this kernel.\n");
-          return 1;
-        }
-
-        if (toi_sysfs_init())
-                return 1;
-
-        for (i = 0; i < numfiles; i++)
-                toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
-        toi_core_fns = &my_fns;
-
-        if (toi_alloc_init())
-                return 1;
-        if (toi_checksum_init())
-                return 1;
-        if (toi_usm_init())
-                return 1;
-        if (toi_ui_init())
-                return 1;
-        if (toi_poweroff_init())
-                return 1;
-        if (toi_cluster_init())
-                return 1;
-        if (toi_cbw_init())
-                return 1;
-
-        return 0;
-}
-
-late_initcall(core_load);
diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c
deleted file mode 100644
index a8c5f3660..000000000
--- a/kernel/power/tuxonice_incremental.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * kernel/power/tuxonice_incremental.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines related to storing incremental images - that
- * is, retaining an image after an initial cycle and then storing incremental
- * changes on subsequent hibernations.
- *
- * Based in part on on...
- *
- * Debug helper to dump the current kernel pagetables of the system
- * so that we can see what the various memory ranges are set to.
- *
- * (C) Copyright 2008 Intel Corporation
- *
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/mm.h>
-#include <linux/tuxonice.h>
-#include <linux/sched.h>
-#include <asm/pgtable.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/page.h>
-#include "tuxonice_pageflags.h"
-#include "tuxonice_builtin.h"
-#include "power.h"
-
-int toi_do_incremental_initcall;
-
-extern void kdb_init(int level);
-extern noinline void kgdb_breakpoint(void);
-
-#undef pr_debug
-#if 0
-#define pr_debug(a, b...) do { printk(a, ##b); } while(0)
-#else
-#define pr_debug(a, b...) do { } while(0)
-#endif
-
-/* Multipliers for offsets within the PTEs */
-#define PTE_LEVEL_MULT (PAGE_SIZE)
-#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
-#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
-
-/*
- * This function gets called on a break in a continuous series
- * of PTE entries; the next one is different so we need to
- * print what we collected so far.
- */
-static void note_page(void *addr)
-{
-    static struct page *lastpage;
-    struct page *page;
-
-    page = virt_to_page(addr);
-
-    if (page != lastpage) {
-        unsigned int level;
-        pte_t *pte = lookup_address((unsigned long) addr, &level);
-        struct page *pt_page2 = pte_page(*pte);
-        //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2));
-        SetPageTOI_Untracked(pt_page2);
-        lastpage = page;
-    }
-}
-
-static void walk_pte_level(pmd_t addr)
-{
-        int i;
-        pte_t *start;
-
-        start = (pte_t *) pmd_page_vaddr(addr);
-        for (i = 0; i < PTRS_PER_PTE; i++) {
-                note_page(start);
-                start++;
-        }
-}
-
-#if PTRS_PER_PMD > 1
-
-static void walk_pmd_level(pud_t addr)
-{
-        int i;
-        pmd_t *start;
-
-        start = (pmd_t *) pud_page_vaddr(addr);
-        for (i = 0; i < PTRS_PER_PMD; i++) {
-                if (!pmd_none(*start)) {
-                        if (pmd_large(*start) || !pmd_present(*start))
-                                note_page(start);
-                        else
-                                walk_pte_level(*start);
-                } else
-                        note_page(start);
-                start++;
-        }
-}
-
-#else
-#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a)))
-#define pud_large(a) pmd_large(__pmd(pud_val(a)))
-#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
-#endif
-
-#if PTRS_PER_PUD > 1
-
-static void walk_pud_level(pgd_t addr)
-{
-        int i;
-        pud_t *start;
-
-        start = (pud_t *) pgd_page_vaddr(addr);
-
-        for (i = 0; i < PTRS_PER_PUD; i++) {
-                if (!pud_none(*start)) {
-                        if (pud_large(*start) || !pud_present(*start))
-                                note_page(start);
-                        else
-                                walk_pmd_level(*start);
-                } else
-                        note_page(start);
-
-                start++;
-        }
-}
-
-#else
-#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a)))
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
-#endif
-
-/*
- * Not static in the original at the time of writing, so needs renaming here.
- */
-static void toi_ptdump_walk_pgd_level(pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
-        pgd_t *start = (pgd_t *) &init_level4_pgt;
-#else
-        pgd_t *start = swapper_pg_dir;
-#endif
-        int i;
-        if (pgd) {
-                start = pgd;
-        }
-
-        for (i = 0; i < PTRS_PER_PGD; i++) {
-                if (!pgd_none(*start)) {
-                        if (pgd_large(*start) || !pgd_present(*start))
-                                note_page(start);
-                        else
-                                walk_pud_level(*start);
-                } else
-                        note_page(start);
-
-                start++;
-        }
-
-        /* Flush out the last page */
-        note_page(start);
-}
-
-#ifdef CONFIG_PARAVIRT
-extern struct pv_info pv_info;
-
-static void toi_set_paravirt_ops_untracked(void) {
-    int i;
-
-    unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)),
-                  pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end));
-    //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end);
-    for (i = pvpfn; i <= pvpfn_end; i++) {
-        SetPageTOI_Untracked(pfn_to_page(i));
-    }
-}
-#else
-#define toi_set_paravirt_ops_untracked() { do { } while(0) }
-#endif
-
-extern void toi_mark_per_cpus_pages_untracked(void);
-
-void toi_untrack_stack(unsigned long *stack)
-{
-    int i;
-    struct page *stack_page = virt_to_page(stack);
-
-    for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) {
-        pr_debug("Untrack stack page %p.\n", page_address(stack_page + i));
-        SetPageTOI_Untracked(stack_page + i);
-    }
-}
-void toi_untrack_process(struct task_struct *p)
-{
-    SetPageTOI_Untracked(virt_to_page(p));
-    pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p)));
-
-    toi_untrack_stack(p->stack);
-}
-
-void toi_generate_untracked_map(void)
-{
-    struct task_struct *p, *t;
-    struct page *page;
-    pte_t *pte;
-    int i;
-    unsigned int level;
-    static int been_here = 0;
-
-    if (been_here)
-        return;
-
-    been_here = 1;
-
-    /* Pagetable pages */
-    toi_ptdump_walk_pgd_level(NULL);
-
-    /* Printk buffer - not normally needed but can be helpful for debugging. */
-    //toi_set_logbuf_untracked();
-
-    /* Paravirt ops */
-    toi_set_paravirt_ops_untracked();
-
-    /* Task structs and stacks */
-    for_each_process_thread(p, t) {
-        toi_untrack_process(p);
-        //toi_untrack_stack((unsigned long *) t->thread.sp);
-    }
-
-    for (i = 0; i < NR_CPUS; i++) {
-        struct task_struct *idle = idle_task(i);
-
-        if (idle) {
-            pr_debug("Untrack idle process for CPU %d.\n", i);
-            toi_untrack_process(idle);
-        }
-
-        /* IRQ stack */
-        pr_debug("Untrack IRQ stack for CPU %d.\n", i);
-        toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i));
-    }
-
-    /* Per CPU data */
-    //pr_debug("Untracking per CPU variable pages.\n");
-    toi_mark_per_cpus_pages_untracked();
-
-    /* Init stack - for bringing up secondary CPUs */
-    page = virt_to_page(init_stack);
-    for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) {
-        SetPageTOI_Untracked(page + i);
-    }
-
-    pte = lookup_address((unsigned long) &mmu_cr4_features, &level);
-    SetPageTOI_Untracked(pte_page(*pte));
-    SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features));
-}
-
-/**
- * toi_reset_dirtiness_one
- */
-
-void toi_reset_dirtiness_one(unsigned long pfn, int verbose)
-{
-    struct page *page = pfn_to_page(pfn);
-
-    /**
-     * Don't worry about whether the Dirty flag is
-     * already set. If this is our first call, it
-     * won't be.
-     */
-
-    preempt_disable();
-
-    ClearPageTOI_Dirty(page);
-    SetPageTOI_RO(page);
-    if (verbose)
-        printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page));
-
-    set_memory_ro((unsigned long) page_address(page), 1);
-
-    preempt_enable();
-}
-
-/**
- * TuxOnIce's incremental image support works by marking all memory apart from
- * the page tables read-only, then in the page-faults that result enabling
- * writing if appropriate and flagging the page as dirty. Free pages are also
- * marked as dirty and not protected so that if allocated, they will be included
- * in the image without further processing.
- *
- * toi_reset_dirtiness is called when and image exists and incremental images are
- * enabled, and each time we resume thereafter. It is not invoked on a fresh boot.
- *
- * This routine should be called from a single-cpu-running context to avoid races in setting
- * page dirty/read only flags.
- *
- * TODO: Make "it is not invoked on a fresh boot" true  when I've finished developing it!
- *
- * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c.
- **/
-
-int toi_reset_dirtiness(int verbose)
-{
-        struct zone *zone;
-        unsigned long loop;
-        int allocated_map = 0;
-
-        toi_generate_untracked_map();
-
-        if (!free_map) {
-            if (!toi_alloc_bitmap(&free_map))
-                return -ENOMEM;
-            allocated_map = 1;
-        }
-
-        toi_generate_free_page_map();
-
-        pr_debug(KERN_EMERG "Reset dirtiness.\n");
-        for_each_populated_zone(zone) {
-            // 64 bit only. No need to worry about highmem.
-            for (loop = 0; loop < zone->spanned_pages; loop++) {
-                unsigned long pfn = zone->zone_start_pfn + loop;
-                struct page *page;
-                int chunk_size;
-
-                if (!pfn_valid(pfn)) {
-                    continue;
-                }
-
-                chunk_size = toi_size_of_free_region(zone, pfn);
-                if (chunk_size) {
-                    loop += chunk_size - 1;
-                    continue;
-                }
-
-                page = pfn_to_page(pfn);
-
-                if (PageNosave(page) || !saveable_page(zone, pfn)) {
-                    continue;
-                }
-
-                if (PageTOI_Untracked(page)) {
-                    continue;
-                }
-
-                /**
-                 * Do we need to (re)protect the page?
-                 * If it is already protected (PageTOI_RO), there is
-                 * nothing to do - skip the following.
-                 * If it is marked as dirty (PageTOI_Dirty), it was
-                 * either free and has been allocated or has been
-                 * written to and marked dirty. Reset the dirty flag
-                 * and (re)apply the protection.
-                 */
-                if (!PageTOI_RO(page)) {
-                    toi_reset_dirtiness_one(pfn, verbose);
-                }
-            }
-        }
-
-        pr_debug(KERN_EMERG "Done resetting dirtiness.\n");
-
-        if (allocated_map) {
-            toi_free_bitmap(&free_map);
-        }
-        return 0;
-}
-
-static int toi_reset_dirtiness_initcall(void)
-{
-    if (toi_do_incremental_initcall) {
-        pr_info("TuxOnIce: Enabling dirty page tracking.\n");
-        toi_reset_dirtiness(0);
-    }
-    return 1;
-}
-extern void toi_generate_untracked_map(void);
-
-// Leave early_initcall for pages to register untracked sections.
-early_initcall(toi_reset_dirtiness_initcall);
-
-static int __init toi_incremental_initcall_setup(char *str)
-{
-        int value;
-
-        if (sscanf(str, "=%d", &value) && value)
-                toi_do_incremental_initcall = value;
-
-        return 1;
-}
-__setup("toi_incremental_initcall", toi_incremental_initcall_setup);
diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
deleted file mode 100644
index 2db934350..000000000
--- a/kernel/power/tuxonice_io.c
+++ /dev/null
@@ -1,1936 +0,0 @@
-/*
- * kernel/power/tuxonice_io.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/version.h>
-#include <linux/utsname.h>
-#include <linux/mount.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-#include <linux/cpu.h>
-#include <linux/fs_struct.h>
-#include <linux/bio.h>
-#include <linux/fs_uuid.h>
-#include <linux/kmod.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_alloc.h"
-char alt_resume_param[256];
-
-/* Version read from image header at resume */
-static int toi_image_header_version;
-
-#define read_if_version(VERS, VAR, DESC, ERR_ACT) do {                                        \
-        if (likely(toi_image_header_version >= VERS))                                \
-                if (toiActiveAllocator->rw_header_chunk(READ, NULL,                \
-                                        (char *) &VAR, sizeof(VAR))) {                \
-                        abort_hibernate(TOI_FAILED_IO, "Failed to read DESC.");        \
-                        ERR_ACT;                                        \
-                }                                                                \
-} while(0)                                                                        \
-
-/* Variables shared between threads and updated under the mutex */
-static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
-static int io_index, io_nextupdate, io_pc, io_pc_step;
-static DEFINE_MUTEX(io_mutex);
-static DEFINE_PER_CPU(struct page *, last_sought);
-static DEFINE_PER_CPU(struct page *, last_high_page);
-static DEFINE_PER_CPU(char *, checksum_locn);
-static DEFINE_PER_CPU(struct pbe *, last_low_page);
-static atomic_t io_count;
-atomic_t toi_io_workers;
-
-static int using_flusher;
-
-DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
-
-int toi_bio_queue_flusher_should_finish;
-
-int toi_max_workers;
-
-static char *image_version_error = "The image header version is newer than " \
-        "this kernel supports.";
-
-struct toi_module_ops *first_filter;
-
-static atomic_t toi_num_other_threads;
-static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue);
-enum toi_worker_commands {
-        TOI_IO_WORKER_STOP,
-        TOI_IO_WORKER_RUN,
-        TOI_IO_WORKER_EXIT
-};
-static enum toi_worker_commands toi_worker_command;
-
-/**
- * toi_attempt_to_parse_resume_device - determine if we can hibernate
- *
- * Can we hibernate, using the current resume= parameter?
- **/
-int toi_attempt_to_parse_resume_device(int quiet)
-{
-        struct list_head *Allocator;
-        struct toi_module_ops *thisAllocator;
-        int result, returning = 0;
-
-        if (toi_activate_storage(0))
-                return 0;
-
-        toiActiveAllocator = NULL;
-        clear_toi_state(TOI_RESUME_DEVICE_OK);
-        clear_toi_state(TOI_CAN_RESUME);
-        clear_result_state(TOI_ABORTED);
-
-        if (!toiNumAllocators) {
-                if (!quiet)
-                        printk(KERN_INFO "TuxOnIce: No storage allocators have "
-                                "been registered. Hibernating will be "
-                                "disabled.\n");
-                goto cleanup;
-        }
-
-        list_for_each(Allocator, &toiAllocators) {
-                thisAllocator = list_entry(Allocator, struct toi_module_ops,
-                                                                type_list);
-
-                /*
-                 * Not sure why you'd want to disable an allocator, but
-                 * we should honour the flag if we're providing it
-                 */
-                if (!thisAllocator->enabled)
-                        continue;
-
-                result = thisAllocator->parse_sig_location(
-                                resume_file, (toiNumAllocators == 1),
-                                quiet);
-
-                switch (result) {
-                case -EINVAL:
-                        /* For this allocator, but not a valid
-                         * configuration. Error already printed. */
-                        goto cleanup;
-
-                case 0:
-                        /* For this allocator and valid. */
-                        toiActiveAllocator = thisAllocator;
-
-                        set_toi_state(TOI_RESUME_DEVICE_OK);
-                        set_toi_state(TOI_CAN_RESUME);
-                        returning = 1;
-                        goto cleanup;
-                }
-        }
-        if (!quiet)
-                printk(KERN_INFO "TuxOnIce: No matching enabled allocator "
-                                "found. Resuming disabled.\n");
-cleanup:
-        toi_deactivate_storage(0);
-        return returning;
-}
-
-void attempt_to_parse_resume_device2(void)
-{
-        toi_prepare_usm();
-        toi_attempt_to_parse_resume_device(0);
-        toi_cleanup_usm();
-}
-
-void save_restore_alt_param(int replace, int quiet)
-{
-        static char resume_param_save[255];
-        static unsigned long toi_state_save;
-
-        if (replace) {
-                toi_state_save = toi_state;
-                strcpy(resume_param_save, resume_file);
-                strcpy(resume_file, alt_resume_param);
-        } else {
-                strcpy(resume_file, resume_param_save);
-                toi_state = toi_state_save;
-        }
-        toi_attempt_to_parse_resume_device(quiet);
-}
-
-void attempt_to_parse_alt_resume_param(void)
-{
-        int ok = 0;
-
-        /* Temporarily set resume_param to the poweroff value */
-        if (!strlen(alt_resume_param))
-                return;
-
-        printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n");
-        save_restore_alt_param(SAVE, NOQUIET);
-        if (test_toi_state(TOI_CAN_RESUME))
-                ok = 1;
-
-        printk(KERN_INFO "=== Done ===\n");
-        save_restore_alt_param(RESTORE, QUIET);
-
-        /* If not ok, clear the string */
-        if (ok)
-                return;
-
-        printk(KERN_INFO "Can't resume from that location; clearing "
-                        "alt_resume_param.\n");
-        alt_resume_param[0] = '\0';
-}
-
-/**
- * noresume_reset_modules - reset data structures in case of non resuming
- *
- * When we read the start of an image, modules (and especially the
- * active allocator) might need to reset data structures if we
- * decide to remove the image rather than resuming from it.
- **/
-static void noresume_reset_modules(void)
-{
-        struct toi_module_ops *this_filter;
-
-        list_for_each_entry(this_filter, &toi_filters, type_list)
-                if (this_filter->noresume_reset)
-                        this_filter->noresume_reset();
-
-        if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
-                toiActiveAllocator->noresume_reset();
-}
-
-/**
- * fill_toi_header - fill the hibernate header structure
- * @struct toi_header: Header data structure to be filled.
- **/
-static int fill_toi_header(struct toi_header *sh)
-{
-        int i, error;
-
-        error = init_header((struct swsusp_info *) sh);
-        if (error)
-                return error;
-
-        sh->pagedir = pagedir1;
-        sh->pageset_2_size = pagedir2.size;
-        sh->param0 = toi_result;
-        sh->param1 = toi_bkd.toi_action;
-        sh->param2 = toi_bkd.toi_debug_state;
-        sh->param3 = toi_bkd.toi_default_console_level;
-        sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
-        for (i = 0; i < 4; i++)
-                sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
-        sh->bkd = boot_kernel_data_buffer;
-        return 0;
-}
-
-/**
- * rw_init_modules - initialize modules
- * @rw:                Whether we are reading of writing an image.
- * @which:        Section of the image being processed.
- *
- * Iterate over modules, preparing the ones that will be used to read or write
- * data.
- **/
-static int rw_init_modules(int rw, int which)
-{
-        struct toi_module_ops *this_module;
-        /* Initialise page transformers */
-        list_for_each_entry(this_module, &toi_filters, type_list) {
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->rw_init && this_module->rw_init(rw, which)) {
-                        abort_hibernate(TOI_FAILED_MODULE_INIT,
-                                "Failed to initialize the %s filter.",
-                                this_module->name);
-                        return 1;
-                }
-        }
-
-        /* Initialise allocator */
-        if (toiActiveAllocator->rw_init(rw, which)) {
-                abort_hibernate(TOI_FAILED_MODULE_INIT,
-                                "Failed to initialise the allocator.");
-                return 1;
-        }
-
-        /* Initialise other modules */
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    this_module->type == FILTER_MODULE ||
-                    this_module->type == WRITER_MODULE)
-                        continue;
-                if (this_module->rw_init && this_module->rw_init(rw, which)) {
-                        set_abort_result(TOI_FAILED_MODULE_INIT);
-                        printk(KERN_INFO "Setting aborted flag due to module "
-                                        "init failure.\n");
-                        return 1;
-                }
-        }
-
-        return 0;
-}
-
-/**
- * rw_cleanup_modules - cleanup modules
- * @rw:        Whether we are reading of writing an image.
- *
- * Cleanup components after reading or writing a set of pages.
- * Only the allocator may fail.
- **/
-static int rw_cleanup_modules(int rw)
-{
-        struct toi_module_ops *this_module;
-        int result = 0;
-
-        /* Cleanup other modules */
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    this_module->type == FILTER_MODULE ||
-                    this_module->type == WRITER_MODULE)
-                        continue;
-                if (this_module->rw_cleanup)
-                        result |= this_module->rw_cleanup(rw);
-        }
-
-        /* Flush data and cleanup */
-        list_for_each_entry(this_module, &toi_filters, type_list) {
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->rw_cleanup)
-                        result |= this_module->rw_cleanup(rw);
-        }
-
-        result |= toiActiveAllocator->rw_cleanup(rw);
-
-        return result;
-}
-
-static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high)
-{
-        int index, min, max;
-        struct page *high_page = NULL,
-                    **my_last_high_page = raw_cpu_ptr(&last_high_page),
-                    **my_last_sought = raw_cpu_ptr(&last_sought);
-        struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page);
-        void *compare;
-
-        if (is_high) {
-                if (*my_last_sought && *my_last_high_page &&
-                                *my_last_sought < orig_page)
-                        high_page = *my_last_high_page;
-                else
-                        high_page = (struct page *) restore_highmem_pblist;
-                this = (struct pbe *) kmap(high_page);
-                compare = orig_page;
-        } else {
-                if (*my_last_sought && *my_last_low_page &&
-                                *my_last_sought < orig_page)
-                        this = *my_last_low_page;
-                else
-                        this = restore_pblist;
-                compare = page_address(orig_page);
-        }
-
-        *my_last_sought = orig_page;
-
-        /* Locate page containing pbe */
-        while (this[PBES_PER_PAGE - 1].next &&
-                        this[PBES_PER_PAGE - 1].orig_address < compare) {
-                if (is_high) {
-                        struct page *next_high_page = (struct page *)
-                                this[PBES_PER_PAGE - 1].next;
-                        kunmap(high_page);
-                        this = kmap(next_high_page);
-                        high_page = next_high_page;
-                } else
-                        this = this[PBES_PER_PAGE - 1].next;
-        }
-
-        /* Do a binary search within the page */
-        min = 0;
-        max = PBES_PER_PAGE;
-        index = PBES_PER_PAGE / 2;
-        while (max - min) {
-                if (!this[index].orig_address ||
-                    this[index].orig_address > compare)
-                        max = index;
-                else if (this[index].orig_address == compare) {
-                        if (is_high) {
-                                struct page *page = this[index].address;
-                                *my_last_high_page = high_page;
-                                kunmap(high_page);
-                                return page;
-                        }
-                        *my_last_low_page = this;
-                        return virt_to_page(this[index].address);
-                } else
-                        min = index;
-                index = ((max + min) / 2);
-        };
-
-        if (is_high)
-                kunmap(high_page);
-
-        abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
-                " orig page %p. This[min].orig_address=%p.\n", orig_page,
-                this[index].orig_address);
-        return NULL;
-}
-
-/**
- * write_next_page - write the next page in a pageset
- * @data_pfn: The pfn where the next data to write is located.
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn number to write in the image (where the data belongs).
- *
- * Get the pfn of the next page to write, map the page if necessary and do the
- * write.
- **/
-static int write_next_page(unsigned long *data_pfn, int *my_io_index,
-                unsigned long *write_pfn)
-{
-        struct page *page;
-        char **my_checksum_locn = raw_cpu_ptr(&checksum_locn);
-        int result = 0, was_present;
-
-        *data_pfn = memory_bm_next_pfn(io_map, 0);
-
-        /* Another thread could have beaten us to it. */
-        if (*data_pfn == BM_END_OF_MAP) {
-                if (atomic_read(&io_count)) {
-                        printk(KERN_INFO "Ran out of pfns but io_count is "
-                                        "still %d.\n", atomic_read(&io_count));
-                        BUG();
-                }
-                mutex_unlock(&io_mutex);
-                return -ENODATA;
-        }
-
-        *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
-        memory_bm_clear_bit(io_map, 0, *data_pfn);
-        page = pfn_to_page(*data_pfn);
-
-        was_present = kernel_page_present(page);
-        if (!was_present)
-                kernel_map_pages(page, 1, 1);
-
-        if (io_pageset == 1)
-                *write_pfn = memory_bm_next_pfn(pageset1_map, 0);
-        else {
-                *write_pfn = *data_pfn;
-                *my_checksum_locn = tuxonice_get_next_checksum();
-        }
-
-        TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index);
-
-        mutex_unlock(&io_mutex);
-
-        if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn))
-                return 1;
-
-        result = first_filter->write_page(*write_pfn, TOI_PAGE, page,
-                        PAGE_SIZE);
-
-        if (!was_present)
-                kernel_map_pages(page, 1, 0);
-
-        return result;
-}
-
-/**
- * read_next_page - read the next page in a pageset
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn in which the data belongs.
- *
- * Read a page of the image into our buffer. It can happen (here and in the
- * write routine) that threads don't get run until after other CPUs have done
- * all the work. This was the cause of the long standing issue with
- * occasionally getting -ENODATA errors at the end of reading the image. We
- * therefore need to check there's actually a page to read before trying to
- * retrieve one.
- **/
-
-static int read_next_page(int *my_io_index, unsigned long *write_pfn,
-                struct page *buffer)
-{
-        unsigned int buf_size = PAGE_SIZE;
-        unsigned long left = atomic_read(&io_count);
-
-        if (!left)
-                return -ENODATA;
-
-        /* Start off assuming the page we read isn't resaved */
-        *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
-        mutex_unlock(&io_mutex);
-
-        /*
-         * Are we aborting? If so, don't submit any more I/O as
-         * resetting the resume_attempted flag (from ui.c) will
-         * clear the bdev flags, making this thread oops.
-         */
-        if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
-                atomic_dec(&toi_io_workers);
-                if (!atomic_read(&toi_io_workers)) {
-                        /*
-                         * So we can be sure we'll have memory for
-                         * marking that we haven't resumed.
-                         */
-                        rw_cleanup_modules(READ);
-                        set_toi_state(TOI_IO_STOPPED);
-                }
-                while (1)
-                        schedule();
-        }
-
-        /*
-         * See toi_bio_read_page in tuxonice_bio.c:
-         * read the next page in the image.
-         */
-        return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size);
-}
-
-static void use_read_page(unsigned long write_pfn, struct page *buffer)
-{
-        struct page *final_page = pfn_to_page(write_pfn),
-                    *copy_page = final_page;
-        char *virt, *buffer_virt;
-        int was_present, cpu = smp_processor_id();
-        unsigned long idx = 0;
-
-        if (io_pageset == 1 && (!pageset1_copy_map ||
-                        !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) {
-                int is_high = PageHighMem(final_page);
-                copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high);
-        }
-
-        if (!memory_bm_test_bit(io_map, cpu, write_pfn)) {
-                int test = !memory_bm_test_bit(io_map, cpu, write_pfn);
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test);
-                mutex_lock(&io_mutex);
-                idx = atomic_add_return(1, &io_count);
-                mutex_unlock(&io_mutex);
-                return;
-        }
-
-        virt = kmap(copy_page);
-        buffer_virt = kmap(buffer);
-        was_present = kernel_page_present(copy_page);
-        if (!was_present)
-                kernel_map_pages(copy_page, 1, 1);
-        memcpy(virt, buffer_virt, PAGE_SIZE);
-        if (!was_present)
-                kernel_map_pages(copy_page, 1, 0);
-        kunmap(copy_page);
-        kunmap(buffer);
-        memory_bm_clear_bit(io_map, cpu, write_pfn);
-        TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset);
-}
-
-static unsigned long status_update(int writing, unsigned long done,
-                unsigned long ticks)
-{
-        int cs_index = writing ? 0 : 1;
-        unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks;
-        unsigned long msec = jiffies_to_msecs(abs(ticks_so_far));
-        unsigned long pgs_per_s, estimate = 0, pages_left;
-
-        if (msec) {
-                pages_left = io_barmax - done;
-                pgs_per_s = 1000 * done / msec;
-                if (pgs_per_s)
-                        estimate = DIV_ROUND_UP(pages_left, pgs_per_s);
-        }
-
-        if (estimate && ticks > HZ / 2)
-                return toi_update_status(done, io_barmax,
-                        " %d/%d MB (%lu sec left)",
-                        MB(done+1), MB(io_barmax), estimate);
-
-        return toi_update_status(done, io_barmax, " %d/%d MB",
-                MB(done+1), MB(io_barmax));
-}
-
-/**
- * worker_rw_loop - main loop to read/write pages
- *
- * The main I/O loop for reading or writing pages. The io_map bitmap is used to
- * track the pages to read/write.
- * If we are reading, the pages are loaded to their final (mapped) pfn.
- * Data is non zero iff this is a thread started via start_other_threads.
- * In that case, we stay in here until told to quit.
- **/
-static int worker_rw_loop(void *data)
-{
-        unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4,
-                      jif_index = 1, start_time = jiffies, thread_num;
-        int result = 0, my_io_index = 0, last_worker;
-        struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
-        cpumask_var_t orig_mask;
-
-        if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) {
-                printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data);
-                result = -ENOMEM;
-                goto out;
-        }
-
-        cpumask_copy(orig_mask, tsk_cpus_allowed(current));
-
-        current->flags |= PF_NOFREEZE;
-
-top:
-        mutex_lock(&io_mutex);
-        thread_num = atomic_read(&toi_io_workers);
-
-        cpumask_copy(tsk_cpus_allowed(current), orig_mask);
-        schedule();
-
-        atomic_inc(&toi_io_workers);
-
-        while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
-                !(io_write && test_result_state(TOI_ABORTED)) &&
-                toi_worker_command == TOI_IO_WORKER_RUN) {
-                if (!thread_num && jiffies > next_jiffies) {
-                        next_jiffies += HZ / 4;
-                        if (toiActiveAllocator->update_throughput_throttle)
-                                toiActiveAllocator->update_throughput_throttle(
-                                                jif_index);
-                        jif_index++;
-                }
-
-                /*
-                 * What page to use? If reading, don't know yet which page's
-                 * data will be read, so always use the buffer. If writing,
-                 * use the copy (Pageset1) or original page (Pageset2), but
-                 * always write the pfn of the original page.
-                 */
-                if (io_write)
-                        result = write_next_page(&data_pfn, &my_io_index,
-                                        &write_pfn);
-                else /* Reading */
-                        result = read_next_page(&my_io_index, &write_pfn,
-                                        buffer);
-
-                if (result) {
-                        mutex_lock(&io_mutex);
-                        /* Nothing to do? */
-                        if (result == -ENODATA) {
-                                toi_message(TOI_IO, TOI_VERBOSE, 0,
-                                        "Thread %d has no more work.",
-                                        smp_processor_id());
-                                break;
-                        }
-
-                        io_result = result;
-
-                        if (io_write) {
-                                printk(KERN_INFO "Write chunk returned %d.\n",
-                                                result);
-                                abort_hibernate(TOI_FAILED_IO,
-                                        "Failed to write a chunk of the "
-                                        "image.");
-                                break;
-                        }
-
-                        if (io_pageset == 1) {
-                                printk(KERN_ERR "\nBreaking out of I/O loop "
-                                        "because of result code %d.\n", result);
-                                break;
-                        }
-                        panic("Read chunk returned (%d)", result);
-                }
-
-                /*
-                 * Discard reads of resaved pages while reading ps2
-                 * and unwanted pages while rereading ps2 when aborting.
-                 */
-                if (!io_write) {
-                        if (!PageResave(pfn_to_page(write_pfn)))
-                                use_read_page(write_pfn, buffer);
-                        else {
-                                mutex_lock(&io_mutex);
-                                toi_message(TOI_IO, TOI_VERBOSE, 0,
-                                                "Resaved %ld.", write_pfn);
-                                atomic_inc(&io_count);
-                                mutex_unlock(&io_mutex);
-                        }
-                }
-
-                if (!thread_num) {
-                        if(my_io_index + io_base > io_nextupdate)
-                                io_nextupdate = status_update(io_write,
-                                                my_io_index + io_base,
-                                                jiffies - start_time);
-
-                        if (my_io_index > io_pc) {
-                                printk(KERN_CONT "...%d%%", 20 * io_pc_step);
-                                io_pc_step++;
-                                io_pc = io_finish_at * io_pc_step / 5;
-                        }
-                }
-
-                toi_cond_pause(0, NULL);
-
-                /*
-                 * Subtle: If there's less I/O still to be done than threads
-                 * running, quit. This stops us doing I/O beyond the end of
-                 * the image when reading.
-                 *
-                 * Possible race condition. Two threads could do the test at
-                 * the same time; one should exit and one should continue.
-                 * Therefore we take the mutex before comparing and exiting.
-                 */
-
-                mutex_lock(&io_mutex);
-        }
-
-        last_worker = atomic_dec_and_test(&toi_io_workers);
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers));
-        mutex_unlock(&io_mutex);
-
-        if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) {
-                /* Were we the last thread and we're using a flusher thread? */
-                if (last_worker && using_flusher) {
-                        toiActiveAllocator->finish_all_io();
-                }
-                /* First, if we're doing I/O, wait for it to finish */
-                wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN);
-                /* Then wait to be told what to do next */
-                wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP);
-                if (toi_worker_command == TOI_IO_WORKER_RUN)
-                        goto top;
-        }
-
-        if (thread_num)
-                atomic_dec(&toi_num_other_threads);
-
-out:
-        toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num);
-        toi__free_page(28, buffer);
-        free_cpumask_var(orig_mask);
-
-        return result;
-}
-
-int toi_start_other_threads(void)
-{
-        int cpu;
-        struct task_struct *p;
-        int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1;
-  unsigned long num_started = 0;
-
-        if (test_action_state(TOI_NO_MULTITHREADED_IO))
-                return 0;
-
-        toi_worker_command = TOI_IO_WORKER_STOP;
-
-        for_each_online_cpu(cpu) {
-                if (num_started == to_start)
-                        break;
-
-                if (cpu == smp_processor_id())
-                        continue;
-
-                p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1,
-                                cpu_to_node(cpu), "ktoi_io/%d", cpu);
-                if (IS_ERR(p)) {
-                        printk(KERN_ERR "ktoi_io for %i failed\n", cpu);
-                        continue;
-                }
-                kthread_bind(p, cpu);
-                p->flags |= PF_MEMALLOC;
-                wake_up_process(p);
-                num_started++;
-                atomic_inc(&toi_num_other_threads);
-        }
-
-        toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started);
-        return num_started;
-}
-
-void toi_stop_other_threads(void)
-{
-        toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads.");
-        toi_worker_command = TOI_IO_WORKER_EXIT;
-        wake_up(&toi_worker_wait_queue);
-}
-
-/**
- * do_rw_loop - main highlevel function for reading or writing pages
- *
- * Create the io_map bitmap and call worker_rw_loop to perform I/O operations.
- **/
-static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags,
-                int base, int barmax, int pageset)
-{
-        int index = 0, cpu, result = 0, workers_started;
-        unsigned long pfn, next;
-
-        first_filter = toi_get_next_filter(NULL);
-
-        if (!finish_at)
-                return 0;
-
-        io_write = write;
-        io_finish_at = finish_at;
-        io_base = base;
-        io_barmax = barmax;
-        io_pageset = pageset;
-        io_index = 0;
-        io_pc = io_finish_at / 5;
-        io_pc_step = 1;
-        io_result = 0;
-        io_nextupdate = base + 1;
-        toi_bio_queue_flusher_should_finish = 0;
-
-        for_each_online_cpu(cpu) {
-                per_cpu(last_sought, cpu) = NULL;
-                per_cpu(last_low_page, cpu) = NULL;
-                per_cpu(last_high_page, cpu) = NULL;
-        }
-
-        /* Ensure all bits clear */
-        memory_bm_clear(io_map);
-
-        memory_bm_position_reset(io_map);
-        next = memory_bm_next_pfn(io_map, 0);
-
-        BUG_ON(next != BM_END_OF_MAP);
-
-        /* Set the bits for the pages to write */
-        memory_bm_position_reset(pageflags);
-
-        pfn = memory_bm_next_pfn(pageflags, 0);
-        toi_trace_index++;
-
-        while (pfn != BM_END_OF_MAP && index < finish_at) {
-                TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at);
-                memory_bm_set_bit(io_map, 0, pfn);
-                pfn = memory_bm_next_pfn(pageflags, 0);
-                index++;
-        }
-
-        BUG_ON(next != BM_END_OF_MAP || index < finish_at);
-
-        memory_bm_position_reset(io_map);
-        toi_trace_index++;
-
-        atomic_set(&io_count, finish_at);
-
-        memory_bm_position_reset(pageset1_map);
-
-        mutex_lock(&io_mutex);
-
-        clear_toi_state(TOI_IO_STOPPED);
-
-        using_flusher = (atomic_read(&toi_num_other_threads) &&
-                         toiActiveAllocator->io_flusher &&
-                         !test_action_state(TOI_NO_FLUSHER_THREAD));
-
-        workers_started = atomic_read(&toi_num_other_threads);
-
-        memory_bm_position_reset(io_map);
-        memory_bm_position_reset(pageset1_copy_map);
-
-        toi_worker_command = TOI_IO_WORKER_RUN;
-        wake_up(&toi_worker_wait_queue);
-
-        mutex_unlock(&io_mutex);
-
-        if (using_flusher)
-                result = toiActiveAllocator->io_flusher(write);
-        else
-                worker_rw_loop(NULL);
-
-        while (atomic_read(&toi_io_workers))
-                schedule();
-
-        printk(KERN_CONT "\n");
-
-        toi_worker_command = TOI_IO_WORKER_STOP;
-        wake_up(&toi_worker_wait_queue);
-
-        if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
-                if (!atomic_read(&toi_io_workers)) {
-                        rw_cleanup_modules(READ);
-                        set_toi_state(TOI_IO_STOPPED);
-                }
-                while (1)
-                        schedule();
-        }
-        set_toi_state(TOI_IO_STOPPED);
-
-        if (!io_result && !result && !test_result_state(TOI_ABORTED)) {
-                unsigned long next;
-
-                toi_update_status(io_base + io_finish_at, io_barmax,
-                                " %d/%d MB ",
-                                MB(io_base + io_finish_at), MB(io_barmax));
-
-                memory_bm_position_reset(io_map);
-                next = memory_bm_next_pfn(io_map, 0);
-                if  (next != BM_END_OF_MAP) {
-                        printk(KERN_INFO "Finished I/O loop but still work to "
-                                        "do?\nFinish at = %d. io_count = %d.\n",
-                                        finish_at, atomic_read(&io_count));
-                        printk(KERN_INFO "I/O bitmap still records work to do."
-                                        "%ld.\n", next);
-                        BUG();
-                        do {
-                                cpu_relax();
-                        } while (0);
-                }
-        }
-
-        return io_result ? io_result : result;
-}
-
-/**
- * write_pageset - write a pageset to disk.
- * @pagedir:        Which pagedir to write.
- *
- * Returns:
- *        Zero on success or -1 on failure.
- **/
-int write_pageset(struct pagedir *pagedir)
-{
-        int finish_at, base = 0;
-        int barmax = pagedir1.size + pagedir2.size;
-        long error = 0;
-        struct memory_bitmap *pageflags;
-        unsigned long start_time, end_time;
-
-        /*
-         * Even if there is nothing to read or write, the allocator
-         * may need the init/cleanup for it's housekeeping.  (eg:
-         * Pageset1 may start where pageset2 ends when writing).
-         */
-        finish_at = pagedir->size;
-
-        if (pagedir->id == 1) {
-                toi_prepare_status(DONT_CLEAR_BAR,
-                                "Writing kernel & process data...");
-                base = pagedir2.size;
-                if (test_action_state(TOI_TEST_FILTER_SPEED) ||
-                    test_action_state(TOI_TEST_BIO))
-                        pageflags = pageset1_map;
-                else
-                        pageflags = pageset1_copy_map;
-        } else {
-                toi_prepare_status(DONT_CLEAR_BAR, "Writing caches...");
-                pageflags = pageset2_map;
-        }
-
-        start_time = jiffies;
-
-        if (rw_init_modules(WRITE, pagedir->id)) {
-                abort_hibernate(TOI_FAILED_MODULE_INIT,
-                                "Failed to initialise modules for writing.");
-                error = 1;
-        }
-
-        if (!error)
-                error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax,
-                                pagedir->id);
-
-        if (rw_cleanup_modules(WRITE) && !error) {
-                abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
-                                "Failed to cleanup after writing.");
-                error = 1;
-        }
-
-        end_time = jiffies;
-
-        if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
-                toi_bkd.toi_io_time[0][0] += finish_at,
-                toi_bkd.toi_io_time[0][1] += (end_time - start_time);
-        }
-
-        return error;
-}
-
-/**
- * read_pageset - highlevel function to read a pageset from disk
- * @pagedir:                        pageset to read
- * @overwrittenpagesonly:        Whether to read the whole pageset or
- *                                only part of it.
- *
- * Returns:
- *        Zero on success or -1 on failure.
- **/
-static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
-{
-        int result = 0, base = 0;
-        int finish_at = pagedir->size;
-        int barmax = pagedir1.size + pagedir2.size;
-        struct memory_bitmap *pageflags;
-        unsigned long start_time, end_time;
-
-        if (pagedir->id == 1) {
-                toi_prepare_status(DONT_CLEAR_BAR,
-                                "Reading kernel & process data...");
-                pageflags = pageset1_map;
-        } else {
-                toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
-                if (overwrittenpagesonly) {
-                        barmax = min(pagedir1.size, pagedir2.size);
-                        finish_at = min(pagedir1.size, pagedir2.size);
-                } else
-                        base = pagedir1.size;
-                pageflags = pageset2_map;
-        }
-
-        start_time = jiffies;
-
-        if (rw_init_modules(READ, pagedir->id)) {
-                toiActiveAllocator->remove_image();
-                result = 1;
-        } else
-                result = do_rw_loop(READ, finish_at, pageflags, base, barmax,
-                                pagedir->id);
-
-        if (rw_cleanup_modules(READ) && !result) {
-                abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
-                                "Failed to cleanup after reading.");
-                result = 1;
-        }
-
-        /* Statistics */
-        end_time = jiffies;
-
-        if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
-                toi_bkd.toi_io_time[1][0] += finish_at,
-                toi_bkd.toi_io_time[1][1] += (end_time - start_time);
-        }
-
-        return result;
-}
-
-/**
- * write_module_configs - store the modules configuration
- *
- * The configuration for each module is stored in the image header.
- * Returns: Int
- *        Zero on success, Error value otherwise.
- **/
-static int write_module_configs(void)
-{
-        struct toi_module_ops *this_module;
-        char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
-        int len, index = 1;
-        struct toi_module_header toi_module_header;
-
-        if (!buffer) {
-                printk(KERN_INFO "Failed to allocate a buffer for saving "
-                                "module configuration info.\n");
-                return -ENOMEM;
-        }
-
-        /*
-         * We have to know which data goes with which module, so we at
-         * least write a length of zero for a module. Note that we are
-         * also assuming every module's config data takes <= PAGE_SIZE.
-         */
-
-        /* For each module (in registration order) */
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled || !this_module->storage_needed ||
-                    (this_module->type == WRITER_MODULE &&
-                     toiActiveAllocator != this_module))
-                        continue;
-
-                /* Get the data from the module */
-                len = 0;
-                if (this_module->save_config_info)
-                        len = this_module->save_config_info(buffer);
-
-                /* Save the details of the module */
-                toi_module_header.enabled = this_module->enabled;
-                toi_module_header.type = this_module->type;
-                toi_module_header.index = index++;
-                strncpy(toi_module_header.name, this_module->name,
-                                        sizeof(toi_module_header.name));
-                toiActiveAllocator->rw_header_chunk(WRITE,
-                                this_module,
-                                (char *) &toi_module_header,
-                                sizeof(toi_module_header));
-
-                /* Save the size of the data and any data returned */
-                toiActiveAllocator->rw_header_chunk(WRITE,
-                                this_module,
-                                (char *) &len, sizeof(int));
-                if (len)
-                        toiActiveAllocator->rw_header_chunk(
-                                WRITE, this_module, buffer, len);
-        }
-
-        /* Write a blank header to terminate the list */
-        toi_module_header.name[0] = '\0';
-        toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                        (char *) &toi_module_header, sizeof(toi_module_header));
-
-        toi_free_page(22, (unsigned long) buffer);
-        return 0;
-}
-
-/**
- * read_one_module_config - read and configure one module
- *
- * Read the configuration for one module, and configure the module
- * to match if it is loaded.
- *
- * Returns: Int
- *        Zero on success, Error value otherwise.
- **/
-static int read_one_module_config(struct toi_module_header *header)
-{
-        struct toi_module_ops *this_module;
-        int result, len;
-        char *buffer;
-
-        /* Find the module */
-        this_module = toi_find_module_given_name(header->name);
-
-        if (!this_module) {
-                if (header->enabled) {
-                        toi_early_boot_message(1, TOI_CONTINUE_REQ,
-                                "It looks like we need module %s for reading "
-                                "the image but it hasn't been registered.\n",
-                                header->name);
-                        if (!(test_toi_state(TOI_CONTINUE_REQ)))
-                                return -EINVAL;
-                } else
-                        printk(KERN_INFO "Module %s configuration data found, "
-                                "but the module hasn't registered. Looks like "
-                                "it was disabled, so we're ignoring its data.",
-                                header->name);
-        }
-
-        /* Get the length of the data (if any) */
-        result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len,
-                        sizeof(int));
-        if (result) {
-                printk(KERN_ERR "Failed to read the length of the module %s's"
-                                " configuration data.\n",
-                                header->name);
-                return -EINVAL;
-        }
-
-        /* Read any data and pass to the module (if we found one) */
-        if (!len)
-                return 0;
-
-        buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
-
-        if (!buffer) {
-                printk(KERN_ERR "Failed to allocate a buffer for reloading "
-                                "module configuration info.\n");
-                return -ENOMEM;
-        }
-
-        toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len);
-
-        if (!this_module)
-                goto out;
-
-        if (!this_module->save_config_info)
-                printk(KERN_ERR "Huh? Module %s appears to have a "
-                                "save_config_info, but not a load_config_info "
-                                "function!\n", this_module->name);
-        else
-                this_module->load_config_info(buffer, len);
-
-        /*
-         * Now move this module to the tail of its lists. This will put it in
-         * order. Any new modules will end up at the top of the lists. They
-         * should have been set to disabled when loaded (people will
-         * normally not edit an initrd to load a new module and then hibernate
-         * without using it!).
-         */
-
-        toi_move_module_tail(this_module);
-
-        this_module->enabled = header->enabled;
-
-out:
-        toi_free_page(23, (unsigned long) buffer);
-        return 0;
-}
-
-/**
- * read_module_configs - reload module configurations from the image header.
- *
- * Returns: Int
- *        Zero on success or an error code.
- **/
-static int read_module_configs(void)
-{
-        int result = 0;
-        struct toi_module_header toi_module_header;
-        struct toi_module_ops *this_module;
-
-        /* All modules are initially disabled. That way, if we have a module
-         * loaded now that wasn't loaded when we hibernated, it won't be used
-         * in trying to read the data.
-         */
-        list_for_each_entry(this_module, &toi_modules, module_list)
-                this_module->enabled = 0;
-
-        /* Get the first module header */
-        result = toiActiveAllocator->rw_header_chunk(READ, NULL,
-                        (char *) &toi_module_header,
-                        sizeof(toi_module_header));
-        if (result) {
-                printk(KERN_ERR "Failed to read the next module header.\n");
-                return -EINVAL;
-        }
-
-        /* For each module (in registration order) */
-        while (toi_module_header.name[0]) {
-                result = read_one_module_config(&toi_module_header);
-
-                if (result)
-                        return -EINVAL;
-
-                /* Get the next module header */
-                result = toiActiveAllocator->rw_header_chunk(READ, NULL,
-                                (char *) &toi_module_header,
-                                sizeof(toi_module_header));
-
-                if (result) {
-                        printk(KERN_ERR "Failed to read the next module "
-                                        "header.\n");
-                        return -EINVAL;
-                }
-        }
-
-        return 0;
-}
-
-static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev)
-{
-        return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1;
-}
-
-int fs_info_space_needed(int reset)
-{
-    static int last_result = 0;
-    const struct super_block *sb;
-    int result = sizeof(int);
-
-    if (!last_result || reset) {
-        list_for_each_entry(sb, &super_blocks, s_list) {
-            struct fs_info *fs;
-
-            if (!sb->s_bdev)
-                continue;
-
-            fs = fs_info_from_block_dev(sb->s_bdev);
-            if (save_fs_info(fs, sb->s_bdev))
-                result += 16 + sizeof(dev_t) + sizeof(int) +
-                    fs->last_mount_size;
-            free_fs_info(fs);
-        }
-        last_result = result;
-    }
-    return result;
-}
-
-static int fs_info_num_to_save(void)
-{
-        const struct super_block *sb;
-        int to_save = 0;
-
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                struct fs_info *fs;
-
-                if (!sb->s_bdev)
-                        continue;
-
-                fs = fs_info_from_block_dev(sb->s_bdev);
-                if (save_fs_info(fs, sb->s_bdev))
-                        to_save++;
-                free_fs_info(fs);
-        }
-
-        return to_save;
-}
-
-static int fs_info_save(void)
-{
-        const struct super_block *sb;
-        int to_save = fs_info_num_to_save();
-
-        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save,
-                                sizeof(int))) {
-                abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info"
-                                " to save.");
-                return -EIO;
-        }
-
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                struct fs_info *fs;
-
-                if (!sb->s_bdev)
-                        continue;
-
-                fs = fs_info_from_block_dev(sb->s_bdev);
-                if (save_fs_info(fs, sb->s_bdev)) {
-                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                                        &fs->uuid[0], 16)) {
-                                abort_hibernate(TOI_FAILED_IO, "Failed to "
-                                                "write uuid.");
-                                return -EIO;
-                        }
-                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                                        (char *) &fs->dev_t, sizeof(dev_t))) {
-                                abort_hibernate(TOI_FAILED_IO, "Failed to "
-                                                "write dev_t.");
-                                return -EIO;
-                        }
-                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                                        (char *) &fs->last_mount_size, sizeof(int))) {
-                                abort_hibernate(TOI_FAILED_IO, "Failed to "
-                                                "write last mount length.");
-                                return -EIO;
-                        }
-                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                                        fs->last_mount, fs->last_mount_size)) {
-                                abort_hibernate(TOI_FAILED_IO, "Failed to "
-                                                "write uuid.");
-                                return -EIO;
-                        }
-                }
-                free_fs_info(fs);
-        }
-        return 0;
-}
-
-static int fs_info_load_and_check_one(void)
-{
-        char uuid[16], *last_mount;
-        int result = 0, ln;
-        dev_t dev_t;
-        struct block_device *dev;
-        struct fs_info *fs_info, seek;
-
-        if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) {
-                abort_hibernate(TOI_FAILED_IO, "Failed to read uuid.");
-                return -EIO;
-        }
-
-        read_if_version(3, dev_t, "uuid dev_t field", return -EIO);
-
-        if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln,
-                                sizeof(int))) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to read last mount size.");
-                return -EIO;
-        }
-
-        last_mount = kzalloc(ln, GFP_KERNEL);
-
-        if (!last_mount)
-                return -ENOMEM;
-
-        if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount,        ln)) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to read last mount timestamp.");
-                result = -EIO;
-                goto out_lmt;
-        }
-
-        strncpy((char *) &seek.uuid, uuid, 16);
-        seek.dev_t = dev_t;
-        seek.last_mount_size = ln;
-        seek.last_mount = last_mount;
-        dev_t = blk_lookup_fs_info(&seek);
-        if (!dev_t)
-                goto out_lmt;
-
-        dev = toi_open_by_devnum(dev_t);
-
-        fs_info = fs_info_from_block_dev(dev);
-        if (fs_info && !IS_ERR(fs_info)) {
-                if (ln != fs_info->last_mount_size) {
-                        printk(KERN_EMERG "Found matching uuid but last mount "
-                                        "time lengths differ?! "
-                                        "(%d vs %d).\n", ln,
-                                        fs_info->last_mount_size);
-                        result = -EINVAL;
-                } else {
-                        char buf[BDEVNAME_SIZE];
-                        result = !!memcmp(fs_info->last_mount, last_mount, ln);
-                        if (result)
-                                printk(KERN_EMERG "Last mount time for %s has "
-                                        "changed!\n", bdevname(dev, buf));
-                }
-        }
-        toi_close_bdev(dev);
-        free_fs_info(fs_info);
-out_lmt:
-        kfree(last_mount);
-        return result;
-}
-
-static int fs_info_load_and_check(void)
-{
-        int to_do, result = 0;
-
-        if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do,
-                                sizeof(int))) {
-                abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info "
-                                "to load.");
-                return -EIO;
-        }
-
-        while(to_do--)
-                result |= fs_info_load_and_check_one();
-
-        return result;
-}
-
-/**
- * write_image_header - write the image header after write the image proper
- *
- * Returns: Int
- *        Zero on success, error value otherwise.
- **/
-int write_image_header(void)
-{
-        int ret;
-        int total = pagedir1.size + pagedir2.size+2;
-        char *header_buffer = NULL;
-
-        /* Now prepare to write the header */
-        ret = toiActiveAllocator->write_header_init();
-        if (ret) {
-                abort_hibernate(TOI_FAILED_MODULE_INIT,
-                                "Active allocator's write_header_init"
-                                " function failed.");
-                goto write_image_header_abort;
-        }
-
-        /* Get a buffer */
-        header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
-        if (!header_buffer) {
-                abort_hibernate(TOI_OUT_OF_MEMORY,
-                        "Out of memory when trying to get page for header!");
-                goto write_image_header_abort;
-        }
-
-        /* Write hibernate header */
-        if (fill_toi_header((struct toi_header *) header_buffer)) {
-                abort_hibernate(TOI_OUT_OF_MEMORY,
-                        "Failure to fill header information!");
-                goto write_image_header_abort;
-        }
-
-        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                        header_buffer, sizeof(struct toi_header))) {
-                abort_hibernate(TOI_OUT_OF_MEMORY,
-                        "Failure to write header info.");
-                goto write_image_header_abort;
-        }
-
-        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
-                        (char *) &toi_max_workers, sizeof(toi_max_workers))) {
-                abort_hibernate(TOI_OUT_OF_MEMORY,
-                        "Failure to number of workers to use.");
-                goto write_image_header_abort;
-        }
-
-        /* Write filesystem info */
-        if (fs_info_save())
-                goto write_image_header_abort;
-
-        /* Write module configurations */
-        ret = write_module_configs();
-        if (ret) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to write module configs.");
-                goto write_image_header_abort;
-        }
-
-        if (memory_bm_write(pageset1_map,
-                                toiActiveAllocator->rw_header_chunk)) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to write bitmaps.");
-                goto write_image_header_abort;
-        }
-
-        /* Flush data and let allocator cleanup */
-        if (toiActiveAllocator->write_header_cleanup()) {
-                abort_hibernate(TOI_FAILED_IO,
-                                "Failed to cleanup writing header.");
-                goto write_image_header_abort_no_cleanup;
-        }
-
-        if (test_result_state(TOI_ABORTED))
-                goto write_image_header_abort_no_cleanup;
-
-        toi_update_status(total, total, NULL);
-
-out:
-        if (header_buffer)
-                toi_free_page(24, (unsigned long) header_buffer);
-        return ret;
-
-write_image_header_abort:
-        toiActiveAllocator->write_header_cleanup();
-write_image_header_abort_no_cleanup:
-        ret = -1;
-        goto out;
-}
-
-/**
- * sanity_check - check the header
- * @sh:        the header which was saved at hibernate time.
- *
- * Perform a few checks, seeking to ensure that the kernel being
- * booted matches the one hibernated. They need to match so we can
- * be _sure_ things will work. It is not absolutely impossible for
- * resuming from a different kernel to work, just not assured.
- **/
-static char *sanity_check(struct toi_header *sh)
-{
-        char *reason = check_image_kernel((struct swsusp_info *) sh);
-
-        if (reason)
-                return reason;
-
-        if (!test_action_state(TOI_IGNORE_ROOTFS)) {
-                const struct super_block *sb;
-                list_for_each_entry(sb, &super_blocks, s_list) {
-                        if ((!(sb->s_flags & MS_RDONLY)) &&
-                            (sb->s_type->fs_flags & FS_REQUIRES_DEV))
-                                return "Device backed fs has been mounted "
-                                        "rw prior to resume or initrd/ramfs "
-                                        "is mounted rw.";
-                }
-        }
-
-        return NULL;
-}
-
-static DECLARE_WAIT_QUEUE_HEAD(freeze_wait);
-
-#define FREEZE_IN_PROGRESS (~0)
-
-static int freeze_result;
-
-static void do_freeze(struct work_struct *dummy)
-{
-        freeze_result = freeze_processes();
-        wake_up(&freeze_wait);
-        trap_non_toi_io = 1;
-}
-
-static DECLARE_WORK(freeze_work, do_freeze);
-
-/**
- * __read_pageset1 - test for the existence of an image and attempt to load it
- *
- * Returns:        Int
- *        Zero if image found and pageset1 successfully loaded.
- *        Error if no image found or loaded.
- **/
-static int __read_pageset1(void)
-{
-        int i, result = 0;
-        char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
-             *sanity_error = NULL;
-        struct toi_header *toi_header;
-
-        if (!header_buffer) {
-                printk(KERN_INFO "Unable to allocate a page for reading the "
-                                "signature.\n");
-                return -ENOMEM;
-        }
-
-        /* Check for an image */
-        result = toiActiveAllocator->image_exists(1);
-        if (result == 3) {
-                result = -ENODATA;
-                toi_early_boot_message(1, 0, "The signature from an older "
-                                "version of TuxOnIce has been detected.");
-                goto out_remove_image;
-        }
-
-        if (result != 1) {
-                result = -ENODATA;
-                noresume_reset_modules();
-                printk(KERN_INFO "TuxOnIce: No image found.\n");
-                goto out;
-        }
-
-        /*
-         * Prepare the active allocator for reading the image header. The
-         * activate allocator might read its own configuration.
-         *
-         * NB: This call may never return because there might be a signature
-         * for a different image such that we warn the user and they choose
-         * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
-         * location of the image might be unavailable if it was stored on a
-         * network connection).
-         */
-
-        result = toiActiveAllocator->read_header_init();
-        if (result) {
-                printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the "
-                                "image header.\n");
-                goto out_remove_image;
-        }
-
-        /* Check for noresume command line option */
-        if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
-                printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
-                                "image.\n");
-                goto out_remove_image;
-        }
-
-        /* Check whether we've resumed before */
-        if (test_toi_state(TOI_RESUMED_BEFORE)) {
-                toi_early_boot_message(1, 0, NULL);
-                if (!(test_toi_state(TOI_CONTINUE_REQ))) {
-                        printk(KERN_INFO "TuxOnIce: Tried to resume before: "
-                                        "Invalidated image.\n");
-                        goto out_remove_image;
-                }
-        }
-
-        clear_toi_state(TOI_CONTINUE_REQ);
-
-        toi_image_header_version = toiActiveAllocator->get_header_version();
-
-        if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) {
-                toi_early_boot_message(1, 0, image_version_error);
-                if (!(test_toi_state(TOI_CONTINUE_REQ))) {
-                        printk(KERN_INFO "TuxOnIce: Header version too new: "
-                                        "Invalidated image.\n");
-                        goto out_remove_image;
-                }
-        }
-
-        /* Read hibernate header */
-        result = toiActiveAllocator->rw_header_chunk(READ, NULL,
-                        header_buffer, sizeof(struct toi_header));
-        if (result < 0) {
-                printk(KERN_ERR "TuxOnIce: Failed to read the image "
-                                "signature.\n");
-                goto out_remove_image;
-        }
-
-        toi_header = (struct toi_header *) header_buffer;
-
-        /*
-         * NB: This call may also result in a reboot rather than returning.
-         */
-
-        sanity_error = sanity_check(toi_header);
-        if (sanity_error) {
-                toi_early_boot_message(1, TOI_CONTINUE_REQ,
-                                sanity_error);
-                printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
-                goto out_remove_image;
-        }
-
-        /*
-         * We have an image and it looks like it will load okay.
-         *
-         * Get metadata from header. Don't override commandline parameters.
-         *
-         * We don't need to save the image size limit because it's not used
-         * during resume and will be restored with the image anyway.
-         */
-
-        memcpy((char *) &pagedir1,
-                (char *) &toi_header->pagedir, sizeof(pagedir1));
-        toi_result = toi_header->param0;
-        if (!toi_bkd.toi_debug_state) {
-                toi_bkd.toi_action =
-                        (toi_header->param1 & ~toi_bootflags_mask) |
-                        (toi_bkd.toi_action & toi_bootflags_mask);
-                toi_bkd.toi_debug_state = toi_header->param2;
-                toi_bkd.toi_default_console_level = toi_header->param3;
-        }
-        clear_toi_state(TOI_IGNORE_LOGLEVEL);
-        pagedir2.size = toi_header->pageset_2_size;
-        for (i = 0; i < 4; i++)
-                toi_bkd.toi_io_time[i/2][i%2] =
-                        toi_header->io_time[i/2][i%2];
-
-        set_toi_state(TOI_BOOT_KERNEL);
-        boot_kernel_data_buffer = toi_header->bkd;
-
-        read_if_version(1, toi_max_workers, "TuxOnIce max workers",
-                        goto out_remove_image);
-
-        /* Read filesystem info */
-        if (fs_info_load_and_check()) {
-                printk(KERN_EMERG "TuxOnIce: File system mount time checks "
-                        "failed. Refusing to corrupt your filesystems!\n");
-                goto out_remove_image;
-        }
-
-        /* Read module configurations */
-        result = read_module_configs();
-        if (result) {
-                pagedir1.size = 0;
-                pagedir2.size = 0;
-                printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
-                                "configurations.\n");
-                clear_action_state(TOI_KEEP_IMAGE);
-                goto out_remove_image;
-        }
-
-        toi_prepare_console();
-
-        set_toi_state(TOI_NOW_RESUMING);
-
-        result = pm_notifier_call_chain(PM_RESTORE_PREPARE);
-        if (result)
-                goto out_notifier_call_chain;;
-
-        if (usermodehelper_disable())
-                goto out_enable_usermodehelper;
-
-        current->flags |= PF_NOFREEZE;
-        freeze_result = FREEZE_IN_PROGRESS;
-
-        schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work);
-
-        toi_cond_pause(1, "About to read original pageset1 locations.");
-
-        /*
-         * See _toi_rw_header_chunk in tuxonice_bio.c:
-         * Initialize pageset1_map by reading the map from the image.
-         */
-        if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk))
-                goto out_thaw;
-
-        /*
-         * See toi_rw_cleanup in tuxonice_bio.c:
-         * Clean up after reading the header.
-         */
-        result = toiActiveAllocator->read_header_cleanup();
-        if (result) {
-                printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the "
-                                "image header.\n");
-                goto out_thaw;
-        }
-
-        toi_cond_pause(1, "About to read pagedir.");
-
-        /*
-         * Get the addresses of pages into which we will load the kernel to
-         * be copied back and check if they conflict with the ones we are using.
-         */
-        if (toi_get_pageset1_load_addresses()) {
-                printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
-                                "pageset1.\n");
-                goto out_thaw;
-        }
-
-        /* Read the original kernel back */
-        toi_cond_pause(1, "About to read pageset 1.");
-
-        /* Given the pagemap, read back the data from disk */
-        if (read_pageset(&pagedir1, 0)) {
-                toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1.");
-                result = -EIO;
-                goto out_thaw;
-        }
-
-        toi_cond_pause(1, "About to restore original kernel.");
-        result = 0;
-
-        if (!toi_keeping_image &&
-            toiActiveAllocator->mark_resume_attempted)
-                toiActiveAllocator->mark_resume_attempted(1);
-
-        wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
-out:
-        current->flags &= ~PF_NOFREEZE;
-        toi_free_page(25, (unsigned long) header_buffer);
-        return result;
-
-out_thaw:
-        wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
-        trap_non_toi_io = 0;
-        thaw_processes();
-out_enable_usermodehelper:
-        usermodehelper_enable();
-out_notifier_call_chain:
-        pm_notifier_call_chain(PM_POST_RESTORE);
-        toi_cleanup_console();
-out_remove_image:
-        result = -EINVAL;
-        if (!toi_keeping_image)
-                toiActiveAllocator->remove_image();
-        toiActiveAllocator->read_header_cleanup();
-        noresume_reset_modules();
-        goto out;
-}
-
-/**
- * read_pageset1 - highlevel function to read the saved pages
- *
- * Attempt to read the header and pageset1 of a hibernate image.
- * Handle the outcome, complaining where appropriate.
- **/
-int read_pageset1(void)
-{
-        int error;
-
-        error = __read_pageset1();
-
-        if (error && error != -ENODATA && error != -EINVAL &&
-                                        !test_result_state(TOI_ABORTED))
-                abort_hibernate(TOI_IMAGE_ERROR,
-                        "TuxOnIce: Error %d resuming\n", error);
-
-        return error;
-}
-
-/**
- * get_have_image_data - check the image header
- **/
-static char *get_have_image_data(void)
-{
-        char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
-        struct toi_header *toi_header;
-
-        if (!output_buffer) {
-                printk(KERN_INFO "Output buffer null.\n");
-                return NULL;
-        }
-
-        /* Check for an image */
-        if (!toiActiveAllocator->image_exists(1) ||
-            toiActiveAllocator->read_header_init() ||
-            toiActiveAllocator->rw_header_chunk(READ, NULL,
-                        output_buffer, sizeof(struct toi_header))) {
-                sprintf(output_buffer, "0\n");
-                /*
-                 * From an initrd/ramfs, catting have_image and
-                 * getting a result of 0 is sufficient.
-                 */
-                clear_toi_state(TOI_BOOT_TIME);
-                goto out;
-        }
-
-        toi_header = (struct toi_header *) output_buffer;
-
-        sprintf(output_buffer, "1\n%s\n%s\n",
-                        toi_header->uts.machine,
-                        toi_header->uts.version);
-
-        /* Check whether we've resumed before */
-        if (test_toi_state(TOI_RESUMED_BEFORE))
-                strcat(output_buffer, "Resumed before.\n");
-
-out:
-        noresume_reset_modules();
-        return output_buffer;
-}
-
-/**
- * read_pageset2 - read second part of the image
- * @overwrittenpagesonly:        Read only pages which would have been
- *                                verwritten by pageset1?
- *
- * Read in part or all of pageset2 of an image, depending upon
- * whether we are hibernating and have only overwritten a portion
- * with pageset1 pages, or are resuming and need to read them
- * all.
- *
- * Returns: Int
- *        Zero if no error, otherwise the error value.
- **/
-int read_pageset2(int overwrittenpagesonly)
-{
-        int result = 0;
-
-        if (!pagedir2.size)
-                return 0;
-
-        result = read_pageset(&pagedir2, overwrittenpagesonly);
-
-        toi_cond_pause(1, "Pagedir 2 read.");
-
-        return result;
-}
-
-/**
- * image_exists_read - has an image been found?
- * @page:        Output buffer
- *
- * Store 0 or 1 in page, depending on whether an image is found.
- * Incoming buffer is PAGE_SIZE and result is guaranteed
- * to be far less than that, so we don't worry about
- * overflow.
- **/
-int image_exists_read(const char *page, int count)
-{
-        int len = 0;
-        char *result;
-
-        if (toi_activate_storage(0))
-                return count;
-
-        if (!test_toi_state(TOI_RESUME_DEVICE_OK))
-                toi_attempt_to_parse_resume_device(0);
-
-        if (!toiActiveAllocator) {
-                len = sprintf((char *) page, "-1\n");
-        } else {
-                result = get_have_image_data();
-                if (result) {
-                        len = sprintf((char *) page, "%s",  result);
-                        toi_free_page(26, (unsigned long) result);
-                }
-        }
-
-        toi_deactivate_storage(0);
-
-        return len;
-}
-
-/**
- * image_exists_write - invalidate an image if one exists
- **/
-int image_exists_write(const char *buffer, int count)
-{
-        if (toi_activate_storage(0))
-                return count;
-
-        if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
-                toiActiveAllocator->remove_image();
-
-        toi_deactivate_storage(0);
-
-        clear_result_state(TOI_KEPT_IMAGE);
-
-        return count;
-}
diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
deleted file mode 100644
index 7d4d83f40..000000000
--- a/kernel/power/tuxonice_io.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * kernel/power/tuxonice_io.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/utsname.h>
-#include "tuxonice_pagedir.h"
-
-/* Non-module data saved in our image header */
-struct toi_header {
-        /*
-         * Mirror struct swsusp_info, but without
-         * the page aligned attribute
-         */
-        struct new_utsname uts;
-        u32 version_code;
-        unsigned long num_physpages;
-        int cpus;
-        unsigned long image_pages;
-        unsigned long pages;
-        unsigned long size;
-
-        /* Our own data */
-        unsigned long orig_mem_free;
-        int page_size;
-        int pageset_2_size;
-        int param0;
-        int param1;
-        int param2;
-        int param3;
-        int progress0;
-        int progress1;
-        int progress2;
-        int progress3;
-        int io_time[2][2];
-        struct pagedir pagedir;
-        dev_t root_fs;
-        unsigned long bkd; /* Boot kernel data locn */
-};
-
-extern int write_pageset(struct pagedir *pagedir);
-extern int write_image_header(void);
-extern int read_pageset1(void);
-extern int read_pageset2(int overwrittenpagesonly);
-
-extern int toi_attempt_to_parse_resume_device(int quiet);
-extern void attempt_to_parse_resume_device2(void);
-extern void attempt_to_parse_alt_resume_param(void);
-int image_exists_read(const char *page, int count);
-int image_exists_write(const char *buffer, int count);
-extern void save_restore_alt_param(int replace, int quiet);
-extern atomic_t toi_io_workers;
-
-/* Args to save_restore_alt_param */
-#define RESTORE 0
-#define SAVE 1
-
-#define NOQUIET 0
-#define QUIET 1
-
-extern wait_queue_head_t toi_io_queue_flusher;
-extern int toi_bio_queue_flusher_should_finish;
-
-int fs_info_space_needed(int reset);
-
-extern int toi_max_workers;
diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
deleted file mode 100644
index a203c8fb9..000000000
--- a/kernel/power/tuxonice_modules.c
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-
-LIST_HEAD(toi_filters);
-LIST_HEAD(toiAllocators);
-
-LIST_HEAD(toi_modules);
-
-struct toi_module_ops *toiActiveAllocator;
-
-static int toi_num_filters;
-int toiNumAllocators, toi_num_modules;
-
-/*
- * toi_header_storage_for_modules
- *
- * Returns the amount of space needed to store configuration
- * data needed by the modules prior to copying back the original
- * kernel. We can exclude data for pageset2 because it will be
- * available anyway once the kernel is copied back.
- */
-long toi_header_storage_for_modules(void)
-{
-        struct toi_module_ops *this_module;
-        int bytes = 0;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    (this_module->type == WRITER_MODULE &&
-                     toiActiveAllocator != this_module))
-                        continue;
-                if (this_module->storage_needed) {
-                        int this = this_module->storage_needed() +
-                                sizeof(struct toi_module_header) +
-                                sizeof(int);
-                        this_module->header_requested = this;
-                        bytes += this;
-                }
-        }
-
-        /* One more for the empty terminator */
-        return bytes + sizeof(struct toi_module_header);
-}
-
-void print_toi_header_storage_for_modules(void)
-{
-        struct toi_module_ops *this_module;
-        int bytes = 0;
-
-        printk(KERN_DEBUG "Header storage:\n");
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled ||
-                    (this_module->type == WRITER_MODULE &&
-                     toiActiveAllocator != this_module))
-                        continue;
-                if (this_module->storage_needed) {
-                        int this = this_module->storage_needed() +
-                                sizeof(struct toi_module_header) +
-                                sizeof(int);
-                        this_module->header_requested = this;
-                        bytes += this;
-                        printk(KERN_DEBUG "+ %16s : %-4d/%d.\n",
-                                        this_module->name,
-                                        this_module->header_used, this);
-                }
-        }
-
-        printk(KERN_DEBUG "+ empty terminator : %zu.\n",
-                        sizeof(struct toi_module_header));
-        printk(KERN_DEBUG "                     ====\n");
-        printk(KERN_DEBUG "                     %zu\n",
-                        bytes + sizeof(struct toi_module_header));
-}
-
-/*
- * toi_memory_for_modules
- *
- * Returns the amount of memory requested by modules for
- * doing their work during the cycle.
- */
-
-long toi_memory_for_modules(int print_parts)
-{
-        long bytes = 0, result;
-        struct toi_module_ops *this_module;
-
-        if (print_parts)
-                printk(KERN_INFO "Memory for modules:\n===================\n");
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                int this;
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->memory_needed) {
-                        this = this_module->memory_needed();
-                        if (print_parts)
-                                printk(KERN_INFO "%10d bytes (%5ld pages) for "
-                                                "module '%s'.\n", this,
-                                                DIV_ROUND_UP(this, PAGE_SIZE),
-                                                this_module->name);
-                        bytes += this;
-                }
-        }
-
-        result = DIV_ROUND_UP(bytes, PAGE_SIZE);
-        if (print_parts)
-                printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
-
-        return result;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Returns the compression ratio expected when saving the image.
- */
-
-int toi_expected_compression_ratio(void)
-{
-        int ratio = 100;
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->expected_compression)
-                        ratio = ratio * this_module->expected_compression()
-                                / 100;
-        }
-
-        return ratio;
-}
-
-/* toi_find_module_given_dir
- * Functionality :        Return a module (if found), given a pointer
- *                         to its directory name
- */
-
-static struct toi_module_ops *toi_find_module_given_dir(char *name)
-{
-        struct toi_module_ops *this_module, *found_module = NULL;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!strcmp(name, this_module->directory)) {
-                        found_module = this_module;
-                        break;
-                }
-        }
-
-        return found_module;
-}
-
-/* toi_find_module_given_name
- * Functionality :        Return a module (if found), given a pointer
- *                         to its name
- */
-
-struct toi_module_ops *toi_find_module_given_name(char *name)
-{
-        struct toi_module_ops *this_module, *found_module = NULL;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!strcmp(name, this_module->name)) {
-                        found_module = this_module;
-                        break;
-                }
-        }
-
-        return found_module;
-}
-
-/*
- * toi_print_module_debug_info
- * Functionality   : Get debugging info from modules into a buffer.
- */
-int toi_print_module_debug_info(char *buffer, int buffer_size)
-{
-        struct toi_module_ops *this_module;
-        int len = 0;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->print_debug_info) {
-                        int result;
-                        result = this_module->print_debug_info(buffer + len,
-                                        buffer_size - len);
-                        len += result;
-                }
-        }
-
-        /* Ensure null terminated */
-        buffer[buffer_size] = 0;
-
-        return len;
-}
-
-/*
- * toi_register_module
- *
- * Register a module.
- */
-int toi_register_module(struct toi_module_ops *module)
-{
-        int i;
-        struct kobject *kobj;
-
-        if (!hibernation_available())
-          return -ENODEV;
-
-        module->enabled = 1;
-
-        if (toi_find_module_given_name(module->name)) {
-                printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
-                                " which is already registered.\n",
-                                module->name);
-                return -EBUSY;
-        }
-
-        switch (module->type) {
-        case FILTER_MODULE:
-                list_add_tail(&module->type_list, &toi_filters);
-                toi_num_filters++;
-                break;
-        case WRITER_MODULE:
-                list_add_tail(&module->type_list, &toiAllocators);
-                toiNumAllocators++;
-                break;
-        case MISC_MODULE:
-        case MISC_HIDDEN_MODULE:
-        case BIO_ALLOCATOR_MODULE:
-                break;
-        default:
-                printk(KERN_ERR "Hmmm. Module '%s' has an invalid type."
-                        " It has been ignored.\n", module->name);
-                return -EINVAL;
-        }
-        list_add_tail(&module->module_list, &toi_modules);
-        toi_num_modules++;
-
-        if ((!module->directory && !module->shared_directory) ||
-                        !module->sysfs_data || !module->num_sysfs_entries)
-                return 0;
-
-        /*
-         * Modules may share a directory, but those with shared_dir
-         * set must be loaded (via symbol dependencies) after parents
-         * and unloaded beforehand.
-         */
-        if (module->shared_directory) {
-                struct toi_module_ops *shared =
-                        toi_find_module_given_dir(module->shared_directory);
-                if (!shared) {
-                        printk(KERN_ERR "TuxOnIce: Module %s wants to share "
-                                        "%s's directory but %s isn't loaded.\n",
-                                        module->name, module->shared_directory,
-                                        module->shared_directory);
-                        toi_unregister_module(module);
-                        return -ENODEV;
-                }
-                kobj = shared->dir_kobj;
-        } else {
-                if (!strncmp(module->directory, "[ROOT]", 6))
-                        kobj = tuxonice_kobj;
-                else
-                        kobj = make_toi_sysdir(module->directory);
-        }
-        module->dir_kobj = kobj;
-        for (i = 0; i < module->num_sysfs_entries; i++) {
-                int result = toi_register_sysfs_file(kobj,
-                                &module->sysfs_data[i]);
-                if (result)
-                        return result;
-        }
-        return 0;
-}
-
-/*
- * toi_unregister_module
- *
- * Remove a module.
- */
-void toi_unregister_module(struct toi_module_ops *module)
-{
-        int i;
-
-        if (module->dir_kobj)
-                for (i = 0; i < module->num_sysfs_entries; i++)
-                        toi_unregister_sysfs_file(module->dir_kobj,
-                                        &module->sysfs_data[i]);
-
-        if (!module->shared_directory && module->directory &&
-                        strncmp(module->directory, "[ROOT]", 6))
-                remove_toi_sysdir(module->dir_kobj);
-
-        switch (module->type) {
-        case FILTER_MODULE:
-                list_del(&module->type_list);
-                toi_num_filters--;
-                break;
-        case WRITER_MODULE:
-                list_del(&module->type_list);
-                toiNumAllocators--;
-                if (toiActiveAllocator == module) {
-                        toiActiveAllocator = NULL;
-                        clear_toi_state(TOI_CAN_RESUME);
-                        clear_toi_state(TOI_CAN_HIBERNATE);
-                }
-                break;
-        case MISC_MODULE:
-        case MISC_HIDDEN_MODULE:
-        case BIO_ALLOCATOR_MODULE:
-                break;
-        default:
-                printk(KERN_ERR "Module '%s' has an invalid type."
-                        " It has been ignored.\n", module->name);
-                return;
-        }
-        list_del(&module->module_list);
-        toi_num_modules--;
-}
-
-/*
- * toi_move_module_tail
- *
- * Rearrange modules when reloading the config.
- */
-void toi_move_module_tail(struct toi_module_ops *module)
-{
-        switch (module->type) {
-        case FILTER_MODULE:
-                if (toi_num_filters > 1)
-                        list_move_tail(&module->type_list, &toi_filters);
-                break;
-        case WRITER_MODULE:
-                if (toiNumAllocators > 1)
-                        list_move_tail(&module->type_list, &toiAllocators);
-                break;
-        case MISC_MODULE:
-        case MISC_HIDDEN_MODULE:
-        case BIO_ALLOCATOR_MODULE:
-                break;
-        default:
-                printk(KERN_ERR "Module '%s' has an invalid type."
-                        " It has been ignored.\n", module->name);
-                return;
-        }
-        if ((toi_num_filters + toiNumAllocators) > 1)
-                list_move_tail(&module->module_list, &toi_modules);
-}
-
-/*
- * toi_initialise_modules
- *
- * Get ready to do some work!
- */
-int toi_initialise_modules(int starting_cycle, int early)
-{
-        struct toi_module_ops *this_module;
-        int result;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                this_module->header_requested = 0;
-                this_module->header_used = 0;
-                if (!this_module->enabled)
-                        continue;
-                if (this_module->early != early)
-                        continue;
-                if (this_module->initialise) {
-                        result = this_module->initialise(starting_cycle);
-                        if (result) {
-                                toi_cleanup_modules(starting_cycle);
-                                return result;
-                        }
-                        this_module->initialised = 1;
-                }
-        }
-
-        return 0;
-}
-
-/*
- * toi_cleanup_modules
- *
- * Tell modules the work is done.
- */
-void toi_cleanup_modules(int finishing_cycle)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (!this_module->enabled || !this_module->initialised)
-                        continue;
-                if (this_module->cleanup)
-                        this_module->cleanup(finishing_cycle);
-                this_module->initialised = 0;
-        }
-}
-
-/*
- * toi_pre_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (this_module->enabled && this_module->pre_atomic_restore)
-                        this_module->pre_atomic_restore(bkd);
-        }
-}
-
-/*
- * toi_post_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (this_module->enabled && this_module->post_atomic_restore)
-                        this_module->post_atomic_restore(bkd);
-        }
-}
-
-/*
- * toi_get_next_filter
- *
- * Get the next filter in the pipeline.
- */
-struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
-{
-        struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
-
-        list_for_each_entry(this_filter, &toi_filters, type_list) {
-                if (!this_filter->enabled)
-                        continue;
-                if ((last_filter == filter_sought) || (!filter_sought))
-                        return this_filter;
-                last_filter = this_filter;
-        }
-
-        return toiActiveAllocator;
-}
-
-/**
- * toi_show_modules: Printk what support is loaded.
- */
-void toi_print_modules(void)
-{
-        struct toi_module_ops *this_module;
-        int prev = 0;
-
-        printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for");
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                if (this_module->type == MISC_HIDDEN_MODULE)
-                        continue;
-                printk("%s %s%s%s", prev ? "," : "",
-                                this_module->enabled ? "" : "[",
-                                this_module->name,
-                                this_module->enabled ? "" : "]");
-                prev = 1;
-        }
-
-        printk(".\n");
-}
-
-/* toi_get_modules
- *
- * Take a reference to modules so they can't go away under us.
- */
-
-int toi_get_modules(void)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list) {
-                struct toi_module_ops *this_module2;
-
-                if (try_module_get(this_module->module))
-                        continue;
-
-                /* Failed! Reverse gets and return error */
-                list_for_each_entry(this_module2, &toi_modules,
-                                module_list) {
-                        if (this_module == this_module2)
-                                return -EINVAL;
-                        module_put(this_module2->module);
-                }
-        }
-        return 0;
-}
-
-/* toi_put_modules
- *
- * Release our references to modules we used.
- */
-
-void toi_put_modules(void)
-{
-        struct toi_module_ops *this_module;
-
-        list_for_each_entry(this_module, &toi_modules, module_list)
-                module_put(this_module->module);
-}
diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
deleted file mode 100644
index 44f10abb9..000000000
--- a/kernel/power/tuxonice_modules.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations for modules. Modules are additions to
- * TuxOnIce that provide facilities such as image compression or
- * encryption, backends for storage of the image and user interfaces.
- *
- */
-
-#ifndef TOI_MODULES_H
-#define TOI_MODULES_H
-
-/* This is the maximum size we store in the image header for a module name */
-#define TOI_MAX_MODULE_NAME_LENGTH 30
-
-struct toi_boot_kernel_data;
-
-/* Per-module metadata */
-struct toi_module_header {
-        char name[TOI_MAX_MODULE_NAME_LENGTH];
-        int enabled;
-        int type;
-        int index;
-        int data_length;
-        unsigned long signature;
-};
-
-enum {
-        FILTER_MODULE,
-        WRITER_MODULE,
-        BIO_ALLOCATOR_MODULE,
-        MISC_MODULE,
-        MISC_HIDDEN_MODULE,
-};
-
-enum {
-        TOI_ASYNC,
-        TOI_SYNC
-};
-
-enum {
-        TOI_VIRT,
-        TOI_PAGE,
-};
-
-#define TOI_MAP(type, addr) \
- (type == TOI_PAGE ? kmap(addr) : addr)
-
-#define TOI_UNMAP(type, addr) \
- do { \
-   if (type == TOI_PAGE) \
-     kunmap(addr); \
- } while(0)
-
-struct toi_module_ops {
-        /* Functions common to all modules */
-        int type;
-        char *name;
-        char *directory;
-        char *shared_directory;
-        struct kobject *dir_kobj;
-        struct module *module;
-        int enabled, early, initialised;
-        struct list_head module_list;
-
-        /* List of filters or allocators */
-        struct list_head list, type_list;
-
-        /*
-         * Requirements for memory and storage in
-         * the image header..
-         */
-        int (*memory_needed) (void);
-        int (*storage_needed) (void);
-
-        int header_requested, header_used;
-
-        int (*expected_compression) (void);
-
-        /*
-         * Debug info
-         */
-        int (*print_debug_info) (char *buffer, int size);
-        int (*save_config_info) (char *buffer);
-        void (*load_config_info) (char *buffer, int len);
-
-        /*
-         * Initialise & cleanup - general routines called
-         * at the start and end of a cycle.
-         */
-        int (*initialise) (int starting_cycle);
-        void (*cleanup) (int finishing_cycle);
-
-        void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd);
-        void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd);
-
-        /*
-         * Calls for allocating storage (allocators only).
-         *
-         * Header space is requested separately and cannot fail, but the
-         * reservation is only applied when main storage is allocated.
-         * The header space reservation is thus always set prior to
-         * requesting the allocation of storage - and prior to querying
-         * how much storage is available.
-         */
-
-        unsigned long (*storage_available) (void);
-        void (*reserve_header_space) (unsigned long space_requested);
-        int (*register_storage) (void);
-        int (*allocate_storage) (unsigned long space_requested);
-        unsigned long (*storage_allocated) (void);
-        void (*free_unused_storage) (void);
-
-        /*
-         * Routines used in image I/O.
-         */
-        int (*rw_init) (int rw, int stream_number);
-        int (*rw_cleanup) (int rw);
-        int (*write_page) (unsigned long index, int buf_type, void *buf,
-                        unsigned int buf_size);
-        int (*read_page) (unsigned long *index, int buf_type, void *buf,
-                        unsigned int *buf_size);
-        int (*io_flusher) (int rw);
-
-        /* Reset module if image exists but reading aborted */
-        void (*noresume_reset) (void);
-
-        /* Read and write the metadata */
-        int (*write_header_init) (void);
-        int (*write_header_cleanup) (void);
-
-        int (*read_header_init) (void);
-        int (*read_header_cleanup) (void);
-
-        /* To be called after read_header_init */
-        int (*get_header_version) (void);
-
-        int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
-                        char *buffer_start, int buffer_size);
-
-        int (*rw_header_chunk_noreadahead) (int rw,
-                        struct toi_module_ops *owner, char *buffer_start,
-                        int buffer_size);
-
-        /* Attempt to parse an image location */
-        int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
-
-        /* Throttle I/O according to throughput */
-        void (*update_throughput_throttle) (int jif_index);
-
-        /* Flush outstanding I/O */
-        int (*finish_all_io) (void);
-
-        /* Determine whether image exists that we can restore */
-        int (*image_exists) (int quiet);
-
-        /* Mark the image as having tried to resume */
-        int (*mark_resume_attempted) (int);
-
-        /* Destroy image if one exists */
-        int (*remove_image) (void);
-
-        /* Sysfs Data */
-        struct toi_sysfs_data *sysfs_data;
-        int num_sysfs_entries;
-
-        /* Block I/O allocator */
-        struct toi_bio_allocator_ops *bio_allocator_ops;
-};
-
-extern int toi_num_modules, toiNumAllocators;
-
-extern struct toi_module_ops *toiActiveAllocator;
-extern struct list_head toi_filters, toiAllocators, toi_modules;
-
-extern void toi_prepare_console_modules(void);
-extern void toi_cleanup_console_modules(void);
-
-extern struct toi_module_ops *toi_find_module_given_name(char *name);
-extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
-
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_move_module_tail(struct toi_module_ops *module);
-
-extern long toi_header_storage_for_modules(void);
-extern long toi_memory_for_modules(int print_parts);
-extern void print_toi_header_storage_for_modules(void);
-extern int toi_expected_compression_ratio(void);
-
-extern int toi_print_module_debug_info(char *buffer, int buffer_size);
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_unregister_module(struct toi_module_ops *module);
-
-extern int toi_initialise_modules(int starting_cycle, int early);
-#define toi_initialise_modules_early(starting) \
-        toi_initialise_modules(starting, 1)
-#define toi_initialise_modules_late(starting) \
-        toi_initialise_modules(starting, 0)
-extern void toi_cleanup_modules(int finishing_cycle);
-
-extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-
-extern void toi_print_modules(void);
-
-int toi_get_modules(void);
-void toi_put_modules(void);
-#endif
diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
deleted file mode 100644
index 78bd31b05..000000000
--- a/kernel/power/tuxonice_netlink.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Functions for communicating with a userspace helper via netlink.
- */
-
-#include <linux/suspend.h>
-#include <linux/sched.h>
-#include <linux/kmod.h>
-#include "tuxonice_netlink.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct user_helper_data *uhd_list;
-
-/*
- * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
- * none can be allocated).
- */
-static void toi_fill_skb_pool(struct user_helper_data *uhd)
-{
-        while (uhd->pool_level < uhd->pool_limit) {
-                struct sk_buff *new_skb =
-                        alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
-                if (!new_skb)
-                        break;
-
-                new_skb->next = uhd->emerg_skbs;
-                uhd->emerg_skbs = new_skb;
-                uhd->pool_level++;
-        }
-}
-
-/*
- * Try to allocate a single skb. If we can't get one, try to use one from
- * our pool.
- */
-static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
-{
-        struct sk_buff *skb =
-                alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
-        if (skb)
-                return skb;
-
-        skb = uhd->emerg_skbs;
-        if (skb) {
-                uhd->pool_level--;
-                uhd->emerg_skbs = skb->next;
-                skb->next = NULL;
-        }
-
-        return skb;
-}
-
-void toi_send_netlink_message(struct user_helper_data *uhd,
-                int type, void *params, size_t len)
-{
-        struct sk_buff *skb;
-        struct nlmsghdr *nlh;
-        void *dest;
-        struct task_struct *t;
-
-        if (uhd->pid == -1)
-                return;
-
-        if (uhd->debug)
-                printk(KERN_ERR "toi_send_netlink_message: Send "
-                                "message type %d.\n", type);
-
-        skb = toi_get_skb(uhd);
-        if (!skb) {
-                printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
-                return;
-        }
-
-        nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0);
-        uhd->sock_seq++;
-
-        dest = NLMSG_DATA(nlh);
-        if (params && len > 0)
-                memcpy(dest, params, len);
-
-        netlink_unicast(uhd->nl, skb, uhd->pid, 0);
-
-        toi_read_lock_tasklist();
-        t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
-        if (!t) {
-                toi_read_unlock_tasklist();
-                if (uhd->pid > -1)
-                        printk(KERN_INFO "Hmm. Can't find the userspace task"
-                                " %d.\n", uhd->pid);
-                return;
-        }
-        wake_up_process(t);
-        toi_read_unlock_tasklist();
-
-        yield();
-}
-
-static void send_whether_debugging(struct user_helper_data *uhd)
-{
-        static u8 is_debugging = 1;
-
-        toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
-                        &is_debugging, sizeof(u8));
-}
-
-/*
- * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
- * are hibernating.
- */
-static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid)
-{
-        struct task_struct *t;
-
-        if (uhd->debug)
-                printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid);
-
-        toi_read_lock_tasklist();
-        t = find_task_by_pid_ns(pid, &init_pid_ns);
-        if (!t) {
-                toi_read_unlock_tasklist();
-                printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
-                                pid);
-                return -EINVAL;
-        }
-
-        t->flags |= PF_NOFREEZE;
-
-        toi_read_unlock_tasklist();
-        uhd->pid = pid;
-
-        toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
-
-        return 0;
-}
-
-/*
- * Called when the userspace process has informed us that it's ready to roll.
- */
-static int nl_ready(struct user_helper_data *uhd, u32 version)
-{
-        if (version != uhd->interface_version) {
-                printk(KERN_INFO "%s userspace process using invalid interface"
-                                " version (%d - kernel wants %d). Trying to "
-                                "continue without it.\n",
-                                uhd->name, version, uhd->interface_version);
-                if (uhd->not_ready)
-                        uhd->not_ready();
-                return -EINVAL;
-        }
-
-        complete(&uhd->wait_for_process);
-
-        return 0;
-}
-
-void toi_netlink_close_complete(struct user_helper_data *uhd)
-{
-        if (uhd->nl) {
-                netlink_kernel_release(uhd->nl);
-                uhd->nl = NULL;
-        }
-
-        while (uhd->emerg_skbs) {
-                struct sk_buff *next = uhd->emerg_skbs->next;
-                kfree_skb(uhd->emerg_skbs);
-                uhd->emerg_skbs = next;
-        }
-
-        uhd->pid = -1;
-}
-
-static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
-                struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-        int type = nlh->nlmsg_type;
-        int *data;
-        int err;
-
-        if (uhd->debug)
-                printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n",
-                                type);
-
-        /* Let the more specific handler go first. It returns
-         * 1 for valid messages that it doesn't know. */
-        err = uhd->rcv_msg(skb, nlh);
-        if (err != 1)
-                return err;
-
-        /* Only allow one task to receive NOFREEZE privileges */
-        if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
-                printk(KERN_INFO "Received extra nofreeze me requests.\n");
-                return -EBUSY;
-        }
-
-        data = NLMSG_DATA(nlh);
-
-        switch (type) {
-        case NETLINK_MSG_NOFREEZE_ME:
-                return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
-        case NETLINK_MSG_GET_DEBUGGING:
-                send_whether_debugging(uhd);
-                return 0;
-        case NETLINK_MSG_READY:
-                if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) {
-                        printk(KERN_INFO "Invalid ready mesage.\n");
-                        if (uhd->not_ready)
-                                uhd->not_ready();
-                        return -EINVAL;
-                }
-                return nl_ready(uhd, (u32) *data);
-        case NETLINK_MSG_CLEANUP:
-                toi_netlink_close_complete(uhd);
-                return 0;
-        }
-
-        return -EINVAL;
-}
-
-static void toi_user_rcv_skb(struct sk_buff *skb)
-{
-        int err;
-        struct nlmsghdr *nlh;
-        struct user_helper_data *uhd = uhd_list;
-
-        while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
-                uhd = uhd->next;
-
-        if (!uhd)
-                return;
-
-        while (skb->len >= NLMSG_SPACE(0)) {
-                u32 rlen;
-
-                nlh = (struct nlmsghdr *) skb->data;
-                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-                        return;
-
-                rlen = NLMSG_ALIGN(nlh->nlmsg_len);
-                if (rlen > skb->len)
-                        rlen = skb->len;
-
-                err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
-                if (err)
-                        netlink_ack(skb, nlh, err);
-                else if (nlh->nlmsg_flags & NLM_F_ACK)
-                        netlink_ack(skb, nlh, 0);
-                skb_pull(skb, rlen);
-        }
-}
-
-static int netlink_prepare(struct user_helper_data *uhd)
-{
-        struct netlink_kernel_cfg cfg = {
-                .groups = 0,
-                .input = toi_user_rcv_skb,
-        };
-
-        uhd->next = uhd_list;
-        uhd_list = uhd;
-
-        uhd->sock_seq = 0x42c0ffee;
-        uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg);
-        if (!uhd->nl) {
-                printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
-                                uhd->name);
-                return -ENOMEM;
-        }
-
-        toi_fill_skb_pool(uhd);
-
-        return 0;
-}
-
-void toi_netlink_close(struct user_helper_data *uhd)
-{
-        struct task_struct *t;
-
-        toi_read_lock_tasklist();
-        t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
-        if (t)
-                t->flags &= ~PF_NOFREEZE;
-        toi_read_unlock_tasklist();
-
-        toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
-}
-int toi_netlink_setup(struct user_helper_data *uhd)
-{
-        /* In case userui didn't cleanup properly on us */
-        toi_netlink_close_complete(uhd);
-
-        if (netlink_prepare(uhd) < 0) {
-                printk(KERN_INFO "Netlink prepare failed.\n");
-                return 1;
-        }
-
-        if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
-                                UMH_WAIT_EXEC, uhd->debug) < 0) {
-                printk(KERN_INFO "Launch userspace program failed.\n");
-                toi_netlink_close_complete(uhd);
-                return 1;
-        }
-
-        /* Wait 2 seconds for the userspace process to make contact */
-        wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
-
-        if (uhd->pid == -1) {
-                printk(KERN_INFO "%s: Failed to contact userspace process.\n",
-                                uhd->name);
-                toi_netlink_close_complete(uhd);
-                return 1;
-        }
-
-        return 0;
-}
diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
deleted file mode 100644
index 6613c8eaa..000000000
--- a/kernel/power/tuxonice_netlink.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for functions for communicating with a userspace helper
- * via netlink.
- */
-
-#include <linux/netlink.h>
-#include <net/sock.h>
-
-#define NETLINK_MSG_BASE 0x10
-
-#define NETLINK_MSG_READY 0x10
-#define        NETLINK_MSG_NOFREEZE_ME 0x16
-#define NETLINK_MSG_GET_DEBUGGING 0x19
-#define NETLINK_MSG_CLEANUP 0x24
-#define NETLINK_MSG_NOFREEZE_ACK 0x27
-#define NETLINK_MSG_IS_DEBUGGING 0x28
-
-struct user_helper_data {
-        int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
-        void (*not_ready) (void);
-        struct sock *nl;
-        u32 sock_seq;
-        pid_t pid;
-        char *comm;
-        char program[256];
-        int pool_level;
-        int pool_limit;
-        struct sk_buff *emerg_skbs;
-        int skb_size;
-        int netlink_id;
-        char *name;
-        struct user_helper_data *next;
-        struct completion wait_for_process;
-        u32 interface_version;
-        int must_init;
-        int debug;
-};
-
-#ifdef CONFIG_NET
-int toi_netlink_setup(struct user_helper_data *uhd);
-void toi_netlink_close(struct user_helper_data *uhd);
-void toi_send_netlink_message(struct user_helper_data *uhd,
-                int type, void *params, size_t len);
-void toi_netlink_close_complete(struct user_helper_data *uhd);
-#else
-static inline int toi_netlink_setup(struct user_helper_data *uhd)
-{
-        return 0;
-}
-
-static inline void toi_netlink_close(struct user_helper_data *uhd) { };
-static inline void toi_send_netlink_message(struct user_helper_data *uhd,
-                int type, void *params, size_t len) { };
-static inline void toi_netlink_close_complete(struct user_helper_data *uhd)
-        { };
-#endif
diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
deleted file mode 100644
index d469f3d2d..000000000
--- a/kernel/power/tuxonice_pagedir.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for handling pagesets.
- * Note that pbes aren't actually stored as such. They're stored as
- * bitmaps and extents.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/hardirq.h>
-#include <linux/sched.h>
-#include <linux/cpu.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
-
-static int ptoi_pfn;
-static struct pbe *this_low_pbe;
-static struct pbe **last_low_pbe_ptr;
-
-void toi_reset_alt_image_pageset2_pfn(void)
-{
-  memory_bm_position_reset(pageset2_map);
-}
-
-static struct page *first_conflicting_page;
-
-/*
- * free_conflicting_pages
- */
-
-static void free_conflicting_pages(void)
-{
-        while (first_conflicting_page) {
-                struct page *next =
-                        *((struct page **) kmap(first_conflicting_page));
-                kunmap(first_conflicting_page);
-                toi__free_page(29, first_conflicting_page);
-                first_conflicting_page = next;
-        }
-}
-
-/* __toi_get_nonconflicting_page
- *
- * Description: Gets order zero pages that won't be overwritten
- *                while copying the original pages.
- */
-
-struct page *___toi_get_nonconflicting_page(int can_be_highmem)
-{
-        struct page *page;
-        gfp_t flags = TOI_ATOMIC_GFP;
-        if (can_be_highmem)
-                flags |= __GFP_HIGHMEM;
-
-
-        if (test_toi_state(TOI_LOADING_ALT_IMAGE) &&
-                        pageset2_map && ptoi_pfn) {
-                do {
-                        ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0);
-                        if (ptoi_pfn != BM_END_OF_MAP) {
-                                page = pfn_to_page(ptoi_pfn);
-                                if (!PagePageset1(page) &&
-                                    (can_be_highmem || !PageHighMem(page)))
-                                        return page;
-                        }
-                } while (ptoi_pfn);
-        }
-
-        do {
-                page = toi_alloc_page(29, flags | __GFP_ZERO);
-                if (!page) {
-                        printk(KERN_INFO "Failed to get nonconflicting "
-                                        "page.\n");
-                        return NULL;
-                }
-                if (PagePageset1(page)) {
-                        struct page **next = (struct page **) kmap(page);
-                        *next = first_conflicting_page;
-                        first_conflicting_page = page;
-                        kunmap(page);
-                }
-        } while (PagePageset1(page));
-
-        return page;
-}
-
-unsigned long __toi_get_nonconflicting_page(void)
-{
-        struct page *page = ___toi_get_nonconflicting_page(0);
-        return page ? (unsigned long) page_address(page) : 0;
-}
-
-static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
-                int highmem)
-{
-        if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
-                     + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
-                struct page *new_page =
-                        ___toi_get_nonconflicting_page(highmem);
-                if (!new_page)
-                        return ERR_PTR(-ENOMEM);
-                this_pbe = (struct pbe *) kmap(new_page);
-                memset(this_pbe, 0, PAGE_SIZE);
-                *page_ptr = new_page;
-        } else
-                this_pbe++;
-
-        return this_pbe;
-}
-
-/**
- * get_pageset1_load_addresses - generate pbes for conflicting pages
- *
- * We check here that pagedir & pages it points to won't collide
- * with pages where we're going to restore from the loaded pages
- * later.
- *
- * Returns:
- *        Zero on success, one if couldn't find enough pages (shouldn't
- *        happen).
- **/
-int toi_get_pageset1_load_addresses(void)
-{
-        int pfn, highallocd = 0, lowallocd = 0;
-        int low_needed = pagedir1.size - get_highmem_size(pagedir1);
-        int high_needed = get_highmem_size(pagedir1);
-        int low_pages_for_highmem = 0;
-        gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
-        struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
-                    *low_pbe_page, *last_low_pbe_page = NULL;
-        struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
-                   *this_high_pbe = NULL;
-        unsigned long orig_low_pfn, orig_high_pfn;
-        int high_pbes_done = 0, low_pbes_done = 0;
-        int low_direct = 0, high_direct = 0, result = 0, i;
-        int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0;
-
-        toi_trace_index++;
-
-        memory_bm_position_reset(pageset1_map);
-        memory_bm_position_reset(pageset1_copy_map);
-
-        last_low_pbe_ptr = &restore_pblist;
-
-        /* First, allocate pages for the start of our pbe lists. */
-        if (high_needed) {
-                high_pbe_page = ___toi_get_nonconflicting_page(1);
-                if (!high_pbe_page) {
-                        result = -ENOMEM;
-                        goto out;
-                }
-                this_high_pbe = (struct pbe *) kmap(high_pbe_page);
-                memset(this_high_pbe, 0, PAGE_SIZE);
-        }
-
-        low_pbe_page = ___toi_get_nonconflicting_page(0);
-        if (!low_pbe_page) {
-                result = -ENOMEM;
-                goto out;
-        }
-        this_low_pbe = (struct pbe *) page_address(low_pbe_page);
-
-        /*
-         * Next, allocate the number of pages we need.
-         */
-
-        i = low_needed + high_needed;
-
-        do {
-                int is_high;
-
-                if (i == low_needed)
-                        flags &= ~__GFP_HIGHMEM;
-
-                page = toi_alloc_page(30, flags);
-                BUG_ON(!page);
-
-                SetPagePageset1Copy(page);
-                is_high = PageHighMem(page);
-
-                if (PagePageset1(page)) {
-                        if (is_high)
-                                high_direct++;
-                        else
-                                low_direct++;
-                } else {
-                        if (is_high)
-                                highallocd++;
-                        else
-                                lowallocd++;
-                }
-        } while (--i);
-
-        high_needed -= high_direct;
-        low_needed -= low_direct;
-
-        /*
-         * Do we need to use some lowmem pages for the copies of highmem
-         * pages?
-         */
-        if (high_needed > highallocd) {
-                low_pages_for_highmem = high_needed - highallocd;
-                high_needed -= low_pages_for_highmem;
-                low_needed += low_pages_for_highmem;
-        }
-
-        /*
-         * Now generate our pbes (which will be used for the atomic restore),
-         * and free unneeded pages.
-         */
-        memory_bm_position_reset(pageset1_copy_map);
-        for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP;
-                        pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) {
-                int is_high;
-                page = pfn_to_page(pfn);
-                is_high = PageHighMem(page);
-
-                if (PagePageset1(page))
-                        continue;
-
-                /* Nope. We're going to use this page. Add a pbe. */
-                if (is_high || low_pages_for_highmem) {
-                        struct page *orig_page;
-                        high_pbes_done++;
-                        if (!is_high)
-                                low_pages_for_highmem--;
-                        do {
-                                orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0);
-                                BUG_ON(orig_high_pfn == BM_END_OF_MAP);
-                                orig_page = pfn_to_page(orig_high_pfn);
-                        } while (!PageHighMem(orig_page) ||
-                                        PagePageset1Copy(orig_page));
-
-                        this_high_pbe->orig_address = (void *) orig_high_pfn;
-                        this_high_pbe->address = page;
-                        this_high_pbe->next = NULL;
-                        toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p",
-                                        high_page, high_offset, page, orig_high_pfn, orig_page);
-                        if (last_high_pbe_page != high_pbe_page) {
-                                *last_high_pbe_ptr =
-                                        (struct pbe *) high_pbe_page;
-                                if (last_high_pbe_page) {
-                                        kunmap(last_high_pbe_page);
-                                        high_page++;
-                                        high_offset = 0;
-                                } else
-                                        high_offset++;
-                                last_high_pbe_page = high_pbe_page;
-                        } else {
-                                *last_high_pbe_ptr = this_high_pbe;
-                                high_offset++;
-                        }
-                        last_high_pbe_ptr = &this_high_pbe->next;
-                        this_high_pbe = get_next_pbe(&high_pbe_page,
-                                        this_high_pbe, 1);
-                        if (IS_ERR(this_high_pbe)) {
-                                printk(KERN_INFO
-                                                "This high pbe is an error.\n");
-                                return -ENOMEM;
-                        }
-                } else {
-                        struct page *orig_page;
-                        low_pbes_done++;
-                        do {
-                                orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0);
-                                BUG_ON(orig_low_pfn == BM_END_OF_MAP);
-                                orig_page = pfn_to_page(orig_low_pfn);
-                        } while (PageHighMem(orig_page) ||
-                                        PagePageset1Copy(orig_page));
-
-                        this_low_pbe->orig_address = page_address(orig_page);
-                        this_low_pbe->address = page_address(page);
-                        this_low_pbe->next = NULL;
-                        toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p",
-                                        low_page, low_offset, this_low_pbe->orig_address,
-                                        orig_low_pfn, this_low_pbe->address);
-                        TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address);
-                        *last_low_pbe_ptr = this_low_pbe;
-                        last_low_pbe_ptr = &this_low_pbe->next;
-                        this_low_pbe = get_next_pbe(&low_pbe_page,
-                                        this_low_pbe, 0);
-                        if (low_pbe_page != last_low_pbe_page) {
-                                if (last_low_pbe_page) {
-                                        low_page++;
-                                        low_offset = 0;
-                                } else {
-                                    low_offset++;
-                                }
-                                last_low_pbe_page = low_pbe_page;
-                        } else
-                                low_offset++;
-                        if (IS_ERR(this_low_pbe)) {
-                                printk(KERN_INFO "this_low_pbe is an error.\n");
-                                return -ENOMEM;
-                        }
-                }
-        }
-
-        if (high_pbe_page)
-                kunmap(high_pbe_page);
-
-        if (last_high_pbe_page != high_pbe_page) {
-                if (last_high_pbe_page)
-                        kunmap(last_high_pbe_page);
-                toi__free_page(29, high_pbe_page);
-        }
-
-        free_conflicting_pages();
-
-out:
-        return result;
-}
-
-int add_boot_kernel_data_pbe(void)
-{
-        this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
-        if (!this_low_pbe->address) {
-                printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
-                return -ENOMEM;
-        }
-
-        toi_bkd.size = sizeof(toi_bkd);
-        memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
-
-        *last_low_pbe_ptr = this_low_pbe;
-        this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
-        this_low_pbe->next = NULL;
-        return 0;
-}
diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
deleted file mode 100644
index 046535918..000000000
--- a/kernel/power/tuxonice_pagedir.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for routines for handling pagesets.
- */
-
-#ifndef KERNEL_POWER_PAGEDIR_H
-#define KERNEL_POWER_PAGEDIR_H
-
-/* Pagedir
- *
- * Contains the metadata for a set of pages saved in the image.
- */
-
-struct pagedir {
-        int id;
-        unsigned long size;
-#ifdef CONFIG_HIGHMEM
-        unsigned long size_high;
-#endif
-};
-
-#ifdef CONFIG_HIGHMEM
-#define get_highmem_size(pagedir) (pagedir.size_high)
-#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
-#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
-#else
-#define get_highmem_size(pagedir) (0)
-#define set_highmem_size(pagedir, sz) do { } while (0)
-#define inc_highmem_size(pagedir) do { } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size)
-#endif
-
-extern struct pagedir pagedir1, pagedir2;
-
-extern void toi_copy_pageset1(void);
-
-extern int toi_get_pageset1_load_addresses(void);
-
-extern unsigned long __toi_get_nonconflicting_page(void);
-struct page *___toi_get_nonconflicting_page(int can_be_highmem);
-
-extern void toi_reset_alt_image_pageset2_pfn(void);
-extern int add_boot_kernel_data_pbe(void);
-#endif
diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
deleted file mode 100644
index 0fe92edd7..000000000
--- a/kernel/power/tuxonice_pageflags.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for serialising and relocating pageflags in which we
- * store our image metadata.
- */
-
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-int toi_pageflags_space_needed(void)
-{
-        return memory_bm_space_needed(pageset1_map);
-}
diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
deleted file mode 100644
index ddeeaf1e7..000000000
--- a/kernel/power/tuxonice_pageflags.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-
-struct  memory_bitmap;
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-void memory_bm_clear(struct memory_bitmap *bm);
-
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index);
-void memory_bm_position_reset(struct memory_bitmap *bm);
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-int toi_alloc_bitmap(struct memory_bitmap **bm);
-void toi_free_bitmap(struct memory_bitmap **bm);
-void memory_bm_clear(struct memory_bitmap *bm);
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-
-struct toi_module_ops;
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
-        (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
-        (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_space_needed(struct memory_bitmap *bm);
-
-extern struct memory_bitmap *pageset1_map;
-extern struct memory_bitmap *pageset1_copy_map;
-extern struct memory_bitmap *pageset2_map;
-extern struct memory_bitmap *page_resave_map;
-extern struct memory_bitmap *io_map;
-extern struct memory_bitmap *nosave_map;
-extern struct memory_bitmap *free_map;
-extern struct memory_bitmap *compare_map;
-
-#define PagePageset1(page) \
-        (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1(page) \
-        (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1(page) \
-        (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset1Copy(page) \
-        (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1Copy(page) \
-        (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1Copy(page) \
-        (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset2(page) \
-        (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset2(page) \
-        (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset2(page) \
-        (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageWasRW(page) \
-        (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPageWasRW(page) \
-        (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageWasRW(page) \
-        (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageResave(page) (page_resave_map ? \
-        memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageResave(page) \
-        (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageResave(page) \
-        (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosave(page) (nosave_map ? \
-        memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosave(page) \
-        (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosave(page) \
-        (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosaveFree(page) (free_map ? \
-                memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosaveFree(page) \
-        (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosaveFree(page) \
-        (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageCompareChanged(page) (compare_map ? \
-                memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageCompareChanged(page) \
-        (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageCompareChanged(page) \
-        (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-
-extern void save_pageflags(struct memory_bitmap *pagemap);
-extern int load_pageflags(struct memory_bitmap *pagemap);
-extern int toi_pageflags_space_needed(void);
-#endif
diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
deleted file mode 100644
index 7c78773cf..000000000
--- a/kernel/power/tuxonice_power_off.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for powering down.
- */
-
-#include <linux/device.h>
-#include <linux/suspend.h>
-#include <linux/mm.h>
-#include <linux/pm.h>
-#include <linux/reboot.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/fs.h>
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-
-unsigned long toi_poweroff_method; /* 0 - Kernel power off */
-
-static int wake_delay;
-static char lid_state_file[256], wake_alarm_dir[256];
-static struct file *lid_file, *alarm_file, *epoch_file;
-static int post_wake_state = -1;
-
-static int did_suspend_to_both;
-
-/*
- * __toi_power_down
- * Functionality   : Powers down or reboots the computer once the image
- *                   has been written to disk.
- * Key Assumptions : Able to reboot/power down via code called or that
- *                   the warning emitted if the calls fail will be visible
- *                   to the user (ie printk resumes devices).
- */
-
-static void __toi_power_down(int method)
-{
-        int error;
-
-        toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." :
-                        "Powering down.");
-
-        if (test_result_state(TOI_ABORTED))
-                goto out;
-
-        if (test_action_state(TOI_REBOOT))
-                kernel_restart(NULL);
-
-        switch (method) {
-        case 0:
-                break;
-        case 3:
-                /*
-                 * Re-read the overwritten part of pageset2 to make post-resume
-                 * faster.
-                 */
-                if (read_pageset2(1))
-                        panic("Attempt to reload pagedir 2 failed. "
-                                        "Try rebooting.");
-
-                pm_prepare_console();
-
-                error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
-                if (!error) {
-                        pm_restore_gfp_mask();
-                        error = suspend_devices_and_enter(PM_SUSPEND_MEM);
-                        pm_restrict_gfp_mask();
-                        if (!error)
-                                did_suspend_to_both = 1;
-                }
-                pm_notifier_call_chain(PM_POST_SUSPEND);
-                pm_restore_console();
-
-                /* Success - we're now post-resume-from-ram */
-                if (did_suspend_to_both)
-                        return;
-
-                /* Failed to suspend to ram - do normal power off */
-                break;
-        case 4:
-                /*
-                 * If succeeds, doesn't return. If fails, do a simple
-                 * powerdown.
-                 */
-                hibernation_platform_enter();
-                break;
-        case 5:
-                /* Historic entry only now */
-                break;
-        }
-
-        if (method && method != 5)
-                toi_cond_pause(1,
-                        "Falling back to alternate power off method.");
-
-        if (test_result_state(TOI_ABORTED))
-                goto out;
-
-        if (pm_power_off)
-            kernel_power_off();
-        kernel_halt();
-        toi_cond_pause(1, "Powerdown failed.");
-        while (1)
-                cpu_relax();
-
-out:
-        if (read_pageset2(1))
-                panic("Attempt to reload pagedir 2 failed. Try rebooting.");
-        return;
-}
-
-#define CLOSE_FILE(file) \
-        if (file) { \
-                filp_close(file, NULL); file = NULL; \
-        }
-
-static void powerdown_cleanup(int toi_or_resume)
-{
-        if (!toi_or_resume)
-                return;
-
-        CLOSE_FILE(lid_file);
-        CLOSE_FILE(alarm_file);
-        CLOSE_FILE(epoch_file);
-}
-
-static void open_file(char *format, char *arg, struct file **var, int mode,
-                char *desc)
-{
-        char buf[256];
-
-        if (strlen(arg)) {
-                sprintf(buf, format, arg);
-                *var = filp_open(buf, mode, 0);
-                if (IS_ERR(*var) || !*var) {
-                        printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
-                                desc, buf, *var);
-                        *var = NULL;
-                }
-        }
-}
-
-static int powerdown_init(int toi_or_resume)
-{
-        if (!toi_or_resume)
-                return 0;
-
-        did_suspend_to_both = 0;
-
-        open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
-                        O_RDONLY, "lid");
-
-        if (strlen(wake_alarm_dir)) {
-                open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
-                                &alarm_file, O_WRONLY, "alarm");
-
-                open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
-                                &epoch_file, O_RDONLY, "epoch");
-        }
-
-        return 0;
-}
-
-static int lid_closed(void)
-{
-        char array[25];
-        ssize_t size;
-        loff_t pos = 0;
-
-        if (!lid_file)
-                return 0;
-
-        size = vfs_read(lid_file, (char __user *) array, 25, &pos);
-        if ((int) size < 1) {
-                printk(KERN_INFO "Failed to read lid state file (%d).\n",
-                        (int) size);
-                return 0;
-        }
-
-        if (!strcmp(array, "state:      closed\n"))
-                return 1;
-
-        return 0;
-}
-
-static void write_alarm_file(int value)
-{
-        ssize_t size;
-        char buf[40];
-        loff_t pos = 0;
-
-        if (!alarm_file)
-                return;
-
-        sprintf(buf, "%d\n", value);
-
-        size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
-
-        if (size < 0)
-                printk(KERN_INFO "Error %d writing alarm value %s.\n",
-                                (int) size, buf);
-}
-
-/**
- * toi_check_resleep: See whether to powerdown again after waking.
- *
- * After waking, check whether we should powerdown again in a (usually
- * different) way. We only do this if the lid switch is still closed.
- */
-void toi_check_resleep(void)
-{
-        /* We only return if we suspended to ram and woke. */
-        if (lid_closed() && post_wake_state >= 0)
-                __toi_power_down(post_wake_state);
-}
-
-void toi_power_down(void)
-{
-        if (alarm_file && wake_delay) {
-                char array[25];
-                loff_t pos = 0;
-                size_t size = vfs_read(epoch_file, (char __user *) array, 25,
-                                &pos);
-
-                if (((int) size) < 1)
-                        printk(KERN_INFO "Failed to read epoch file (%d).\n",
-                                        (int) size);
-                else {
-                        unsigned long since_epoch;
-                        if (!kstrtoul(array, 0, &since_epoch)) {
-                                /* Clear any wakeup time. */
-                                write_alarm_file(0);
-
-                                /* Set new wakeup time. */
-                                write_alarm_file(since_epoch + wake_delay);
-                        }
-                }
-        }
-
-        __toi_power_down(toi_poweroff_method);
-
-        toi_check_resleep();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_ACPI)
-        SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL),
-        SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL),
-        SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL),
-        SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0,
-                        NULL),
-        SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0),
-        SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both,
-                0, 0, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops powerdown_ops = {
-        .type                                = MISC_HIDDEN_MODULE,
-        .name                                = "poweroff",
-        .initialise                        = powerdown_init,
-        .cleanup                        = powerdown_cleanup,
-        .directory                        = "[ROOT]",
-        .module                                = THIS_MODULE,
-        .sysfs_data                        = sysfs_params,
-        .num_sysfs_entries                = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-int toi_poweroff_init(void)
-{
-        return toi_register_module(&powerdown_ops);
-}
-
-void toi_poweroff_exit(void)
-{
-        toi_unregister_module(&powerdown_ops);
-}
diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
deleted file mode 100644
index 6e1d8bb39..000000000
--- a/kernel/power/tuxonice_power_off.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for the powering down.
- */
-
-int toi_pm_state_finish(void);
-void toi_power_down(void);
-extern unsigned long toi_poweroff_method;
-int toi_poweroff_init(void);
-void toi_poweroff_exit(void);
-void toi_check_resleep(void);
-
-extern int platform_begin(int platform_mode);
-extern int platform_pre_snapshot(int platform_mode);
-extern void platform_leave(int platform_mode);
-extern void platform_end(int platform_mode);
-extern void platform_finish(int platform_mode);
-extern int platform_pre_restore(int platform_mode);
-extern void platform_restore_cleanup(int platform_mode);
diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
deleted file mode 100644
index df37cc805..000000000
--- a/kernel/power/tuxonice_prepare_image.c
+++ /dev/null
@@ -1,1089 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * We need to eat memory until we can:
- * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
- * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
- *    main_storage_needed())
- * 3. Reload the pagedir and pageset1 to places that don't collide with their
- *    final destinations, not knowing to what extent the resumed kernel will
- *    overlap with the one loaded at boot time. I think the resumed kernel
- *    should overlap completely, but I don't want to rely on this as it is
- *    an unproven assumption. We therefore assume there will be no overlap at
- *    all (worse case).
- * 4. Meet the user's requested limit (if any) on the size of the image.
- *    The limit is in MB, so pages/256 (assuming 4K pages).
- *
- */
-
-#include <linux/highmem.h>
-#include <linux/freezer.h>
-#include <linux/hardirq.h>
-#include <linux/mmzone.h>
-#include <linux/console.h>
-#include <linux/tuxonice.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_builtin.h"
-
-static unsigned long num_nosave, main_storage_allocated, storage_limit,
-            header_storage_needed;
-unsigned long extra_pd1_pages_allowance =
-        CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE;
-long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT;
-static int no_ps2_needed;
-
-struct attention_list {
-        struct task_struct *task;
-        struct attention_list *next;
-};
-
-static struct attention_list *attention_list;
-
-#define PAGESET1 0
-#define PAGESET2 1
-
-void free_attention_list(void)
-{
-        struct attention_list *last = NULL;
-
-        while (attention_list) {
-                last = attention_list;
-                attention_list = attention_list->next;
-                toi_kfree(6, last, sizeof(*last));
-        }
-}
-
-static int build_attention_list(void)
-{
-        int i, task_count = 0;
-        struct task_struct *p;
-        struct attention_list *next;
-
-        /*
-         * Count all userspace process (with task->mm) marked PF_NOFREEZE.
-         */
-        toi_read_lock_tasklist();
-        for_each_process(p)
-                if ((p->flags & PF_NOFREEZE) || p == current)
-                        task_count++;
-        toi_read_unlock_tasklist();
-
-        /*
-         * Allocate attention list structs.
-         */
-        for (i = 0; i < task_count; i++) {
-                struct attention_list *this =
-                        toi_kzalloc(6, sizeof(struct attention_list),
-                                        TOI_WAIT_GFP);
-                if (!this) {
-                        printk(KERN_INFO "Failed to allocate slab for "
-                                        "attention list.\n");
-                        free_attention_list();
-                        return 1;
-                }
-                this->next = NULL;
-                if (attention_list)
-                        this->next = attention_list;
-                attention_list = this;
-        }
-
-        next = attention_list;
-        toi_read_lock_tasklist();
-        for_each_process(p)
-                if ((p->flags & PF_NOFREEZE) || p == current) {
-                        next->task = p;
-                        next = next->next;
-                }
-        toi_read_unlock_tasklist();
-        return 0;
-}
-
-static void pageset2_full(void)
-{
-        struct zone *zone;
-        struct page *page;
-        unsigned long flags;
-        int i;
-
-        toi_trace_index++;
-
-        for_each_populated_zone(zone) {
-                spin_lock_irqsave(&zone->lru_lock, flags);
-                for_each_lru(i) {
-                        if (!zone_page_state(zone, NR_LRU_BASE + i))
-                                continue;
-
-                        list_for_each_entry(page, &zone->lruvec.lists[i], lru) {
-                                struct address_space *mapping;
-
-                                mapping = page_mapping(page);
-                                if (!mapping || !mapping->host ||
-                                    !(mapping->host->i_flags & S_ATOMIC_COPY)) {
-                                    if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
-                                        TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified.");
-                                    } else {
-                                        TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full.");
-                                        SetPagePageset2(page);
-                                    }
-                                }
-                        }
-                }
-                spin_unlock_irqrestore(&zone->lru_lock, flags);
-        }
-}
-
-/*
- * toi_mark_task_as_pageset
- * Functionality   : Marks all the saveable pages belonging to a given process
- *                      as belonging to a particular pageset.
- */
-
-static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
-{
-        struct vm_area_struct *vma;
-        struct mm_struct *mm;
-
-        mm = t->active_mm;
-
-        if (!mm || !mm->mmap)
-                return;
-
-        toi_trace_index++;
-
-        if (!irqs_disabled())
-                down_read(&mm->mmap_sem);
-
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                unsigned long posn;
-
-                if (!vma->vm_start ||
-                    vma->vm_flags & VM_PFNMAP)
-                        continue;
-
-                for (posn = vma->vm_start; posn < vma->vm_end;
-                                posn += PAGE_SIZE) {
-                        struct page *page = follow_page(vma, posn, 0);
-                        struct address_space *mapping;
-
-                        if (!page || !pfn_valid(page_to_pfn(page)))
-                                continue;
-
-                        mapping = page_mapping(page);
-                        if (mapping && mapping->host &&
-                            mapping->host->i_flags & S_ATOMIC_COPY && pageset2)
-                                continue;
-
-                        if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
-                                TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2);
-                                continue;
-                        }
-
-                        if (pageset2) {
-                                TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1");
-                                SetPagePageset2(page);
-                        } else {
-                                TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2");
-                                ClearPagePageset2(page);
-                                SetPagePageset1(page);
-                        }
-                }
-        }
-
-        if (!irqs_disabled())
-                up_read(&mm->mmap_sem);
-}
-
-static void mark_tasks(int pageset)
-{
-        struct task_struct *p;
-
-        toi_read_lock_tasklist();
-        for_each_process(p) {
-                if (!p->mm)
-                        continue;
-
-                if (p->flags & PF_KTHREAD)
-                        continue;
-
-                toi_mark_task_as_pageset(p, pageset);
-        }
-        toi_read_unlock_tasklist();
-
-}
-
-/* mark_pages_for_pageset2
- *
- * Description:        Mark unshared pages in processes not needed for hibernate as
- *                 being able to be written out in a separate pagedir.
- *                 HighMem pages are simply marked as pageset2. They won't be
- *                 needed during hibernate.
- */
-
-static void toi_mark_pages_for_pageset2(void)
-{
-        struct attention_list *this = attention_list;
-
-        memory_bm_clear(pageset2_map);
-
-        if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed)
-                return;
-
-        if (test_action_state(TOI_PAGESET2_FULL))
-                pageset2_full();
-        else
-                mark_tasks(PAGESET2);
-
-        /*
-         * Because the tasks in attention_list are ones related to hibernating,
-         * we know that they won't go away under us.
-         */
-
-        while (this) {
-                if (!test_result_state(TOI_ABORTED))
-                        toi_mark_task_as_pageset(this->task, PAGESET1);
-                this = this->next;
-        }
-}
-
-/*
- * The atomic copy of pageset1 is stored in pageset2 pages.
- * But if pageset1 is larger (normally only just after boot),
- * we need to allocate extra pages to store the atomic copy.
- * The following data struct and functions are used to handle
- * the allocation and freeing of that memory.
- */
-
-static unsigned long extra_pages_allocated;
-
-struct extras {
-        struct page *page;
-        int order;
-        struct extras *next;
-};
-
-static struct extras *extras_list;
-
-/* toi_free_extra_pagedir_memory
- *
- * Description:        Free previously allocated extra pagedir memory.
- */
-void toi_free_extra_pagedir_memory(void)
-{
-        /* Free allocated pages */
-        while (extras_list) {
-                struct extras *this = extras_list;
-                int i;
-
-                extras_list = this->next;
-
-                for (i = 0; i < (1 << this->order); i++)
-                        ClearPageNosave(this->page + i);
-
-                toi_free_pages(9, this->page, this->order);
-                toi_kfree(7, this, sizeof(*this));
-        }
-
-        extra_pages_allocated = 0;
-}
-
-/* toi_allocate_extra_pagedir_memory
- *
- * Description:        Allocate memory for making the atomic copy of pagedir1 in the
- *                 case where it is bigger than pagedir2.
- * Arguments:        int        num_to_alloc: Number of extra pages needed.
- * Result:        int.         Number of extra pages we now have allocated.
- */
-static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
-{
-        int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
-        gfp_t flags = TOI_ATOMIC_GFP;
-
-        if (num_to_alloc < 1)
-                return 0;
-
-        order = fls(num_to_alloc);
-        if (order >= MAX_ORDER)
-                order = MAX_ORDER - 1;
-
-        while (num_to_alloc) {
-                struct page *newpage;
-                unsigned long virt;
-                struct extras *extras_entry;
-
-                while ((1 << order) > num_to_alloc)
-                        order--;
-
-                extras_entry = (struct extras *) toi_kzalloc(7,
-                        sizeof(struct extras), TOI_ATOMIC_GFP);
-
-                if (!extras_entry)
-                        return extra_pages_allocated;
-
-                virt = toi_get_free_pages(9, flags, order);
-                while (!virt && order) {
-                        order--;
-                        virt = toi_get_free_pages(9, flags, order);
-                }
-
-                if (!virt) {
-                        toi_kfree(7, extras_entry, sizeof(*extras_entry));
-                        return extra_pages_allocated;
-                }
-
-                newpage = virt_to_page(virt);
-
-                extras_entry->page = newpage;
-                extras_entry->order = order;
-                extras_entry->next = extras_list;
-
-                extras_list = extras_entry;
-
-                for (j = 0; j < (1 << order); j++) {
-                        SetPageNosave(newpage + j);
-                        SetPagePageset1Copy(newpage + j);
-                }
-
-                extra_pages_allocated += (1 << order);
-                num_to_alloc -= (1 << order);
-        }
-
-        return extra_pages_allocated;
-}
-
-/*
- * real_nr_free_pages: Count pcp pages for a zone type or all zones
- * (-1 for all, otherwise zone_idx() result desired).
- */
-unsigned long real_nr_free_pages(unsigned long zone_idx_mask)
-{
-        struct zone *zone;
-        int result = 0, cpu;
-
-        /* PCP lists */
-        for_each_populated_zone(zone) {
-                if (!(zone_idx_mask & (1 << zone_idx(zone))))
-                        continue;
-
-                for_each_online_cpu(cpu) {
-                        struct per_cpu_pageset *pset =
-                                per_cpu_ptr(zone->pageset, cpu);
-                        struct per_cpu_pages *pcp = &pset->pcp;
-                        result += pcp->count;
-                }
-
-                result += zone_page_state(zone, NR_FREE_PAGES);
-        }
-        return result;
-}
-
-/*
- * Discover how much extra memory will be required by the drivers
- * when they're asked to hibernate. We can then ensure that amount
- * of memory is available when we really want it.
- */
-static void get_extra_pd1_allowance(void)
-{
-        unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final;
-
-        toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
-
-        if (toi_go_atomic(PMSG_FREEZE, 1))
-                return;
-
-        final = real_nr_free_pages(all_zones_mask);
-        toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0);
-
-        extra_pd1_pages_allowance = (orig_num_free > final) ?
-                orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE :
-                MIN_EXTRA_PAGES_ALLOWANCE;
-}
-
-/*
- * Amount of storage needed, possibly taking into account the
- * expected compression ratio and possibly also ignoring our
- * allowance for extra pages.
- */
-static unsigned long main_storage_needed(int use_ecr,
-                int ignore_extra_pd1_allow)
-{
-        return (pagedir1.size + pagedir2.size +
-          (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
-         (use_ecr ? toi_expected_compression_ratio() : 100) / 100;
-}
-
-/*
- * Storage needed for the image header, in bytes until the return.
- *
- * fs_info_space_needed is saved in a static variable unless we
- * explicitly want to reset the value (done at the start of a cycle)
- * as it requires memory allocation that may result in a hang if we're
- * also trying to free memory.
- */
-unsigned long get_header_storage_needed(int reset)
-{
-        unsigned long bytes = sizeof(struct toi_header) +
-                        toi_header_storage_for_modules() +
-                        toi_pageflags_space_needed() +
-                        fs_info_space_needed(0);
-
-        return DIV_ROUND_UP(bytes, PAGE_SIZE);
-}
-
-/*
- * When freeing memory, pages from either pageset might be freed.
- *
- * When seeking to free memory to be able to hibernate, for every ps1 page
- * freed, we need 2 less pages for the atomic copy because there is one less
- * page to copy and one more page into which data can be copied.
- *
- * Freeing ps2 pages saves us nothing directly. No more memory is available
- * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
- * that's too much work to figure out.
- *
- * => ps1_to_free functions
- *
- * Of course if we just want to reduce the image size, because of storage
- * limitations or an image size limit either ps will do.
- *
- * => any_to_free function
- */
-
-static unsigned long lowpages_usable_for_highmem_copy(void)
-{
-        unsigned long needed = get_lowmem_size(pagedir1) +
-                        extra_pd1_pages_allowance + MIN_FREE_RAM +
-                        toi_memory_for_modules(0),
-                available = get_lowmem_size(pagedir2) +
-                         real_nr_free_low_pages() + extra_pages_allocated;
-
-        return available > needed ? available - needed : 0;
-}
-
-static unsigned long highpages_ps1_to_free(void)
-{
-        unsigned long need = get_highmem_size(pagedir1),
-                      available = get_highmem_size(pagedir2) +
-                              real_nr_free_high_pages() +
-                              lowpages_usable_for_highmem_copy();
-
-        return need > available ? DIV_ROUND_UP(need - available, 2) : 0;
-}
-
-static unsigned long lowpages_ps1_to_free(void)
-{
-        unsigned long needed = get_lowmem_size(pagedir1) +
-                        extra_pd1_pages_allowance + MIN_FREE_RAM +
-                        toi_memory_for_modules(0),
-                available = get_lowmem_size(pagedir2) +
-                         real_nr_free_low_pages() + extra_pages_allocated;
-
-        return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0;
-}
-
-static unsigned long current_image_size(void)
-{
-        return pagedir1.size + pagedir2.size + header_storage_needed;
-}
-
-static unsigned long storage_still_required(void)
-{
-        unsigned long needed = main_storage_needed(1, 1);
-        return needed > storage_limit ? needed - storage_limit : 0;
-}
-
-static unsigned long ram_still_required(void)
-{
-        unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) +
-                2 * extra_pd1_pages_allowance,
-                  available = real_nr_free_low_pages() + extra_pages_allocated;
-        return needed > available ? needed - available : 0;
-}
-
-unsigned long any_to_free(int use_image_size_limit)
-{
-        int use_soft_limit = use_image_size_limit && image_size_limit > 0;
-        unsigned long current_size = current_image_size(),
-                      soft_limit = use_soft_limit ? (image_size_limit << 8) : 0,
-                      to_free = use_soft_limit ? (current_size > soft_limit ?
-                                      current_size - soft_limit : 0) : 0,
-                      storage_limit = storage_still_required(),
-                      ram_limit = ram_still_required(),
-                      first_max = max(to_free, storage_limit);
-
-        return max(first_max, ram_limit);
-}
-
-static int need_pageset2(void)
-{
-        return (real_nr_free_low_pages() + extra_pages_allocated -
-                2 * extra_pd1_pages_allowance - MIN_FREE_RAM -
-                 toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size;
-}
-
-/* amount_needed
- *
- * Calculates the amount by which the image size needs to be reduced to meet
- * our constraints.
- */
-static unsigned long amount_needed(int use_image_size_limit)
-{
-        return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
-                        any_to_free(use_image_size_limit));
-}
-
-static int image_not_ready(int use_image_size_limit)
-{
-        toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
-                "Amount still needed (%lu) > 0:%u,"
-                " Storage allocd: %lu < %lu: %u.\n",
-                        amount_needed(use_image_size_limit),
-                        (amount_needed(use_image_size_limit) > 0),
-                        main_storage_allocated,
-                        main_storage_needed(1, 1),
-                        main_storage_allocated < main_storage_needed(1, 1));
-
-        toi_cond_pause(0, NULL);
-
-        return (amount_needed(use_image_size_limit) > 0) ||
-                 main_storage_allocated < main_storage_needed(1, 1);
-}
-
-static void display_failure_reason(int tries_exceeded)
-{
-        unsigned long storage_required = storage_still_required(),
-            ram_required = ram_still_required(),
-            high_ps1 = highpages_ps1_to_free(),
-            low_ps1 = lowpages_ps1_to_free();
-
-        printk(KERN_INFO "Failed to prepare the image because...\n");
-
-        if (!storage_limit) {
-                printk(KERN_INFO "- You need some storage available to be "
-                                "able to hibernate.\n");
-                return;
-        }
-
-        if (tries_exceeded)
-                printk(KERN_INFO "- The maximum number of iterations was "
-                                "reached without successfully preparing the "
-                                "image.\n");
-
-        if (storage_required) {
-                printk(KERN_INFO " - We need at least %lu pages of storage "
-                                "(ignoring the header), but only have %lu.\n",
-                                main_storage_needed(1, 1),
-                                main_storage_allocated);
-                set_abort_result(TOI_INSUFFICIENT_STORAGE);
-        }
-
-        if (ram_required) {
-                printk(KERN_INFO " - We need %lu more free pages of low "
-                                "memory.\n", ram_required);
-                printk(KERN_INFO "     Minimum free     : %8d\n", MIN_FREE_RAM);
-                printk(KERN_INFO "   + Reqd. by modules : %8lu\n",
-                                toi_memory_for_modules(0));
-                printk(KERN_INFO "   + 2 * extra allow  : %8lu\n",
-                                2 * extra_pd1_pages_allowance);
-                printk(KERN_INFO "   - Currently free   : %8lu\n",
-                                real_nr_free_low_pages());
-                printk(KERN_INFO "   - Pages allocd     : %8lu\n",
-                                extra_pages_allocated);
-                printk(KERN_INFO "                      : ========\n");
-                printk(KERN_INFO "     Still needed     : %8lu\n",
-                                ram_required);
-
-                /* Print breakdown of memory needed for modules */
-                toi_memory_for_modules(1);
-                set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
-        }
-
-        if (high_ps1) {
-                printk(KERN_INFO "- We need to free %lu highmem pageset 1 "
-                                "pages.\n", high_ps1);
-                set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
-        }
-
-        if (low_ps1) {
-                printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
-                                "pages.\n", low_ps1);
-                set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
-        }
-}
-
-static void display_stats(int always, int sub_extra_pd1_allow)
-{
-        char buffer[255];
-        snprintf(buffer, 254,
-                "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). "
-                "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). "
-                "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n",
-
-                /* Free */
-                real_nr_free_pages(all_zones_mask),
-                real_nr_free_low_pages(),
-
-                /* Sets */
-                pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
-                pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
-
-                /* Nosave */
-                num_nosave, extra_pages_allocated,
-                num_nosave - extra_pages_allocated,
-
-                /* Storage */
-                main_storage_allocated,
-                storage_limit,
-                main_storage_needed(1, sub_extra_pd1_allow),
-                main_storage_needed(1, 1),
-
-                /* Needed */
-                lowpages_ps1_to_free(), highpages_ps1_to_free(),
-                any_to_free(1),
-                MIN_FREE_RAM, toi_memory_for_modules(0),
-                extra_pd1_pages_allowance,
-                image_size_limit,
-
-                need_pageset2() ? "yes" : "no");
-
-        if (always)
-                printk("%s", buffer);
-        else
-                toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
-}
-
-/* flag_image_pages
- *
- * This routine generates our lists of pages to be stored in each
- * pageset. Since we store the data using extents, and adding new
- * extents might allocate a new extent page, this routine may well
- * be called more than once.
- */
-static void flag_image_pages(int atomic_copy)
-{
-        int num_free = 0, num_unmodified = 0;
-        unsigned long loop;
-        struct zone *zone;
-
-        pagedir1.size = 0;
-        pagedir2.size = 0;
-
-        set_highmem_size(pagedir1, 0);
-        set_highmem_size(pagedir2, 0);
-
-        num_nosave = 0;
-        toi_trace_index++;
-
-        memory_bm_clear(pageset1_map);
-
-        toi_generate_free_page_map();
-
-        /*
-         * Pages not to be saved are marked Nosave irrespective of being
-         * reserved.
-         */
-        for_each_populated_zone(zone) {
-                int highmem = is_highmem(zone);
-
-                for (loop = 0; loop < zone->spanned_pages; loop++) {
-                        unsigned long pfn = zone->zone_start_pfn + loop;
-                        struct page *page;
-                        int chunk_size;
-
-                        if (!pfn_valid(pfn)) {
-                            TOI_TRACE_DEBUG(pfn, "_Flag Invalid");
-                            continue;
-                        }
-
-                        chunk_size = toi_size_of_free_region(zone, pfn);
-                        if (chunk_size) {
-                            unsigned long y;
-                            for (y = pfn; y < pfn + chunk_size; y++) {
-                                page = pfn_to_page(y);
-                                TOI_TRACE_DEBUG(y, "_Flag Free");
-                                ClearPagePageset1(page);
-                                ClearPagePageset2(page);
-                            }
-                                num_free += chunk_size;
-                                loop += chunk_size - 1;
-                                continue;
-                        }
-
-                        page = pfn_to_page(pfn);
-
-                        if (PageNosave(page)) {
-                            char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave";
-                            TOI_TRACE_DEBUG(pfn, "_Flag %s", desc);
-                            num_nosave++;
-                            continue;
-                        }
-
-                        page = highmem ? saveable_highmem_page(zone, pfn) :
-                                saveable_page(zone, pfn);
-
-                        if (!page) {
-                                TOI_TRACE_DEBUG(pfn, "_Flag Nosave2");
-                                num_nosave++;
-                                continue;
-                        }
-
-                        if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
-                            TOI_TRACE_DEBUG(pfn, "_Unmodified");
-                            num_unmodified++;
-                            continue;
-                        }
-
-                        if (PagePageset2(page)) {
-                                pagedir2.size++;
-                                TOI_TRACE_DEBUG(pfn, "_Flag PS2");
-                                if (PageHighMem(page))
-                                        inc_highmem_size(pagedir2);
-                                else
-                                        SetPagePageset1Copy(page);
-                                if (PageResave(page)) {
-                                        SetPagePageset1(page);
-                                        ClearPagePageset1Copy(page);
-                                        pagedir1.size++;
-                                        if (PageHighMem(page))
-                                                inc_highmem_size(pagedir1);
-                                }
-                        } else {
-                                pagedir1.size++;
-                                TOI_TRACE_DEBUG(pfn, "_Flag PS1");
-                                SetPagePageset1(page);
-                                if (PageHighMem(page))
-                                        inc_highmem_size(pagedir1);
-                        }
-                }
-        }
-
-        if (!atomic_copy)
-                toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
-                        "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)"
-                                    " + Unmodified (%d) + NumFree (%d) = %d.\n",
-                        pagedir1.size, pagedir2.size, num_nosave, num_unmodified,
-                        num_free, pagedir1.size + pagedir2.size + num_nosave + num_free);
-}
-
-void toi_recalculate_image_contents(int atomic_copy)
-{
-        memory_bm_clear(pageset1_map);
-        if (!atomic_copy) {
-                unsigned long pfn;
-                memory_bm_position_reset(pageset2_map);
-                for (pfn = memory_bm_next_pfn(pageset2_map, 0);
-                                pfn != BM_END_OF_MAP;
-                                pfn = memory_bm_next_pfn(pageset2_map, 0))
-                        ClearPagePageset1Copy(pfn_to_page(pfn));
-                /* Need to call this before getting pageset1_size! */
-                toi_mark_pages_for_pageset2();
-        }
-        memory_bm_position_reset(pageset2_map);
-        flag_image_pages(atomic_copy);
-
-        if (!atomic_copy) {
-                storage_limit = toiActiveAllocator->storage_available();
-                display_stats(0, 0);
-        }
-}
-
-int try_allocate_extra_memory(void)
-{
-        unsigned long wanted = pagedir1.size +  extra_pd1_pages_allowance -
-                get_lowmem_size(pagedir2);
-        if (wanted > extra_pages_allocated) {
-                unsigned long got = toi_allocate_extra_pagedir_memory(wanted);
-                if (wanted < got) {
-                        toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
-                                "Want %d extra pages for pageset1, got %d.\n",
-                                wanted, got);
-                        return 1;
-                }
-        }
-        return 0;
-}
-
-/* update_image
- *
- * Allocate [more] memory and storage for the image.
- */
-static void update_image(int ps2_recalc)
-{
-        int old_header_req;
-        unsigned long seek;
-
-        if (try_allocate_extra_memory())
-                return;
-
-        if (ps2_recalc)
-                goto recalc;
-
-        thaw_kernel_threads();
-
-        /*
-         * Allocate remaining storage space, if possible, up to the
-         * maximum we know we'll need. It's okay to allocate the
-         * maximum if the writer is the swapwriter, but
-         * we don't want to grab all available space on an NFS share.
-         * We therefore ignore the expected compression ratio here,
-         * thereby trying to allocate the maximum image size we could
-         * need (assuming compression doesn't expand the image), but
-         * don't complain if we can't get the full amount we're after.
-         */
-
-        do {
-                int result;
-
-                old_header_req = header_storage_needed;
-                toiActiveAllocator->reserve_header_space(header_storage_needed);
-
-                /* How much storage is free with the reservation applied? */
-                storage_limit = toiActiveAllocator->storage_available();
-                seek = min(storage_limit, main_storage_needed(0, 0));
-
-                result = toiActiveAllocator->allocate_storage(seek);
-                if (result)
-                        printk("Failed to allocate storage (%d).\n", result);
-
-                main_storage_allocated =
-                        toiActiveAllocator->storage_allocated();
-
-                /* Need more header because more storage allocated? */
-                header_storage_needed = get_header_storage_needed(0);
-
-        } while (header_storage_needed > old_header_req);
-
-        if (freeze_kernel_threads())
-                set_abort_result(TOI_FREEZING_FAILED);
-
-recalc:
-        toi_recalculate_image_contents(0);
-}
-
-/* attempt_to_freeze
- *
- * Try to freeze processes.
- */
-
-static int attempt_to_freeze(void)
-{
-        int result;
-
-        /* Stop processes before checking again */
-        toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
-                        "filesystems.");
-        result = freeze_processes();
-
-        if (result)
-                set_abort_result(TOI_FREEZING_FAILED);
-
-        result = freeze_kernel_threads();
-
-        if (result)
-                set_abort_result(TOI_FREEZING_FAILED);
-
-        return result;
-}
-
-/* eat_memory
- *
- * Try to free some memory, either to meet hard or soft constraints on the image
- * characteristics.
- *
- * Hard constraints:
- * - Pageset1 must be < half of memory;
- * - We must have enough memory free at resume time to have pageset1
- *   be able to be loaded in pages that don't conflict with where it has to
- *   be restored.
- * Soft constraints
- * - User specificied image size limit.
- */
-static void eat_memory(void)
-{
-        unsigned long amount_wanted = 0;
-        int did_eat_memory = 0;
-
-        /*
-         * Note that if we have enough storage space and enough free memory, we
-         * may exit without eating anything. We give up when the last 10
-         * iterations ate no extra pages because we're not going to get much
-         * more anyway, but the few pages we get will take a lot of time.
-         *
-         * We freeze processes before beginning, and then unfreeze them if we
-         * need to eat memory until we think we have enough. If our attempts
-         * to freeze fail, we give up and abort.
-         */
-
-        amount_wanted = amount_needed(1);
-
-        switch (image_size_limit) {
-        case -1: /* Don't eat any memory */
-                if (amount_wanted > 0) {
-                        set_abort_result(TOI_WOULD_EAT_MEMORY);
-                        return;
-                }
-                break;
-        case -2:  /* Free caches only */
-                drop_pagecache();
-                toi_recalculate_image_contents(0);
-                amount_wanted = amount_needed(1);
-                break;
-        default:
-                break;
-        }
-
-        if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
-                        image_size_limit != -1) {
-                unsigned long request = amount_wanted;
-                unsigned long high_req = max(highpages_ps1_to_free(),
-                                any_to_free(1));
-                unsigned long low_req = lowpages_ps1_to_free();
-                unsigned long got = 0;
-
-                toi_prepare_status(CLEAR_BAR,
-                                "Seeking to free %ldMB of memory.",
-                                MB(amount_wanted));
-
-                thaw_kernel_threads();
-
-                /*
-                 * Ask for too many because shrink_memory_mask doesn't
-                 * currently return enough most of the time.
-                 */
-                
-                if (low_req)
-                        got = shrink_memory_mask(low_req, GFP_KERNEL);
-                if (high_req)
-                        shrink_memory_mask(high_req - got, GFP_HIGHUSER);
-
-                did_eat_memory = 1;
-
-                toi_recalculate_image_contents(0);
-
-                amount_wanted = amount_needed(1);
-
-                printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &"
-                                " %ld pages from anywhere, got %ld.\n",
-                                high_req, low_req,
-                                request - amount_wanted);
-
-                toi_cond_pause(0, NULL);
-
-                if (freeze_kernel_threads())
-                        set_abort_result(TOI_FREEZING_FAILED);
-        }
-
-        if (did_eat_memory)
-                toi_recalculate_image_contents(0);
-}
-
-/* toi_prepare_image
- *
- * Entry point to the whole image preparation section.
- *
- * We do four things:
- * - Freeze processes;
- * - Ensure image size constraints are met;
- * - Complete all the preparation for saving the image,
- *   including allocation of storage. The only memory
- *   that should be needed when we're finished is that
- *   for actually storing the image (and we know how
- *   much is needed for that because the modules tell
- *   us).
- * - Make sure that all dirty buffers are written out.
- */
-#define MAX_TRIES 2
-int toi_prepare_image(void)
-{
-        int result = 1, tries = 1;
-
-        main_storage_allocated = 0;
-
-        // Force recalculation of the amount of header storage needed for fs info.
-        fs_info_space_needed(1);
-
-        no_ps2_needed = 0;
-
-        if (attempt_to_freeze())
-                return 1;
-
-        lock_device_hotplug();
-        set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
-
-        if (!extra_pd1_pages_allowance)
-                get_extra_pd1_allowance();
-
-        storage_limit = toiActiveAllocator->storage_available();
-
-        if (!storage_limit) {
-                printk(KERN_INFO "No storage available. Didn't try to prepare "
-                                "an image.\n");
-                display_failure_reason(0);
-                set_abort_result(TOI_NOSTORAGE_AVAILABLE);
-                return 1;
-        }
-
-        if (build_attention_list()) {
-                abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
-                                "Unable to successfully prepare the image.\n");
-                return 1;
-        }
-
-        toi_recalculate_image_contents(0);
-
-        do {
-                toi_prepare_status(CLEAR_BAR,
-                                "Preparing Image. Try %d.", tries);
-
-                eat_memory();
-
-                if (test_result_state(TOI_ABORTED))
-                        break;
-
-                update_image(0);
-
-                tries++;
-
-        } while (image_not_ready(1) && tries <= MAX_TRIES &&
-                        !test_result_state(TOI_ABORTED));
-
-        result = image_not_ready(0);
-
-        /* TODO: Handle case where need to remove existing image and resave
-         * instead of adding to incremental image. */
-
-        if (!test_result_state(TOI_ABORTED)) {
-                if (result) {
-                        display_stats(1, 0);
-                        display_failure_reason(tries > MAX_TRIES);
-                        abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
-                                "Unable to successfully prepare the image.\n");
-                } else {
-                        /* Pageset 2 needed? */
-                        if (!need_pageset2() &&
-                                  test_action_state(TOI_NO_PS2_IF_UNNEEDED)) {
-                                no_ps2_needed = 1;
-                                toi_recalculate_image_contents(0);
-                                update_image(1);
-                        }
-
-                        toi_cond_pause(1, "Image preparation complete.");
-                }
-        }
-
-        return result ? result : allocate_checksum_pages();
-}
diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
deleted file mode 100644
index f7f2b695c..000000000
--- a/kernel/power/tuxonice_prepare_image.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <asm/sections.h>
-
-extern int toi_prepare_image(void);
-extern void toi_recalculate_image_contents(int storage_available);
-extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask);
-extern long image_size_limit;
-extern void toi_free_extra_pagedir_memory(void);
-extern unsigned long extra_pd1_pages_allowance;
-extern void free_attention_list(void);
-
-#define MIN_FREE_RAM 100
-#define MIN_EXTRA_PAGES_ALLOWANCE 500
-
-#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
-#ifdef CONFIG_HIGHMEM
-#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
-                                                (1 << ZONE_HIGHMEM)))
-#else
-#define real_nr_free_high_pages() (0)
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
-
-/* For eat_memory function */
-#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
-#endif
-
-unsigned long get_header_storage_needed(int reset);
-unsigned long any_to_free(int use_image_size_limit);
-int try_allocate_extra_memory(void);
diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c
deleted file mode 100644
index 5bc56d3a1..000000000
--- a/kernel/power/tuxonice_prune.c
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * kernel/power/tuxonice_prune.c
- *
- * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file implements a TuxOnIce module that seeks to prune the
- * amount of data written to disk. It builds a table of hashes
- * of the uncompressed data, and writes the pfn of the previous page
- * with the same contents instead of repeating the data when a match
- * is found.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-#include <crypto/hash.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-/*
- * We never write a page bigger than PAGE_SIZE, so use a large number
- * to indicate that data is a PFN.
- */
-#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100)
-
-static unsigned long toi_pruned_pages;
-
-static struct toi_module_ops toi_prune_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_prune_hash_algo_name[32] = "sha1";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
-        struct shash_desc desc;
-        char *digest;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_prune_crypto_prepare(void)
-{
-        int cpu, ret, digestsize;
-
-        if (!*toi_prune_hash_algo_name) {
-                printk(KERN_INFO "TuxOnIce: Pruning enabled but no "
-                                "hash algorithm set.\n");
-                return 1;
-        }
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0);
-                if (IS_ERR(this->desc.tfm)) {
-                        printk(KERN_INFO "TuxOnIce: Failed to allocate the "
-                                        "%s prune hash algorithm.\n",
-                                        toi_prune_hash_algo_name);
-                        this->desc.tfm = NULL;
-                        return 1;
-                }
-
-                if (!digestsize)
-                        digestsize = crypto_shash_digestsize(this->desc.tfm);
-
-                this->digest = kmalloc(digestsize, GFP_KERNEL);
-                if (!this->digest) {
-                        printk(KERN_INFO "TuxOnIce: Failed to allocate space "
-                                        "for digest output.\n");
-                        crypto_free_shash(this->desc.tfm);
-                        this->desc.tfm = NULL;
-                }
-
-                this->desc.flags = 0;
-
-                ret = crypto_shash_init(&this->desc);
-                if (ret < 0) {
-                        printk(KERN_INFO "TuxOnIce: Failed to initialise the "
-                                        "%s prune hash algorithm.\n",
-                                        toi_prune_hash_algo_name);
-                        kfree(this->digest);
-                        this->digest = NULL;
-                        crypto_free_shash(this->desc.tfm);
-                        this->desc.tfm = NULL;
-                        return 1;
-                }
-        }
-
-        return 0;
-}
-
-static int toi_prune_rw_cleanup(int writing)
-{
-        int cpu;
-
-        for_each_online_cpu(cpu) {
-                struct cpu_context *this = &per_cpu(contexts, cpu);
-                if (this->desc.tfm) {
-                        crypto_free_shash(this->desc.tfm);
-                        this->desc.tfm = NULL;
-                }
-
-                if (this->digest) {
-                        kfree(this->digest);
-                        this->digest = NULL;
-                }
-        }
-
-        return 0;
-}
-
-/*
- * toi_prune_init
- */
-
-static int toi_prune_init(int toi_or_resume)
-{
-        if (!toi_or_resume)
-                return 0;
-
-        toi_pruned_pages = 0;
-
-        next_driver = toi_get_next_filter(&toi_prune_ops);
-
-        return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_prune_rw_init()
- */
-
-static int toi_prune_rw_init(int rw, int stream_number)
-{
-        if (toi_prune_crypto_prepare()) {
-                printk(KERN_ERR "Failed to initialise prune "
-                                "algorithm.\n");
-                if (rw == READ) {
-                        printk(KERN_INFO "Unable to read the image.\n");
-                        return -ENODEV;
-                } else {
-                        printk(KERN_INFO "Continuing without "
-                                "pruning the image.\n");
-                        toi_prune_ops.enabled = 0;
-                }
-        }
-
-        return 0;
-}
-
-/*
- * toi_prune_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page:        Pointer to a buffer of size PAGE_SIZE, containing
- * data to be checked.
- *
- * Returns:        0 on success. Otherwise the error is that returned by later
- *                 modules, -ECHILD if we have a broken pipeline or -EIO if
- *                 zlib errs.
- */
-static int toi_prune_write_page(unsigned long index, int buf_type,
-                void *buffer_page, unsigned int buf_size)
-{
-        int ret = 0, cpu = smp_processor_id(), write_data = 1;
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-        u8* output_buffer = buffer_page;
-        int output_len = buf_size;
-        int out_buf_type = buf_type;
-        void *buffer_start;
-        u32 buf[4];
-
-        if (ctx->desc.tfm) {
-
-                buffer_start = TOI_MAP(buf_type, buffer_page);
-                ctx->len = OUT_BUF_SIZE;
-
-                ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest);
-                if (ret) {
-                        printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret);
-                } else {
-                        mutex_lock(&stats_lock);
-
-                        toi_pruned_pages++;
-
-                        mutex_unlock(&stats_lock);
-
-                }
-
-                TOI_UNMAP(buf_type, buffer_page);
-        }
-
-        if (write_data)
-                ret = next_driver->write_page(index, out_buf_type,
-                                output_buffer, output_len);
-        else
-                ret = next_driver->write_page(index, out_buf_type,
-                                output_buffer, output_len);
-
-        return ret;
-}
-
-/*
- * toi_prune_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules or from a previously loaded page and
- * fill the input buffer.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_prune_read_page(unsigned long *index, int buf_type,
-                void *buffer_page, unsigned int *buf_size)
-{
-        int ret, cpu = smp_processor_id();
-        unsigned int len;
-        char *buffer_start;
-        struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
-        if (!ctx->desc.tfm)
-                return next_driver->read_page(index, TOI_PAGE, buffer_page,
-                                buf_size);
-
-        /*
-         * All our reads must be synchronous - we can't handle
-         * data that hasn't been read yet.
-         */
-
-        ret = next_driver->read_page(index, buf_type, buffer_page, &len);
-
-        if (len == PRUNE_DATA_IS_PFN) {
-                buffer_start = kmap(buffer_page);
-        }
-
-        return ret;
-}
-
-/*
- * toi_prune_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_prune_print_debug_stats(char *buffer, int size)
-{
-        int len;
-
-        /* Output the number of pages pruned. */
-        if (*toi_prune_hash_algo_name)
-                len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
-                                toi_prune_hash_algo_name);
-        else
-                len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
-        if (toi_pruned_pages)
-                len += scnprintf(buffer+len, size - len, "  Pruned "
-                        "%lu pages).\n",
-                  toi_pruned_pages);
-        return len;
-}
-
-/*
- * toi_prune_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_prune_memory_needed(void)
-{
-        return 2 * PAGE_SIZE;
-}
-
-static int toi_prune_storage_needed(void)
-{
-        return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
-                strlen(toi_prune_hash_algo_name) + 1;
-}
-
-/*
- * toi_prune_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_prune_save_config_info(char *buffer)
-{
-        int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0;
-
-        *((unsigned long *) buffer) = toi_pruned_pages;
-        offset += sizeof(unsigned long);
-        *((int *) (buffer + offset)) = len;
-        offset += sizeof(int);
-        strncpy(buffer + offset, toi_prune_hash_algo_name, len);
-        return offset + len;
-}
-
-/* toi_prune_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description:        Reload information needed for passing back to the
- * resumed kernel.
- */
-static void toi_prune_load_config_info(char *buffer, int size)
-{
-        int len, offset = 0;
-
-        toi_pruned_pages = *((unsigned long *) buffer);
-        offset += sizeof(unsigned long);
-        len = *((int *) (buffer + offset));
-        offset += sizeof(int);
-        strncpy(toi_prune_hash_algo_name, buffer + offset, len);
-}
-
-static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        bkd->pruned_pages = toi_pruned_pages;
-}
-
-static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        toi_pruned_pages = bkd->pruned_pages;
-}
-
-/*
- * toi_expected_ratio
- *
- * Description:        Returns the expected ratio between data passed into this module
- *                 and the amount of data output when writing.
- * Returns:        100 - we have no idea how many pages will be pruned.
- */
-
-static int toi_prune_expected_ratio(void)
-{
-        return 100;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0,
-                        NULL),
-        SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_prune_ops = {
-        .type                        = FILTER_MODULE,
-        .name                        = "prune",
-        .directory                = "prune",
-        .module                        = THIS_MODULE,
-        .initialise                = toi_prune_init,
-        .memory_needed                 = toi_prune_memory_needed,
-        .print_debug_info        = toi_prune_print_debug_stats,
-        .save_config_info        = toi_prune_save_config_info,
-        .load_config_info        = toi_prune_load_config_info,
-        .storage_needed                = toi_prune_storage_needed,
-        .expected_compression        = toi_prune_expected_ratio,
-
-        .pre_atomic_restore        = toi_prune_pre_atomic_restore,
-        .post_atomic_restore        = toi_prune_post_atomic_restore,
-
-        .rw_init                = toi_prune_rw_init,
-        .rw_cleanup                = toi_prune_rw_cleanup,
-
-        .write_page                = toi_prune_write_page,
-        .read_page                = toi_prune_read_page,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_prune_load(void)
-{
-        return toi_register_module(&toi_prune_ops);
-}
-
-late_initcall(toi_prune_load);
diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
deleted file mode 100644
index d8539c275..000000000
--- a/kernel/power/tuxonice_storage.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for talking to a userspace program that manages storage.
- *
- * The kernel side:
- * - starts the userspace program;
- * - sends messages telling it when to open and close the connection;
- * - tells it when to quit;
- *
- * The user space side:
- * - passes messages regarding status;
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_ui.h"
-
-static struct user_helper_data usm_helper_data;
-static struct toi_module_ops usm_ops;
-static int message_received, usm_prepare_count;
-static int storage_manager_last_action, storage_manager_action;
-
-static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-        int type;
-        int *data;
-
-        type = nlh->nlmsg_type;
-
-        /* A control message: ignore them */
-        if (type < NETLINK_MSG_BASE)
-                return 0;
-
-        /* Unknown message: reply with EINVAL */
-        if (type >= USM_MSG_MAX)
-                return -EINVAL;
-
-        /* All operations require privileges, even GET */
-        if (!capable(CAP_NET_ADMIN))
-                return -EPERM;
-
-        /* Only allow one task to receive NOFREEZE privileges */
-        if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
-                return -EBUSY;
-
-        data = (int *) NLMSG_DATA(nlh);
-
-        switch (type) {
-        case USM_MSG_SUCCESS:
-        case USM_MSG_FAILED:
-                message_received = type;
-                complete(&usm_helper_data.wait_for_process);
-                break;
-        default:
-                printk(KERN_INFO "Storage manager doesn't recognise "
-                                "message %d.\n", type);
-        }
-
-        return 1;
-}
-
-#ifdef CONFIG_NET
-static int activations;
-
-int toi_activate_storage(int force)
-{
-        int tries = 1;
-
-        if (usm_helper_data.pid == -1 || !usm_ops.enabled)
-                return 0;
-
-        message_received = 0;
-        activations++;
-
-        if (activations > 1 && !force)
-                return 0;
-
-        while ((!message_received || message_received == USM_MSG_FAILED) &&
-                        tries < 2) {
-                toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
-                                "%d.\n", tries);
-
-                init_completion(&usm_helper_data.wait_for_process);
-
-                toi_send_netlink_message(&usm_helper_data,
-                        USM_MSG_CONNECT,
-                        NULL, 0);
-
-                /* Wait 2 seconds for the userspace process to make contact */
-                wait_for_completion_timeout(&usm_helper_data.wait_for_process,
-                                2*HZ);
-
-                tries++;
-        }
-
-        return 0;
-}
-
-int toi_deactivate_storage(int force)
-{
-        if (usm_helper_data.pid == -1 || !usm_ops.enabled)
-                return 0;
-
-        message_received = 0;
-        activations--;
-
-        if (activations && !force)
-                return 0;
-
-        init_completion(&usm_helper_data.wait_for_process);
-
-        toi_send_netlink_message(&usm_helper_data,
-                        USM_MSG_DISCONNECT,
-                        NULL, 0);
-
-        wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
-
-        if (!message_received || message_received == USM_MSG_FAILED) {
-                printk(KERN_INFO "Returning failure disconnecting storage.\n");
-                return 1;
-        }
-
-        return 0;
-}
-#endif
-
-static void storage_manager_simulate(void)
-{
-        printk(KERN_INFO "--- Storage manager simulate ---\n");
-        toi_prepare_usm();
-        schedule();
-        printk(KERN_INFO "--- Activate storage 1 ---\n");
-        toi_activate_storage(1);
-        schedule();
-        printk(KERN_INFO "--- Deactivate storage 1 ---\n");
-        toi_deactivate_storage(1);
-        schedule();
-        printk(KERN_INFO "--- Cleanup usm ---\n");
-        toi_cleanup_usm();
-        schedule();
-        printk(KERN_INFO "--- Storage manager simulate ends ---\n");
-}
-
-static int usm_storage_needed(void)
-{
-        return sizeof(int) + strlen(usm_helper_data.program) + 1;
-}
-
-static int usm_save_config_info(char *buf)
-{
-        int len = strlen(usm_helper_data.program);
-        memcpy(buf, usm_helper_data.program, len + 1);
-        return sizeof(int) + len + 1;
-}
-
-static void usm_load_config_info(char *buf, int size)
-{
-        /* Don't load the saved path if one has already been set */
-        if (usm_helper_data.program[0])
-                return;
-
-        memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf));
-}
-
-static int usm_memory_needed(void)
-{
-        /* ball park figure of 32 pages */
-        return 32 * PAGE_SIZE;
-}
-
-/* toi_prepare_usm
- */
-int toi_prepare_usm(void)
-{
-        usm_prepare_count++;
-
-        if (usm_prepare_count > 1 || !usm_ops.enabled)
-                return 0;
-
-        usm_helper_data.pid = -1;
-
-        if (!*usm_helper_data.program)
-                return 0;
-
-        toi_netlink_setup(&usm_helper_data);
-
-        if (usm_helper_data.pid == -1)
-                printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
-                                " start it.\n");
-
-        toi_activate_storage(0);
-
-        return usm_helper_data.pid != -1;
-}
-
-void toi_cleanup_usm(void)
-{
-        usm_prepare_count--;
-
-        if (usm_helper_data.pid > -1 && !usm_prepare_count) {
-                toi_deactivate_storage(0);
-                toi_netlink_close(&usm_helper_data);
-        }
-}
-
-static void storage_manager_activate(void)
-{
-        if (storage_manager_action == storage_manager_last_action)
-                return;
-
-        if (storage_manager_action)
-                toi_prepare_usm();
-        else
-                toi_cleanup_usm();
-
-        storage_manager_last_action = storage_manager_action;
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate),
-        SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL),
-        SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0,
-                NULL),
-        SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1,
-                        0, storage_manager_activate)
-};
-
-static struct toi_module_ops usm_ops = {
-        .type                                = MISC_MODULE,
-        .name                                = "usm",
-        .directory                        = "storage_manager",
-        .module                                = THIS_MODULE,
-        .storage_needed                        = usm_storage_needed,
-        .save_config_info                = usm_save_config_info,
-        .load_config_info                = usm_load_config_info,
-        .memory_needed                        = usm_memory_needed,
-
-        .sysfs_data                        = sysfs_params,
-        .num_sysfs_entries                = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* toi_usm_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-int toi_usm_init(void)
-{
-        usm_helper_data.nl = NULL;
-        usm_helper_data.program[0] = '\0';
-        usm_helper_data.pid = -1;
-        usm_helper_data.skb_size = 0;
-        usm_helper_data.pool_limit = 6;
-        usm_helper_data.netlink_id = NETLINK_TOI_USM;
-        usm_helper_data.name = "userspace storage manager";
-        usm_helper_data.rcv_msg = usm_user_rcv_msg;
-        usm_helper_data.interface_version = 2;
-        usm_helper_data.must_init = 0;
-        init_completion(&usm_helper_data.wait_for_process);
-
-        return toi_register_module(&usm_ops);
-}
-
-void toi_usm_exit(void)
-{
-        toi_netlink_close_complete(&usm_helper_data);
-        toi_unregister_module(&usm_ops);
-}
diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
deleted file mode 100644
index 0189c8888..000000000
--- a/kernel/power/tuxonice_storage.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_NET
-int toi_prepare_usm(void);
-void toi_cleanup_usm(void);
-
-int toi_activate_storage(int force);
-int toi_deactivate_storage(int force);
-extern int toi_usm_init(void);
-extern void toi_usm_exit(void);
-#else
-static inline int toi_usm_init(void) { return 0; }
-static inline void toi_usm_exit(void) { }
-
-static inline int toi_activate_storage(int force)
-{
-        return 0;
-}
-
-static inline int toi_deactivate_storage(int force)
-{
-        return 0;
-}
-
-static inline int toi_prepare_usm(void) { return 0; }
-static inline void toi_cleanup_usm(void) { }
-#endif
-
-enum {
-        USM_MSG_BASE = 0x10,
-
-        /* Kernel -> Userspace */
-        USM_MSG_CONNECT = 0x30,
-        USM_MSG_DISCONNECT = 0x31,
-        USM_MSG_SUCCESS = 0x40,
-        USM_MSG_FAILED = 0x41,
-
-        USM_MSG_MAX,
-};
diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
deleted file mode 100644
index 9f555c932..000000000
--- a/kernel/power/tuxonice_swap.c
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * kernel/power/tuxonice_swap.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of swap space as a
- * backing store.
- */
-
-#include <linux/suspend.h>
-#include <linux/blkdev.h>
-#include <linux/swapops.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct toi_module_ops toi_swapops;
-
-/* For swapfile automatically swapon/off'd. */
-static char swapfilename[255] = "";
-static int toi_swapon_status;
-
-/* Swap Pages */
-static unsigned long swap_allocated;
-
-static struct sysinfo swapinfo;
-
-static int is_ram_backed(struct swap_info_struct *si)
-{
-        if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) ||
-            !strncmp(si->bdev->bd_disk->disk_name, "zram", 4))
-                return 1;
-
-        return 0;
-}
-
-/**
- * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
- *
- * Activate the given swapfile if it wasn't already enabled. Remember whether
- * we really did swapon it for swapoffing later.
- */
-static void enable_swapfile(void)
-{
-        int activateswapresult = -EINVAL;
-
-        if (swapfilename[0]) {
-                /* Attempt to swap on with maximum priority */
-                activateswapresult = sys_swapon(swapfilename, 0xFFFF);
-                if (activateswapresult && activateswapresult != -EBUSY)
-                        printk(KERN_ERR "TuxOnIce: The swapfile/partition "
-                                "specified by /sys/power/tuxonice/swap/swapfile"
-                                " (%s) could not be turned on (error %d). "
-                                "Attempting to continue.\n",
-                                swapfilename, activateswapresult);
-                if (!activateswapresult)
-                        toi_swapon_status = 1;
-        }
-}
-
-/**
- * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
- *
- * If we did successfully swapon a file at the start of the cycle, swapoff
- * it now (finishing up).
- */
-static void disable_swapfile(void)
-{
-        if (!toi_swapon_status)
-                return;
-
-        sys_swapoff(swapfilename);
-        toi_swapon_status = 0;
-}
-
-static int add_blocks_to_extent_chain(struct toi_bdev_info *chain,
-                unsigned long start, unsigned long end)
-{
-        if (test_action_state(TOI_TEST_BIO))
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to "
-                                "chain %p.", start << chain->bmap_shift,
-                                end << chain->bmap_shift, chain);
-
-        return toi_add_to_extent_chain(&chain->blocks, start, end);
-}
-
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
-        struct hibernate_extent *extentpointer = NULL;
-        unsigned long address, extent_min = 0, extent_max = 0;
-        int empty = 1;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for "
-                        "chain %d.", chain->allocator_index);
-
-        if (!chain->allocations.first)
-                return 0;
-
-        if (chain->blocks.first)
-                toi_put_extent_chain(&chain->blocks);
-
-        toi_extent_for_each(&chain->allocations, extentpointer, address) {
-                swp_entry_t swap_address = (swp_entry_t) { address };
-                struct block_device *bdev;
-                sector_t new_sector = map_swap_entry(swap_address, &bdev);
-
-                if (empty) {
-                        empty = 0;
-                        extent_min = extent_max = new_sector;
-                        continue;
-                }
-
-                if (new_sector == extent_max + 1) {
-                        extent_max++;
-                        continue;
-                }
-
-                if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
-                        printk(KERN_ERR "Out of memory while making block "
-                                        "chains.\n");
-                        return -ENOMEM;
-                }
-
-                extent_min = new_sector;
-                extent_max = new_sector;
-        }
-
-        if (!empty &&
-            add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
-                printk(KERN_ERR "Out of memory while making block chains.\n");
-                return -ENOMEM;
-        }
-
-        return 0;
-}
-
-/*
- * Like si_swapinfo, except that we don't include ram backed swap (compcache!)
- * and don't need to use the spinlocks (userspace is stopped when this
- * function is called).
- */
-void si_swapinfo_no_compcache(void)
-{
-        unsigned int i;
-
-        si_swapinfo(&swapinfo);
-        swapinfo.freeswap = 0;
-        swapinfo.totalswap = 0;
-
-        for (i = 0; i < MAX_SWAPFILES; i++) {
-                struct swap_info_struct *si = get_swap_info_struct(i);
-                if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) {
-                        swapinfo.totalswap += si->inuse_pages;
-                        swapinfo.freeswap += si->pages - si->inuse_pages;
-                }
-        }
-}
-/*
- * We can't just remember the value from allocation time, because other
- * processes might have allocated swap in the mean time.
- */
-static unsigned long toi_swap_storage_available(void)
-{
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available.");
-        si_swapinfo_no_compcache();
-        return swapinfo.freeswap + swap_allocated;
-}
-
-static int toi_swap_initialise(int starting_cycle)
-{
-        if (!starting_cycle)
-                return 0;
-
-        enable_swapfile();
-        return 0;
-}
-
-static void toi_swap_cleanup(int ending_cycle)
-{
-        if (!ending_cycle)
-                return;
-
-        disable_swapfile();
-}
-
-static void toi_swap_free_storage(struct toi_bdev_info *chain)
-{
-        /* Free swap entries */
-        struct hibernate_extent *extentpointer;
-        unsigned long extentvalue;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.",
-                        chain);
-
-        swap_allocated -= chain->allocations.size;
-        toi_extent_for_each(&chain->allocations, extentpointer, extentvalue)
-                swap_free((swp_entry_t) { extentvalue });
-
-        toi_put_extent_chain(&chain->allocations);
-}
-
-static void free_swap_range(unsigned long min, unsigned long max)
-{
-        int j;
-
-        for (j = min; j <= max; j++)
-                swap_free((swp_entry_t) { j });
-        swap_allocated -= (max - min + 1);
-}
-
-/*
- * Allocation of a single swap type. Swap priorities are handled at the higher
- * level.
- */
-static int toi_swap_allocate_storage(struct toi_bdev_info *chain,
-                unsigned long request)
-{
-        unsigned long gotten = 0;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "  Swap allocate storage: Asked to"
-                        " allocate %lu pages from device %d.", request,
-                        chain->allocator_index);
-
-        while (gotten < request) {
-                swp_entry_t start, end;
-                if (0) {
-                    /* Broken at the moment for SSDs */
-                    get_swap_range_of_type(chain->allocator_index, &start, &end,
-                            request - gotten + 1);
-                } else {
-                    start = end = get_swap_page_of_type(chain->allocator_index);
-                }
-                if (start.val) {
-                        int added = end.val - start.val + 1;
-                        if (toi_add_to_extent_chain(&chain->allocations,
-                                                start.val, end.val)) {
-                                printk(KERN_INFO "Failed to allocate extent for "
-                                        "%lu-%lu.\n", start.val, end.val);
-                                free_swap_range(start.val, end.val);
-                                break;
-                        }
-                        gotten += added;
-                        swap_allocated += added;
-                } else
-                        break;
-        }
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "  Allocated %lu pages.", gotten);
-        return gotten;
-}
-
-static int toi_swap_register_storage(void)
-{
-        int i, result = 0;
-
-        toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage.");
-        for (i = 0; i < MAX_SWAPFILES; i++) {
-                struct swap_info_struct *si = get_swap_info_struct(i);
-                struct toi_bdev_info *devinfo;
-                unsigned char *p;
-                unsigned char buf[256];
-                struct fs_info *fs_info;
-
-                if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si))
-                        continue;
-
-                devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info),
-                                GFP_ATOMIC);
-                if (!devinfo) {
-                        printk("Failed to allocate devinfo struct for swap "
-                                        "device %d.\n", i);
-                        return -ENOMEM;
-                }
-
-                devinfo->bdev = si->bdev;
-                devinfo->allocator = &toi_swapops;
-                devinfo->allocator_index = i;
-
-                fs_info = fs_info_from_block_dev(si->bdev);
-                if (fs_info && !IS_ERR(fs_info)) {
-                        memcpy(devinfo->uuid, &fs_info->uuid, 16);
-                        free_fs_info(fs_info);
-                } else
-                        result = (int) PTR_ERR(fs_info);
-
-                if (!fs_info)
-                        printk("fs_info from block dev returned %d.\n", result);
-                devinfo->dev_t = si->bdev->bd_dev;
-                devinfo->prio = si->prio;
-                devinfo->bmap_shift = 3;
-                devinfo->blocks_per_page = 1;
-
-                p = d_path(&si->swap_file->f_path, buf, sizeof(buf));
-                sprintf(devinfo->name, "swap on %s", p);
-
-                toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:"
-                                " Device %d (%lx), prio %d.", i,
-                                (unsigned long) devinfo->dev_t, devinfo->prio);
-                toi_bio_ops.register_storage(devinfo);
-        }
-
-        return 0;
-}
-
-static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used)
-{
-    struct hibernate_extent *extentpointer = NULL;
-    unsigned long extentvalue;
-    unsigned long i = 0, first_freed = 0;
-
-    toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) {
-        i++;
-        if (i > used) {
-            swap_free((swp_entry_t) { extentvalue });
-            if (!first_freed)
-                first_freed = extentvalue;
-        }
-    }
-
-    return first_freed;
-}
-
-/*
- * workspace_size
- *
- * Description:
- * Returns the number of bytes of RAM needed for this
- * code to do its work. (Used when calculating whether
- * we have enough memory to be able to hibernate & resume).
- *
- */
-static int toi_swap_memory_needed(void)
-{
-        return 1;
-}
-
-/*
- * Print debug info
- *
- * Description:
- */
-static int toi_swap_print_debug_stats(char *buffer, int size)
-{
-        int len = 0;
-
-        len = scnprintf(buffer, size, "- Swap Allocator enabled.\n");
-        if (swapfilename[0])
-                len += scnprintf(buffer+len, size-len,
-                        "  Attempting to automatically swapon: %s.\n",
-                        swapfilename);
-
-        si_swapinfo_no_compcache();
-
-        len += scnprintf(buffer+len, size-len,
-                        "  Swap available for image: %lu pages.\n",
-                        swapinfo.freeswap + swap_allocated);
-
-        return len;
-}
-
-static int header_locations_read_sysfs(const char *page, int count)
-{
-        int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
-        struct inode *swapf = NULL;
-        int zone;
-        char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
-        char *path, *output = (char *) page;
-        int path_len;
-
-        if (!page)
-                return 0;
-
-        for (i = 0; i < MAX_SWAPFILES; i++) {
-                struct swap_info_struct *si =  get_swap_info_struct(i);
-
-                if (!si || !(si->flags & SWP_WRITEOK))
-                        continue;
-
-                if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
-                        haveswap = 1;
-                        if (!printedpartitionsmessage) {
-                                len += sprintf(output + len,
-                                        "For swap partitions, simply use the "
-                                        "format: resume=swap:/dev/hda1.\n");
-                                printedpartitionsmessage = 1;
-                        }
-                } else {
-                        path_len = 0;
-
-                        path = d_path(&si->swap_file->f_path, path_page,
-                                        PAGE_SIZE);
-                        path_len = snprintf(path_page, PAGE_SIZE, "%s", path);
-
-                        haveswap = 1;
-                        swapf = si->swap_file->f_mapping->host;
-                        zone = bmap(swapf, 0);
-                        if (!zone) {
-                                len += sprintf(output + len,
-                                        "Swapfile %s has been corrupted. Reuse"
-                                        " mkswap on it and try again.\n",
-                                        path_page);
-                        } else {
-                                char name_buffer[BDEVNAME_SIZE];
-                                len += sprintf(output + len,
-                                        "For swapfile `%s`,"
-                                        " use resume=swap:/dev/%s:0x%x.\n",
-                                        path_page,
-                                        bdevname(si->bdev, name_buffer),
-                                        zone << (swapf->i_blkbits - 9));
-                        }
-                }
-        }
-
-        if (!haveswap)
-                len = sprintf(output, "You need to turn on swap partitions "
-                                "before examining this file.\n");
-
-        toi_free_page(10, (unsigned long) path_page);
-        return len;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL),
-        SYSFS_CUSTOM("headerlocations", SYSFS_READONLY,
-                        header_locations_read_sysfs, NULL, 0, NULL),
-        SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0,
-                        attempt_to_parse_resume_device2),
-};
-
-static struct toi_bio_allocator_ops toi_bio_swapops = {
-        .register_storage                        = toi_swap_register_storage,
-        .storage_available                        = toi_swap_storage_available,
-        .allocate_storage                        = toi_swap_allocate_storage,
-        .bmap                                        = get_main_pool_phys_params,
-        .free_storage                                = toi_swap_free_storage,
-        .free_unused_storage                    = toi_swap_free_unused_storage,
-};
-
-static struct toi_module_ops toi_swapops = {
-        .type                                        = BIO_ALLOCATOR_MODULE,
-        .name                                        = "swap storage",
-        .directory                                = "swap",
-        .module                                        = THIS_MODULE,
-        .memory_needed                                = toi_swap_memory_needed,
-        .print_debug_info                        = toi_swap_print_debug_stats,
-        .initialise                                = toi_swap_initialise,
-        .cleanup                                = toi_swap_cleanup,
-        .bio_allocator_ops                        = &toi_bio_swapops,
-
-        .sysfs_data                = sysfs_params,
-        .num_sysfs_entries        = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_swap_load(void)
-{
-        return toi_register_module(&toi_swapops);
-}
-
-late_initcall(toi_swap_load);
diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
deleted file mode 100644
index 77f36dbeb..000000000
--- a/kernel/power/tuxonice_sysfs.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.c
- *
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains support for sysfs entries for tuning TuxOnIce.
- *
- * We have a generic handler that deals with the most common cases, and
- * hooks for special handlers to use.
- */
-
-#include <linux/suspend.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_alloc.h"
-
-static int toi_sysfs_initialised;
-
-static void toi_initialise_sysfs(void);
-
-static struct toi_sysfs_data sysfs_params[];
-
-#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
-
-static void toi_main_wrapper(void)
-{
-        toi_try_hibernate();
-}
-
-static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
-                              char *page)
-{
-        struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
-        int len = 0;
-        int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ;
-
-        if (full_prep && toi_start_anything(0))
-                return -EBUSY;
-
-        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
-                toi_prepare_usm();
-
-        switch (sysfs_data->type) {
-        case TOI_SYSFS_DATA_CUSTOM:
-                len = (sysfs_data->data.special.read_sysfs) ?
-                        (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
-                        : 0;
-                break;
-        case TOI_SYSFS_DATA_BIT:
-                len = sprintf(page, "%d\n",
-                        -test_bit(sysfs_data->data.bit.bit,
-                                sysfs_data->data.bit.bit_vector));
-                break;
-        case TOI_SYSFS_DATA_INTEGER:
-                len = sprintf(page, "%d\n",
-                        *(sysfs_data->data.integer.variable));
-                break;
-        case TOI_SYSFS_DATA_LONG:
-                len = sprintf(page, "%ld\n",
-                        *(sysfs_data->data.a_long.variable));
-                break;
-        case TOI_SYSFS_DATA_UL:
-                len = sprintf(page, "%lu\n",
-                        *(sysfs_data->data.ul.variable));
-                break;
-        case TOI_SYSFS_DATA_STRING:
-                len = sprintf(page, "%s\n",
-                        sysfs_data->data.string.variable);
-                break;
-        }
-
-        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
-                toi_cleanup_usm();
-
-        if (full_prep)
-                toi_finish_anything(0);
-
-        return len;
-}
-
-#define BOUND(_variable, _type) do { \
-        if (*_variable < sysfs_data->data._type.minimum) \
-                *_variable = sysfs_data->data._type.minimum; \
-        else if (*_variable > sysfs_data->data._type.maximum) \
-                *_variable = sysfs_data->data._type.maximum; \
-} while (0)
-
-static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
-                const char *my_buf, size_t count)
-{
-        int assigned_temp_buffer = 0, result = count;
-        struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
-
-        if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
-                return -EBUSY;
-
-        ((char *) my_buf)[count] = 0;
-
-        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
-                toi_prepare_usm();
-
-        switch (sysfs_data->type) {
-        case TOI_SYSFS_DATA_CUSTOM:
-                if (sysfs_data->data.special.write_sysfs)
-                        result = (sysfs_data->data.special.write_sysfs)(my_buf,
-                                        count);
-                break;
-        case TOI_SYSFS_DATA_BIT:
-                {
-                unsigned long value;
-                result = kstrtoul(my_buf, 0, &value);
-                if (result)
-                        break;
-                if (value)
-                        set_bit(sysfs_data->data.bit.bit,
-                                (sysfs_data->data.bit.bit_vector));
-                else
-                        clear_bit(sysfs_data->data.bit.bit,
-                                (sysfs_data->data.bit.bit_vector));
-                }
-                break;
-        case TOI_SYSFS_DATA_INTEGER:
-                {
-                        long temp;
-                        result = kstrtol(my_buf, 0, &temp);
-                        if (result)
-                                break;
-                        *(sysfs_data->data.integer.variable) = (int) temp;
-                        BOUND(sysfs_data->data.integer.variable, integer);
-                        break;
-                }
-        case TOI_SYSFS_DATA_LONG:
-                {
-                        long *variable =
-                                sysfs_data->data.a_long.variable;
-                        result = kstrtol(my_buf, 0, variable);
-                        if (result)
-                                break;
-                        BOUND(variable, a_long);
-                        break;
-                }
-        case TOI_SYSFS_DATA_UL:
-                {
-                        unsigned long *variable =
-                                sysfs_data->data.ul.variable;
-                        result = kstrtoul(my_buf, 0, variable);
-                        if (result)
-                                break;
-                        BOUND(variable, ul);
-                        break;
-                }
-                break;
-        case TOI_SYSFS_DATA_STRING:
-                {
-                        int copy_len = count;
-                        char *variable =
-                                sysfs_data->data.string.variable;
-
-                        if (sysfs_data->data.string.max_length &&
-                            (copy_len > sysfs_data->data.string.max_length))
-                                copy_len = sysfs_data->data.string.max_length;
-
-                        if (!variable) {
-                                variable = (char *) toi_get_zeroed_page(31,
-                                                TOI_ATOMIC_GFP);
-                                sysfs_data->data.string.variable = variable;
-                                assigned_temp_buffer = 1;
-                        }
-                        strncpy(variable, my_buf, copy_len);
-                        if (copy_len && my_buf[copy_len - 1] == '\n')
-                                variable[count - 1] = 0;
-                        variable[count] = 0;
-                }
-                break;
-        }
-
-        if (!result)
-                result = count;
-
-        /* Side effect routine? */
-        if (result == count && sysfs_data->write_side_effect)
-                sysfs_data->write_side_effect();
-
-        /* Free temporary buffers */
-        if (assigned_temp_buffer) {
-                toi_free_page(31,
-                        (unsigned long) sysfs_data->data.string.variable);
-                sysfs_data->data.string.variable = NULL;
-        }
-
-        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
-                toi_cleanup_usm();
-
-        toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
-
-        return result;
-}
-
-static struct sysfs_ops toi_sysfs_ops = {
-        .show        = &toi_attr_show,
-        .store        = &toi_attr_store,
-};
-
-static struct kobj_type toi_ktype = {
-        .sysfs_ops        = &toi_sysfs_ops,
-};
-
-struct kobject *tuxonice_kobj;
-
-/* Non-module sysfs entries.
- *
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-        SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL,
-                SYSFS_HIBERNATING, toi_main_wrapper),
-        SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL,
-                SYSFS_RESUMING, toi_try_resume)
-};
-
-void remove_toi_sysdir(struct kobject *kobj)
-{
-        if (!kobj)
-                return;
-
-        kobject_put(kobj);
-}
-
-struct kobject *make_toi_sysdir(char *name)
-{
-        struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
-
-        if (!kobj) {
-                printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
-                                "dir!\n");
-                return NULL;
-        }
-
-        kobj->ktype = &toi_ktype;
-
-        return kobj;
-}
-
-/* toi_register_sysfs_file
- *
- * Helper for registering a new /sysfs/tuxonice entry.
- */
-
-int toi_register_sysfs_file(
-                struct kobject *kobj,
-                struct toi_sysfs_data *toi_sysfs_data)
-{
-        int result;
-
-        if (!toi_sysfs_initialised)
-                toi_initialise_sysfs();
-
-        result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
-        if (result)
-                printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
-                        "returned %d.\n",
-                        toi_sysfs_data->attr.name, result);
-        kobj->ktype = &toi_ktype;
-
-        return result;
-}
-
-/* toi_unregister_sysfs_file
- *
- * Helper for removing unwanted /sys/power/tuxonice entries.
- *
- */
-void toi_unregister_sysfs_file(struct kobject *kobj,
-                struct toi_sysfs_data *toi_sysfs_data)
-{
-        sysfs_remove_file(kobj, &toi_sysfs_data->attr);
-}
-
-void toi_cleanup_sysfs(void)
-{
-        int i,
-            numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
-        if (!toi_sysfs_initialised)
-                return;
-
-        for (i = 0; i < numfiles; i++)
-                toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
-        kobject_put(tuxonice_kobj);
-        toi_sysfs_initialised = 0;
-}
-
-/* toi_initialise_sysfs
- *
- * Initialise the /sysfs/tuxonice directory.
- */
-
-static void toi_initialise_sysfs(void)
-{
-        int i;
-        int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
-        if (toi_sysfs_initialised)
-                return;
-
-        /* Make our TuxOnIce directory a child of /sys/power */
-        tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
-        if (!tuxonice_kobj)
-                return;
-
-        toi_sysfs_initialised = 1;
-
-        for (i = 0; i < numfiles; i++)
-                toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-}
-
-int toi_sysfs_init(void)
-{
-        toi_initialise_sysfs();
-        return 0;
-}
-
-void toi_sysfs_exit(void)
-{
-        toi_cleanup_sysfs();
-}
diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
deleted file mode 100644
index 1de954ce1..000000000
--- a/kernel/power/tuxonice_sysfs.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/sysfs.h>
-
-struct toi_sysfs_data {
-        struct attribute attr;
-        int type;
-        int flags;
-        union {
-                struct {
-                        unsigned long *bit_vector;
-                        int bit;
-                } bit;
-                struct {
-                        int *variable;
-                        int minimum;
-                        int maximum;
-                } integer;
-                struct {
-                        long *variable;
-                        long minimum;
-                        long maximum;
-                } a_long;
-                struct {
-                        unsigned long *variable;
-                        unsigned long minimum;
-                        unsigned long maximum;
-                } ul;
-                struct {
-                        char *variable;
-                        int max_length;
-                } string;
-                struct {
-                        int (*read_sysfs) (const char *buffer, int count);
-                        int (*write_sysfs) (const char *buffer, int count);
-                        void *data;
-                } special;
-        } data;
-
-        /* Side effects routine. Used, eg, for reparsing the
-         * resume= entry when it changes */
-        void (*write_side_effect) (void);
-        struct list_head sysfs_data_list;
-};
-
-enum {
-        TOI_SYSFS_DATA_NONE = 1,
-        TOI_SYSFS_DATA_CUSTOM,
-        TOI_SYSFS_DATA_BIT,
-        TOI_SYSFS_DATA_INTEGER,
-        TOI_SYSFS_DATA_UL,
-        TOI_SYSFS_DATA_LONG,
-        TOI_SYSFS_DATA_STRING
-};
-
-#define SYSFS_WRITEONLY 0200
-#define SYSFS_READONLY 0444
-#define SYSFS_RW 0644
-
-#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_BIT, \
-        .flags = _flags, \
-        .data = { .bit = { .bit_vector = _ul, .bit = _bit } } }
-
-#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_INTEGER, \
-        .flags = _flags, \
-        .data = { .integer = { .variable = _int, .minimum = _min, \
-                        .maximum = _max } }, \
-        .write_side_effect = _wse }
-
-#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_UL, \
-        .flags = _flags, \
-        .data = { .ul = { .variable = _ul, .minimum = _min, \
-                        .maximum = _max } } }
-
-#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_LONG, \
-        .flags = _flags, \
-        .data = { .a_long = { .variable = _long, .minimum = _min, \
-                        .maximum = _max } } }
-
-#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_STRING, \
-        .flags = _flags, \
-        .data = { .string = { .variable = _string, .max_length = _max_len } }, \
-        .write_side_effect = _wse }
-
-#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \
-        .attr = {.name  = _name , .mode   = _mode }, \
-        .type = TOI_SYSFS_DATA_CUSTOM, \
-        .flags = _flags, \
-        .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \
-        .write_side_effect = _wse }
-
-#define SYSFS_NONE(_name, _wse) { \
-        .attr = {.name  = _name , .mode   = SYSFS_WRITEONLY }, \
-        .type = TOI_SYSFS_DATA_NONE, \
-        .write_side_effect = _wse, \
-}
-
-/* Flags */
-#define SYSFS_NEEDS_SM_FOR_READ 1
-#define SYSFS_NEEDS_SM_FOR_WRITE 2
-#define SYSFS_HIBERNATE 4
-#define SYSFS_RESUME 8
-#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
-#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_NEEDS_SM_FOR_BOTH \
- (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
-
-int toi_register_sysfs_file(struct kobject *kobj,
-                struct toi_sysfs_data *toi_sysfs_data);
-void toi_unregister_sysfs_file(struct kobject *kobj,
-                struct toi_sysfs_data *toi_sysfs_data);
-
-extern struct kobject *tuxonice_kobj;
-
-struct kobject *make_toi_sysdir(char *name);
-void remove_toi_sysdir(struct kobject *obj);
-extern void toi_cleanup_sysfs(void);
-
-extern int toi_sysfs_init(void);
-extern void toi_sysfs_exit(void);
diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
deleted file mode 100644
index 76152f3ff..000000000
--- a/kernel/power/tuxonice_ui.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/reboot.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_builtin.h"
-
-static char local_printf_buf[1024];        /* Same as printk - should be safe */
-struct ui_ops *toi_current_ui;
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress, either from userui or /dev/console if userui isn't
- * available. The non-userui path is particularly for at boot-time, prior
- * to userui being started, when we have an important warning to give to
- * the user.
- */
-static char toi_wait_for_keypress(int timeout)
-{
-        if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
-                return ' ';
-
-        return toi_wait_for_keypress_dev_console(timeout);
-}
-
-/* toi_early_boot_message()
- * Description:        Handle errors early in the process of booting.
- *                 The user may press C to continue booting, perhaps
- *                 invalidating the image,  or space to reboot.
- *                 This works from either the serial console or normally
- *                 attached keyboard.
- *
- *                 Note that we come in here from init, while the kernel is
- *                 locked. If we want to get events from the serial console,
- *                 we need to temporarily unlock the kernel.
- *
- *                 toi_early_boot_message may also be called post-boot.
- *                 In this case, it simply printks the message and returns.
- *
- * Arguments:        int        Whether we are able to erase the image.
- *                 int        default_answer. What to do when we timeout. This
- *                         will normally be continue, but the user might
- *                         provide command line options (__setup) to override
- *                         particular cases.
- *                 Char *. Pointer to a string explaining why we're moaning.
- */
-
-#define say(message, a...) printk(KERN_EMERG message, ##a)
-
-void toi_early_boot_message(int message_detail, int default_answer,
-        char *warning_reason, ...)
-{
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
-        unsigned long orig_state = get_toi_state(), continue_req = 0;
-        unsigned long orig_loglevel = console_loglevel;
-        int can_ask = 1;
-#else
-        int can_ask = 0;
-#endif
-
-        va_list args;
-        int printed_len;
-
-        if (!toi_wait) {
-                set_toi_state(TOI_CONTINUE_REQ);
-                can_ask = 0;
-        }
-
-        if (warning_reason) {
-                va_start(args, warning_reason);
-                printed_len = vsnprintf(local_printf_buf,
-                                sizeof(local_printf_buf),
-                                warning_reason,
-                                args);
-                va_end(args);
-        }
-
-        if (!test_toi_state(TOI_BOOT_TIME)) {
-                printk("TuxOnIce: %s\n", local_printf_buf);
-                return;
-        }
-
-        if (!can_ask) {
-                continue_req = !!default_answer;
-                goto post_ask;
-        }
-
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
-        console_loglevel = 7;
-
-        say("=== TuxOnIce ===\n\n");
-        if (warning_reason) {
-                say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
-                switch (message_detail) {
-                case 0:
-                        say("If you continue booting, note that any image WILL"
-                                "NOT BE REMOVED.\nTuxOnIce is unable to do so "
-                                "because the appropriate modules aren't\n"
-                                "loaded. You should manually remove the image "
-                                "to avoid any\npossibility of corrupting your "
-                                "filesystem(s) later.\n");
-                        break;
-                case 1:
-                        say("If you want to use the current TuxOnIce image, "
-                                "reboot and try\nagain with the same kernel "
-                                "that you hibernated from. If you want\n"
-                                "to forget that image, continue and the image "
-                                "will be erased.\n");
-                        break;
-                }
-                say("Press SPACE to reboot or C to continue booting with "
-                        "this kernel\n\n");
-                if (toi_wait > 0)
-                        say("Default action if you don't select one in %d "
-                                "seconds is: %s.\n",
-                                toi_wait,
-                                default_answer == TOI_CONTINUE_REQ ?
-                                "continue booting" : "reboot");
-        } else {
-                say("BIG FAT WARNING!!\n\n"
-                        "You have tried to resume from this image before.\n"
-                        "If it failed once, it may well fail again.\n"
-                        "Would you like to remove the image and boot "
-                        "normally?\nThis will be equivalent to entering "
-                        "noresume on the\nkernel command line.\n\n"
-                        "Press SPACE to remove the image or C to continue "
-                        "resuming.\n\n");
-                if (toi_wait > 0)
-                        say("Default action if you don't select one in %d "
-                                "seconds is: %s.\n", toi_wait,
-                                !!default_answer ?
-                                "continue resuming" : "remove the image");
-        }
-        console_loglevel = orig_loglevel;
-
-        set_toi_state(TOI_SANITY_CHECK_PROMPT);
-        clear_toi_state(TOI_CONTINUE_REQ);
-
-        if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
-                continue_req = !!default_answer;
-        else
-                continue_req = test_toi_state(TOI_CONTINUE_REQ);
-
-#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
-
-post_ask:
-        if ((warning_reason) && (!continue_req))
-                kernel_restart(NULL);
-
-        restore_toi_state(orig_state);
-        if (continue_req)
-                set_toi_state(TOI_CONTINUE_REQ);
-}
-
-#undef say
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
-        SYSFS_INT("default_console_level", SYSFS_RW,
-                        &toi_bkd.toi_default_console_level, 0, 7, 0, NULL),
-        SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0,
-                        1 << 30, 0),
-        SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL,
-                        0)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
-        .type                                = MISC_HIDDEN_MODULE,
-        .name                                = "printk ui",
-        .directory                        = "user_interface",
-        .module                                = THIS_MODULE,
-        .sysfs_data                        = sysfs_params,
-        .num_sysfs_entries                = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-int toi_register_ui_ops(struct ui_ops *this_ui)
-{
-        if (toi_current_ui) {
-                printk(KERN_INFO "Only one TuxOnIce user interface module can "
-                                "be loaded at a time.");
-                return -EBUSY;
-        }
-
-        toi_current_ui = this_ui;
-
-        return 0;
-}
-
-void toi_remove_ui_ops(struct ui_ops *this_ui)
-{
-        if (toi_current_ui != this_ui)
-                return;
-
-        toi_current_ui = NULL;
-}
-
-/* toi_console_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-
-int toi_ui_init(void)
-{
-        return toi_register_module(&userui_ops);
-}
-
-void toi_ui_exit(void)
-{
-        toi_unregister_module(&userui_ops);
-}
diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
deleted file mode 100644
index 4934e3a91..000000000
--- a/kernel/power/tuxonice_ui.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- */
-
-enum {
-        DONT_CLEAR_BAR,
-        CLEAR_BAR
-};
-
-enum {
-        /* Userspace -> Kernel */
-        USERUI_MSG_ABORT = 0x11,
-        USERUI_MSG_SET_STATE = 0x12,
-        USERUI_MSG_GET_STATE = 0x13,
-        USERUI_MSG_GET_DEBUG_STATE = 0x14,
-        USERUI_MSG_SET_DEBUG_STATE = 0x15,
-        USERUI_MSG_SPACE = 0x18,
-        USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
-        USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
-        USERUI_MSG_GET_LOGLEVEL = 0x1C,
-        USERUI_MSG_SET_LOGLEVEL = 0x1D,
-        USERUI_MSG_PRINTK = 0x1E,
-
-        /* Kernel -> Userspace */
-        USERUI_MSG_MESSAGE = 0x21,
-        USERUI_MSG_PROGRESS = 0x22,
-        USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
-
-        USERUI_MSG_MAX,
-};
-
-struct userui_msg_params {
-        u32 a, b, c, d;
-        char text[255];
-};
-
-struct ui_ops {
-        char (*wait_for_key) (int timeout);
-        u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...);
-        void (*prepare_status) (int clearbar, const char *fmt, ...);
-        void (*cond_pause) (int pause, char *message);
-        void (*abort)(int result_code, const char *fmt, ...);
-        void (*prepare)(void);
-        void (*cleanup)(void);
-        void (*message)(u32 section, u32 level, u32 normally_logged,
-                        const char *fmt, ...);
-};
-
-extern struct ui_ops *toi_current_ui;
-
-#define toi_update_status(val, max, fmt, args...) \
- (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
-        max)
-
-#define toi_prepare_console(void) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->prepare)(); \
-        } while (0)
-
-#define toi_cleanup_console(void) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->cleanup)(); \
-        } while (0)
-
-#define abort_hibernate(result, fmt, args...) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->abort)(result, fmt, ##args); \
-             else { \
-                set_abort_result(result); \
-             } \
-        } while (0)
-
-#define toi_cond_pause(pause, message) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->cond_pause)(pause, message); \
-        } while (0)
-
-#define toi_prepare_status(clear, fmt, args...) \
-        do { if (toi_current_ui) \
-                (toi_current_ui->prepare_status)(clear, fmt, ##args); \
-             else \
-                printk(KERN_INFO fmt "%s", ##args, "\n"); \
-        } while (0)
-
-#define toi_message(sn, lev, log, fmt, a...) \
-do { \
-        if (toi_current_ui && (!sn || test_debug_state(sn))) \
-                toi_current_ui->message(sn, lev, log, fmt, ##a); \
-} while (0)
-
-__exit void toi_ui_cleanup(void);
-extern int toi_ui_init(void);
-extern void toi_ui_exit(void);
-extern int toi_register_ui_ops(struct ui_ops *this_ui);
-extern void toi_remove_ui_ops(struct ui_ops *this_ui);
diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
deleted file mode 100644
index 6aa5ac3eb..000000000
--- a/kernel/power/tuxonice_userui.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * kernel/power/user_ui.c
- *
- * Copyright (C) 2005-2007 Bernard Blackham
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/ctype.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>
-#include <linux/reboot.h>
-#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vt.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-
-static char local_printf_buf[1024];        /* Same as printk - should be safe */
-
-static struct user_helper_data ui_helper_data;
-static struct toi_module_ops userui_ops;
-static int orig_kmsg;
-
-static char lastheader[512];
-static int lastheader_message_len;
-static int ui_helper_changed; /* Used at resume-time so don't overwrite value
-                                set from initrd/ramfs. */
-
-/* Number of distinct progress amounts that userspace can display */
-static int progress_granularity = 30;
-
-static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
-static int userui_wait_should_wake;
-
-#define toi_stop_waiting_for_userui_key() \
-{ \
-        userui_wait_should_wake = true; \
-        wake_up_interruptible(&userui_wait_for_key); \
-}
-
-/**
- * ui_nl_set_state - Update toi_action based on a message from userui.
- *
- * @n: The bit (1 << bit) to set.
- */
-static void ui_nl_set_state(int n)
-{
-        /* Only let them change certain settings */
-        static const u32 toi_action_mask =
-                (1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
-                (1 << TOI_LOGALL) |
-                (1 << TOI_SINGLESTEP) |
-                (1 << TOI_PAUSE_NEAR_PAGESET_END);
-        static unsigned long new_action;
-
-        new_action = (toi_bkd.toi_action & (~toi_action_mask)) |
-                (n & toi_action_mask);
-
-        printk(KERN_DEBUG "n is %x. Action flags being changed from %lx "
-                        "to %lx.", n, toi_bkd.toi_action, new_action);
-        toi_bkd.toi_action = new_action;
-
-        if (!test_action_state(TOI_PAUSE) &&
-                        !test_action_state(TOI_SINGLESTEP))
-                toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_post_atomic_restore - Tell userui that atomic restore just happened.
- *
- * Tell userui that atomic restore just occured, so that it can do things like
- * redrawing the screen, re-getting settings and so on.
- */
-static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
-        toi_send_netlink_message(&ui_helper_data,
-                        USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
-}
-
-/**
- * userui_storage_needed - Report how much memory in image header is needed.
- */
-static int userui_storage_needed(void)
-{
-        return sizeof(ui_helper_data.program) + 1 + sizeof(int);
-}
-
-/**
- * userui_save_config_info - Fill buffer with config info for image header.
- *
- * @buf: Buffer into which to put the config info we want to save.
- */
-static int userui_save_config_info(char *buf)
-{
-        *((int *) buf) = progress_granularity;
-        memcpy(buf + sizeof(int), ui_helper_data.program,
-                        sizeof(ui_helper_data.program));
-        return sizeof(ui_helper_data.program) + sizeof(int) + 1;
-}
-
-/**
- * userui_load_config_info - Restore config info from buffer.
- *
- * @buf: Buffer containing header info loaded.
- * @size: Size of data loaded for this module.
- */
-static void userui_load_config_info(char *buf, int size)
-{
-        progress_granularity = *((int *) buf);
-        size -= sizeof(int);
-
-        /* Don't load the saved path if one has already been set */
-        if (ui_helper_changed)
-                return;
-
-        if (size > sizeof(ui_helper_data.program))
-                size = sizeof(ui_helper_data.program);
-
-        memcpy(ui_helper_data.program, buf + sizeof(int), size);
-        ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
-}
-
-/**
- * set_ui_program_set: Record that userui program was changed.
- *
- * Side effect routine for when the userui program is set. In an initrd or
- * ramfs, the user may set a location for the userui program. If this happens,
- * we don't want to reload the value that was saved in the image header. This
- * routine allows us to flag that we shouldn't restore the program name from
- * the image header.
- */
-static void set_ui_program_set(void)
-{
-        ui_helper_changed = 1;
-}
-
-/**
- * userui_memory_needed - Tell core how much memory to reserve for us.
- */
-static int userui_memory_needed(void)
-{
-        /* ball park figure of 128 pages */
-        return 128 * PAGE_SIZE;
-}
-
-/**
- * userui_update_status - Update the progress bar and (if on) in-bar message.
- *
- * @value: Current progress percentage numerator.
- * @maximum: Current progress percentage denominator.
- * @fmt: Message to be displayed in the middle of the progress bar.
- *
- * Note that a NULL message does not mean that any previous message is erased!
- * For that, you need toi_prepare_status with clearbar on.
- *
- * Returns an unsigned long, being the next numerator (as determined by the
- * maximum and progress granularity) where status needs to be updated.
- * This is to reduce unnecessary calls to update_status.
- */
-static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...)
-{
-        static u32 last_step = 9999;
-        struct userui_msg_params msg;
-        u32 this_step, next_update;
-        int bitshift;
-
-        if (ui_helper_data.pid == -1)
-                return 0;
-
-        if ((!maximum) || (!progress_granularity))
-                return maximum;
-
-        if (value < 0)
-                value = 0;
-
-        if (value > maximum)
-                value = maximum;
-
-        /* Try to avoid math problems - we can't do 64 bit math here
-         * (and shouldn't need it - anyone got screen resolution
-         * of 65536 pixels or more?) */
-        bitshift = fls(maximum) - 16;
-        if (bitshift > 0) {
-                u32 temp_maximum = maximum >> bitshift;
-                u32 temp_value = value >> bitshift;
-                this_step = (u32)
-                        (temp_value * progress_granularity / temp_maximum);
-                next_update = (((this_step + 1) * temp_maximum /
-                                        progress_granularity) + 1) << bitshift;
-        } else {
-                this_step = (u32) (value * progress_granularity / maximum);
-                next_update = ((this_step + 1) * maximum /
-                                progress_granularity) + 1;
-        }
-
-        if (this_step == last_step)
-                return next_update;
-
-        memset(&msg, 0, sizeof(msg));
-
-        msg.a = this_step;
-        msg.b = progress_granularity;
-
-        if (fmt) {
-                va_list args;
-                va_start(args, fmt);
-                vsnprintf(msg.text, sizeof(msg.text), fmt, args);
-                va_end(args);
-                msg.text[sizeof(msg.text)-1] = '\0';
-        }
-
-        toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
-                        &msg, sizeof(msg));
-        last_step = this_step;
-
-        return next_update;
-}
-
-/**
- * userui_message - Display a message without necessarily logging it.
- *
- * @section: Type of message. Messages can be filtered by type.
- * @level: Degree of importance of the message. Lower values = higher priority.
- * @normally_logged: Whether logged even if log_everything is off.
- * @fmt: Message (and parameters).
- *
- * This function is intended to do the same job as printk, but without normally
- * logging what is printed. The point is to be able to get debugging info on
- * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
- *
- * It may be called from an interrupt context - can't sleep!
- */
-static void userui_message(u32 section, u32 level, u32 normally_logged,
-                const char *fmt, ...)
-{
-        struct userui_msg_params msg;
-
-        if ((level) && (level > console_loglevel))
-                return;
-
-        memset(&msg, 0, sizeof(msg));
-
-        msg.a = section;
-        msg.b = level;
-        msg.c = normally_logged;
-
-        if (fmt) {
-                va_list args;
-                va_start(args, fmt);
-                vsnprintf(msg.text, sizeof(msg.text), fmt, args);
-                va_end(args);
-                msg.text[sizeof(msg.text)-1] = '\0';
-        }
-
-        if (test_action_state(TOI_LOGALL))
-                printk(KERN_INFO "%s\n", msg.text);
-
-        toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
-                        &msg, sizeof(msg));
-}
-
-/**
- * wait_for_key_via_userui - Wait for userui to receive a keypress.
- */
-static void wait_for_key_via_userui(void)
-{
-        DECLARE_WAITQUEUE(wait, current);
-
-        add_wait_queue(&userui_wait_for_key, &wait);
-        set_current_state(TASK_INTERRUPTIBLE);
-
-        wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake);
-        userui_wait_should_wake = false;
-
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&userui_wait_for_key, &wait);
-}
-
-/**
- * userui_prepare_status - Display high level messages.
- *
- * @clearbar: Whether to clear the progress bar.
- * @fmt...: New message for the title.
- *
- * Prepare the 'nice display', drawing the header and version, along with the
- * current action and perhaps also resetting the progress bar.
- */
-static void userui_prepare_status(int clearbar, const char *fmt, ...)
-{
-        va_list args;
-
-        if (fmt) {
-                va_start(args, fmt);
-                lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
-                va_end(args);
-        }
-
-        if (clearbar)
-                toi_update_status(0, 1, NULL);
-
-        if (ui_helper_data.pid == -1)
-                printk(KERN_EMERG "%s\n", lastheader);
-        else
-                toi_message(0, TOI_STATUS, 1, lastheader, NULL);
-}
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress from userui.
- *
- * FIXME: Implement timeout?
- */
-static char userui_wait_for_keypress(int timeout)
-{
-        char key = '\0';
-
-        if (ui_helper_data.pid != -1) {
-                wait_for_key_via_userui();
-                key = ' ';
-        }
-
-        return key;
-}
-
-/**
- * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
- *
- * @result_code: Reason why we're aborting (1 << bit).
- * @fmt: Message to display if telling the user what's going on.
- *
- * Abort a cycle. If this wasn't at the user's request (and we're displaying
- * output), tell the user why and wait for them to acknowledge the message.
- */
-static void userui_abort_hibernate(int result_code, const char *fmt, ...)
-{
-        va_list args;
-        int printed_len = 0;
-
-        set_result_state(result_code);
-
-        if (test_result_state(TOI_ABORTED))
-                return;
-
-        set_result_state(TOI_ABORTED);
-
-        if (test_result_state(TOI_ABORT_REQUESTED))
-                return;
-
-        va_start(args, fmt);
-        printed_len = vsnprintf(local_printf_buf,  sizeof(local_printf_buf),
-                        fmt, args);
-        va_end(args);
-        if (ui_helper_data.pid != -1)
-                printed_len = sprintf(local_printf_buf + printed_len,
-                                        " (Press SPACE to continue)");
-
-        toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
-
-        if (ui_helper_data.pid != -1)
-                userui_wait_for_keypress(0);
-}
-
-/**
- * request_abort_hibernate - Abort hibernating or resuming at user request.
- *
- * Handle the user requesting the cancellation of a hibernation or resume by
- * pressing escape.
- */
-static void request_abort_hibernate(void)
-{
-        if (test_result_state(TOI_ABORT_REQUESTED) ||
-           !test_action_state(TOI_CAN_CANCEL))
-                return;
-
-        if (test_toi_state(TOI_NOW_RESUMING)) {
-                toi_prepare_status(CLEAR_BAR, "Escape pressed. "
-                                        "Powering down again.");
-                set_toi_state(TOI_STOP_RESUME);
-                while (!test_toi_state(TOI_IO_STOPPED))
-                        schedule();
-                if (toiActiveAllocator->mark_resume_attempted)
-                        toiActiveAllocator->mark_resume_attempted(0);
-                toi_power_down();
-        }
-
-        toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
-                                        " ABORTING HIBERNATION ---");
-        set_abort_result(TOI_ABORT_REQUESTED);
-        toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_user_rcv_msg - Receive a netlink message from userui.
- *
- * @skb: skb received.
- * @nlh: Netlink header received.
- */
-static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-        int type;
-        int *data;
-
-        type = nlh->nlmsg_type;
-
-        /* A control message: ignore them */
-        if (type < NETLINK_MSG_BASE)
-                return 0;
-
-        /* Unknown message: reply with EINVAL */
-        if (type >= USERUI_MSG_MAX)
-                return -EINVAL;
-
-        /* All operations require privileges, even GET */
-        if (!capable(CAP_NET_ADMIN))
-                return -EPERM;
-
-        /* Only allow one task to receive NOFREEZE privileges */
-        if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
-                printk(KERN_INFO "Got NOFREEZE_ME request when "
-                        "ui_helper_data.pid is %d.\n", ui_helper_data.pid);
-                return -EBUSY;
-        }
-
-        data = (int *) NLMSG_DATA(nlh);
-
-        switch (type) {
-        case USERUI_MSG_ABORT:
-                request_abort_hibernate();
-                return 0;
-        case USERUI_MSG_GET_STATE:
-                toi_send_netlink_message(&ui_helper_data,
-                                USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
-                                sizeof(toi_bkd.toi_action));
-                return 0;
-        case USERUI_MSG_GET_DEBUG_STATE:
-                toi_send_netlink_message(&ui_helper_data,
-                                USERUI_MSG_GET_DEBUG_STATE,
-                                &toi_bkd.toi_debug_state,
-                                sizeof(toi_bkd.toi_debug_state));
-                return 0;
-        case USERUI_MSG_SET_STATE:
-                if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
-                        return -EINVAL;
-                ui_nl_set_state(*data);
-                return 0;
-        case USERUI_MSG_SET_DEBUG_STATE:
-                if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
-                        return -EINVAL;
-                toi_bkd.toi_debug_state = (*data);
-                return 0;
-        case USERUI_MSG_SPACE:
-                toi_stop_waiting_for_userui_key();
-                return 0;
-        case USERUI_MSG_GET_POWERDOWN_METHOD:
-                toi_send_netlink_message(&ui_helper_data,
-                                USERUI_MSG_GET_POWERDOWN_METHOD,
-                                &toi_poweroff_method,
-                                sizeof(toi_poweroff_method));
-                return 0;
-        case USERUI_MSG_SET_POWERDOWN_METHOD:
-                if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char)))
-                        return -EINVAL;
-                toi_poweroff_method = (unsigned long)(*data);
-                return 0;
-        case USERUI_MSG_GET_LOGLEVEL:
-                toi_send_netlink_message(&ui_helper_data,
-                                USERUI_MSG_GET_LOGLEVEL,
-                                &toi_bkd.toi_default_console_level,
-                                sizeof(toi_bkd.toi_default_console_level));
-                return 0;
-        case USERUI_MSG_SET_LOGLEVEL:
-                if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
-                        return -EINVAL;
-                toi_bkd.toi_default_console_level = (*data);
-                return 0;
-        case USERUI_MSG_PRINTK:
-                printk(KERN_INFO "%s", (char *) data);
-                return 0;
-        }
-
-        /* Unhandled here */
-        return 1;
-}
-
-/**
- * userui_cond_pause - Possibly pause at user request.
- *
- * @pause: Whether to pause or just display the message.
- * @message: Message to display at the start of pausing.
- *
- * Potentially pause and wait for the user to tell us to continue. We normally
- * only pause when @pause is set. While paused, the user can do things like
- * changing the loglevel, toggling the display of debugging sections and such
- * like.
- */
-static void userui_cond_pause(int pause, char *message)
-{
-        int displayed_message = 0, last_key = 0;
-
-        while (last_key != 32 &&
-                ui_helper_data.pid != -1 &&
-                ((test_action_state(TOI_PAUSE) && pause) ||
-                 (test_action_state(TOI_SINGLESTEP)))) {
-                if (!displayed_message) {
-                        toi_prepare_status(DONT_CLEAR_BAR,
-                           "%s Press SPACE to continue.%s",
-                           message ? message : "",
-                           (test_action_state(TOI_SINGLESTEP)) ?
-                           " Single step on." : "");
-                        displayed_message = 1;
-                }
-                last_key = userui_wait_for_keypress(0);
-        }
-        schedule();
-}
-
-/**
- * userui_prepare_console - Prepare the console for use.
- *
- * Prepare a console for use, saving current kmsg settings and attempting to
- * start userui. Console loglevel changes are handled by userui.
- */
-static void userui_prepare_console(void)
-{
-        orig_kmsg = vt_kmsg_redirect(fg_console + 1);
-
-        ui_helper_data.pid = -1;
-
-        if (!userui_ops.enabled) {
-                printk(KERN_INFO "TuxOnIce: Userui disabled.\n");
-                return;
-        }
-
-        if (*ui_helper_data.program)
-                toi_netlink_setup(&ui_helper_data);
-        else
-                printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
-}
-
-/**
- * userui_cleanup_console - Cleanup after a cycle.
- *
- * Tell userui to cleanup, and restore kmsg_redirect to its original value.
- */
-
-static void userui_cleanup_console(void)
-{
-        if (ui_helper_data.pid > -1)
-                toi_netlink_close(&ui_helper_data);
-
-        vt_kmsg_redirect(orig_kmsg);
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
-        SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_CAN_CANCEL, 0),
-        SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action,
-                        TOI_PAUSE, 0),
-        SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL),
-        SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
-                        2048, 0, NULL),
-        SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0,
-                        set_ui_program_set),
-        SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
-        .type                                = MISC_MODULE,
-        .name                                = "userui",
-        .shared_directory                = "user_interface",
-        .module                                = THIS_MODULE,
-        .storage_needed                        = userui_storage_needed,
-        .save_config_info                = userui_save_config_info,
-        .load_config_info                = userui_load_config_info,
-        .memory_needed                        = userui_memory_needed,
-        .post_atomic_restore                = userui_post_atomic_restore,
-        .sysfs_data                        = sysfs_params,
-        .num_sysfs_entries                = sizeof(sysfs_params) /
-                sizeof(struct toi_sysfs_data),
-};
-
-static struct ui_ops my_ui_ops = {
-        .update_status                        = userui_update_status,
-        .message                        = userui_message,
-        .prepare_status                        = userui_prepare_status,
-        .abort                                = userui_abort_hibernate,
-        .cond_pause                        = userui_cond_pause,
-        .prepare                        = userui_prepare_console,
-        .cleanup                        = userui_cleanup_console,
-        .wait_for_key                        = userui_wait_for_keypress,
-};
-
-/**
- * toi_user_ui_init - Boot time initialisation for user interface.
- *
- * Invoked from the core init routine.
- */
-static __init int toi_user_ui_init(void)
-{
-        int result;
-
-        ui_helper_data.nl = NULL;
-        strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
-        ui_helper_data.pid = -1;
-        ui_helper_data.skb_size = sizeof(struct userui_msg_params);
-        ui_helper_data.pool_limit = 6;
-        ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
-        ui_helper_data.name = "userspace ui";
-        ui_helper_data.rcv_msg = userui_user_rcv_msg;
-        ui_helper_data.interface_version = 8;
-        ui_helper_data.must_init = 0;
-        ui_helper_data.not_ready = userui_cleanup_console;
-        init_completion(&ui_helper_data.wait_for_process);
-        result = toi_register_module(&userui_ops);
-        if (!result) {
-          result = toi_register_ui_ops(&my_ui_ops);
-          if (result)
-            toi_unregister_module(&userui_ops);
-        }
-
-        return result;
-}
-
-late_initcall(toi_user_ui_init);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 8c1204fa3..a787aa942 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -15,8 +15,8 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-ifdef CONFIG_SCHED_BFS
-obj-y += bfs.o clock.o
+ifdef CONFIG_SCHED_MUQSS
+obj-y += MuQSS.o clock.o
 else
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
new file mode 100644
index 000000000..6a656ad4b
--- /dev/null
+++ b/kernel/sched/MuQSS.c
@@ -0,0 +1,8247 @@
+/*
+ *  kernel/sched/MuQSS.c, was kernel/sched.c
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991-2002  Linus Torvalds
+ *
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *		make semaphores SMP safe
+ *  1998-11-19	Implemented schedule_timeout() and related stuff
+ *		by Andrea Arcangeli
+ *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Cleanups and useful suggestions
+ *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03	Interactivity tuning by Con Kolivas.
+ *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2007-04-15  Work begun on replacing all interactivity tuning with a
+ *              fair scheduling design by Con Kolivas.
+ *  2007-05-05  Load balancing (smp-nice) and other improvements
+ *              by Peter Williams
+ *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
+ *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ *              Thomas Gleixner, Mike Kravetz
+ *  2009-08-13	Brainfuck deadline scheduling policy by Con Kolivas deletes
+ *              a whole lot of those previous things.
+ *  2016-10-01  Multiple Queue Skiplist Scheduler scalable evolution of BFS
+ * 		scheduler by Con Kolivas.
+ */
+
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/perf_event.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/sched/sysctl.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/log2.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
+#include <linux/sched/prio.h>
+#include <linux/tick.h>
+#include <linux/skip_list.h>
+
+#include <asm/irq_regs.h>
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+#include <asm/unistd.h>
+#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
+#include "cpupri.h"
+#include "../workqueue_internal.h"
+#include "../smpboot.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+#include "MuQSS.h"
+
+#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
+#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
+					(policy) == SCHED_RR)
+#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
+
+#define is_idle_policy(policy)	((policy) == SCHED_IDLEPRIO)
+#define idleprio_task(p)	unlikely(is_idle_policy((p)->policy))
+#define task_running_idle(p)	unlikely((p)->prio == IDLE_PRIO)
+
+#define is_iso_policy(policy)	((policy) == SCHED_ISO)
+#define iso_task(p)		unlikely(is_iso_policy((p)->policy))
+#define task_running_iso(p)	unlikely((p)->prio == ISO_PRIO)
+
+#define rq_idle(rq)		((rq)->rq_prio == PRIO_LIMIT)
+
+#define ISO_PERIOD		(5 * HZ)
+
+#define STOP_PRIO		(MAX_RT_PRIO - 1)
+
+/*
+ * Some helpers for converting to/from various scales. Use shifts to get
+ * approximate multiples of ten for less overhead.
+ */
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1073741824 / HZ))
+#define JIFFY_NS		(1073741824 / HZ)
+#define NS_TO_JIFFIES(TIME)	((TIME) / JIFFY_NS)
+#define HALF_JIFFY_NS		(1073741824 / HZ / 2)
+#define HALF_JIFFY_US		(1048576 / HZ / 2)
+#define MS_TO_NS(TIME)		((TIME) << 20)
+#define MS_TO_US(TIME)		((TIME) << 10)
+#define NS_TO_MS(TIME)		((TIME) >> 20)
+#define NS_TO_US(TIME)		((TIME) >> 10)
+
+#define RESCHED_US	(100) /* Reschedule if less than this many μs left */
+
+void print_scheduler_version(void)
+{
+	printk(KERN_INFO "MuQSS CPU scheduler v0.114 by Con Kolivas.\n");
+}
+
+/*
+ * This is the time all tasks within the same priority round robin.
+ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
+ * Tunable via /proc interface.
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+int rr_interval __read_mostly = 3;
+#else
+int rr_interval __read_mostly = 6;
+#endif
+
+/* Tunable to choose whether to prioritise latency or throughput, simple
+ * binary yes or no */
+
+int sched_interactive __read_mostly = 1;
+
+/*
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
+ * are allowed to run five seconds as real time tasks. This is the total over
+ * all online cpus.
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+int sched_iso_cpu __read_mostly = 25;
+#else
+int sched_iso_cpu __read_mostly = 70;
+#endif
+
+/*
+ * The relative length of deadline for each priority(nice) level.
+ */
+static int prio_ratios[NICE_WIDTH] __read_mostly;
+
+/*
+ * The quota handed out to tasks of all priority levels when refilling their
+ * time_slice.
+ */
+static inline int timeslice(void)
+{
+	return MS_TO_US(rr_interval);
+}
+
+/*
+ * The global runqueue data that all CPUs work off. Contains either atomic
+ * variables and a cpu bitmap set atomically.
+ */
+struct global_rq {
+#ifdef CONFIG_SMP
+	atomic_t nr_running ____cacheline_aligned_in_smp;
+	atomic_t nr_uninterruptible ____cacheline_aligned_in_smp;
+	atomic64_t nr_switches ____cacheline_aligned_in_smp;
+	atomic_t qnr ____cacheline_aligned_in_smp; /* queued not running */
+#else
+	atomic_t nr_running ____cacheline_aligned;
+	atomic_t nr_uninterruptible ____cacheline_aligned;
+	atomic64_t nr_switches ____cacheline_aligned;
+	atomic_t qnr ____cacheline_aligned; /* queued not running */
+#endif
+#ifdef CONFIG_SMP
+	cpumask_t cpu_idle_map;
+#endif
+};
+
+#ifdef CONFIG_SMP
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+	atomic_t refcount;
+	atomic_t rto_count;
+	struct rcu_head rcu;
+	cpumask_var_t span;
+	cpumask_var_t online;
+
+	/*
+	 * The "RT overload" flag: it gets set if a CPU has more than
+	 * one runnable RT task.
+	 */
+	cpumask_var_t rto_mask;
+	struct cpupri cpupri;
+};
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/* There can be only one */
+#ifdef CONFIG_SMP
+static struct global_rq grq ____cacheline_aligned_in_smp;
+#else
+static struct global_rq grq ____cacheline_aligned;
+#endif
+
+static DEFINE_MUTEX(sched_hotcpu_mutex);
+
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu)
+{
+	return &per_cpu(runqueues, (cpu));
+}
+#define task_rq(p)		cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+/*
+ * sched_domains_mutex serialises calls to init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+DEFINE_MUTEX(sched_domains_mutex);
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+#else
+struct rq *uprq;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_SMP
+static inline int cpu_of(struct rq *rq)
+{
+	return rq->cpu;
+}
+#else /* CONFIG_SMP */
+static inline int cpu_of(struct rq *rq)
+{
+	return 0;
+}
+#endif
+
+#include "stats.h"
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch()	do { } while (0)
+#endif
+
+/*
+ * All common locking functions performed on rq->lock. rq->clock is local to
+ * the CPU accessing it so it can be modified just with interrupts disabled
+ * when we're not updating niffies.
+ * Looking up task_rq must be done under rq->lock to be safe.
+ */
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+static inline void update_rq_clock(struct rq *rq)
+{
+	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+
+	if (unlikely(delta < 0))
+		return;
+	rq->clock += delta;
+	update_rq_clock_task(rq, delta);
+}
+
+/*
+ * Niffies are a globally increasing nanosecond counter. They're only used by
+ * update_load_avg and time_slice_expired, however deadlines are based on them
+ * across CPUs. Update them whenever we will call one of those functions, and
+ * synchronise them across CPUs whenever we hold both runqueue locks.
+ */
+static inline void update_clocks(struct rq *rq)
+{
+	s64 ndiff, minndiff;
+	long jdiff;
+
+	update_rq_clock(rq);
+	ndiff = rq->clock - rq->old_clock;
+	rq->old_clock = rq->clock;
+	jdiff = jiffies - rq->last_jiffy;
+
+	/* Subtract any niffies added by balancing with other rqs */
+	ndiff -= rq->niffies - rq->last_niffy;
+	minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies;
+	if (minndiff < 0)
+		minndiff = 0;
+	ndiff = max(ndiff, minndiff);
+	rq->niffies += ndiff;
+	rq->last_niffy = rq->niffies;
+	if (jdiff) {
+		rq->last_jiffy += jdiff;
+		rq->last_jiffy_niffies = rq->niffies;
+	}
+}
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+	return rq->curr == p;
+}
+
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+	return p->on_cpu;
+#else
+	return task_current(rq, p);
+#endif
+}
+
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+	return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+	return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
+
+static inline void rq_lock(struct rq *rq)
+	__acquires(rq->lock)
+{
+	raw_spin_lock(&rq->lock);
+}
+
+static inline int rq_trylock(struct rq *rq)
+	__acquires(rq->lock)
+{
+	return raw_spin_trylock(&rq->lock);
+}
+
+static inline void rq_unlock(struct rq *rq)
+	__releases(rq->lock)
+{
+	raw_spin_unlock(&rq->lock);
+}
+
+static inline struct rq *this_rq_lock(void)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	local_irq_disable();
+	rq = this_rq();
+	raw_spin_lock(&rq->lock);
+
+	return rq;
+}
+
+/*
+ * Any time we have two runqueues locked we use that as an opportunity to
+ * synchronise niffies to the highest value as idle ticks may have artificially
+ * kept niffies low on one CPU and the truth can only be later.
+ */
+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2)
+{
+	if (rq1->niffies > rq2->niffies)
+		rq2->niffies = rq1->niffies;
+	else
+		rq1->niffies = rq2->niffies;
+}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+
+/* For when we know rq1 != rq2 */
+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	if (rq1 < rq2) {
+		raw_spin_lock(&rq1->lock);
+		raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		raw_spin_lock(&rq2->lock);
+		raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	BUG_ON(!irqs_disabled());
+	if (rq1 == rq2) {
+		raw_spin_lock(&rq1->lock);
+		__acquire(rq2->lock);	/* Fake it out ;) */
+	} else
+		__double_rq_lock(rq1, rq2);
+	synchronise_niffies(rq1, rq2);
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
+{
+	raw_spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		raw_spin_unlock(&rq2->lock);
+	else
+		__release(rq2->lock);
+}
+
+/* Must be sure rq1 != rq2 and irqs are disabled */
+static inline void lock_second_rq(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	BUG_ON(!irqs_disabled());
+	if (unlikely(!raw_spin_trylock(&rq2->lock))) {
+		raw_spin_unlock(&rq1->lock);
+		__double_rq_lock(rq1, rq2);
+	}
+	synchronise_niffies(rq1, rq2);
+}
+
+static inline void lock_all_rqs(void)
+{
+	int cpu;
+
+	preempt_disable();
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		do_raw_spin_lock(&rq->lock);
+	}
+}
+
+static inline void unlock_all_rqs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		do_raw_spin_unlock(&rq->lock);
+	}
+	preempt_enable();
+}
+
+/* Specially nest trylock an rq */
+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq)
+{
+	if (unlikely(!do_raw_spin_trylock(&rq->lock)))
+		return false;
+	spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
+	synchronise_niffies(this_rq, rq);
+	return true;
+}
+
+/* Unlock a specially nested trylocked rq */
+static inline void unlock_rq(struct rq *rq)
+{
+	spin_release(&rq->lock.dep_map, 1, _RET_IP_);
+	do_raw_spin_unlock(&rq->lock);
+}
+
+static inline void rq_lock_irq(struct rq *rq)
+	__acquires(rq->lock)
+{
+	raw_spin_lock_irq(&rq->lock);
+}
+
+static inline void rq_unlock_irq(struct rq *rq)
+	__releases(rq->lock)
+{
+	raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags)
+	__acquires(rq->lock)
+{
+	raw_spin_lock_irqsave(&rq->lock, *flags);
+}
+
+static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags)
+	__releases(rq->lock)
+{
+	raw_spin_unlock_irqrestore(&rq->lock, *flags);
+}
+
+static inline struct rq
+*task_rq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(p->pi_lock)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	while (42) {
+		raw_spin_lock_irqsave(&p->pi_lock, *flags);
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			break;
+		raw_spin_unlock(&rq->lock);
+		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+	}
+	return rq;
+}
+
+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+	__releases(rq->lock)
+	__releases(p->pi_lock)
+{
+	rq_unlock(rq);
+	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	while (42) {
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			break;
+		raw_spin_unlock(&rq->lock);
+	}
+	return rq;
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+{
+	rq_unlock(rq);
+}
+
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, mask)						\
+	({								\
+		typeof(ptr) _ptr = (ptr);				\
+		typeof(mask) _mask = (mask);				\
+		typeof(*_ptr) _old, _val = *_ptr;			\
+									\
+		for (;;) {						\
+			_old = cmpxchg(_ptr, _val, _val | _mask);	\
+			if (_old == _val)				\
+				break;					\
+			_val = _old;					\
+		}							\
+	_old;								\
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+	struct thread_info *ti = task_thread_info(p);
+	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+	struct thread_info *ti = task_thread_info(p);
+	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+
+	for (;;) {
+		if (!(val & _TIF_POLLING_NRFLAG))
+			return false;
+		if (val & _TIF_NEED_RESCHED)
+			return true;
+		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+		if (old == val)
+			break;
+		val = old;
+	}
+	return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+	set_tsk_need_resched(p);
+	return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+	return false;
+}
+#endif
+#endif
+
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+	struct wake_q_node *node = &task->wake_q;
+
+	/*
+	 * Atomically grab the task, if ->wake_q is !nil already it means
+	 * its already queued (either by us or someone else) and will get the
+	 * wakeup due to that.
+	 *
+	 * This cmpxchg() implies a full barrier, which pairs with the write
+	 * barrier implied by the wakeup in wake_up_q().
+	 */
+	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+		return;
+
+	get_task_struct(task);
+
+	/*
+	 * The head is context local, there can be no concurrency.
+	 */
+	*head->lastp = node;
+	head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+	struct wake_q_node *node = head->first;
+
+	while (node != WAKE_Q_TAIL) {
+		struct task_struct *task;
+
+		task = container_of(node, struct task_struct, wake_q);
+		BUG_ON(!task);
+		/* task can safely be re-inserted now */
+		node = node->next;
+		task->wake_q.next = NULL;
+
+		/*
+		 * wake_up_process() implies a wmb() to pair with the queueing
+		 * in wake_q_add() so as not to miss wakeups.
+		 */
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+}
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+	next->on_cpu = 1;
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
+{
+	int cpu;
+#ifdef CONFIG_LOCKDEP
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&rq->lock);
+#endif
+	if (test_tsk_need_resched(p))
+		return;
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id()) {
+		set_tsk_need_resched(p);
+		set_preempt_need_resched();
+		return;
+	}
+
+	if (set_nr_and_not_polling(p))
+		smp_send_reschedule(cpu);
+	else
+		trace_sched_wake_idle_without_ipi(cpu);
+}
+
+/*
+ * A task that is not running or queued will not have a node set.
+ * A task that is queued but not running will have a node set.
+ * A task that is currently running will have ->on_cpu set but no node set.
+ */
+static inline bool task_queued(struct task_struct *p)
+{
+	return !skiplist_node_empty(&p->node);
+}
+
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+static inline void resched_if_idle(struct rq *rq);
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 *
+	 * In particular, the load of prev->state in finish_task_switch() must
+	 * happen before this.
+	 *
+	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+	 */
+	smp_store_release(&prev->on_cpu, 0);
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+	/* this is a valid case when another task releases the spinlock */
+	rq->lock.owner = current;
+#endif
+	/*
+	 * If we are tracking spinlock dependencies then we have to
+	 * fix up the runqueue lock - which gets 'carried over' from
+	 * prev into current:
+	 */
+	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+#ifdef CONFIG_SMP
+	/*
+	 * If prev was marked as migrating to another CPU in return_task, drop
+	 * the local runqueue lock but leave interrupts disabled and grab the
+	 * remote lock we're migrating it to before enabling them.
+	 */
+	if (unlikely(task_on_rq_migrating(prev))) {
+		sched_info_dequeued(rq, prev);
+		/*
+		 * We move the ownership of prev to the new cpu now. ttwu can't
+		 * activate prev to the wrong cpu since it has to grab this
+		 * runqueue in ttwu_remote.
+		 */
+		task_thread_info(prev)->cpu = prev->wake_cpu;
+		raw_spin_unlock(&rq->lock);
+
+		raw_spin_lock(&prev->pi_lock);
+		rq = __task_rq_lock(prev);
+		/* Check that someone else hasn't already queued prev */
+		if (likely(!task_queued(prev))) {
+			enqueue_task(rq, prev, 0);
+			prev->on_rq = TASK_ON_RQ_QUEUED;
+			/* Wake up the CPU if it's not already running */
+			resched_if_idle(rq);
+		}
+		raw_spin_unlock(&prev->pi_lock);
+	}
+#endif
+	raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline bool deadline_before(u64 deadline, u64 time)
+{
+	return (deadline < time);
+}
+
+/*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+	return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+	return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+	return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+	return NS_TO_MS(longest_deadline_diff());
+}
+
+static inline int rq_load(struct rq *rq)
+{
+	return rq->sl->entries + !rq_idle(rq);
+}
+
+static inline bool rq_local(struct rq *rq);
+
+/*
+ * Update the load average for feeding into cpu frequency governors. Use a
+ * rough estimate of a rolling average with ~ time constant of 32ms.
+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
+ * Make sure a call to update_clocks has been made before calling this to get
+ * an updated rq->niffies.
+ */
+static void update_load_avg(struct rq *rq)
+{
+	/* rq clock can go backwards so skip update if that happens */
+	if (likely(rq->clock > rq->load_update)) {
+		unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
+		long load, curload = rq_load(rq);
+
+		load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
+		if (unlikely(load < 0))
+			load = 0;
+		load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
+		rq->load_avg = load;
+	} else
+		return;
+
+	rq->load_update = rq->clock;
+	if (likely(rq_local(rq)))
+		cpufreq_trigger(rq->niffies, rq->load_avg);
+}
+
+/*
+ * Removing from the runqueue. Enter with rq locked. Deleting a task
+ * from the skip list is done via the stored node reference in the task struct
+ * and does not require a full look up. Thus it occurs in O(k) time where k
+ * is the "level" of the list the task was stored at - usually < 4, max 8.
+ */
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+	skiplist_delete(rq->sl, &p->node);
+	rq->best_key = rq->node.next[0]->key;
+	update_clocks(rq);
+	if (!(flags & DEQUEUE_SAVE))
+		sched_info_dequeued(task_rq(p), p);
+	update_load_avg(rq);
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+static bool rcu_read_critical(struct task_struct *p)
+{
+	return p->rcu_read_unlock_special.b.blocked;
+}
+#else /* CONFIG_PREEMPT_RCU */
+#define rcu_read_critical(p) (false)
+#endif /* CONFIG_PREEMPT_RCU */
+
+/*
+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
+ * an idle task, we ensure none of the following conditions are met.
+ */
+static bool idleprio_suitable(struct task_struct *p)
+{
+	return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
+		!signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
+}
+
+/*
+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
+ * that the iso_refractory flag is not set.
+ */
+static inline bool isoprio_suitable(struct rq *rq)
+{
+	return !rq->iso_refractory;
+}
+
+/*
+ * Adding to the runqueue. Enter with rq locked.
+ */
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+	unsigned int randseed;
+	u64 sl_id;
+
+	if (!rt_task(p)) {
+		/* Check it hasn't gotten rt from PI */
+		if ((idleprio_task(p) && idleprio_suitable(p)) ||
+		   (iso_task(p) && isoprio_suitable(rq)))
+			p->prio = p->normal_prio;
+		else
+			p->prio = NORMAL_PRIO;
+	}
+	/*
+	 * The sl_id key passed to the skiplist generates a sorted list.
+	 * Realtime and sched iso tasks run FIFO so they only need be sorted
+	 * according to priority. The skiplist will put tasks of the same
+	 * key inserted later in FIFO order. Tasks of sched normal, batch
+	 * and idleprio are sorted according to their deadlines. Idleprio
+	 * tasks are offset by an impossibly large deadline value ensuring
+	 * they get sorted into last positions, but still according to their
+	 * own deadlines. This creates a "landscape" of skiplists running
+	 * from priority 0 realtime in first place to the lowest priority
+	 * idleprio tasks last. Skiplist insertion is an O(log n) process.
+	 */
+	if (p->prio <= ISO_PRIO)
+		sl_id = p->prio;
+	else {
+		sl_id = p->deadline;
+		if (idleprio_task(p)) {
+			if (p->prio == IDLE_PRIO)
+				sl_id |= 0xF000000000000000;
+			else
+				sl_id += longest_deadline_diff();
+		}
+	}
+	/*
+	 * Some architectures don't have better than microsecond resolution
+	 * so mask out ~microseconds as the random seed for skiplist insertion.
+	 */
+	update_clocks(rq);
+	if (!(flags & ENQUEUE_RESTORE))
+		sched_info_queued(rq, p);
+	randseed = (rq->niffies >> 10) & 0xFFFFFFFF;
+	skiplist_insert(rq->sl, &p->node, sl_id, p, randseed);
+	rq->best_key = rq->node.next[0]->key;
+	update_load_avg(rq);
+}
+
+/*
+ * Returns the relative length of deadline all compared to the shortest
+ * deadline which is that of nice -20.
+ */
+static inline int task_prio_ratio(struct task_struct *p)
+{
+	return prio_ratios[TASK_USER_PRIO(p)];
+}
+
+/*
+ * task_timeslice - all tasks of all priorities get the exact same timeslice
+ * length. CPU distribution is handled by giving different deadlines to
+ * tasks of different priorities. Use 128 as the base value for fast shifts.
+ */
+static inline int task_timeslice(struct task_struct *p)
+{
+	return (rr_interval * task_prio_ratio(p) / 128);
+}
+
+/*
+ * qnr is the "queued but not running" count which is the total number of
+ * tasks on the global runqueue list waiting for cpu time but not actually
+ * currently running on a cpu.
+ */
+static inline void inc_qnr(void)
+{
+	atomic_inc(&grq.qnr);
+}
+
+static inline void dec_qnr(void)
+{
+	atomic_dec(&grq.qnr);
+}
+
+static inline int queued_notrunning(void)
+{
+	return atomic_read(&grq.qnr);
+}
+
+#ifdef CONFIG_SMP
+/* Entered with rq locked */
+static inline void resched_if_idle(struct rq *rq)
+{
+	if (rq_idle(rq))
+		resched_task(rq->curr);
+}
+
+static inline bool rq_local(struct rq *rq)
+{
+	return (rq->cpu == smp_processor_id());
+}
+#ifdef CONFIG_SMT_NICE
+static const cpumask_t *thread_cpumask(int cpu);
+
+/* Find the best real time priority running on any SMT siblings of cpu and if
+ * none are running, the static priority of the best deadline task running.
+ * The lookups to the other runqueues is done lockless as the occasional wrong
+ * value would be harmless. */
+static int best_smt_bias(struct rq *this_rq)
+{
+	int other_cpu, best_bias = 0;
+
+	for_each_cpu(other_cpu, &this_rq->thread_mask) {
+		struct rq *rq = cpu_rq(other_cpu);
+
+		if (rq_idle(rq))
+			continue;
+		if (unlikely(!rq->online))
+			continue;
+		if (!rq->rq_mm)
+			continue;
+		if (likely(rq->rq_smt_bias > best_bias))
+			best_bias = rq->rq_smt_bias;
+	}
+	return best_bias;
+}
+
+static int task_prio_bias(struct task_struct *p)
+{
+	if (rt_task(p))
+		return 1 << 30;
+	else if (task_running_iso(p))
+		return 1 << 29;
+	else if (task_running_idle(p))
+		return 0;
+	return MAX_PRIO - p->static_prio;
+}
+
+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq)
+{
+	return true;
+}
+
+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule;
+
+/* We've already decided p can run on CPU, now test if it shouldn't for SMT
+ * nice reasons. */
+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq)
+{
+	int best_bias, task_bias;
+
+	/* Kernel threads always run */
+	if (unlikely(!p->mm))
+		return true;
+	if (rt_task(p))
+		return true;
+	if (!idleprio_suitable(p))
+		return true;
+	best_bias = best_smt_bias(this_rq);
+	/* The smt siblings are all idle or running IDLEPRIO */
+	if (best_bias < 1)
+		return true;
+	task_bias = task_prio_bias(p);
+	if (task_bias < 1)
+		return false;
+	if (task_bias >= best_bias)
+		return true;
+	/* Dither 25% cpu of normal tasks regardless of nice difference */
+	if (best_bias % 4 == 1)
+		return true;
+	/* Sorry, you lose */
+	return false;
+}
+#else /* CONFIG_SMT_NICE */
+#define smt_schedule(p, this_rq) (true)
+#endif /* CONFIG_SMT_NICE */
+
+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
+{
+	set_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
+/*
+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
+ * allow easy lookup of whether any suitable idle CPUs are available.
+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
+ * idle_cpus variable than to do a full bitmask check when we are busy. The
+ * bits are set atomically but read locklessly as occasional false positive /
+ * negative is harmless.
+ */
+static inline void set_cpuidle_map(int cpu)
+{
+	if (likely(cpu_online(cpu)))
+		atomic_set_cpu(cpu, &grq.cpu_idle_map);
+}
+
+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
+{
+	clear_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+	atomic_clear_cpu(cpu, &grq.cpu_idle_map);
+}
+
+static bool suitable_idle_cpus(struct task_struct *p)
+{
+	return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
+}
+
+/*
+ * Resched current on rq. We don't know if rq is local to this CPU nor if it
+ * is locked so we do not use an intermediate variable for the task to avoid
+ * having it dereferenced.
+ */
+static void resched_curr(struct rq *rq)
+{
+	int cpu;
+
+	if (test_tsk_need_resched(rq->curr))
+		return;
+
+	rq->preempt = rq->curr;
+	cpu = rq->cpu;
+
+	/* We're doing this without holding the rq lock if it's not task_rq */
+
+	if (cpu == smp_processor_id()) {
+		set_tsk_need_resched(rq->curr);
+		set_preempt_need_resched();
+		return;
+	}
+
+	if (set_nr_and_not_polling(rq->curr))
+		smp_send_reschedule(cpu);
+	else
+		trace_sched_wake_idle_without_ipi(cpu);
+}
+
+#define CPUIDLE_DIFF_THREAD	(1)
+#define CPUIDLE_DIFF_CORE	(2)
+#define CPUIDLE_CACHE_BUSY	(4)
+#define CPUIDLE_DIFF_CPU	(8)
+#define CPUIDLE_THREAD_BUSY	(16)
+#define CPUIDLE_DIFF_NODE	(32)
+
+/*
+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
+ *
+ * Same thread, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+ */
+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+	int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
+		CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
+		CPUIDLE_DIFF_THREAD;
+	int cpu_tmp;
+
+	if (cpumask_test_cpu(best_cpu, tmpmask))
+		goto out;
+
+	for_each_cpu(cpu_tmp, tmpmask) {
+		int ranking, locality;
+		struct rq *tmp_rq;
+
+		ranking = 0;
+		tmp_rq = cpu_rq(cpu_tmp);
+
+		locality = rq->cpu_locality[cpu_tmp];
+#ifdef CONFIG_NUMA
+		if (locality > 3)
+			ranking |= CPUIDLE_DIFF_NODE;
+		else
+#endif
+		if (locality > 2)
+			ranking |= CPUIDLE_DIFF_CPU;
+#ifdef CONFIG_SCHED_MC
+		else if (locality == 2)
+			ranking |= CPUIDLE_DIFF_CORE;
+		else if (!(tmp_rq->cache_idle(tmp_rq)))
+			ranking |= CPUIDLE_CACHE_BUSY;
+#endif
+#ifdef CONFIG_SCHED_SMT
+		if (locality == 1)
+			ranking |= CPUIDLE_DIFF_THREAD;
+		if (!(tmp_rq->siblings_idle(tmp_rq)))
+			ranking |= CPUIDLE_THREAD_BUSY;
+#endif
+		if (ranking < best_ranking) {
+			best_cpu = cpu_tmp;
+			best_ranking = ranking;
+		}
+	}
+out:
+	return best_cpu;
+}
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+	struct rq *this_rq = cpu_rq(this_cpu);
+
+	return (this_rq->cpu_locality[that_cpu] < 3);
+}
+
+/* As per resched_curr but only will resched idle task */
+static inline void resched_idle(struct rq *rq)
+{
+	if (test_tsk_need_resched(rq->idle))
+		return;
+
+	rq->preempt = rq->idle;
+
+	set_tsk_need_resched(rq->idle);
+
+	if (rq_local(rq)) {
+		set_preempt_need_resched();
+		return;
+	}
+
+	smp_send_reschedule(rq->cpu);
+}
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
+{
+	cpumask_t tmpmask;
+	struct rq *rq;
+	int best_cpu;
+
+	cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
+	best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask);
+	rq = cpu_rq(best_cpu);
+	if (!smt_schedule(p, rq))
+		return NULL;
+	resched_idle(rq);
+	return rq;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+	if (suitable_idle_cpus(p))
+		resched_best_idle(p, task_cpu(p));
+}
+
+static inline struct rq *rq_order(struct rq *rq, int cpu)
+{
+	return rq->rq_order[cpu];
+}
+#else /* CONFIG_SMP */
+static inline void set_cpuidle_map(int cpu)
+{
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+}
+
+static inline bool suitable_idle_cpus(struct task_struct *p)
+{
+	return uprq->curr == uprq->idle;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+}
+
+static inline void resched_curr(struct rq *rq)
+{
+	resched_task(rq->curr);
+}
+
+static inline void resched_if_idle(struct rq *rq)
+{
+}
+
+static inline bool rq_local(struct rq *rq)
+{
+	return true;
+}
+
+static inline struct rq *rq_order(struct rq *rq, int cpu)
+{
+	return rq;
+}
+
+static inline bool smt_schedule(struct task_struct *p, struct rq *rq)
+{
+	return true;
+}
+#endif /* CONFIG_SMP */
+
+static inline int normal_prio(struct task_struct *p)
+{
+	if (has_rt_policy(p))
+		return MAX_RT_PRIO - 1 - p->rt_priority;
+	if (idleprio_task(p))
+		return IDLE_PRIO;
+	if (iso_task(p))
+		return ISO_PRIO;
+	return NORMAL_PRIO;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks as it will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+	p->normal_prio = normal_prio(p);
+	/*
+	 * If we are RT tasks or we were boosted to RT priority,
+	 * keep the priority unchanged. Otherwise, update priority
+	 * to the normal priority:
+	 */
+	if (!rt_prio(p->prio))
+		return p->normal_prio;
+	return p->prio;
+}
+
+/*
+ * activate_task - move a task to the runqueue. Enter with rq locked.
+ */
+static void activate_task(struct task_struct *p, struct rq *rq)
+{
+	resched_if_idle(rq);
+
+	/*
+	 * Sleep time is in units of nanosecs, so shift by 20 to get a
+	 * milliseconds-range estimation of the amount of time that the task
+	 * spent sleeping:
+	 */
+	if (unlikely(prof_on == SLEEP_PROFILING)) {
+		if (p->state == TASK_UNINTERRUPTIBLE)
+			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+				     (rq->niffies - p->last_ran) >> 20);
+	}
+
+	p->prio = effective_prio(p);
+	if (task_contributes_to_load(p))
+		atomic_dec(&grq.nr_uninterruptible);
+
+	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
+	atomic_inc(&grq.nr_running);
+	inc_qnr();
+}
+
+/*
+ * deactivate_task - If it's running, it's not on the runqueue and we can just
+ * decrement the nr_running. Enter with rq locked.
+ */
+static inline void deactivate_task(struct task_struct *p, struct rq *rq)
+{
+	if (task_contributes_to_load(p))
+		atomic_inc(&grq.nr_uninterruptible);
+
+	p->on_rq = 0;
+	atomic_dec(&grq.nr_running);
+	sched_info_dequeued(rq, p);
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+	struct rq *rq = task_rq(p);
+	bool queued;
+
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * The caller should hold either p->pi_lock or rq->lock, when changing
+	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+	 *
+	 * Furthermore, all task_rq users should acquire both locks, see
+	 * task_rq_lock().
+	 */
+	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+				      lockdep_is_held(&task_rq(p)->lock)));
+#endif
+	if (p->wake_cpu == cpu)
+		return;
+	trace_sched_migrate_task(p, cpu);
+	perf_event_task_migrate(p);
+
+	/*
+	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+	 * successfully executed on another CPU. We must ensure that updates of
+	 * per-task data have been completed by this moment.
+	 */
+	smp_wmb();
+
+	if (task_running(rq, p)) {
+		/*
+		 * We should only be calling this on a running task if we're
+		 * holding rq lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+
+		/*
+		 * We can't change the task_thread_info cpu on a running task
+		 * as p will still be protected by the rq lock of the cpu it
+		 * is still running on so we set the wake_cpu for it to be
+		 * lazily updated once off the cpu.
+		 */
+		p->wake_cpu = cpu;
+		return;
+	}
+
+	if ((queued = task_queued(p)))
+		dequeue_task(rq, p, 0);
+	task_thread_info(p)->cpu = p->wake_cpu = cpu;
+	if (queued)
+		enqueue_task(cpu_rq(cpu), p, 0);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Move a task off the runqueue and take it to a cpu for it will
+ * become the running task.
+ */
+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
+{
+	struct rq *p_rq = task_rq(p);
+
+	dequeue_task(p_rq, p, DEQUEUE_SAVE);
+	if (p_rq != rq) {
+		sched_info_dequeued(p_rq, p);
+		sched_info_queued(rq, p);
+	}
+	set_task_cpu(p, cpu);
+	dec_qnr();
+}
+
+/*
+ * Returns a descheduling task to the runqueue unless it is being
+ * deactivated.
+ */
+static inline void return_task(struct task_struct *p, struct rq *rq,
+			       int cpu, bool deactivate)
+{
+	if (deactivate)
+		deactivate_task(p, rq);
+	else {
+		inc_qnr();
+#ifdef CONFIG_SMP
+		/*
+		 * set_task_cpu was called on the running task that doesn't
+		 * want to deactivate so it has to be enqueued to a different
+		 * CPU and we need its lock. Tag it to be moved with as the
+		 * lock is dropped in finish_lock_switch.
+		 */
+		if (unlikely(p->wake_cpu != cpu))
+			p->on_rq = TASK_ON_RQ_MIGRATING;
+		else
+#endif
+			enqueue_task(rq, p, ENQUEUE_RESTORE);
+	}
+}
+
+/* Enter with rq lock held. We know p is on the local cpu */
+static inline void __set_tsk_resched(struct task_struct *p)
+{
+	set_tsk_need_resched(p);
+	set_preempt_need_resched();
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+	return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+	int running, queued;
+	unsigned long flags;
+	unsigned long ncsw;
+	struct rq *rq;
+
+	for (;;) {
+		rq = task_rq(p);
+
+		/*
+		 * If the task is actively running on another CPU
+		 * still, just relax and busy-wait without holding
+		 * any locks.
+		 *
+		 * NOTE! Since we don't hold any locks, it's not
+		 * even sure that "rq" stays as the right runqueue!
+		 * But we don't care, since this will return false
+		 * if the runqueue has changed and p is actually now
+		 * running somewhere else!
+		 */
+		while (task_running(rq, p)) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
+			cpu_relax();
+		}
+
+		/*
+		 * Ok, time to look more closely! We need the rq
+		 * lock now, to be *sure*. If we're wrong, we'll
+		 * just go back and repeat.
+		 */
+		rq = task_rq_lock(p, &flags);
+		trace_sched_wait_task(p);
+		running = task_running(rq, p);
+		queued = task_on_rq_queued(p);
+		ncsw = 0;
+		if (!match_state || p->state == match_state)
+			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+		task_rq_unlock(rq, p, &flags);
+
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
+		/*
+		 * Was it really running after all now that we
+		 * checked with the proper locks actually held?
+		 *
+		 * Oops. Go back and try again..
+		 */
+		if (unlikely(running)) {
+			cpu_relax();
+			continue;
+		}
+
+		/*
+		 * It's not enough that it's not actively running,
+		 * it must be off the runqueue _entirely_, and not
+		 * preempted!
+		 *
+		 * So if it was still runnable (but just not actively
+		 * running right now), it's preempted, and we should
+		 * yield - it could be a while.
+		 */
+		if (unlikely(queued)) {
+			ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+			continue;
+		}
+
+		/*
+		 * Ahh, all good. It wasn't running, and it wasn't
+		 * runnable, which means that it will never become
+		 * running in the future either. We're all done!
+		 */
+		break;
+	}
+
+	return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if ((cpu != smp_processor_id()) && task_curr(p))
+		smp_send_reschedule(cpu);
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif
+
+/*
+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
+ * between themselves, they cooperatively multitask. An idle rq scores as
+ * prio PRIO_LIMIT so it is always preempted.
+ */
+static inline bool
+can_preempt(struct task_struct *p, int prio, u64 deadline)
+{
+	/* Better static priority RT task or better policy preemption */
+	if (p->prio < prio)
+		return true;
+	if (p->prio > prio)
+		return false;
+	if (p->policy == SCHED_BATCH)
+		return false;
+	/* SCHED_NORMAL and ISO will preempt based on deadline */
+	if (!deadline_before(p->deadline, deadline))
+		return false;
+	return true;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead.
+ */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
+		return true;
+	return false;
+}
+#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	int i, this_entries = rq_load(this_rq);
+	cpumask_t tmp;
+
+	if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p)))
+		return;
+
+	/* IDLEPRIO tasks never preempt anything but idle */
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+
+	cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
+
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct rq *rq = this_rq->rq_order[i];
+
+		if (!cpumask_test_cpu(rq->cpu, &tmp))
+			continue;
+
+		if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries)
+			continue;
+		if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
+			resched_curr(rq);
+			return;
+		}
+	}
+}
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check);
+#else /* CONFIG_SMP */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	return false;
+}
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+	if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
+		resched_curr(uprq);
+}
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+					 const struct cpumask *new_mask, bool check)
+{
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * wake flags
+ */
+#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
+#define WF_FORK		0x02		/* child wakeup after fork */
+#define WF_MIGRATED	0x04		/* internal use, task got migrated */
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+#ifdef CONFIG_SCHEDSTATS
+	struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+	int this_cpu = smp_processor_id();
+
+	if (cpu == this_cpu)
+		schedstat_inc(rq, ttwu_local);
+	else {
+		struct sched_domain *sd;
+
+		rcu_read_lock();
+		for_each_domain(this_cpu, sd) {
+			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+				schedstat_inc(sd, ttwu_wake_remote);
+				break;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+#endif /* CONFIG_SMP */
+
+	schedstat_inc(rq, ttwu_count);
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p)
+{
+	activate_task(p, rq);
+
+	/* if a worker is waking up, notify workqueue */
+	if (p->flags & PF_WQ_WORKER)
+		wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption if there are no idle cpus,
+	 * instead waiting for current to deschedule.
+	 */
+	if (wake_flags & WF_SYNC)
+		resched_suitable_idle(p);
+	else
+		try_preempt(p, rq);
+	p->state = TASK_RUNNING;
+	trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+	lockdep_assert_held(&rq->lock);
+
+#ifdef CONFIG_SMP
+	if (p->sched_contributes_to_load)
+		atomic_dec(&grq.nr_uninterruptible);
+#endif
+
+	ttwu_activate(rq, p);
+	ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+	struct rq *rq;
+	int ret = 0;
+
+	rq = __task_rq_lock(p);
+	if (likely(task_on_rq_queued(p))) {
+		ttwu_do_wakeup(rq, p, wake_flags);
+		ret = 1;
+	}
+	__task_rq_unlock(rq);
+
+	return ret;
+}
+
+#ifdef CONFIG_SMP
+static bool sched_smp_initialized __read_mostly;
+
+void sched_ttwu_pending(void)
+{
+	struct rq *rq = this_rq();
+	struct llist_node *llist = llist_del_all(&rq->wake_list);
+	struct task_struct *p;
+	unsigned long flags;
+
+	if (!llist)
+		return;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	while (llist) {
+		int wake_flags = 0;
+
+		p = llist_entry(llist, struct task_struct, wake_entry);
+		llist = llist_next(llist);
+
+		ttwu_do_activate(rq, p, wake_flags);
+	}
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void scheduler_ipi(void)
+{
+	/*
+	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+	 * TIF_NEED_RESCHED remotely (for the first time) will also send
+	 * this IPI.
+	 */
+	preempt_fold_need_resched();
+
+	if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched()))
+		return;
+
+	/*
+	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+	 * traditionally all their work was done from the interrupt return
+	 * path. Now that we actually do some work, we need to make sure
+	 * we do call them.
+	 *
+	 * Some archs already do call them, luckily irq_enter/exit nest
+	 * properly.
+	 *
+	 * Arguably we should visit all archs and update all handlers,
+	 * however a fair share of IPIs are still resched only so this would
+	 * somewhat pessimize the simple resched case.
+	 */
+	irq_enter();
+	sched_ttwu_pending();
+	irq_exit();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+		if (!set_nr_if_polling(rq->idle))
+			smp_send_reschedule(cpu);
+		else
+			trace_sched_wake_idle_without_ipi(cpu);
+	}
+}
+
+void wake_up_if_idle(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	rcu_read_lock();
+
+	if (!is_idle_task(rcu_dereference(rq->curr)))
+		goto out;
+
+	if (set_nr_if_polling(rq->idle)) {
+		trace_sched_wake_idle_without_ipi(cpu);
+	} else {
+		rq_lock_irqsave(rq, &flags);
+		if (likely(is_idle_task(rq->curr)))
+			smp_send_reschedule(cpu);
+		/* Else cpu is not in idle, do nothing here */
+		rq_unlock_irqrestore(rq, &flags);
+	}
+
+out:
+	rcu_read_unlock();
+}
+
+static int valid_task_cpu(struct task_struct *p)
+{
+	cpumask_t valid_mask;
+
+	if (p->flags & PF_KTHREAD)
+		cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_online_mask);
+	else
+		cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_active_mask);
+
+	if (unlikely(!cpumask_weight(&valid_mask))) {
+		/* Hotplug boot threads do this before the CPU is up */
+		WARN_ON(sched_smp_initialized);
+		return cpumask_any(tsk_cpus_allowed(p));
+	}
+	return cpumask_any(&valid_mask);
+}
+
+/*
+ * For a task that's just being woken up we have a valuable balancing
+ * opportunity so choose the nearest cache most lightly loaded runqueue.
+ * Entered with rq locked and returns with the chosen runqueue locked.
+ */
+static inline int select_best_cpu(struct task_struct *p)
+{
+	unsigned int idlest = ~0U;
+	struct rq *rq = NULL;
+	int i;
+
+	if (suitable_idle_cpus(p)) {
+		int cpu = task_cpu(p);
+
+		if (unlikely(needs_other_cpu(p, cpu)))
+			cpu = valid_task_cpu(p);
+		rq = resched_best_idle(p, cpu);
+		if (likely(rq))
+			return rq->cpu;
+	}
+
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct rq *other_rq = task_rq(p)->rq_order[i];
+		int entries;
+
+		if (!other_rq->online)
+			continue;
+		if (needs_other_cpu(p, other_rq->cpu))
+			continue;
+		entries = rq_load(other_rq);
+		if (entries >= idlest)
+			continue;
+		idlest = entries;
+		rq = other_rq;
+	}
+	if (unlikely(!rq))
+		return smp_processor_id();
+	return rq->cpu;
+}
+#else /* CONFIG_SMP */
+static int valid_task_cpu(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline int select_best_cpu(struct task_struct *p)
+{
+	return 0;
+}
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
+{
+	return NULL;
+}
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+	if (!cpus_share_cache(smp_processor_id(), cpu)) {
+		sched_clock_cpu(cpu); /* sync clocks x-cpu */
+		ttwu_queue_remote(p, cpu, wake_flags);
+		return;
+	}
+#endif
+	rq_lock(rq);
+	ttwu_do_activate(rq, p, wake_flags);
+	rq_unlock(rq);
+}
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Return: %true if @p was woken up, %false if it was already running.
+ * or @state didn't match @p's state.
+ */
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+{
+	unsigned long flags;
+	int cpu, success = 0;
+
+	/*
+	 * If we are going to wake up a thread waiting for CONDITION we
+	 * need to ensure that CONDITION=1 done by the caller can not be
+	 * reordered with p->state check below. This pairs with mb() in
+	 * set_current_state() the waiting thread does.
+	 */
+	smp_mb__before_spinlock();
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	/* state is a volatile long, どうして、分からない */
+	if (!((unsigned int)p->state & state))
+		goto out;
+
+	trace_sched_waking(p);
+
+	success = 1; /* we're going to change ->state */
+	cpu = task_cpu(p);
+
+	/*
+	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
+	 * be possible to, falsely, observe p->on_rq == 0 and get stuck
+	 * in smp_cond_load_acquire() below.
+	 *
+	 * sched_ttwu_pending()                 try_to_wake_up()
+	 *   [S] p->on_rq = 1;                  [L] P->state
+	 *       UNLOCK rq->lock  -----.
+	 *                              \
+	 *				 +---   RMB
+	 * schedule()                   /
+	 *       LOCK rq->lock    -----'
+	 *       UNLOCK rq->lock
+	 *
+	 * [task p]
+	 *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+	 *
+	 * Pairs with the UNLOCK+LOCK on rq->lock from the
+	 * last wakeup of our task and the schedule that got our task
+	 * current.
+	 */
+	smp_rmb();
+	if (p->on_rq && ttwu_remote(p, wake_flags))
+		goto stat;
+
+#ifdef CONFIG_SMP
+	/*
+	 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+	 * possible to, falsely, observe p->on_cpu == 0.
+	 *
+	 * One must be running (->on_cpu == 1) in order to remove oneself
+	 * from the runqueue.
+	 *
+	 *  [S] ->on_cpu = 1;	[L] ->on_rq
+	 *      UNLOCK rq->lock
+	 *			RMB
+	 *      LOCK   rq->lock
+	 *  [S] ->on_rq = 0;    [L] ->on_cpu
+	 *
+	 * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
+	 * from the consecutive calls to schedule(); the first switching to our
+	 * task, the second putting it to sleep.
+	 */
+	smp_rmb();
+
+	/*
+	 * If the owning (remote) cpu is still in the middle of schedule() with
+	 * this task as prev, wait until its done referencing the task.
+	 *
+	 * Pairs with the smp_store_release() in finish_lock_switch().
+	 *
+	 * This ensures that tasks getting woken will be fully ordered against
+	 * their previous state and preserve Program Order.
+	 */
+	smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+	p->sched_contributes_to_load = !!task_contributes_to_load(p);
+	p->state = TASK_WAKING;
+
+	cpu = select_best_cpu(p);
+	if (task_cpu(p) != cpu)
+		set_task_cpu(p, cpu);
+#endif /* CONFIG_SMP */
+
+	ttwu_queue(p, cpu, wake_flags);
+stat:
+	if (schedstat_enabled())
+		ttwu_stat(p, cpu, wake_flags);
+out:
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that rq is locked and, @p is not the current task.
+ * rq stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+
+	if (WARN_ON_ONCE(rq != this_rq()) ||
+	    WARN_ON_ONCE(p == current))
+		return;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (!raw_spin_trylock(&p->pi_lock)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet picked a replacement task.
+		 */
+		raw_spin_unlock(&rq->lock);
+		raw_spin_lock(&p->pi_lock);
+		raw_spin_lock(&rq->lock);
+	}
+
+	if (!(p->state & TASK_NORMAL))
+		goto out;
+
+	trace_sched_waking(p);
+
+	if (!task_on_rq_queued(p))
+		ttwu_activate(rq, p);
+
+	ttwu_do_wakeup(rq, p, 0);
+	if (schedstat_enabled())
+		ttwu_stat(p, smp_processor_id(), 0);
+out:
+	raw_spin_unlock(&p->pi_lock);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process(struct task_struct *p)
+{
+	return try_to_wake_up(p, TASK_NORMAL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+	return try_to_wake_up(p, state, 0);
+}
+
+static void time_slice_expired(struct task_struct *p, struct rq *rq);
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
+{
+	unsigned long flags;
+	int cpu = get_cpu();
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+	/*
+	 * We mark the process as NEW here. This guarantees that
+	 * nobody will actually run it, and a signal or other external
+	 * event cannot wake it up and insert it on the runqueue either.
+	 */
+	p->state = TASK_NEW;
+
+	/*
+	 * The process state is set to the same value of the process executing
+	 * do_fork() code. That is running. This guarantees that nobody will
+	 * actually run it, and a signal or other external event cannot wake
+	 * it up and insert it on the runqueue either.
+	 */
+
+	/* Should be reset in fork.c but done here for ease of MuQSS patching */
+	p->on_cpu =
+	p->on_rq =
+	p->utime =
+	p->stime =
+	p->utimescaled =
+	p->stimescaled =
+	p->sched_time =
+	p->stime_ns =
+	p->utime_ns = 0;
+	skiplist_node_init(&p->node);
+
+	/*
+	 * Revert to default priority/policy on fork if requested.
+	 */
+	if (unlikely(p->sched_reset_on_fork)) {
+		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+			p->policy = SCHED_NORMAL;
+			p->normal_prio = normal_prio(p);
+		}
+
+		if (PRIO_TO_NICE(p->static_prio) < 0) {
+			p->static_prio = NICE_TO_PRIO(0);
+			p->normal_prio = p->static_prio;
+		}
+
+		/*
+		 * We don't need the reset flag anymore after the fork. It has
+		 * fulfilled its duty:
+		 */
+		p->sched_reset_on_fork = 0;
+	}
+
+	/*
+	 * Silence PROVE_RCU.
+	 */
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	set_task_cpu(p, cpu);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+#ifdef CONFIG_SCHED_INFO
+	if (unlikely(sched_info_on()))
+		memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+	init_task_preempt_count(p);
+
+	put_cpu();
+	return 0;
+}
+
+#ifdef CONFIG_SCHEDSTATS
+
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
+
+static void set_schedstats(bool enabled)
+{
+	if (enabled)
+		static_branch_enable(&sched_schedstats);
+	else
+		static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+	if (!schedstat_enabled()) {
+		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+		static_branch_enable(&sched_schedstats);
+	}
+}
+
+static int __init setup_schedstats(char *str)
+{
+	int ret = 0;
+	if (!str)
+		goto out;
+
+	/*
+	 * This code is called before jump labels have been set up, so we can't
+	 * change the static branch directly just yet.  Instead set a temporary
+	 * variable so init_schedstats() can do it later.
+	 */
+	if (!strcmp(str, "enable")) {
+		__sched_schedstats = true;
+		ret = 1;
+	} else if (!strcmp(str, "disable")) {
+		__sched_schedstats = false;
+		ret = 1;
+	}
+out:
+	if (!ret)
+		pr_warn("Unable to parse schedstats=\n");
+
+	return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+static void __init init_schedstats(void)
+{
+	set_schedstats(__sched_schedstats);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_schedstats);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		set_schedstats(state);
+	return err;
+}
+#endif /* CONFIG_PROC_SYSCTL */
+#else  /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+	struct task_struct *parent, *rq_curr;
+	struct rq *rq, *new_rq;
+	unsigned long flags;
+
+	parent = p->parent;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	p->state = TASK_RUNNING;
+	/* Task_rq can't change yet on a new task */
+	new_rq = rq = task_rq(p);
+	if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
+		set_task_cpu(p, valid_task_cpu(p));
+		new_rq = task_rq(p);
+	}
+
+	double_rq_lock(rq, new_rq);
+	update_clocks(rq);
+	rq_curr = rq->curr;
+
+	/*
+	 * Make sure we do not leak PI boosting priority to the child.
+	 */
+	p->prio = rq_curr->normal_prio;
+
+	activate_task(p, rq);
+	trace_sched_wakeup_new(p);
+
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness. If it's negative, it won't
+	 * matter since that's the same as being 0. rq->rq_deadline is only
+	 * modified within schedule() so it is always equal to
+	 * current->deadline.
+	 */
+	p->last_ran = rq_curr->last_ran;
+	if (likely(rq_curr->policy != SCHED_FIFO)) {
+		rq_curr->time_slice /= 2;
+		if (unlikely(rq_curr->time_slice < RESCHED_US)) {
+			/*
+			 * Forking task has run out of timeslice. Reschedule it and
+			 * start its child with a new time slice and deadline. The
+			 * child will end up running first because its deadline will
+			 * be slightly earlier.
+			 */
+			rq_curr->time_slice = 0;
+			__set_tsk_resched(rq_curr);
+			time_slice_expired(p, new_rq);
+			if (suitable_idle_cpus(p))
+				resched_best_idle(p, task_cpu(p));
+			else if (unlikely(rq != new_rq))
+				try_preempt(p, new_rq);
+		} else {
+			p->time_slice = rq_curr->time_slice;
+			if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
+				/*
+				 * The VM isn't cloned, so we're in a good position to
+				 * do child-runs-first in anticipation of an exec. This
+				 * usually avoids a lot of COW overhead.
+				 */
+				__set_tsk_resched(rq_curr);
+			} else
+				try_preempt(p, new_rq);
+		}
+	} else {
+		time_slice_expired(p, new_rq);
+		try_preempt(p, new_rq);
+	}
+	double_rq_unlock(rq, new_rq);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+	static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+	static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+	if (!static_key_false(&preempt_notifier_key))
+		WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
+	hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is *not* safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+	hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	struct preempt_notifier *notifier;
+
+	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+		notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	if (static_key_false(&preempt_notifier_key))
+		__fire_sched_in_preempt_notifiers(curr);
+}
+
+static void
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	struct preempt_notifier *notifier;
+
+	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+		notifier->ops->sched_out(notifier, next);
+}
+
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	if (static_key_false(&preempt_notifier_key))
+		__fire_sched_out_preempt_notifiers(curr, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+		    struct task_struct *next)
+{
+	sched_info_switch(rq, prev, next);
+	perf_event_task_sched_out(prev, next);
+	fire_sched_out_preempt_notifiers(prev, next);
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock.  (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
+ */
+static struct rq *finish_task_switch(struct task_struct *prev)
+	__releases(rq->lock)
+{
+	struct rq *rq = this_rq();
+	struct mm_struct *mm = rq->prev_mm;
+	long prev_state;
+
+	/*
+	 * The previous task will have left us with a preempt_count of 2
+	 * because it left us after:
+	 *
+	 *	schedule()
+	 *	  preempt_disable();			// 1
+	 *	  __schedule()
+	 *	    raw_spin_lock_irq(&rq->lock)	// 2
+	 *
+	 * Also, see FORK_PREEMPT_COUNT.
+	 */
+	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+		      "corrupted preempt_count: %s/%d/0x%x\n",
+		      current->comm, current->pid, preempt_count()))
+		preempt_count_set(FORK_PREEMPT_COUNT);
+
+	rq->prev_mm = NULL;
+
+	/*
+	 * A task struct has one reference for the use as "current".
+	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+	 * schedule one last time. The schedule call will never return, and
+	 * the scheduled task must drop that reference.
+	 *
+	 * We must observe prev->state before clearing prev->on_cpu (in
+	 * finish_lock_switch), otherwise a concurrent wakeup can get prev
+	 * running on another CPU and we could rave with its RUNNING -> DEAD
+	 * transition, resulting in a double drop.
+	 */
+	prev_state = prev->state;
+	vtime_task_switch(prev);
+	perf_event_task_sched_in(prev, current);
+	finish_lock_switch(rq, prev);
+	finish_arch_post_lock_switch();
+
+	fire_sched_in_preempt_notifiers(current);
+	if (mm)
+		mmdrop(mm);
+	if (unlikely(prev_state == TASK_DEAD)) {
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+		 */
+		kprobe_flush_task(prev);
+		put_task_struct(prev);
+	}
+	return rq;
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
+	__releases(rq->lock)
+{
+	struct rq *rq;
+
+	/*
+	 * New tasks start with FORK_PREEMPT_COUNT, see there and
+	 * finish_task_switch() for details.
+	 *
+	 * finish_task_switch() will drop rq->lock() and lower preempt_count
+	 * and the preempt_enable() will end up enabling preemption (on
+	 * PREEMPT_COUNT kernels).
+	 */
+
+	rq = finish_task_switch(prev);
+	preempt_enable();
+
+	if (current->set_child_tid)
+		put_user(task_pid_vnr(current), current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	struct mm_struct *mm, *oldmm;
+
+	prepare_task_switch(rq, prev, next);
+
+	mm = next->mm;
+	oldmm = prev->active_mm;
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (!mm) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next);
+	} else
+		switch_mm_irqs_off(oldmm, mm, next);
+
+	if (!prev->mm) {
+		prev->active_mm = NULL;
+		rq->prev_mm = oldmm;
+	}
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+
+	/* Here we just switch the register state and the stack. */
+	switch_to(prev, next, prev);
+	barrier();
+
+	return finish_task_switch(prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+	return atomic_read(&grq.nr_running);
+}
+
+static unsigned long nr_uninterruptible(void)
+{
+	return atomic_read(&grq.nr_uninterruptible);
+}
+
+/*
+ * Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race.  The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
+ */
+bool single_task_running(void)
+{
+	struct rq *rq = cpu_rq(smp_processor_id());
+
+	if (rq_load(rq) == 1)
+		return true;
+	else
+		return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
+unsigned long long nr_context_switches(void)
+{
+	return (unsigned long long)atomic64_read(&grq.nr_switches);
+}
+
+unsigned long nr_iowait(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+	return sum;
+}
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+	struct rq *this = cpu_rq(cpu);
+	return atomic_read(&this->nr_iowait);
+}
+
+unsigned long nr_active(void)
+{
+	return nr_running() + nr_uninterruptible();
+}
+
+/*
+ * I/O wait is the number of running or queued tasks with their ->rq pointer
+ * set to this cpu as being the CPU they're more likely to run on.
+ */
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+	struct rq *rq = this_rq();
+
+	*nr_waiters = atomic_read(&rq->nr_iowait);
+	*load = rq_load(rq);
+}
+
+/* Variables and functions for calc_load */
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+	unsigned long newload;
+
+	newload = load * exp + active * (FIXED_1 - exp);
+	if (active >= load)
+		newload += FIXED_1-1;
+
+	return newload / FIXED_1;
+}
+
+/*
+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
+ */
+void calc_global_load(unsigned long ticks)
+{
+	long active;
+
+	if (time_before(jiffies, calc_load_update))
+		return;
+	active = nr_active() * FIXED_1;
+
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update = jiffies + LOAD_FREQ;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+	__this_cpu_inc(irq_time_seq.sequence);
+	smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+	smp_wmb();
+	__this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	u64 irq_time;
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+		irq_time = per_cpu(cpu_softirq_time, cpu) +
+			   per_cpu(cpu_hardirq_time, cpu);
+	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+	return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void irqtime_account_irq(struct task_struct *curr)
+{
+	unsigned long flags;
+	s64 delta;
+	int cpu;
+
+	if (!sched_clock_irqtime)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+	__this_cpu_add(irq_start_time, delta);
+
+	irq_time_write_begin();
+	/*
+	 * We do not account for softirq time from ksoftirqd here.
+	 * We want to continue accounting softirq time to ksoftirqd thread
+	 * in that case, so as not to confuse scheduler with a special task
+	 * that do not consume any time, but still wants to run.
+	 */
+	if (hardirq_count())
+		__this_cpu_add(cpu_hardirq_time, delta);
+	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+		__this_cpu_add(cpu_softirq_time, delta);
+
+	irq_time_write_end();
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(irqtime_account_irq);
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+	if (unlikely(steal > NSEC_PER_SEC))
+		return div_u64(steal, TICK_NSEC);
+
+	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+	/*
+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
+	 * this case when a previous update_rq_clock() happened inside a
+	 * {soft,}irq region.
+	 *
+	 * When this happens, we stop ->clock_task and only update the
+	 * prev_irq_time stamp to account for the part that fit, so that a next
+	 * update will consume the rest. This ensures ->clock_task is
+	 * monotonic.
+	 *
+	 * It does however cause some slight miss-attribution of {soft,}irq
+	 * time, a more accurate solution would be to update the irq_time using
+	 * the current rq->clock timestamp, except that would require using
+	 * atomic ops.
+	 */
+	if (irq_delta > delta)
+		irq_delta = delta;
+
+	rq->prev_irq_time += irq_delta;
+	delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	if (static_key_false((&paravirt_steal_rq_enabled))) {
+		s64 steal = paravirt_steal_clock(cpu_of(rq));
+
+		steal -= rq->prev_steal_time_rq;
+
+		if (unlikely(steal > delta))
+			steal = delta;
+
+		rq->prev_steal_time_rq += steal;
+
+		delta -= steal;
+	}
+#endif
+	rq->clock_task += delta;
+}
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static void irqtime_account_hi_si(void)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	u64 latest_ns;
+
+	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
+	if (latest_ns > cpustat[CPUTIME_IRQ])
+		cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
+
+	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
+	if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
+		cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime	(0)
+
+static inline void irqtime_account_hi_si(void)
+{
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+	if (static_key_false(&paravirt_steal_enabled)) {
+		u64 steal;
+		cputime_t steal_ct;
+
+		steal = paravirt_steal_clock(smp_processor_id());
+		steal -= this_rq()->prev_steal_time;
+
+		/*
+		 * cputime_t may be less precise than nsecs (eg: if it's
+		 * based on jiffies). Lets cast the result to cputime
+		 * granularity and account the rest on the next rounds.
+		 */
+		steal_ct = nsecs_to_cputime(steal);
+		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+
+		account_steal_time(steal_ct);
+		return steal_ct;
+	}
+#endif
+	return false;
+}
+
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+	struct signal_struct *sig = tsk->signal;
+	cputime_t utime, stime;
+	struct task_struct *t;
+	unsigned int seq, nextseq;
+	unsigned long flags;
+
+	rcu_read_lock();
+	/* Attempt a lockless read on the first round. */
+	nextseq = 0;
+	do {
+		seq = nextseq;
+		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+		times->utime = sig->utime;
+		times->stime = sig->stime;
+		times->sum_exec_runtime = sig->sum_sched_runtime;
+
+		for_each_thread(tsk, t) {
+			task_cputime(t, &utime, &stime);
+			times->utime += utime;
+			times->stime += stime;
+			times->sum_exec_runtime += task_sched_runtime(t);
+		}
+		/* If lockless access failed, take the lock. */
+		nextseq = 1;
+	} while (need_seqretry(&sig->stats_lock, seq));
+	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+	rcu_read_unlock();
+}
+
+/*
+ * On each tick, add the number of nanoseconds to the unbanked variables and
+ * once one tick's worth has accumulated, account it allowing for accurate
+ * sub-tick accounting and totals.
+ */
+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	unsigned long ticks;
+
+	if (atomic_read(&rq->nr_iowait) > 0) {
+		rq->iowait_ns += ns;
+		if (rq->iowait_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->iowait_ns);
+			cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks;
+			rq->iowait_ns %= JIFFY_NS;
+		}
+	} else {
+		rq->idle_ns += ns;
+		if (rq->idle_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->idle_ns);
+			cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks;
+			rq->idle_ns %= JIFFY_NS;
+		}
+	}
+	acct_update_integrals(idle);
+}
+
+static void pc_system_time(struct rq *rq, struct task_struct *p,
+			   int hardirq_offset, unsigned long ns)
+{
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	unsigned long ticks;
+
+	p->stime_ns += ns;
+	if (p->stime_ns >= JIFFY_NS) {
+		ticks = NS_TO_JIFFIES(p->stime_ns);
+		p->stime_ns %= JIFFY_NS;
+		p->stime += (__force u64)cputime_one_jiffy * ticks;
+		p->stimescaled += one_jiffy_scaled * ticks;
+		account_group_system_time(p, cputime_one_jiffy * ticks);
+	}
+	p->sched_time += ns;
+	account_group_exec_runtime(p, ns);
+
+	if (hardirq_count() - hardirq_offset) {
+		rq->irq_ns += ns;
+		if (rq->irq_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->irq_ns);
+			cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks;
+			rq->irq_ns %= JIFFY_NS;
+		}
+	} else if (in_serving_softirq()) {
+		rq->softirq_ns += ns;
+		if (rq->softirq_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->softirq_ns);
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
+			rq->softirq_ns %= JIFFY_NS;
+		}
+	} else {
+		rq->system_ns += ns;
+		if (rq->system_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->system_ns);
+			cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks;
+			rq->system_ns %= JIFFY_NS;
+		}
+	}
+	acct_update_integrals(p);
+}
+
+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns)
+{
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	unsigned long ticks;
+
+	p->utime_ns += ns;
+	if (p->utime_ns >= JIFFY_NS) {
+		ticks = NS_TO_JIFFIES(p->utime_ns);
+		p->utime_ns %= JIFFY_NS;
+		p->utime += (__force u64)cputime_one_jiffy * ticks;
+		p->utimescaled += one_jiffy_scaled * ticks;
+		account_group_user_time(p, cputime_one_jiffy * ticks);
+	}
+	p->sched_time += ns;
+	account_group_exec_runtime(p, ns);
+
+	if (this_cpu_ksoftirqd() == p) {
+		/*
+		 * ksoftirqd time do not get accounted in cpu_softirq_time.
+		 * So, we have to handle it separately here.
+		 */
+		rq->softirq_ns += ns;
+		if (rq->softirq_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->softirq_ns);
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
+			rq->softirq_ns %= JIFFY_NS;
+		}
+	}
+
+	if (task_nice(p) > 0 || idleprio_task(p)) {
+		rq->nice_ns += ns;
+		if (rq->nice_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->nice_ns);
+			cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks;
+			rq->nice_ns %= JIFFY_NS;
+		}
+	} else {
+		rq->user_ns += ns;
+		if (rq->user_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->user_ns);
+			cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks;
+			rq->user_ns %= JIFFY_NS;
+		}
+	}
+	acct_update_integrals(p);
+}
+
+/*
+ * This is called on clock ticks.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void
+update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
+{
+	s64 account_ns = rq->niffies - p->last_ran;
+	struct task_struct *idle = rq->idle;
+
+	if (steal_account_process_tick())
+		goto ts_account;
+
+	/* Accurate tick timekeeping */
+	if (user_mode(get_irq_regs()))
+		pc_user_time(rq, p, account_ns);
+	else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) {
+		pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns);
+	} else
+		pc_idle_time(rq, idle, account_ns);
+
+	if (sched_clock_irqtime)
+		irqtime_account_hi_si();
+
+ts_account:
+	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
+	if (p->policy != SCHED_FIFO && p != idle)
+		p->time_slice -= NS_TO_US(account_ns);
+
+	p->last_ran = rq->niffies;
+}
+
+/*
+ * This is called on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void
+update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
+{
+	s64 account_ns = rq->niffies - p->last_ran;
+	struct task_struct *idle = rq->idle;
+
+	/* Accurate subtick timekeeping */
+	if (p != idle)
+		pc_user_time(rq, p, account_ns);
+	else
+		pc_idle_time(rq, idle, account_ns);
+
+	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
+	if (p->policy != SCHED_FIFO && p != idle)
+		p->time_slice -= NS_TO_US(account_ns);
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been accounted in
+ * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock(p) held.
+ */
+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+	u64 ns = 0;
+
+	/*
+	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
+	 * project cycles that may never be accounted to this
+	 * thread, breaking clock_gettime().
+	 */
+	if (p == rq->curr && task_on_rq_queued(p)) {
+		update_clocks(rq);
+		ns = rq->niffies - p->last_ran;
+	}
+
+	return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * Return separately the current's pending runtime that have not been
+ * accounted yet.
+ *
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	u64 ns;
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+	/*
+	 * 64-bit doesn't need locks to atomically read a 64bit value.
+	 * So we have a optimization chance when the task's delta_exec is 0.
+	 * Reading ->on_cpu is racy, but this is ok.
+	 *
+	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
+	 * If we race with it entering cpu, unaccounted time is 0. This is
+	 * indistinguishable from the read occurring a few cycles earlier.
+	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+	 * been accounted, so we're correct here as well.
+	 */
+	if (!p->on_cpu || !task_on_rq_queued(p))
+		return tsk_seruntime(p);
+#endif
+
+	rq = task_rq_lock(p, &flags);
+	ns = p->sched_time + do_task_delta_exec(p, rq);
+	task_rq_unlock(rq, p, &flags);
+
+	return ns;
+}
+
+/* Compatibility crap */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+		       cputime_t cputime_scaled)
+{
+}
+
+void account_idle_time(cputime_t cputime)
+{
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+			       cputime_t cputime_scaled)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	/* Add guest time to process. */
+	p->utime += (__force u64)cputime;
+	p->utimescaled += (__force u64)cputime_scaled;
+	account_group_user_time(p, cputime);
+	p->gtime += (__force u64)cputime;
+
+	/* Add guest time to cpustat. */
+	if (task_nice(p) > 0) {
+		cpustat[CPUTIME_NICE] += (__force u64)cputime;
+		cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
+	} else {
+		cpustat[CPUTIME_USER] += (__force u64)cputime;
+		cpustat[CPUTIME_GUEST] += (__force u64)cputime;
+	}
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+			cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+	/* Add system time to process. */
+	p->stime += (__force u64)cputime;
+	p->stimescaled += (__force u64)cputime_scaled;
+	account_group_system_time(p, cputime);
+
+	/* Add system time to cpustat. */
+	*target_cputime64 += (__force u64)cputime;
+
+	/* Account for system time used */
+	acct_update_integrals(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * This is for guest only now.
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+			 cputime_t cputime, cputime_t cputime_scaled)
+{
+
+	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
+		account_guest_time(p, cputime, cputime_scaled);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	cpustat[CPUTIME_STEAL] += (__force u64)cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+static void account_idle_times(cputime_t cputime)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	struct rq *rq = this_rq();
+
+	if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
+	else
+		cpustat[CPUTIME_IDLE] += (__force u64)cputime;
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+	account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+	account_idle_times(jiffies_to_cputime(ticks));
+}
+#endif
+
+/*
+ * Functions to test for when SCHED_ISO tasks have used their allocated
+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All
+ * data is modified only by the local runqueue during scheduler_tick with
+ * interrupts disabled.
+ */
+
+/*
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
+ * slow division.
+ */
+static inline void iso_tick(struct rq *rq)
+{
+	rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
+	rq->iso_ticks += 100;
+	if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) {
+		rq->iso_refractory = true;
+		if (unlikely(rq->iso_ticks > ISO_PERIOD * 100))
+			rq->iso_ticks = ISO_PERIOD * 100;
+	}
+}
+
+/* No SCHED_ISO task was running so decrease rq->iso_ticks */
+static inline void no_iso_tick(struct rq *rq, int ticks)
+{
+	if (rq->iso_ticks > 0 || rq->iso_refractory) {
+		rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD;
+		if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) {
+			rq->iso_refractory = false;
+			if (unlikely(rq->iso_ticks < 0))
+				rq->iso_ticks = 0;
+		}
+	}
+}
+
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+static void task_running_tick(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	/*
+	 * If a SCHED_ISO task is running we increment the iso_ticks. In
+	 * order to prevent SCHED_ISO tasks from causing starvation in the
+	 * presence of true RT tasks we account those as iso_ticks as well.
+	 */
+	if (rt_task(p) || task_running_iso(p))
+		iso_tick(rq);
+	else
+		no_iso_tick(rq, 1);
+
+	/* SCHED_FIFO tasks never run out of timeslice. */
+	if (p->policy == SCHED_FIFO)
+		return;
+
+	if (iso_task(p)) {
+		if (task_running_iso(p)) {
+			if (rq->iso_refractory) {
+				/*
+				 * SCHED_ISO task is running as RT and limit
+				 * has been hit. Force it to reschedule as
+				 * SCHED_NORMAL by zeroing its time_slice
+				 */
+				p->time_slice = 0;
+			}
+		} else if (!rq->iso_refractory) {
+			/* Can now run again ISO. Reschedule to pick up prio */
+			goto out_resched;
+		}
+	}
+
+	/*
+	 * Tasks that were scheduled in the first half of a tick are not
+	 * allowed to run into the 2nd half of the next tick if they will
+	 * run out of time slice in the interim. Otherwise, if they have
+	 * less than RESCHED_US μs of time slice left they will be rescheduled.
+	 */
+	if (p->time_slice - rq->dither >= RESCHED_US)
+		return;
+out_resched:
+	rq_lock(rq);
+	__set_tsk_resched(p);
+	rq_unlock(rq);
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(void)
+{
+	int cpu __maybe_unused = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	sched_clock_tick();
+	update_rq_clock(rq);
+	update_load_avg(rq);
+	update_cpu_clock_tick(rq, rq->curr);
+	if (!rq_idle(rq))
+		task_running_tick(rq);
+	else
+		no_iso_tick(rq, rq->last_scheduler_tick - rq->last_jiffy);
+	rq->last_scheduler_tick = rq->last_jiffy;
+	rq->last_tick = rq->clock;
+	perf_event_task_tick();
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+	if (preempt_count() == val) {
+		unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+		current->preempt_disable_ip = ip;
+#endif
+		trace_preempt_off(CALLER_ADDR0, ip);
+	}
+}
+
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+		return;
+#endif
+	__preempt_count_add(val);
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+				PREEMPT_MASK - 10);
+#endif
+	preempt_latency_start(val);
+}
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
+
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
+void preempt_count_sub(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+		return;
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+			!(preempt_count() & PREEMPT_MASK)))
+		return;
+#endif
+
+	preempt_latency_stop(val);
+	__preempt_count_sub(val);
+}
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
+
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
+#endif
+
+/*
+ * The time_slice is only refilled when it is empty and that is when we set a
+ * new deadline. Make sure update_clocks has been called recently to update
+ * rq->niffies.
+ */
+static void time_slice_expired(struct task_struct *p, struct rq *rq)
+{
+	p->time_slice = timeslice();
+	p->deadline = rq->niffies + task_deadline_diff(p);
+#ifdef CONFIG_SMT_NICE
+	if (!p->mm)
+		p->smt_bias = 0;
+	else if (rt_task(p))
+		p->smt_bias = 1 << 30;
+	else if (task_running_iso(p))
+		p->smt_bias = 1 << 29;
+	else if (idleprio_task(p)) {
+		if (task_running_idle(p))
+			p->smt_bias = 0;
+		else
+			p->smt_bias = 1;
+	} else if (--p->smt_bias < 1)
+		p->smt_bias = MAX_PRIO - p->static_prio;
+#endif
+}
+
+/*
+ * Timeslices below RESCHED_US are considered as good as expired as there's no
+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
+ * have been flagged be not latency sensitive and likely to be fully CPU
+ * bound so every time they're rescheduled they have their time_slice
+ * refilled, but get a new later deadline to have little effect on
+ * SCHED_NORMAL tasks.
+
+ */
+static inline void check_deadline(struct task_struct *p, struct rq *rq)
+{
+	if (p->time_slice < RESCHED_US || batch_task(p))
+		time_slice_expired(p, rq);
+}
+
+#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
+
+/*
+ * Task selection with skiplists is a simple matter of picking off the first
+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1)
+ * being bound to the number of processors.
+ *
+ * Runqueues are selectively locked based on their unlocked data and then
+ * unlocked if not needed. At most 3 locks will be held at any time and are
+ * released as soon as they're no longer needed. All balancing between CPUs
+ * is thus done here in an extremely simple first come best fit manner.
+ *
+ * This iterates over runqueues in cache locality order. In interactive mode
+ * it iterates over all CPUs and finds the task with the best key/deadline.
+ * In non-interactive mode it will only take a task if it's from the current
+ * runqueue or a runqueue with more tasks than the current one with a better
+ * key/deadline.
+ */
+static inline struct
+task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+	struct task_struct *edt = idle;
+	struct rq *locked = NULL;
+	int i, best_entries = 0;
+	u64 best_key = ~0ULL;
+
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct rq *other_rq = rq_order(rq, i);
+		int entries = other_rq->sl->entries;
+		struct task_struct *p;
+		u64 key;
+
+		/*
+		 * Check for queued entres lockless first. The local runqueue
+		 * is locked so entries will always be accurate.
+		 */
+		if (!sched_interactive) {
+			if (entries <= best_entries)
+				continue;
+		} else if (!entries)
+			continue;
+
+		/* if (i) implies other_rq != rq */
+		if (i) {
+			/* Check for best id queued lockless first */
+			if (other_rq->best_key >= best_key)
+				continue;
+
+			if (unlikely(!trylock_rq(rq, other_rq)))
+				continue;
+
+			/* Need to reevaluate entries after locking */
+			entries = other_rq->sl->entries;
+			if (unlikely(!entries)) {
+				unlock_rq(other_rq);
+				continue;
+			}
+		}
+		key = other_rq->node.next[0]->key;
+		/* Reevaluate key after locking */
+		if (unlikely(key >= best_key)) {
+			if (i)
+				unlock_rq(other_rq);
+			continue;
+		}
+
+		p = other_rq->node.next[0]->value;
+		if (!smt_schedule(p, rq)) {
+			if (i)
+				unlock_rq(other_rq);
+			continue;
+		}
+
+		/* Make sure affinity is ok */
+		if (i) {
+			if (needs_other_cpu(p, cpu)) {
+				unlock_rq(other_rq);
+				continue;
+			}
+			if (locked)
+				unlock_rq(locked);
+			locked = other_rq;
+		}
+
+		best_entries = entries;
+		best_key = key;
+		edt = p;
+	}
+
+	if (likely(edt != idle))
+		take_task(rq, cpu, edt);
+
+	if (locked)
+		unlock_rq(locked);
+
+	return edt;
+}
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+	if (oops_in_progress)
+		return;
+
+	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+		prev->comm, prev->pid, preempt_count());
+
+	debug_show_held_locks(prev);
+	print_modules();
+	if (irqs_disabled())
+		print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (in_atomic_preempt_off()) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
+	dump_stack();
+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev)
+{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+	if (task_stack_end_corrupted(prev))
+		panic("corrupted stack end detected inside scheduler\n");
+#endif
+
+	if (unlikely(in_atomic_preempt_off())) {
+		__schedule_bug(prev);
+		preempt_count_set(PREEMPT_DISABLED);
+	}
+	rcu_sleep_check();
+
+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+	schedstat_inc(this_rq(), sched_count);
+}
+
+/*
+ * The currently running task's information is all stored in rq local data
+ * which is only modified by the local CPU.
+ */
+static inline void set_rq_task(struct rq *rq, struct task_struct *p)
+{
+	rq->rq_deadline = p->deadline;
+	rq->rq_prio = p->prio;
+#ifdef CONFIG_SMT_NICE
+	rq->rq_mm = p->mm;
+	rq->rq_smt_bias = p->smt_bias;
+#endif
+}
+
+#ifdef CONFIG_SMT_NICE
+static void check_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings;
+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings;
+
+/* Iterate over smt siblings when we've scheduled a process on cpu and decide
+ * whether they should continue running or be descheduled. */
+static void check_smt_siblings(struct rq *this_rq)
+{
+	int other_cpu;
+
+	for_each_cpu(other_cpu, &this_rq->thread_mask) {
+		struct task_struct *p;
+		struct rq *rq;
+
+		rq = cpu_rq(other_cpu);
+		if (rq_idle(rq))
+			continue;
+		if (unlikely(!rq->online))
+			continue;
+		p = rq->curr;
+		if (!smt_schedule(p, this_rq)) {
+			set_tsk_need_resched(p);
+			smp_send_reschedule(other_cpu);
+		}
+	}
+}
+
+static void wake_smt_siblings(struct rq *this_rq)
+{
+	int other_cpu;
+
+	if (!queued_notrunning())
+		return;
+
+	for_each_cpu(other_cpu, &this_rq->thread_mask) {
+		struct rq *rq;
+
+		rq = cpu_rq(other_cpu);
+		if (unlikely(!rq->online))
+			continue;
+		if (rq_idle(rq)) {
+			struct task_struct *p = rq->curr;
+
+			set_tsk_need_resched(p);
+			smp_send_reschedule(other_cpu);
+		}
+	}
+}
+#else
+static void check_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_siblings(struct rq __maybe_unused *this_rq) {}
+#endif
+
+/*
+ * schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ *      paths. For example, see arch/x86/entry_64.S.
+ *
+ *      To drive preemption between tasks, the scheduler sets the flag in timer
+ *      interrupt handler scheduler_tick().
+ *
+ *   3. Wakeups don't really cause entry into schedule(). They add a
+ *      task to the run-queue and that's it.
+ *
+ *      Now, if the new task added to the run-queue preempts the current
+ *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ *      called on the nearest possible occasion:
+ *
+ *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ *         - in syscall or exception context, at the next outmost
+ *           preempt_enable(). (this might be as soon as the wake_up()'s
+ *           spin_unlock()!)
+ *
+ *         - in IRQ context, return from interrupt-handler to
+ *           preemptible context
+ *
+ *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ *         then at the next:
+ *
+ *          - cond_resched() call
+ *          - explicit schedule() call
+ *          - return from syscall or exception to user-space
+ *          - return from interrupt-handler to user-space
+ *
+ * WARNING: must be called with preemption disabled!
+ */
+static void __sched notrace __schedule(bool preempt)
+{
+	struct task_struct *prev, *next, *idle;
+	unsigned long *switch_count;
+	bool deactivate = false;
+	struct rq *rq;
+	u64 niffies;
+	int cpu;
+
+	cpu = smp_processor_id();
+	rq = cpu_rq(cpu);
+	prev = rq->curr;
+	idle = rq->idle;
+
+	/*
+	 * do_exit() calls schedule() with preemption disabled as an exception;
+	 * however we must fix that up, otherwise the next task will see an
+	 * inconsistent (higher) preempt count.
+	 *
+	 * It also avoids the below schedule_debug() test from complaining
+	 * about this.
+	 */
+	if (unlikely(prev->state == TASK_DEAD))
+		preempt_enable_no_resched_notrace();
+
+	schedule_debug(prev);
+
+	local_irq_disable();
+	rcu_note_context_switch();
+
+	/*
+	 * Make sure that signal_pending_state()->signal_pending() below
+	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+	 * done by the caller to avoid the race with signal_wake_up().
+	 */
+	smp_mb__before_spinlock();
+	rq_lock(rq);
+#ifdef CONFIG_SMP
+	if (rq->preempt) {
+		/*
+		 * Make sure resched_curr hasn't triggered a preemption
+		 * locklessly on a task that has since scheduled away. Spurious
+		 * wakeup of idle is okay though.
+		 */
+		if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) {
+			rq->preempt = NULL;
+			clear_preempt_need_resched();
+			rq_unlock_irq(rq);
+			return;
+		}
+		rq->preempt = NULL;
+	}
+#endif
+
+	switch_count = &prev->nivcsw;
+	if (!preempt && prev->state) {
+		if (unlikely(signal_pending_state(prev->state, prev))) {
+			prev->state = TASK_RUNNING;
+		} else {
+			deactivate = true;
+			prev->on_rq = 0;
+
+			/*
+			 * If a worker is going to sleep, notify and
+			 * ask workqueue whether it wants to wake up a
+			 * task to maintain concurrency.  If so, wake
+			 * up the task.
+			 */
+			if (prev->flags & PF_WQ_WORKER) {
+				struct task_struct *to_wakeup;
+
+				to_wakeup = wq_worker_sleeping(prev);
+				if (to_wakeup) {
+					/* This shouldn't happen, but does */
+					if (WARN_ONCE((to_wakeup == prev), "Waking up prev as worker\n"))
+						deactivate = false;
+					else
+						try_to_wake_up_local(to_wakeup);
+				}
+			}
+		}
+		switch_count = &prev->nvcsw;
+	}
+
+	/*
+	 * Store the niffy value here for use by the next task's last_ran
+	 * below to avoid losing niffies due to update_clocks being called
+	 * again after this point.
+	 */
+	update_clocks(rq);
+	niffies = rq->niffies;
+	update_cpu_clock_switch(rq, prev);
+	if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
+		rq->dither = 0;
+	else
+		rq->dither = HALF_JIFFY_US;
+
+	clear_tsk_need_resched(prev);
+	clear_preempt_need_resched();
+
+	if (idle != prev) {
+		check_deadline(prev, rq);
+		return_task(prev, rq, cpu, deactivate);
+	}
+
+	if (unlikely(!queued_notrunning())) {
+		next = idle;
+		schedstat_inc(rq, sched_goidle);
+		set_cpuidle_map(cpu);
+		update_load_avg(rq);
+	} else {
+		next = earliest_deadline_task(rq, cpu, idle);
+		if (likely(next->prio != PRIO_LIMIT))
+			clear_cpuidle_map(cpu);
+		else {
+			set_cpuidle_map(cpu);
+			update_load_avg(rq);
+		}
+	}
+
+	set_rq_task(rq, next);
+	next->last_ran = niffies;
+
+	if (likely(prev != next)) {
+		/*
+		 * Don't reschedule an idle task or deactivated tasks
+		 */
+		if (prev != idle && !deactivate)
+			resched_suitable_idle(prev);
+		if (next != idle)
+			check_siblings(rq);
+		else
+			wake_siblings(rq);
+		atomic64_inc(&grq.nr_switches);
+		rq->curr = next;
+		++*switch_count;
+
+		trace_sched_switch(preempt, prev, next);
+		rq = context_switch(rq, prev, next); /* unlocks the rq */
+	} else {
+		check_siblings(rq);
+		rq_unlock_irq(rq);
+	}
+}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+	if (!tsk->state || tsk_is_pi_blocked(tsk) ||
+	    preempt_count() ||
+	    signal_pending_state(tsk->state, tsk))
+		return;
+
+	/*
+	 * If we are going to sleep and we have plugged IO queued,
+	 * make sure to submit it to avoid deadlocks.
+	 */
+	if (blk_needs_flush_plug(tsk))
+		blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+	struct task_struct *tsk = current;
+
+	sched_submit_work(tsk);
+	do {
+		preempt_disable();
+		__schedule(false);
+		sched_preempt_enable_no_resched();
+	} while (need_resched());
+}
+
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+asmlinkage __visible void __sched schedule_user(void)
+{
+	/*
+	 * If we come here after a random call to set_need_resched(),
+	 * or we have been woken up remotely but the IPI has not yet arrived,
+	 * we haven't yet exited the RCU idle mode. Do it here manually until
+	 * we find a better solution.
+	 *
+	 * NB: There are buggy callers of this function.  Ideally we
+	 * should warn if prev_state != IN_USER, but that will trigger
+	 * too frequently to make sense yet.
+	 */
+	enum ctx_state prev_state = exception_enter();
+	schedule();
+	exception_exit(prev_state);
+}
+#endif
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+	sched_preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+}
+
+static void __sched notrace preempt_schedule_common(void)
+{
+	do {
+		/*
+		 * Because the function tracer can trace preempt_count_sub()
+		 * and it also uses preempt_enable/disable_notrace(), if
+		 * NEED_RESCHED is set, the preempt_enable_notrace() called
+		 * by the function tracer will call this function again and
+		 * cause infinite recursion.
+		 *
+		 * Preemption must be disabled here before the function
+		 * tracer can trace. Break up preempt_disable() into two
+		 * calls. One to disable preemption without fear of being
+		 * traced. The other to still record the preemption latency,
+		 * which can also be traced by the function tracer.
+		 */
+		preempt_disable_notrace();
+		preempt_latency_start(1);
+		__schedule(true);
+		preempt_latency_stop(1);
+		preempt_enable_no_resched_notrace();
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+	} while (need_resched());
+}
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule(void)
+{
+	/*
+	 * If there is a non-zero preempt_count or interrupts are disabled,
+	 * we do not want to preempt the current task. Just return..
+	 */
+	if (likely(!preemptible()))
+		return;
+
+	preempt_schedule_common();
+}
+NOKPROBE_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL(preempt_schedule);
+
+/**
+ * preempt_schedule_notrace - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
+{
+	enum ctx_state prev_ctx;
+
+	if (likely(!preemptible()))
+		return;
+
+	do {
+		/*
+		 * Because the function tracer can trace preempt_count_sub()
+		 * and it also uses preempt_enable/disable_notrace(), if
+		 * NEED_RESCHED is set, the preempt_enable_notrace() called
+		 * by the function tracer will call this function again and
+		 * cause infinite recursion.
+		 *
+		 * Preemption must be disabled here before the function
+		 * tracer can trace. Break up preempt_disable() into two
+		 * calls. One to disable preemption without fear of being
+		 * traced. The other to still record the preemption latency,
+		 * which can also be traced by the function tracer.
+		 */
+		preempt_disable_notrace();
+		preempt_latency_start(1);
+		/*
+		 * Needs preempt disabled in case user_exit() is traced
+		 * and the tracer calls preempt_enable_notrace() causing
+		 * an infinite recursion.
+		 */
+		prev_ctx = exception_enter();
+		__schedule(true);
+		exception_exit(prev_ctx);
+
+		preempt_latency_stop(1);
+		preempt_enable_no_resched_notrace();
+	} while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage __visible void __sched preempt_schedule_irq(void)
+{
+	enum ctx_state prev_state;
+
+	/* Catch callers which need to be fixed */
+	BUG_ON(preempt_count() || !irqs_disabled());
+
+	prev_state = exception_enter();
+
+	do {
+		preempt_disable();
+		local_irq_enable();
+		__schedule(true);
+		local_irq_disable();
+		sched_preempt_enable_no_resched();
+	} while (need_resched());
+
+	exception_exit(prev_state);
+}
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+			  void *key)
+{
+	return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+	struct rq *rq;
+	int oldprio;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = __task_rq_lock(p);
+
+	/*
+	 * Idle task boosting is a nono in general. There is one
+	 * exception, when PREEMPT_RT and NOHZ is active:
+	 *
+	 * The idle task calls get_next_timer_interrupt() and holds
+	 * the timer wheel base->lock on the CPU and another CPU wants
+	 * to access the timer (probably to cancel it). We can safely
+	 * ignore the boosting request, as the idle CPU runs this code
+	 * with interrupts disabled and will complete the lock
+	 * protected section without being interrupted. So there is no
+	 * real need to boost.
+	 */
+	if (unlikely(p == rq->idle)) {
+		WARN_ON(p != rq->curr);
+		WARN_ON(p->pi_blocked_on);
+		goto out_unlock;
+	}
+
+	trace_sched_pi_setprio(p, prio);
+	oldprio = p->prio;
+	p->prio = prio;
+	if (task_running(rq, p)){
+		if (prio > oldprio)
+			resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(rq, p, DEQUEUE_SAVE);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
+		if (prio < oldprio)
+			try_preempt(p, rq);
+	}
+out_unlock:
+	__task_rq_unlock(rq);
+}
+
+#endif
+
+/*
+ * Adjust the deadline for when the priority is to change, before it's
+ * changed.
+ */
+static inline void adjust_deadline(struct task_struct *p, int new_prio)
+{
+	p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+	int new_static, old_static;
+	unsigned long flags;
+	struct rq *rq;
+
+	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+		return;
+	new_static = NICE_TO_PRIO(nice);
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	rq = task_rq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via sched_setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL/SCHED_BATCH:
+	 */
+	if (has_rt_policy(p)) {
+		p->static_prio = new_static;
+		goto out_unlock;
+	}
+
+	adjust_deadline(p, new_static);
+	old_static = p->static_prio;
+	p->static_prio = new_static;
+	p->prio = effective_prio(p);
+
+	if (task_queued(p)) {
+		dequeue_task(rq, p, DEQUEUE_SAVE);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
+		if (new_static < old_static)
+			try_preempt(p, rq);
+	} else if (task_running(rq, p)) {
+		set_rq_task(rq, p);
+		if (old_static < new_static)
+			resched_task(p);
+	}
+out_unlock:
+	task_rq_unlock(rq, p, &flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+	/* convert nice value [19,-20] to rlimit style value [1,40] */
+	int nice_rlim = nice_to_rlimit(nice);
+
+	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+		capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+	long nice, retval;
+
+	/*
+	 * Setpriority might change our priority at the same moment.
+	 * We don't have to worry. Conceptually one call occurs first
+	 * and we have a single winner.
+	 */
+
+	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+	nice = task_nice(current) + increment;
+
+	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+	if (increment < 0 && !can_nice(current, nice))
+		return -EPERM;
+
+	retval = security_task_setnice(current, nice);
+	if (retval)
+		return retval;
+
+	set_user_nice(current, nice);
+	return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
+ */
+int task_prio(const struct task_struct *p)
+{
+	int delta, prio = p->prio - MAX_RT_PRIO;
+
+	/* rt tasks and iso tasks */
+	if (prio <= 0)
+		goto out;
+
+	/* Convert to ms to avoid overflows */
+	delta = NS_TO_MS(p->deadline - task_rq(p)->niffies);
+	delta = delta * 40 / ms_longest_deadline_diff();
+	if (delta > 0 && delta <= 80)
+		prio += delta;
+	if (idleprio_task(p))
+		prio += 40;
+out:
+	return prio;
+}
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+	return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+	return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
+			   int prio, bool keep_boost)
+{
+	int oldrtprio, oldprio;
+
+	p->policy = policy;
+	oldrtprio = p->rt_priority;
+	p->rt_priority = prio;
+	p->normal_prio = normal_prio(p);
+	oldprio = p->prio;
+	/*
+	 * Keep a potential priority boosting if called from
+	 * sched_setscheduler().
+	 */
+	if (keep_boost) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
+	} else
+		p->prio = p->normal_prio;
+
+	if (task_running(rq, p)) {
+		set_rq_task(rq, p);
+		resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(rq, p, DEQUEUE_SAVE);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
+		if (p->prio < oldprio || p->rt_priority > oldrtprio)
+			try_preempt(p, rq);
+	}
+}
+
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred;
+	bool match;
+
+	rcu_read_lock();
+	pcred = __task_cred(p);
+	match = (uid_eq(cred->euid, pcred->euid) ||
+		 uid_eq(cred->euid, pcred->uid));
+	rcu_read_unlock();
+	return match;
+}
+
+static int
+__sched_setscheduler(struct task_struct *p, int policy,
+		     const struct sched_param *param, bool user, bool pi)
+{
+	struct sched_param zero_param = { .sched_priority = 0 };
+	unsigned long flags, rlim_rtprio = 0;
+	int retval, oldpolicy = -1;
+	int reset_on_fork;
+	struct rq *rq;
+
+	/* may grab non-irq protected spin_locks */
+	BUG_ON(in_interrupt());
+
+	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+		unsigned long lflags;
+
+		if (!lock_task_sighand(p, &lflags))
+			return -ESRCH;
+		rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+		unlock_task_sighand(p, &lflags);
+		if (rlim_rtprio)
+			goto recheck;
+		/*
+		 * If the caller requested an RT policy without having the
+		 * necessary rights, we downgrade the policy to SCHED_ISO.
+		 * We also set the parameter to zero to pass the checks.
+		 */
+		policy = SCHED_ISO;
+		param = &zero_param;
+	}
+recheck:
+	/* double check policy once rq lock held */
+	if (policy < 0) {
+		reset_on_fork = p->sched_reset_on_fork;
+		policy = oldpolicy = p->policy;
+	} else {
+		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+		policy &= ~SCHED_RESET_ON_FORK;
+
+		if (!SCHED_RANGE(policy))
+			return -EINVAL;
+	}
+
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	 * SCHED_BATCH is 0.
+	 */
+	if (param->sched_priority < 0 ||
+	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
+	    (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
+		return -EINVAL;
+	if (is_rt_policy(policy) != (param->sched_priority != 0))
+		return -EINVAL;
+
+	/*
+	 * Allow unprivileged RT tasks to decrease priority:
+	 */
+	if (user && !capable(CAP_SYS_NICE)) {
+		if (is_rt_policy(policy)) {
+			unsigned long rlim_rtprio =
+					task_rlimit(p, RLIMIT_RTPRIO);
+
+			/* can't set/change the rt policy */
+			if (policy != p->policy && !rlim_rtprio)
+				return -EPERM;
+
+			/* can't increase priority */
+			if (param->sched_priority > p->rt_priority &&
+			    param->sched_priority > rlim_rtprio)
+				return -EPERM;
+		} else {
+			switch (p->policy) {
+				/*
+				 * Can only downgrade policies but not back to
+				 * SCHED_NORMAL
+				 */
+				case SCHED_ISO:
+					if (policy == SCHED_ISO)
+						goto out;
+					if (policy != SCHED_NORMAL)
+						return -EPERM;
+					break;
+				case SCHED_BATCH:
+					if (policy == SCHED_BATCH)
+						goto out;
+					if (policy != SCHED_IDLEPRIO)
+						return -EPERM;
+					break;
+				case SCHED_IDLEPRIO:
+					if (policy == SCHED_IDLEPRIO)
+						goto out;
+					return -EPERM;
+				default:
+					break;
+			}
+		}
+
+		/* can't change other user's priorities */
+		if (!check_same_owner(p))
+			return -EPERM;
+
+		/* Normal users shall not reset the sched_reset_on_fork flag */
+		if (p->sched_reset_on_fork && !reset_on_fork)
+			return -EPERM;
+	}
+
+	if (user) {
+		retval = security_task_setscheduler(p);
+		if (retval)
+			return retval;
+	}
+
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 *
+	 * To be able to change p->policy safely, the runqueue lock must be
+	 * held.
+	 */
+	rq = task_rq_lock(p, &flags);
+
+	/*
+	 * Changing the policy of the stop threads its a very bad idea
+	 */
+	if (p == rq->stop) {
+		task_rq_unlock(rq, p, &flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * If not changing anything there's no need to proceed further:
+	 */
+	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
+			param->sched_priority == p->rt_priority))) {
+		task_rq_unlock(rq, p, &flags);
+		return 0;
+	}
+
+	/* recheck policy now with rq lock held */
+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+		policy = oldpolicy = -1;
+		task_rq_unlock(rq, p, &flags);
+		goto recheck;
+	}
+	p->sched_reset_on_fork = reset_on_fork;
+
+	__setscheduler(p, rq, policy, param->sched_priority, pi);
+	task_rq_unlock(rq, p, &flags);
+
+	if (pi)
+		rt_mutex_adjust_pi(p);
+out:
+	return 0;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+		       const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, true, true);
+}
+
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+	const struct sched_param param = { .sched_priority = attr->sched_priority };
+	int policy = attr->sched_policy;
+
+	return __sched_setscheduler(p, policy, &param, true, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+			       const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, false, true);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+	struct sched_param lparam;
+	struct task_struct *p;
+	int retval;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+
+	rcu_read_lock();
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (p != NULL)
+		retval = sched_setscheduler(p, policy, &lparam);
+	rcu_read_unlock();
+
+	return retval;
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+			   struct sched_attr *attr)
+{
+	u32 size;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+		return -EFAULT;
+
+	/*
+	 * zero the full structure, so that a short copy will be nice.
+	 */
+	memset(attr, 0, sizeof(*attr));
+
+	ret = get_user(size, &uattr->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		goto err_size;
+
+	if (!size)		/* abi compat */
+		size = SCHED_ATTR_SIZE_VER0;
+
+	if (size < SCHED_ATTR_SIZE_VER0)
+		goto err_size;
+
+	/*
+	 * If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. new
+	 * user-space does not rely on any kernel feature
+	 * extensions we dont know about yet.
+	 */
+	if (size > sizeof(*attr)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uattr + sizeof(*attr);
+		end  = (void __user *)uattr + size;
+
+		for (; addr < end; addr++) {
+			ret = get_user(val, addr);
+			if (ret)
+				return ret;
+			if (val)
+				goto err_size;
+		}
+		size = sizeof(*attr);
+	}
+
+	ret = copy_from_user(attr, uattr, size);
+	if (ret)
+		return -EFAULT;
+
+	/*
+	 * XXX: do we want to be lenient like existing syscalls; or do we want
+	 * to be strict and return an error on out-of-bounds values?
+	 */
+	attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+
+	/* sched/core.c uses zero here but we already know ret is zero */
+	return 0;
+
+err_size:
+	put_user(sizeof(*attr), &uattr->size);
+	return -E2BIG;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ *
+ * Return: 0 on success. An error code otherwise.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+				       struct sched_param __user *param)
+{
+	/* negative values for policy are not valid */
+	if (policy < 0)
+		return -EINVAL;
+
+	return do_sched_setscheduler(pid, policy, param);
+}
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY	-1
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+			       unsigned int, flags)
+{
+	struct sched_attr attr;
+	struct task_struct *p;
+	int retval;
+
+	if (!uattr || pid < 0 || flags)
+		return -EINVAL;
+
+	retval = sched_copy_attr(uattr, &attr);
+	if (retval)
+		return retval;
+
+	if ((int)attr.sched_policy < 0)
+		return -EINVAL;
+
+	rcu_read_lock();
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (p != NULL)
+		retval = sched_setattr(p, &attr);
+	rcu_read_unlock();
+
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (p) {
+		retval = security_task_getscheduler(p);
+		if (!retval)
+			retval = p->policy;
+	}
+	rcu_read_unlock();
+
+out_nounlock:
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+	struct sched_param lp = { .sched_priority = 0 };
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (!param || pid < 0)
+		goto out_nounlock;
+
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	if (has_rt_policy(p))
+		lp.sched_priority = p->rt_priority;
+	rcu_read_unlock();
+
+	/*
+	 * This one might sleep, we cannot do it with a spinlock held ...
+	 */
+	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+static int sched_read_attr(struct sched_attr __user *uattr,
+			   struct sched_attr *attr,
+			   unsigned int usize)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, usize))
+		return -EFAULT;
+
+	/*
+	 * If we're handed a smaller struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. old
+	 * user-space does not get uncomplete information.
+	 */
+	if (usize < sizeof(*attr)) {
+		unsigned char *addr;
+		unsigned char *end;
+
+		addr = (void *)attr + usize;
+		end  = (void *)attr + sizeof(*attr);
+
+		for (; addr < end; addr++) {
+			if (*addr)
+				return -EFBIG;
+		}
+
+		attr->size = usize;
+	}
+
+	ret = copy_to_user(uattr, attr, attr->size);
+	if (ret)
+		return -EFAULT;
+
+	/* sched/core.c uses zero here but we already know ret is zero */
+	return ret;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+		unsigned int, size, unsigned int, flags)
+{
+	struct sched_attr attr = {
+		.size = sizeof(struct sched_attr),
+	};
+	struct task_struct *p;
+	int retval;
+
+	if (!uattr || pid < 0 || size > PAGE_SIZE ||
+	    size < SCHED_ATTR_SIZE_VER0 || flags)
+		return -EINVAL;
+
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	attr.sched_policy = p->policy;
+	if (rt_task(p))
+		attr.sched_priority = p->rt_priority;
+	else
+		attr.sched_nice = task_nice(p);
+
+	rcu_read_unlock();
+
+	retval = sched_read_attr(uattr, &attr, size);
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+	cpumask_var_t cpus_allowed, new_mask;
+	struct task_struct *p;
+	int retval;
+
+	get_online_cpus();
+	rcu_read_lock();
+
+	p = find_process_by_pid(pid);
+	if (!p) {
+		rcu_read_unlock();
+		put_online_cpus();
+		return -ESRCH;
+	}
+
+	/* Prevent p going away */
+	get_task_struct(p);
+	rcu_read_unlock();
+
+	if (p->flags & PF_NO_SETAFFINITY) {
+		retval = -EINVAL;
+		goto out_put_task;
+	}
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_put_task;
+	}
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_free_cpus_allowed;
+	}
+	retval = -EPERM;
+	if (!check_same_owner(p)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
+			goto out_unlock;
+		}
+		rcu_read_unlock();
+	}
+
+	retval = security_task_setscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	cpuset_cpus_allowed(p, cpus_allowed);
+	cpumask_and(new_mask, in_mask, cpus_allowed);
+again:
+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
+
+	if (!retval) {
+		cpuset_cpus_allowed(p, cpus_allowed);
+		if (!cpumask_subset(new_mask, cpus_allowed)) {
+			/*
+			 * We must have raced with a concurrent cpuset
+			 * update. Just reset the cpus_allowed to the
+			 * cpuset's cpus_allowed
+			 */
+			cpumask_copy(new_mask, cpus_allowed);
+			goto again;
+		}
+	}
+out_unlock:
+	free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+	free_cpumask_var(cpus_allowed);
+out_put_task:
+	put_task_struct(p);
+	put_online_cpus();
+	return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+			     cpumask_t *new_mask)
+{
+	if (len < sizeof(cpumask_t)) {
+		memset(new_mask, 0, sizeof(cpumask_t));
+	} else if (len > sizeof(cpumask_t)) {
+		len = sizeof(cpumask_t);
+	}
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	cpumask_var_t new_mask;
+	int retval;
+
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+	if (retval == 0)
+		retval = sched_setaffinity(pid, new_mask);
+	free_cpumask_var(new_mask);
+	return retval;
+}
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+	struct task_struct *p;
+	unsigned long flags;
+	int retval;
+
+	get_online_cpus();
+	rcu_read_lock();
+
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+out_unlock:
+	rcu_read_unlock();
+	put_online_cpus();
+
+	return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	int ret;
+	cpumask_var_t mask;
+
+	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(unsigned long)-1))
+		return -EINVAL;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = sched_getaffinity(pid, mask);
+	if (ret == 0) {
+		size_t retlen = min_t(size_t, len, cpumask_size());
+
+		if (copy_to_user(user_mask_ptr, mask, retlen))
+			ret = -EFAULT;
+		else
+			ret = retlen;
+	}
+	free_cpumask_var(mask);
+
+	return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. It does this by
+ * scheduling away the current task. If it still has the earliest deadline
+ * it will be scheduled again as the next task.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+	struct task_struct *p;
+	struct rq *rq;
+
+	p = current;
+	rq = this_rq_lock();
+	time_slice_expired(p, rq);
+	schedstat_inc(task_rq(p), yld_count);
+
+	/*
+	 * Since we are going to call schedule() anyway, there's
+	 * no need to preempt or enable interrupts:
+	 */
+	__release(rq->lock);
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+	do_raw_spin_unlock(&rq->lock);
+	sched_preempt_enable_no_resched();
+
+	schedule();
+
+	return 0;
+}
+
+int __sched _cond_resched(void)
+{
+	if (should_resched(0)) {
+		preempt_schedule_common();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+	int resched = should_resched(PREEMPT_LOCK_OFFSET);
+	int ret = 0;
+
+	lockdep_assert_held(lock);
+
+	if (spin_needbreak(lock) || resched) {
+		spin_unlock(lock);
+		if (resched)
+			preempt_schedule_common();
+		else
+			cpu_relax();
+		ret = 1;
+		spin_lock(lock);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __sched __cond_resched_softirq(void)
+{
+	BUG_ON(!in_softirq());
+
+	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
+		local_bh_enable();
+		preempt_schedule_common();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * 	yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+	set_current_state(TASK_RUNNING);
+	sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ *	true (>0) if we indeed boosted the target task.
+ *	false (0) if we failed to boost the target.
+ *	-ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+	struct task_struct *rq_p;
+	struct rq *rq, *p_rq;
+	unsigned long flags;
+	int yielded = 0;
+
+	local_irq_save(flags);
+	rq = this_rq();
+
+again:
+	p_rq = task_rq(p);
+	/*
+	 * If we're the only runnable task on the rq and target rq also
+	 * has only one task, there's absolutely no point in yielding.
+	 */
+	if (task_running(p_rq, p) || p->state) {
+		yielded = -ESRCH;
+		goto out_irq;
+	}
+
+	double_rq_lock(rq, p_rq);
+	if (unlikely(task_rq(p) != p_rq)) {
+		double_rq_unlock(rq, p_rq);
+		goto again;
+	}
+
+	yielded = 1;
+	rq_p = rq->curr;
+	if (p->deadline > rq_p->deadline)
+		p->deadline = rq_p->deadline;
+	p->time_slice += rq_p->time_slice;
+	if (p->time_slice > timeslice())
+		p->time_slice = timeslice();
+	time_slice_expired(rq_p, rq);
+	if (preempt && rq != p_rq)
+		resched_task(p_rq->curr);
+	double_rq_unlock(rq, p_rq);
+out_irq:
+	local_irq_restore(flags);
+
+	if (yielded > 0)
+		schedule();
+	return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/*
+ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+
+long __sched io_schedule_timeout(long timeout)
+{
+	int old_iowait = current->in_iowait;
+	struct rq *rq;
+	long ret;
+
+	current->in_iowait = 1;
+	blk_schedule_flush_plug(current);
+
+	delayacct_blkio_start();
+	rq = raw_rq();
+	atomic_inc(&rq->nr_iowait);
+	ret = schedule_timeout(timeout);
+	current->in_iowait = old_iowait;
+	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
+
+	return ret;
+}
+EXPORT_SYMBOL(io_schedule_timeout);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = MAX_USER_RT_PRIO-1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_ISO:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = 1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_ISO:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
+{
+	struct task_struct *p;
+	unsigned int time_slice;
+	unsigned long flags;
+	struct timespec t;
+	struct rq *rq;
+	int retval;
+
+	if (pid < 0)
+		return -EINVAL;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	rq = task_rq_lock(p, &flags);
+	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
+	task_rq_unlock(rq, p, &flags);
+
+	rcu_read_unlock();
+	t = ns_to_timespec(time_slice);
+	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+	unsigned long free = 0;
+	int ppid;
+	unsigned long state = p->state;
+
+	if (state)
+		state = __ffs(state) + 1;
+	printk(KERN_INFO "%-15.15s %c", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if BITS_PER_LONG == 32
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT " running  ");
+	else
+		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+#else
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT "  running task    ");
+	else
+		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	free = stack_not_used(p);
+#endif
+	ppid = 0;
+	rcu_read_lock();
+	if (pid_alive(p))
+		ppid = task_pid_nr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+		task_pid_nr(p), ppid,
+		(unsigned long)task_thread_info(p)->flags);
+
+	print_worker_info(KERN_INFO, p);
+	show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+	struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+	printk(KERN_INFO
+		"  task                PC stack   pid father\n");
+#else
+	printk(KERN_INFO
+		"  task                        PC stack   pid father\n");
+#endif
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		/*
+		 * reset the NMI-timeout, listing all files on a slow
+		 * console might take a lot of time:
+		 * Also, reset softlockup watchdogs on all CPUs, because
+		 * another CPU might be blocked waiting for us to process
+		 * an IPI.
+		 */
+		touch_nmi_watchdog();
+		touch_all_softlockup_watchdogs();
+		if (!state_filter || (p->state & state_filter))
+			sched_show_task(p);
+	}
+
+	rcu_read_unlock();
+	/*
+	 * Only show locks if all tasks are dumped:
+	 */
+	if (!state_filter)
+		debug_show_all_locks();
+}
+
+void dump_cpu_task(int cpu)
+{
+	pr_info("Task dump for CPU %d:\n", cpu);
+	sched_show_task(cpu_curr(cpu));
+}
+
+#ifdef CONFIG_SMP
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+{
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&p->pi_lock);
+
+	cpumask_copy(tsk_cpus_allowed(p), new_mask);
+
+	if (task_queued(p)) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+	}
+	if (needs_other_cpu(p, task_cpu(p)))
+		set_task_cpu(p, valid_task_cpu(p));
+}
+#endif
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void init_idle(struct task_struct *idle, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&idle->pi_lock, flags);
+	raw_spin_lock(&rq->lock);
+	idle->last_ran = rq->niffies;
+	idle->state = TASK_RUNNING;
+	/* Setting prio to illegal value shouldn't matter when never queued */
+	idle->prio = PRIO_LIMIT;
+
+	kasan_unpoison_task_stack(idle);
+
+#ifdef CONFIG_SMP
+	/*
+	 * It's possible that init_idle() gets called multiple times on a task,
+	 * in that case do_set_cpus_allowed() will not do the right thing.
+	 *
+	 * And since this is boot we can forgo the serialisation.
+	 */
+	set_cpus_allowed_common(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMT_NICE
+	idle->smt_bias = 0;
+#endif
+#endif
+	set_rq_task(rq, idle);
+
+	/* Silence PROVE_RCU */
+	rcu_read_lock();
+	set_task_cpu(idle, cpu);
+	rcu_read_unlock();
+
+	rq->curr = rq->idle = idle;
+	idle->on_rq = TASK_ON_RQ_QUEUED;
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
+
+	/* Set the preempt count _outside_ the spinlocks! */
+	init_idle_preempt_count(idle, cpu);
+
+	ftrace_graph_init_idle_task(idle, cpu);
+#ifdef CONFIG_SMP
+	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
+			      const struct cpumask __maybe_unused *trial)
+{
+	return 1;
+}
+
+int task_can_attach(struct task_struct *p,
+		    const struct cpumask *cs_cpus_allowed)
+{
+	int ret = 0;
+
+	/*
+	 * Kthreads which disallow setaffinity shouldn't be moved
+	 * to a new cpuset; we don't want to change their cpu
+	 * affinity and isolating such threads by their set of
+	 * allowed nodes is unnecessary.  Thus, cpusets are not
+	 * applicable for such threads.  This prevents checking for
+	 * success of set_cpus_allowed_ptr() on all attached tasks
+	 * before cpus_allowed may be changed.
+	 */
+	if (p->flags & PF_NO_SETAFFINITY)
+		ret = -EINVAL;
+
+	return ret;
+}
+
+void resched_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	rq_lock_irqsave(rq, &flags);
+	resched_task(cpu_curr(cpu));
+	rq_unlock_irqrestore(rq, &flags);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
+void nohz_balance_enter_idle(int cpu)
+{
+}
+
+void select_nohz_load_balancer(int stop_tick)
+{
+}
+
+void set_cpu_sd_state_idle(void) {}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:	The cpu whose lowest level of sched domain is to
+ *		be returned.
+ * @flag:	The flag to check for the lowest sched_domain
+ *		for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd)
+		if (sd && (sd->flags & flag))
+			break;
+
+	return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:	The cpu whose domains we're iterating over.
+ * @sd:		variable holding the value of the power_savings_sd
+ *		for cpu.
+ * @flag:	The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+	for (sd = lowest_flag_domain(cpu, flag); \
+		(sd && (sd->flags & flag)); sd = sd->parent)
+
+#endif /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+	int i, cpu = smp_processor_id();
+	struct sched_domain *sd;
+
+	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+		return cpu;
+
+	rcu_read_lock();
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd)) {
+			if (cpu == i)
+				continue;
+
+			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ 				cpu = i;
+				cpu = i;
+				goto unlock;
+			}
+		}
+	}
+
+	if (!is_housekeeping_cpu(cpu))
+		cpu = housekeeping_any_cpu();
+unlock:
+	rcu_read_unlock();
+	return cpu;
+}
+
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+	if (cpu == smp_processor_id())
+		return;
+
+	if (set_nr_and_not_polling(cpu_rq(cpu)->idle))
+		smp_send_reschedule(cpu);
+	else
+		trace_sched_wake_idle_without_ipi(cpu);
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+	wake_up_idle_cpu(cpu);
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
+{
+	const struct cpumask *cpu_valid_mask = cpu_active_mask;
+	bool running_wrong = false;
+	struct cpumask old_mask;
+	bool queued = false;
+	unsigned long flags;
+	struct rq *rq;
+	int ret = 0;
+
+	rq = task_rq_lock(p, &flags);
+
+	if (p->flags & PF_KTHREAD) {
+		/*
+		 * Kernel threads are allowed on online && !active CPUs
+		 */
+		cpu_valid_mask = cpu_online_mask;
+	}
+
+	/*
+	 * Must re-check here, to close a race against __kthread_bind(),
+	 * sched_setaffinity() is not guaranteed to observe the flag.
+	 */
+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	cpumask_copy(&old_mask, tsk_cpus_allowed(p));
+	if (cpumask_equal(&old_mask, new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	queued = task_queued(p);
+
+	do_set_cpus_allowed(p, new_mask);
+
+	if (p->flags & PF_KTHREAD) {
+		/*
+		 * For kernel threads that do indeed end up on online &&
+		 * !active we want to ensure they are strict per-cpu threads.
+		 */
+		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+			!cpumask_intersects(new_mask, cpu_active_mask) &&
+			tsk_nr_cpus_allowed(p) != 1);
+	}
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	if (task_running(rq, p)) {
+		/* Task is running on the wrong cpu now, reschedule it. */
+		if (rq == this_rq()) {
+			set_tsk_need_resched(p);
+			running_wrong = true;
+		} else
+			resched_task(p);
+	} else {
+		int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+		struct rq *dest_rq = cpu_rq(dest_cpu);
+
+		/* Switch rq locks here */
+		lock_second_rq(rq, dest_rq);
+		set_task_cpu(p, dest_cpu);
+		rq_unlock(rq);
+
+		rq = dest_rq;
+	}
+out:
+	if (queued && !cpumask_subset(new_mask, &old_mask))
+		try_preempt(p, rq);
+	if (running_wrong)
+		preempt_disable();
+	task_rq_unlock(rq, p, &flags);
+
+	if (running_wrong) {
+		__schedule(true);
+		preempt_enable();
+	}
+
+	return ret;
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	return __set_cpus_allowed_ptr(p, new_mask, false);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Run through task list and find tasks affined to the dead cpu, then remove
+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold
+ * cpu 0 and src_cpu's runqueue locks.
+ */
+static void bind_zero(int src_cpu)
+{
+	struct task_struct *p, *t;
+	int bound = 0;
+
+	if (src_cpu == 0)
+		return;
+
+	do_each_thread(t, p) {
+		if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
+			bool local = (task_cpu(p) == src_cpu);
+
+			/* task_running is the cpu stopper thread */
+			if (local && task_running(task_rq(p), p))
+				continue;
+			atomic_clear_cpu(src_cpu, tsk_cpus_allowed(p));
+			atomic_set_cpu(0, tsk_cpus_allowed(p));
+			p->zerobound = true;
+			bound++;
+			if (local)
+				set_task_cpu(p, 0);
+		}
+	} while_each_thread(t, p);
+
+	if (bound) {
+		printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n",
+		       bound, src_cpu);
+	}
+}
+
+/* Find processes with the zerobound flag and reenable their affinity for the
+ * CPU coming alive. */
+static void unbind_zero(int src_cpu)
+{
+	int unbound = 0, zerobound = 0;
+	struct task_struct *p, *t;
+
+	if (src_cpu == 0)
+		return;
+
+	do_each_thread(t, p) {
+		if (!p->mm)
+			p->zerobound = false;
+		if (p->zerobound) {
+			unbound++;
+			cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p));
+			/* Once every CPU affinity has been re-enabled, remove
+			 * the zerobound flag */
+			if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) {
+				p->zerobound = false;
+				zerobound++;
+			}
+		}
+	} while_each_thread(t, p);
+
+	if (unbound) {
+		printk(KERN_INFO "Added affinity for %d processes to cpu %d\n",
+		       unbound, src_cpu);
+	}
+	if (zerobound) {
+		printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n",
+		       zerobound);
+	}
+}
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	BUG_ON(cpu_online(smp_processor_id()));
+
+	if (mm != &init_mm) {
+		switch_mm_irqs_off(mm, &init_mm, current);
+		finish_arch_post_lock_switch();
+	}
+	mmdrop(mm);
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void unbind_zero(int src_cpu) {}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+	struct sched_param stop_param = { .sched_priority = STOP_PRIO };
+	struct sched_param start_param = { .sched_priority = 0 };
+	struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+	if (stop) {
+		/*
+		 * Make it appear like a SCHED_FIFO task, its something
+		 * userspace knows about and won't get confused about.
+		 *
+		 * Also, it will make PI more or less work without too
+		 * much confusion -- but then, stop work should not
+		 * rely on PI working anyway.
+		 */
+		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
+	}
+
+	cpu_rq(cpu)->stop = stop;
+
+	if (old_stop) {
+		/*
+		 * Reset it back to a normal scheduling policy so that
+		 * it can die in pieces.
+		 */
+		sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
+	}
+}
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+	{
+		.procname	= "sched_domain",
+		.mode		= 0555,
+	},
+	{}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= sd_ctl_dir,
+	},
+	{}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+	return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+	struct ctl_table *entry;
+
+	/*
+	 * In the intermediate directories, both the child directory and
+	 * procname are dynamically allocated and could fail but the mode
+	 * will always be set. In the lowest directory the names are
+	 * static strings and all have proc handlers.
+	 */
+	for (entry = *tablep; entry->mode; entry++) {
+		if (entry->child)
+			sd_free_ctl_entry(&entry->child);
+		if (entry->proc_handler == NULL)
+			kfree(entry->procname);
+	}
+
+	kfree(*tablep);
+	*tablep = NULL;
+}
+
+#define CPU_LOAD_IDX_MAX 5
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+		const char *procname, void *data, int maxlen,
+		umode_t mode, proc_handler *proc_handler,
+		bool load_idx)
+{
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+
+	if (load_idx) {
+		entry->extra1 = &min_load_idx;
+		entry->extra2 = &max_load_idx;
+	}
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+	if (table == NULL)
+		return NULL;
+
+	set_table_entry(&table[0], "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[1], "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[9], "cache_nice_tries",
+		&sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[10], "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[11], "max_newidle_lb_cost",
+		&sd->max_newidle_lb_cost,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[12], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+	/* &table[13] is terminator */
+
+	return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct ctl_table *entry, *table;
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	char buf[32];
+
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+	if (table == NULL)
+		return NULL;
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_possible_cpus();
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	char buf[32];
+
+	WARN_ON(sd_ctl_dir[0].child);
+	sd_ctl_dir[0].child = entry;
+
+	if (entry == NULL)
+		return;
+
+	for_each_possible_cpu(i) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+		entry++;
+	}
+
+	WARN_ON(sd_sysctl_header);
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+	unregister_sysctl_table(sd_sysctl_header);
+	sd_sysctl_header = NULL;
+	if (sd_ctl_dir[0].child)
+		sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+
+static void set_rq_online(struct rq *rq)
+{
+	if (!rq->online) {
+		cpumask_set_cpu(cpu_of(rq), rq->rd->online);
+		rq->online = true;
+	}
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+	if (rq->online) {
+		int cpu = cpu_of(rq);
+
+		cpumask_clear_cpu(cpu, rq->rd->online);
+		rq->online = false;
+		clear_cpuidle_map(cpu);
+	}
+}
+
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_debug_enabled;
+
+static int __init sched_debug_setup(char *str)
+{
+	sched_debug_enabled = 1;
+
+	return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+	return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+				  struct cpumask *groupmask)
+{
+	cpumask_clear(groupmask);
+
+	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+	if (!(sd->flags & SD_LOAD_BALANCE)) {
+		printk("does not load-balance\n");
+		if (sd->parent)
+			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+					" has parent");
+		return -1;
+	}
+
+	printk(KERN_CONT "span %*pbl level %s\n",
+	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+		printk(KERN_ERR "ERROR: domain->span does not contain "
+				"CPU%d\n", cpu);
+	}
+
+	printk(KERN_CONT "\n");
+
+	if (!cpumask_equal(sched_domain_span(sd), groupmask))
+		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+	if (sd->parent &&
+	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+		printk(KERN_ERR "ERROR: parent span is not a superset "
+			"of domain->span\n");
+	return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	if (!sched_debug_enabled)
+		return;
+
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	for (;;) {
+		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+			break;
+		level++;
+		sd = sd->parent;
+		if (!sd)
+			break;
+	}
+}
+#else /* !CONFIG_SCHED_DEBUG */
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+	return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+	if (cpumask_weight(sched_domain_span(sd)) == 1)
+		return 1;
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_AFFINE))
+		return 0;
+
+	return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+		return 0;
+
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+	cpupri_cleanup(&rd->cpupri);
+	free_cpumask_var(rd->rto_mask);
+	free_cpumask_var(rd->online);
+	free_cpumask_var(rd->span);
+	kfree(rd);
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+	struct root_domain *old_rd = NULL;
+	unsigned long flags;
+
+	rq_lock_irqsave(rq, &flags);
+
+	if (rq->rd) {
+		old_rd = rq->rd;
+
+		if (cpumask_test_cpu(rq->cpu, old_rd->online))
+			set_rq_offline(rq);
+
+		cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+		/*
+		 * If we dont want to free the old_rd yet then
+		 * set old_rd to NULL to skip the freeing later
+		 * in this function:
+		 */
+		if (!atomic_dec_and_test(&old_rd->refcount))
+			old_rd = NULL;
+	}
+
+	atomic_inc(&rd->refcount);
+	rq->rd = rd;
+
+	cpumask_set_cpu(rq->cpu, rd->span);
+	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+		set_rq_online(rq);
+
+	rq_unlock_irqrestore(rq, &flags);
+
+	if (old_rd)
+		call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+	memset(rd, 0, sizeof(*rd));
+
+	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
+		goto out;
+	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
+		goto free_span;
+	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+		goto free_online;
+
+	if (cpupri_init(&rd->cpupri) != 0)
+		goto free_rto_mask;
+	return 0;
+
+free_rto_mask:
+	free_cpumask_var(rd->rto_mask);
+free_online:
+	free_cpumask_var(rd->online);
+free_span:
+	free_cpumask_var(rd->span);
+out:
+	return -ENOMEM;
+}
+
+static void init_defrootdomain(void)
+{
+	init_rootdomain(&def_root_domain);
+
+	atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+	struct root_domain *rd;
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	if (init_rootdomain(rd) != 0) {
+		kfree(rd);
+		return NULL;
+	}
+
+	return rd;
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+	kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+	call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+	for (; sd; sd = sd->parent)
+		destroy_sched_domain(sd, cpu);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; ) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+
+		if (sd_parent_degenerate(tmp, parent)) {
+			tmp->parent = parent->parent;
+			if (parent->parent)
+				parent->parent->child = tmp;
+			/*
+			 * Transfer SD_PREFER_SIBLING down in case of a
+			 * degenerate parent; the spans match for this
+			 * so the property transfers.
+			 */
+			if (parent->flags & SD_PREFER_SIBLING)
+				tmp->flags |= SD_PREFER_SIBLING;
+			destroy_sched_domain(parent, cpu);
+		} else
+			tmp = tmp->parent;
+	}
+
+	if (sd && sd_degenerate(sd)) {
+		tmp = sd;
+		sd = sd->parent;
+		destroy_sched_domain(tmp, cpu);
+		if (sd)
+			sd->child = NULL;
+	}
+
+	sched_domain_debug(sd, cpu);
+
+	rq_attach_root(rq, rd);
+	tmp = rq->sd;
+	rcu_assign_pointer(rq->sd, sd);
+	destroy_sched_domains(tmp, cpu);
+}
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+	int ret;
+
+	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	ret = cpulist_parse(str, cpu_isolated_map);
+	if (ret) {
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+		return 0;
+	}
+	return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+struct s_data {
+	struct sched_domain ** __percpu sd;
+	struct root_domain	*rd;
+};
+
+enum s_alloc {
+	sa_rootdomain,
+	sa_sd,
+	sa_sd_storage,
+	sa_none,
+};
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+	if (kstrtoint(str, 0, &default_relax_domain_level))
+		pr_warn("Unable to set relax_domain_level\n");
+
+	return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+				 struct sched_domain_attr *attr)
+{
+	int request;
+
+	if (!attr || attr->relax_domain_level < 0) {
+		if (default_relax_domain_level < 0)
+			return;
+		else
+			request = default_relax_domain_level;
+	} else
+		request = attr->relax_domain_level;
+	if (request < sd->level) {
+		/* turn off idle balance on this domain */
+		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	} else {
+		/* turn on idle balance on this domain */
+		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	}
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+				 const struct cpumask *cpu_map)
+{
+	switch (what) {
+	case sa_rootdomain:
+		if (!atomic_read(&d->rd->refcount))
+			free_rootdomain(&d->rd->rcu); /* fall through */
+	case sa_sd:
+		free_percpu(d->sd); /* fall through */
+	case sa_sd_storage:
+		__sdt_free(cpu_map); /* fall through */
+	case sa_none:
+		break;
+	}
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+						   const struct cpumask *cpu_map)
+{
+	memset(d, 0, sizeof(*d));
+
+	if (__sdt_alloc(cpu_map))
+		return sa_sd_storage;
+	d->sd = alloc_percpu(struct sched_domain *);
+	if (!d->sd)
+		return sa_sd_storage;
+	d->rd = alloc_rootdomain();
+	if (!d->rd)
+		return sa_sd;
+	return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain
+ * structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+	struct sd_data *sdd = sd->private;
+
+	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+	*per_cpu_ptr(sdd->sd, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY      - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA                - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS		\
+	(SD_SHARE_CPUCAPACITY |		\
+	 SD_SHARE_PKG_RESOURCES |	\
+	 SD_NUMA |			\
+	 SD_ASYM_PACKING |		\
+	 SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl, int cpu)
+{
+	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+	int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+	/*
+	 * Ugly hack to pass state to sd_numa_mask()...
+	 */
+	sched_domains_curr_level = tl->numa_level;
+#endif
+
+	sd_weight = cpumask_weight(tl->mask(cpu));
+
+	if (tl->sd_flags)
+		sd_flags = (*tl->sd_flags)();
+	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+			"wrong sd_flags in topology description\n"))
+		sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+	*sd = (struct sched_domain){
+		.min_interval		= sd_weight,
+		.max_interval		= 2*sd_weight,
+		.busy_factor		= 32,
+		.imbalance_pct		= 125,
+
+		.cache_nice_tries	= 0,
+		.busy_idx		= 0,
+		.idle_idx		= 0,
+		.newidle_idx		= 0,
+		.wake_idx		= 0,
+		.forkexec_idx		= 0,
+
+		.flags			= 1*SD_LOAD_BALANCE
+					| 1*SD_BALANCE_NEWIDLE
+					| 1*SD_BALANCE_EXEC
+					| 1*SD_BALANCE_FORK
+					| 0*SD_BALANCE_WAKE
+					| 1*SD_WAKE_AFFINE
+					| 0*SD_SHARE_CPUCAPACITY
+					| 0*SD_SHARE_PKG_RESOURCES
+					| 0*SD_SERIALIZE
+					| 0*SD_PREFER_SIBLING
+					| 0*SD_NUMA
+					| sd_flags
+					,
+
+		.last_balance		= jiffies,
+		.balance_interval	= sd_weight,
+		.smt_gain		= 0,
+		.max_newidle_lb_cost	= 0,
+		.next_decay_max_lb_cost	= jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+		.name			= tl->name,
+#endif
+	};
+
+	/*
+	 * Convert topological properties into behaviour.
+	 */
+
+	if (sd->flags & SD_SHARE_CPUCAPACITY) {
+		sd->flags |= SD_PREFER_SIBLING;
+		sd->imbalance_pct = 110;
+		sd->smt_gain = 1178; /* ~15% */
+
+	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+		sd->imbalance_pct = 117;
+		sd->cache_nice_tries = 1;
+		sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+	} else if (sd->flags & SD_NUMA) {
+		sd->cache_nice_tries = 2;
+		sd->busy_idx = 3;
+		sd->idle_idx = 2;
+
+		sd->flags |= SD_SERIALIZE;
+		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+			sd->flags &= ~(SD_BALANCE_EXEC |
+				       SD_BALANCE_FORK |
+				       SD_WAKE_AFFINE);
+		}
+
+#endif
+	} else {
+		sd->flags |= SD_PREFER_SIBLING;
+		sd->cache_nice_tries = 1;
+		sd->busy_idx = 2;
+		sd->idle_idx = 1;
+	}
+
+	sd->private = &tl->data;
+
+	return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology =
+	default_topology;
+
+#define for_each_sd_topology(tl)			\
+	for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+	sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+	static int done = false;
+	int i,j;
+
+	if (done)
+		return;
+
+	done = true;
+
+	printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+	for (i = 0; i < nr_node_ids; i++) {
+		printk(KERN_WARNING "  ");
+		for (j = 0; j < nr_node_ids; j++)
+			printk(KERN_CONT "%02d ", node_distance(i,j));
+		printk(KERN_CONT "\n");
+	}
+	printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+	int i;
+
+	if (distance == node_distance(0, 0))
+		return true;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		if (sched_domains_numa_distance[i] == distance)
+			return true;
+	}
+
+	return false;
+}
+
+static void sched_init_numa(void)
+{
+	int next_distance, curr_distance = node_distance(0, 0);
+	struct sched_domain_topology_level *tl;
+	int level = 0;
+	int i, j, k;
+
+	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+	if (!sched_domains_numa_distance)
+		return;
+
+	/*
+	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+	 * unique distances in the node_distance() table.
+	 *
+	 * Assumes node_distance(0,j) includes all distances in
+	 * node_distance(i,j) in order to avoid cubic time.
+	 */
+	next_distance = curr_distance;
+	for (i = 0; i < nr_node_ids; i++) {
+		for (j = 0; j < nr_node_ids; j++) {
+			for (k = 0; k < nr_node_ids; k++) {
+				int distance = node_distance(i, k);
+
+				if (distance > curr_distance &&
+				    (distance < next_distance ||
+				     next_distance == curr_distance))
+					next_distance = distance;
+
+				/*
+				 * While not a strong assumption it would be nice to know
+				 * about cases where if node A is connected to B, B is not
+				 * equally connected to A.
+				 */
+				if (sched_debug() && node_distance(k, i) != distance)
+					sched_numa_warn("Node-distance not symmetric");
+
+				if (sched_debug() && i && !find_numa_distance(distance))
+					sched_numa_warn("Node-0 not representative");
+			}
+			if (next_distance != curr_distance) {
+				sched_domains_numa_distance[level++] = next_distance;
+				sched_domains_numa_levels = level;
+				curr_distance = next_distance;
+			} else break;
+		}
+
+		/*
+		 * In case of sched_debug() we verify the above assumption.
+		 */
+		if (!sched_debug())
+			break;
+	}
+	/*
+	 * 'level' contains the number of unique distances, excluding the
+	 * identity distance node_distance(i,i).
+	 *
+	 * The sched_domains_numa_distance[] array includes the actual distance
+	 * numbers.
+	 */
+
+	/*
+	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
+	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
+	 * the array will contain less then 'level' members. This could be
+	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+	 * in other functions.
+	 *
+	 * We reset it to 'level' at the end of this function.
+	 */
+	sched_domains_numa_levels = 0;
+
+	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+	if (!sched_domains_numa_masks)
+		return;
+
+	/*
+	 * Now for each level, construct a mask per node which contains all
+	 * cpus of nodes that are that many hops away from us.
+	 */
+	for (i = 0; i < level; i++) {
+		sched_domains_numa_masks[i] =
+			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+		if (!sched_domains_numa_masks[i])
+			return;
+
+		for (j = 0; j < nr_node_ids; j++) {
+			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+			if (!mask)
+				return;
+
+			sched_domains_numa_masks[i][j] = mask;
+
+			for_each_node(k) {
+				if (node_distance(j, k) > sched_domains_numa_distance[i])
+					continue;
+
+				cpumask_or(mask, mask, cpumask_of_node(k));
+			}
+		}
+	}
+
+	/* Compute default topology size */
+	for (i = 0; sched_domain_topology[i].mask; i++);
+
+	tl = kzalloc((i + level + 1) *
+			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+	if (!tl)
+		return;
+
+	/*
+	 * Copy the default topology bits..
+	 */
+	for (i = 0; sched_domain_topology[i].mask; i++)
+		tl[i] = sched_domain_topology[i];
+
+	/*
+	 * .. and append 'j' levels of NUMA goodness.
+	 */
+	for (j = 0; j < level; i++, j++) {
+		tl[i] = (struct sched_domain_topology_level){
+			.mask = sd_numa_mask,
+			.sd_flags = cpu_numa_flags,
+			.flags = SDTL_OVERLAP,
+			.numa_level = j,
+			SD_INIT_NAME(NUMA)
+		};
+	}
+
+	sched_domain_topology = tl;
+
+	sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+	int node = cpu_to_node(cpu);
+	int i, j;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		for (j = 0; j < nr_node_ids; j++) {
+			if (node_distance(j, node) <= sched_domains_numa_distance[i])
+				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+		}
+	}
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+	int i, j;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		for (j = 0; j < nr_node_ids; j++)
+			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+	}
+}
+
+#else
+static inline void sched_init_numa(void) { }
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for_each_sd_topology(tl) {
+		struct sd_data *sdd = &tl->data;
+
+		sdd->sd = alloc_percpu(struct sched_domain *);
+		if (!sdd->sd)
+			return -ENOMEM;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+
+			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sd)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sd, j) = sd;
+		}
+	}
+
+	return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for_each_sd_topology(tl) {
+		struct sd_data *sdd = &tl->data;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+
+			if (sdd->sd) {
+				sd = *per_cpu_ptr(sdd->sd, j);
+				kfree(*per_cpu_ptr(sdd->sd, j));
+			}
+		}
+		free_percpu(sdd->sd);
+		sdd->sd = NULL;
+	}
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+		struct sched_domain *child, int cpu)
+{
+	struct sched_domain *sd = sd_init(tl, cpu);
+	if (!sd)
+		return child;
+
+	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+	if (child) {
+		sd->level = child->level + 1;
+		sched_domain_level_max = max(sched_domain_level_max, sd->level);
+		child->parent = sd;
+		sd->child = child;
+
+		if (!cpumask_subset(sched_domain_span(child),
+				    sched_domain_span(sd))) {
+			pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+			pr_err("     the %s domain not a subset of the %s domain\n",
+					child->name, sd->name);
+#endif
+			/* Fixup, ensure @sd has at least @child cpus. */
+			cpumask_or(sched_domain_span(sd),
+				   sched_domain_span(sd),
+				   sched_domain_span(child));
+		}
+
+	}
+	set_domain_attribute(sd, attr);
+
+	return sd;
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const struct cpumask *cpu_map,
+			       struct sched_domain_attr *attr)
+{
+	enum s_alloc alloc_state;
+	struct sched_domain *sd;
+	struct s_data d;
+	int i, ret = -ENOMEM;
+
+	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+	if (alloc_state != sa_rootdomain)
+		goto error;
+
+	/* Set up domains for cpus specified by the cpu_map. */
+	for_each_cpu(i, cpu_map) {
+		struct sched_domain_topology_level *tl;
+
+		sd = NULL;
+		for_each_sd_topology(tl) {
+			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+			if (tl == sched_domain_topology)
+				*per_cpu_ptr(d.sd, i) = sd;
+			if (tl->flags & SDTL_OVERLAP)
+				sd->flags |= SD_OVERLAP;
+			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+				break;
+		}
+	}
+
+	/* Calculate CPU capacity for physical packages and nodes */
+	for (i = nr_cpumask_bits-1; i >= 0; i--) {
+		if (!cpumask_test_cpu(i, cpu_map))
+			continue;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			claim_allocations(i, sd);
+		}
+	}
+
+	/* Attach the domains */
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map) {
+		sd = *per_cpu_ptr(d.sd, i);
+		cpu_attach_domain(sd, d.rd, i);
+	}
+	rcu_read_unlock();
+
+	ret = 0;
+error:
+	__free_domain_allocs(&d, alloc_state, cpu_map);
+	return ret;
+}
+
+static cpumask_var_t *doms_cur;	/* current sched domains */
+static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+				/* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+	return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+	int i;
+	cpumask_var_t *doms;
+
+	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+	if (!doms)
+		return NULL;
+	for (i = 0; i < ndoms; i++) {
+		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+			free_sched_domains(doms, i);
+			return NULL;
+		}
+	}
+	return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+	unsigned int i;
+	for (i = 0; i < ndoms; i++)
+		free_cpumask_var(doms[i]);
+	kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int init_sched_domains(const struct cpumask *cpu_map)
+{
+	int err;
+
+	arch_update_cpu_topology();
+	ndoms_cur = 1;
+	doms_cur = alloc_sched_domains(ndoms_cur);
+	if (!doms_cur)
+		doms_cur = &fallback_doms;
+	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	err = build_sched_domains(doms_cur[0], NULL);
+	register_sched_domain_sysctl();
+
+	return err;
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+	int i;
+
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map)
+		cpu_attach_domain(NULL, &def_root_domain, i);
+	rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+			struct sched_domain_attr *new, int idx_new)
+{
+	struct sched_domain_attr tmp;
+
+	/* fast path */
+	if (!new && !cur)
+		return 1;
+
+	tmp = SD_ATTR_INIT;
+	return !memcmp(cur ? (cur + idx_cur) : &tmp,
+			new ? (new + idx_new) : &tmp,
+			sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+			     struct sched_domain_attr *dattr_new)
+{
+	int i, j, n;
+	int new_topology;
+
+	mutex_lock(&sched_domains_mutex);
+
+	/* always unregister in case we don't destroy any domains */
+	unregister_sched_domain_sysctl();
+
+	/* Let architecture update cpu core mappings. */
+	new_topology = arch_update_cpu_topology();
+
+	n = doms_new ? ndoms_new : 0;
+
+	/* Destroy deleted domains */
+	for (i = 0; i < ndoms_cur; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_cur[i], doms_new[j])
+			    && dattrs_equal(dattr_cur, i, dattr_new, j))
+				goto match1;
+		}
+		/* no match - a current sched domain not in new doms_new[] */
+		detach_destroy_domains(doms_cur[i]);
+match1:
+		;
+	}
+
+	n = ndoms_cur;
+	if (doms_new == NULL) {
+		n = 0;
+		doms_new = &fallback_doms;
+		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		WARN_ON_ONCE(dattr_new);
+	}
+
+	/* Build new domains */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j])
+			    && dattrs_equal(dattr_new, i, dattr_cur, j))
+				goto match2;
+		}
+		/* no match - add a new doms_new */
+		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+		;
+	}
+
+	/* Remember the new sched domains */
+	if (doms_cur != &fallback_doms)
+		free_sched_domains(doms_cur, ndoms_cur);
+	kfree(dattr_cur);	/* kfree(NULL) is safe */
+	doms_cur = doms_new;
+	dattr_cur = dattr_new;
+	ndoms_cur = ndoms_new;
+
+	register_sched_domain_sysctl();
+
+	mutex_unlock(&sched_domains_mutex);
+}
+
+static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
+
+/*
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
+ */
+static void cpuset_cpu_active(void)
+{
+	if (cpuhp_tasks_frozen) {
+		/*
+		 * num_cpus_frozen tracks how many CPUs are involved in suspend
+		 * resume sequence. As long as this is not the last online
+		 * operation in the resume sequence, just build a single sched
+		 * domain, ignoring cpusets.
+		 */
+		num_cpus_frozen--;
+		if (likely(num_cpus_frozen)) {
+			partition_sched_domains(1, NULL, NULL);
+			return;
+		}
+		/*
+		 * This is the last CPU online operation. So fall through and
+		 * restore the original sched domains by considering the
+		 * cpuset configurations.
+		 */
+	}
+
+	cpuset_update_active_cpus(true);
+}
+
+static int cpuset_cpu_inactive(unsigned int cpu)
+{
+	if (!cpuhp_tasks_frozen) {
+		cpuset_update_active_cpus(false);
+	} else {
+		num_cpus_frozen++;
+		partition_sched_domains(1, NULL, NULL);
+	}
+	return 0;
+}
+
+int sched_cpu_activate(unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	set_cpu_active(cpu, true);
+
+	if (sched_smp_initialized) {
+		sched_domains_numa_masks_set(cpu);
+		cpuset_cpu_active();
+	}
+
+	/*
+	 * Put the rq online, if not already. This happens:
+	 *
+	 * 1) In the early boot process, because we build the real domains
+	 *    after all cpus have been brought up.
+	 *
+	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+	 *    domains.
+	 */
+	rq_lock_irqsave(rq, &flags);
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_online(rq);
+	}
+	unbind_zero(cpu);
+	rq_unlock_irqrestore(rq, &flags);
+
+	return 0;
+}
+
+int sched_cpu_deactivate(unsigned int cpu)
+{
+	int ret;
+
+	set_cpu_active(cpu, false);
+	/*
+	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+	 * users of this state to go away such that all new such users will
+	 * observe it.
+	 *
+	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+	 * not imply sync_sched(), so wait for both.
+	 *
+	 * Do sync before park smpboot threads to take care the rcu boost case.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT))
+		synchronize_rcu_mult(call_rcu, call_rcu_sched);
+	else
+		synchronize_rcu();
+
+	if (!sched_smp_initialized)
+		return 0;
+
+	ret = cpuset_cpu_inactive(cpu);
+	if (ret) {
+		set_cpu_active(cpu, true);
+		return ret;
+	}
+	sched_domains_numa_masks_clear(cpu);
+	return 0;
+}
+
+int sched_cpu_starting(unsigned int __maybe_unused cpu)
+{
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	double_rq_lock(rq, cpu_rq(0));
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+	bind_zero(cpu);
+	double_rq_unlock(rq, cpu_rq(0));
+	local_irq_restore(flags);
+
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+/*
+ * Cheaper version of the below functions in case support for SMT and MC is
+ * compiled in but CPUs have no siblings.
+ */
+static bool sole_cpu_idle(struct rq *rq)
+{
+	return rq_idle(rq);
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static const cpumask_t *thread_cpumask(int cpu)
+{
+	return topology_sibling_cpumask(cpu);
+}
+/* All this CPU's SMT siblings are idle */
+static bool siblings_cpu_idle(struct rq *rq)
+{
+	return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
+}
+#endif
+#ifdef CONFIG_SCHED_MC
+static const cpumask_t *core_cpumask(int cpu)
+{
+	return topology_core_cpumask(cpu);
+}
+/* All this CPU's shared cache siblings are idle */
+static bool cache_cpu_idle(struct rq *rq)
+{
+	return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
+}
+#endif
+
+enum sched_domain_level {
+	SD_LV_NONE = 0,
+	SD_LV_SIBLING,
+	SD_LV_MC,
+	SD_LV_BOOK,
+	SD_LV_CPU,
+	SD_LV_NODE,
+	SD_LV_ALLNODES,
+	SD_LV_MAX
+};
+
+void __init sched_init_smp(void)
+{
+	struct sched_domain *sd;
+	int cpu, other_cpu;
+#ifdef CONFIG_SCHED_SMT
+	bool smt_threads = false;
+#endif
+	cpumask_var_t non_isolated_cpus;
+	struct rq *rq;
+
+	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+	sched_init_numa();
+
+	/*
+	 * There's no userspace yet to cause hotplug operations; hence all the
+	 * cpu masks are stable and all blatant races in the below code cannot
+	 * happen.
+	 */
+	mutex_lock(&sched_domains_mutex);
+	init_sched_domains(cpu_active_mask);
+	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+	if (cpumask_empty(non_isolated_cpus))
+		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
+	mutex_unlock(&sched_domains_mutex);
+
+	/* Move init over to a non-isolated CPU */
+	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+		BUG();
+	free_cpumask_var(non_isolated_cpus);
+
+	mutex_lock(&sched_domains_mutex);
+	local_irq_disable();
+	lock_all_rqs();
+	/*
+	 * Set up the relative cache distance of each online cpu from each
+	 * other in a simple array for quick lookup. Locality is determined
+	 * by the closest sched_domain that CPUs are separated by. CPUs with
+	 * shared cache in SMT and MC are treated as local. Separate CPUs
+	 * (within the same package or physically) within the same node are
+	 * treated as not local. CPUs not even in the same domain (different
+	 * nodes) are treated as very distant.
+	 */
+	for_each_online_cpu(cpu) {
+		rq = cpu_rq(cpu);
+
+		/* First check if this cpu is in the same node */
+		for_each_domain(cpu, sd) {
+			if (sd->level > SD_LV_MC)
+				continue;
+			/* Set locality to local node if not already found lower */
+			for_each_cpu(other_cpu, sched_domain_span(sd)) {
+				if (rq->cpu_locality[other_cpu] > 3)
+					rq->cpu_locality[other_cpu] = 3;
+			}
+		}
+
+		/*
+		 * Each runqueue has its own function in case it doesn't have
+		 * siblings of its own allowing mixed topologies.
+		 */
+#ifdef CONFIG_SCHED_MC
+		for_each_cpu(other_cpu, core_cpumask(cpu)) {
+			if (rq->cpu_locality[other_cpu] > 2)
+				rq->cpu_locality[other_cpu] = 2;
+		}
+		if (cpumask_weight(core_cpumask(cpu)) > 1) {
+			cpumask_copy(&rq->core_mask, core_cpumask(cpu));
+			cpumask_clear_cpu(cpu, &rq->core_mask);
+			rq->cache_idle = cache_cpu_idle;
+		}
+#endif
+#ifdef CONFIG_SCHED_SMT
+		if (cpumask_weight(thread_cpumask(cpu)) > 1) {
+			cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
+			cpumask_clear_cpu(cpu, &rq->thread_mask);
+			for_each_cpu(other_cpu, thread_cpumask(cpu))
+				rq->cpu_locality[other_cpu] = 1;
+			rq->siblings_idle = siblings_cpu_idle;
+			smt_threads = true;
+		}
+#endif
+	}
+	for_each_possible_cpu(cpu) {
+		int total_cpus = 1, locality;
+
+		rq = cpu_rq(cpu);
+		for (locality = 1; locality <= 4; locality++) {
+			for_each_possible_cpu(other_cpu) {
+				if (rq->cpu_locality[other_cpu] == locality)
+					rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
+			}
+		}
+	}
+#ifdef CONFIG_SMT_NICE
+	if (smt_threads) {
+		check_siblings = &check_smt_siblings;
+		wake_siblings = &wake_smt_siblings;
+		smt_schedule = &smt_should_schedule;
+	}
+#endif
+	unlock_all_rqs();
+	local_irq_enable();
+	mutex_unlock(&sched_domains_mutex);
+
+	for_each_online_cpu(cpu) {
+		rq = cpu_rq(cpu);
+
+		for_each_online_cpu(other_cpu) {
+			if (other_cpu <= cpu)
+				continue;
+			printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
+		}
+	}
+	sched_smp_initialized = true;
+}
+#else
+void __init sched_init_smp(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+	return in_lock_functions(addr) ||
+		(addr >= (unsigned long)__sched_text_start
+		&& addr < (unsigned long)__sched_text_end);
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+/* task group related information */
+struct task_group {
+	struct cgroup_subsys_state css;
+
+	struct rcu_head rcu;
+	struct list_head list;
+
+	struct task_group *parent;
+	struct list_head siblings;
+	struct list_head children;
+};
+
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
+struct task_group root_task_group;
+LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
+#endif /* CONFIG_CGROUP_SCHED */
+
+void __init sched_init(void)
+{
+#ifdef CONFIG_SMP
+	int cpu_ids;
+#endif
+	int i;
+	struct rq *rq;
+
+	prio_ratios[0] = 128;
+	for (i = 1 ; i < NICE_WIDTH ; i++)
+		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
+
+	atomic_set(&grq.nr_running, 0);
+	atomic_set(&grq.nr_uninterruptible, 0);
+	atomic64_set(&grq.nr_switches, 0);
+	skiplist_node_init(&init_task.node);
+
+#ifdef CONFIG_SMP
+	init_defrootdomain();
+	atomic_set(&grq.qnr, 0);
+	cpumask_clear(&grq.cpu_idle_map);
+#else
+	uprq = &per_cpu(runqueues, 0);
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+	task_group_cache = KMEM_CACHE(task_group, 0);
+
+	list_add(&root_task_group.list, &task_groups);
+	INIT_LIST_HEAD(&root_task_group.children);
+	INIT_LIST_HEAD(&root_task_group.siblings);
+#endif /* CONFIG_CGROUP_SCHED */
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		skiplist_init(&rq->node);
+		rq->sl = new_skiplist(&rq->node);
+		raw_spin_lock_init(&rq->lock);
+		rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0;
+		rq->last_jiffy = jiffies;
+		rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns =
+			      rq->iowait_ns = rq->idle_ns = 0;
+		rq->dither = 0;
+		set_rq_task(rq, &init_task);
+		rq->iso_ticks = 0;
+		rq->iso_refractory = false;
+#ifdef CONFIG_SMP
+		rq->sd = NULL;
+		rq->rd = NULL;
+		rq->online = false;
+		rq->cpu = i;
+		rq_attach_root(rq, &def_root_domain);
+#endif
+		atomic_set(&rq->nr_iowait, 0);
+	}
+
+#ifdef CONFIG_SMP
+	cpu_ids = i;
+	/*
+	 * Set the base locality for cpu cache distance calculation to
+	 * "distant" (3). Make sure the distance from a CPU to itself is 0.
+	 */
+	for_each_possible_cpu(i) {
+		int j;
+
+		rq = cpu_rq(i);
+#ifdef CONFIG_SCHED_SMT
+		rq->siblings_idle = sole_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_MC
+		rq->cache_idle = sole_cpu_idle;
+#endif
+		rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
+		for_each_possible_cpu(j) {
+			if (i == j)
+				rq->cpu_locality[j] = 0;
+			else
+				rq->cpu_locality[j] = 4;
+		}
+		rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+		rq->rq_order[0] = rq;
+		for (j = 1; j < cpu_ids; j++)
+			rq->rq_order[j] = cpu_rq(j);
+	}
+#endif
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current);
+
+	/*
+	 * Make us the idle thread. Technically, schedule() should not be
+	 * called from this thread, however somewhere below it might be,
+	 * but because we are the idle thread, we just pick up running again
+	 * when this runqueue becomes "idle".
+	 */
+	init_idle(current, smp_processor_id());
+
+#ifdef CONFIG_SMP
+	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
+	/* May be allocated at isolcpus cmdline parse time */
+	if (cpu_isolated_map == NULL)
+		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+	idle_thread_set_boot_cpu();
+#endif /* SMP */
+
+	init_schedstats();
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+	int nested = preempt_count() + rcu_preempt_depth();
+
+	return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+	/*
+	 * Blocking primitives will set (and therefore destroy) current->state,
+	 * since we will exit with TASK_RUNNING make sure we enter with it,
+	 * otherwise we will destroy state.
+	 */
+	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+			"do not call blocking ops when !TASK_RUNNING; "
+			"state=%lx set at [<%p>] %pS\n",
+			current->state,
+			(void *)current->task_state_change,
+			(void *)current->task_state_change);
+
+	___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
+	static unsigned long prev_jiffy;	/* ratelimiting */
+
+	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+	     !is_idle_task(current)) ||
+	    system_state != SYSTEM_RUNNING || oops_in_progress)
+		return;
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR
+		"BUG: sleeping function called from invalid context at %s:%d\n",
+			file, line);
+	printk(KERN_ERR
+		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	if (task_stack_end_corrupted(current))
+		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+	debug_show_held_locks(current);
+	if (irqs_disabled())
+		print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (!preempt_count_equals(preempt_offset)) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
+	dump_stack();
+}
+EXPORT_SYMBOL(___might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static inline void normalise_rt_tasks(void)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	struct rq *rq;
+
+	read_lock(&tasklist_lock);
+	for_each_process_thread(g, p) {
+		/*
+		 * Only normalize user tasks:
+		 */
+		if (p->flags & PF_KTHREAD)
+			continue;
+
+		if (!rt_task(p) && !iso_task(p))
+			continue;
+
+		rq = task_rq_lock(p, &flags);
+		__setscheduler(p, rq, SCHED_NORMAL, 0, false);
+		task_rq_unlock(rq, p, &flags);
+	}
+	read_unlock(&tasklist_lock);
+}
+
+void normalize_rt_tasks(void)
+{
+	normalise_rt_tasks();
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
+ */
+struct task_struct *curr_task(int cpu)
+{
+	return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronised, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+	cpu_curr(cpu) = p;
+}
+
+#endif
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_common_task_switch(struct task_struct *prev)
+{
+	if (is_idle_task(prev))
+		vtime_account_idle(prev);
+	else
+		vtime_account_system(prev);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	vtime_account_user(prev);
+#endif
+	arch_vtime_task_switch(prev);
+}
+#endif
+
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	*ut = p->utime;
+	*st = p->stime;
+}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+
+	*ut = cputime.utime;
+	*st = cputime.stime;
+}
+
+void vtime_account_system_irqsafe(struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	vtime_account_system(tsk);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
+
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_account_irq_enter(struct task_struct *tsk)
+{
+	if (!in_interrupt() && is_idle_task(tsk))
+		vtime_account_idle(tsk);
+	else
+		vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+
+#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+/*
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * losing precision when the numbers are big.
+ */
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+{
+	u64 scaled;
+
+	for (;;) {
+		/* Make sure "rtime" is the bigger of stime/rtime */
+		if (stime > rtime) {
+			u64 tmp = rtime; rtime = stime; stime = tmp;
+		}
+
+		/* Make sure 'total' fits in 32 bits */
+		if (total >> 32)
+			goto drop_precision;
+
+		/* Does rtime (and thus stime) fit in 32 bits? */
+		if (!(rtime >> 32))
+			break;
+
+		/* Can we just balance rtime/stime rather than dropping bits? */
+		if (stime >> 31)
+			goto drop_precision;
+
+		/* We can grow stime and shrink rtime and try to make them both fit */
+		stime <<= 1;
+		rtime >>= 1;
+		continue;
+
+drop_precision:
+		/* We drop from rtime, it has more bits than stime */
+		rtime >>= 1;
+		total >>= 1;
+	}
+
+	/*
+	 * Make sure gcc understands that this is a 32x32->64 multiply,
+	 * followed by a 64/32->64 divide.
+	 */
+	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+	return (__force cputime_t) scaled;
+}
+
+/*
+ * Adjust tick based cputime random precision against scheduler
+ * runtime accounting.
+ */
+static void cputime_adjust(struct task_cputime *curr,
+			   struct prev_cputime *prev,
+			   cputime_t *ut, cputime_t *st)
+{
+	cputime_t rtime, stime, utime, total;
+
+	stime = curr->stime;
+	total = stime + curr->utime;
+
+	/*
+	 * Tick based cputime accounting depend on random scheduling
+	 * timeslices of a task to be interrupted or not by the timer.
+	 * Depending on these circumstances, the number of these interrupts
+	 * may be over or under-optimistic, matching the real user and system
+	 * cputime with a variable precision.
+	 *
+	 * Fix this by scaling these tick based values against the total
+	 * runtime accounted by the CFS scheduler.
+	 */
+	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+
+	/*
+	 * Update userspace visible utime/stime values only if actual execution
+	 * time is bigger than already exported. Note that can happen, that we
+	 * provided bigger values due to scaling inaccuracy on big numbers.
+	 */
+	if (prev->stime + prev->utime >= rtime)
+		goto out;
+
+	if (total) {
+		stime = scale_stime((__force u64)stime,
+				    (__force u64)rtime, (__force u64)total);
+		utime = rtime - stime;
+	} else {
+		stime = rtime;
+		utime = 0;
+	}
+
+	/*
+	 * If the tick based count grows faster than the scheduler one,
+	 * the result of the scaling may go backward.
+	 * Let's enforce monotonicity.
+	 */
+	prev->stime = max(prev->stime, stime);
+	prev->utime = max(prev->utime, utime);
+
+out:
+	*ut = prev->utime;
+	*st = prev->stime;
+}
+
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime = {
+		.sum_exec_runtime = tsk_seruntime(p),
+	};
+
+	task_cputime(p, &cputime.utime, &cputime.stime);
+	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
+}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+void init_idle_bootup_task(struct task_struct *idle)
+{}
+
+#ifdef CONFIG_SCHED_DEBUG
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{}
+
+void proc_sched_set_task(struct task_struct *p)
+{}
+#endif
+
+#ifdef CONFIG_SMP
+#define SCHED_LOAD_SHIFT	(10)
+#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
+
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long smt_gain = sd->smt_gain;
+
+	smt_gain /= weight;
+
+	return smt_gain;
+}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+static void sched_free_group(struct task_group *tg)
+{
+	kmem_cache_free(task_group_cache, tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+	struct task_group *tg;
+
+	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+	if (!tg)
+		return ERR_PTR(-ENOMEM);
+
+	return tg;
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void sched_free_group_rcu(struct rcu_head *rhp)
+{
+	/* now it should be safe to free those cfs_rqs */
+	sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+void sched_destroy_group(struct task_group *tg)
+{
+	/* wait for possible concurrent references to cfs_rqs complete */
+	call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
+}
+
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct task_group, css) : NULL;
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct task_group *parent = css_tg(parent_css);
+	struct task_group *tg;
+
+	if (!parent) {
+		/* This is early initialization for the top cgroup */
+		return &root_task_group.css;
+	}
+
+	tg = sched_create_group(parent);
+	if (IS_ERR(tg))
+		return ERR_PTR(-ENOMEM);
+	return &tg->css;
+}
+
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+
+	sched_offline_group(tg);
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+
+	/*
+	 * Relies on the RCU grace period between css_released() and this.
+	 */
+	sched_free_group(tg);
+}
+
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+	return 0;
+}
+
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+
+static struct cftype cpu_files[] = {
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys cpu_cgrp_subsys = {
+	.css_alloc	= cpu_cgroup_css_alloc,
+	.css_released	= cpu_cgroup_css_released,
+	.css_free	= cpu_cgroup_css_free,
+	.fork		= cpu_cgroup_fork,
+	.can_attach	= cpu_cgroup_can_attach,
+	.attach		= cpu_cgroup_attach,
+	.legacy_cftypes	= cpu_files,
+	.early_init	= true,
+};
+#endif	/* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
new file mode 100644
index 000000000..4e3115dac
--- /dev/null
+++ b/kernel/sched/MuQSS.h
@@ -0,0 +1,274 @@
+#include <linux/sched.h>
+#include <linux/cpuidle.h>
+#include <linux/skip_list.h>
+#include <linux/stop_machine.h>
+
+#ifndef MUQSS_SCHED_H
+#define MUQSS_SCHED_H
+
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED	1
+#define TASK_ON_RQ_MIGRATING	2
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ * This data should only be modified by the local cpu.
+ */
+struct rq {
+	struct task_struct *curr, *idle, *stop;
+	struct mm_struct *prev_mm;
+
+	raw_spinlock_t lock;
+
+	/* Stored data about rq->curr to work outside rq lock */
+	u64 rq_deadline;
+	int rq_prio;
+
+	/* Best queued id for use outside lock */
+	u64 best_key;
+
+	unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */
+	unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */
+	u64 niffies; /* Last time this RQ updated rq clock */
+	u64 last_niffy; /* Last niffies as updated by local clock */
+	u64 last_jiffy_niffies; /* Niffies @ last_jiffy */
+
+	u64 load_update; /* When we last updated load */
+	unsigned long load_avg; /* Rolling load average */
+#ifdef CONFIG_SMT_NICE
+	struct mm_struct *rq_mm;
+	int rq_smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+	/* Accurate timekeeping data */
+	unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns,
+		iowait_ns, idle_ns;
+	atomic_t nr_iowait;
+
+	skiplist_node node;
+	skiplist *sl;
+#ifdef CONFIG_SMP
+	struct task_struct *preempt; /* Preempt triggered on this task */
+
+	int cpu;		/* cpu of this runqueue */
+	bool online;
+
+	struct root_domain *rd;
+	struct sched_domain *sd;
+	int *cpu_locality; /* CPU relative cache distance */
+	struct rq **rq_order; /* RQs ordered by relative cache distance */
+
+#ifdef CONFIG_SCHED_SMT
+	cpumask_t thread_mask;
+	bool (*siblings_idle)(struct rq *rq);
+	/* See if all smt siblings are idle */
+#endif /* CONFIG_SCHED_SMT */
+#ifdef CONFIG_SCHED_MC
+	cpumask_t core_mask;
+	bool (*cache_idle)(struct rq *rq);
+	/* See if all cache siblings are idle */
+#endif /* CONFIG_SCHED_MC */
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	u64 prev_irq_time;
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_PARAVIRT
+	u64 prev_steal_time;
+#endif /* CONFIG_PARAVIRT */
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	u64 prev_steal_time_rq;
+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
+
+	u64 clock, old_clock, last_tick;
+	u64 clock_task;
+	int dither;
+
+	int iso_ticks;
+	bool iso_refractory;
+
+#ifdef CONFIG_SCHEDSTATS
+
+	/* latency stats */
+	struct sched_info rq_sched_info;
+	unsigned long long rq_cpu_time;
+	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+	/* sys_sched_yield() stats */
+	unsigned int yld_count;
+
+	/* schedule() stats */
+	unsigned int sched_switch;
+	unsigned int sched_count;
+	unsigned int sched_goidle;
+
+	/* try_to_wake_up() stats */
+	unsigned int ttwu_count;
+	unsigned int ttwu_local;
+#endif /* CONFIG_SCHEDSTATS */
+
+#ifdef CONFIG_SMP
+	struct llist_head wake_list;
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+	/* Must be inspected within a rcu lock section */
+	struct cpuidle_state *idle_state;
+#endif
+};
+
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu);
+#endif
+
+#ifndef CONFIG_SMP
+extern struct rq *uprq;
+#define cpu_rq(cpu)	(uprq)
+#define this_rq()	(uprq)
+#define raw_rq()	(uprq)
+#define task_rq(p)	(uprq)
+#define cpu_curr(cpu)	((uprq)->curr)
+#else /* CONFIG_SMP */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#define this_rq()		this_cpu_ptr(&runqueues)
+#define raw_rq()		raw_cpu_ptr(&runqueues)
+#endif /* CONFIG_SMP */
+
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_MIGRATED  - the task was migrated during wakeup
+ *
+ */
+
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
+
+#define ENQUEUE_WAKEUP		0x01
+#define ENQUEUE_RESTORE		0x02
+#define ENQUEUE_MOVE		0x04
+
+#define ENQUEUE_HEAD		0x08
+#define ENQUEUE_REPLENISH	0x10
+#ifdef CONFIG_SMP
+#define ENQUEUE_MIGRATED	0x20
+#else
+#define ENQUEUE_MIGRATED	0x00
+#endif
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+	return READ_ONCE(rq->clock);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+	lockdep_assert_held(&rq->lock);
+	return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+	lockdep_assert_held(&rq->lock);
+	return rq->clock_task;
+}
+
+extern struct mutex sched_domains_mutex;
+extern struct static_key_false sched_schedstats;
+
+#define rcu_dereference_check_sched_domain(p) \
+	rcu_dereference_check((p), \
+			      lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+#else
+static inline void sched_ttwu_pending(void) { }
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+				  struct cpuidle_state *idle_state)
+{
+	rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+	WARN_ON(!rcu_read_lock_held());
+	return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+				  struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+	return NULL;
+}
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+static inline void cpufreq_trigger(u64 time, unsigned long util)
+{
+       struct update_util_data *data;
+
+       if (util > SCHED_CAPACITY_SCALE)
+	       util = SCHED_CAPACITY_SCALE;
+       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+       if (data)
+               data->func(data, time, util, SCHED_CAPACITY_SCALE);
+}
+#else
+static inline void cpufreq_trigger(u64 time, unsigned long util)
+{
+}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant()	(true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant()	(false)
+#endif
+
+#endif /* MUQSS_SCHED_H */
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c
deleted file mode 100644
index bb5bac4b2..000000000
--- a/kernel/sched/bfs.c
+++ /dev/null
@@ -1,7671 +0,0 @@
-/*
- *  kernel/sched/bfs.c, was kernel/sched.c
- *
- *  Kernel scheduler and related syscalls
- *
- *  Copyright (C) 1991-2002  Linus Torvalds
- *
- *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
- *		make semaphores SMP safe
- *  1998-11-19	Implemented schedule_timeout() and related stuff
- *		by Andrea Arcangeli
- *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
- *		hybrid priority-list and round-robin design with
- *		an array-switch method of distributing timeslices
- *		and per-CPU runqueues.  Cleanups and useful suggestions
- *		by Davide Libenzi, preemptible kernel bits by Robert Love.
- *  2003-09-03	Interactivity tuning by Con Kolivas.
- *  2004-04-02	Scheduler domains code by Nick Piggin
- *  2007-04-15  Work begun on replacing all interactivity tuning with a
- *              fair scheduling design by Con Kolivas.
- *  2007-05-05  Load balancing (smp-nice) and other improvements
- *              by Peter Williams
- *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
- *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
- *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
- *              Thomas Gleixner, Mike Kravetz
- *  2009-08-13	Brainfuck deadline scheduling policy by Con Kolivas deletes
- *              a whole lot of those previous things.
- */
-
-#include <linux/kasan.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-#include <linux/highmem.h>
-#include <linux/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
-#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
-#include <linux/cpuset.h>
-#include <linux/cpumask.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/syscalls.h>
-#include <linux/sched/sysctl.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
-#include <linux/delayacct.h>
-#include <linux/log2.h>
-#include <linux/bootmem.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
-#include <linux/init_task.h>
-#include <linux/binfmts.h>
-#include <linux/context_tracking.h>
-#include <linux/sched/prio.h>
-#include <linux/tick.h>
-#include <linux/skip_lists.h>
-
-#include <asm/irq_regs.h>
-#include <asm/switch_to.h>
-#include <asm/tlb.h>
-#include <asm/unistd.h>
-#include <asm/mutex.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
-#include "cpupri.h"
-#include "../workqueue_internal.h"
-#include "../smpboot.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-
-#include "bfs_sched.h"
-
-#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
-#define rt_task(p)		rt_prio((p)->prio)
-#define rt_queue(rq)		rt_prio((rq)->rq_prio)
-#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
-					(policy) == SCHED_RR)
-#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
-
-#define is_idle_policy(policy)	((policy) == SCHED_IDLEPRIO)
-#define idleprio_task(p)	unlikely(is_idle_policy((p)->policy))
-#define task_running_idle(p)	unlikely((p)->prio == IDLE_PRIO)
-#define idle_queue(rq)		(unlikely(is_idle_policy((rq)->rq_policy)))
-
-#define is_iso_policy(policy)	((policy) == SCHED_ISO)
-#define iso_task(p)		unlikely(is_iso_policy((p)->policy))
-#define iso_queue(rq)		unlikely(is_iso_policy((rq)->rq_policy))
-#define task_running_iso(p)	unlikely((p)->prio == ISO_PRIO)
-#define rq_running_iso(rq)	((rq)->rq_prio == ISO_PRIO)
-
-#define rq_idle(rq)		((rq)->rq_prio == PRIO_LIMIT)
-
-#define ISO_PERIOD		((5 * HZ * grq.noc) + 1)
-
-#define SCHED_PRIO(p)		((p) + MAX_RT_PRIO)
-#define STOP_PRIO		(MAX_RT_PRIO - 1)
-
-/*
- * Some helpers for converting to/from various scales. Use shifts to get
- * approximate multiples of ten for less overhead.
- */
-#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
-#define JIFFY_NS		(1000000000 / HZ)
-#define HALF_JIFFY_NS		(1000000000 / HZ / 2)
-#define HALF_JIFFY_US		(1000000 / HZ / 2)
-#define MS_TO_NS(TIME)		((TIME) << 20)
-#define MS_TO_US(TIME)		((TIME) << 10)
-#define NS_TO_MS(TIME)		((TIME) >> 20)
-#define NS_TO_US(TIME)		((TIME) >> 10)
-
-#define RESCHED_US	(100) /* Reschedule if less than this many μs left */
-
-void print_scheduler_version(void)
-{
-	printk(KERN_INFO "BFS CPU scheduler v0.512 by Con Kolivas.\n");
-}
-
-/*
- * This is the time all tasks within the same priority round robin.
- * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
- * Tunable via /proc interface.
- */
-#ifdef CONFIG_PCK_INTERACTIVE
-int rr_interval __read_mostly = 3;
-#else
-int rr_interval __read_mostly = 6;
-#endif
-
-/* Tunable to choose whether to prioritise latency or throughput, simple
- * binary yes or no */
-
-int sched_interactive __read_mostly = 1;
-
-/*
- * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
- * are allowed to run five seconds as real time tasks. This is the total over
- * all online cpus.
- */
-#ifdef CONFIG_PCK_INTERACTIVE
-int sched_iso_cpu __read_mostly = 25;
-#else
-int sched_iso_cpu __read_mostly = 70;
-#endif
-
-/*
- * The relative length of deadline for each priority(nice) level.
- */
-static int prio_ratios[NICE_WIDTH] __read_mostly;
-
-/*
- * The quota handed out to tasks of all priority levels when refilling their
- * time_slice.
- */
-static inline int timeslice(void)
-{
-	return MS_TO_US(rr_interval);
-}
-
-/*
- * The global runqueue data that all CPUs work off. Data is protected either
- * by the global grq lock, or the discrete lock that precedes the data in this
- * struct.
- */
-struct global_rq {
-	raw_spinlock_t lock;
-	unsigned long nr_running;
-	unsigned long nr_uninterruptible;
-	unsigned long long nr_switches;
-	unsigned long qnr; /* queued not running */
-#ifdef CONFIG_SMP
-	cpumask_t cpu_idle_map;
-	bool idle_cpus;
-#endif
-	int noc; /* num_online_cpus stored and updated when it changes */
-	u64 niffies; /* Nanosecond jiffies */
-	unsigned long last_jiffy; /* Last jiffy we updated niffies */
-
-	raw_spinlock_t iso_lock;
-	int iso_ticks;
-	bool iso_refractory;
-
-	skiplist_node node;
-	skiplist *sl;
-};
-
-#ifdef CONFIG_SMP
-/*
- * We add the notion of a root-domain which will be used to define per-domain
- * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
- * exclusive cpuset is created, we also create and attach a new root-domain
- * object.
- *
- */
-struct root_domain {
-	atomic_t refcount;
-	atomic_t rto_count;
-	struct rcu_head rcu;
-	cpumask_var_t span;
-	cpumask_var_t online;
-
-	/*
-	 * The "RT overload" flag: it gets set if a CPU has more than
-	 * one runnable RT task.
-	 */
-	cpumask_var_t rto_mask;
-	struct cpupri cpupri;
-};
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-
-#endif /* CONFIG_SMP */
-
-/* There can be only one */
-static struct global_rq grq;
-
-static DEFINE_MUTEX(sched_hotcpu_mutex);
-
-/* cpus with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#ifdef CONFIG_SMP
-struct rq *cpu_rq(int cpu)
-{
-	return &per_cpu(runqueues, (cpu));
-}
-#define task_rq(p)		cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
-/*
- * sched_domains_mutex serialises calls to init_sched_domains,
- * detach_destroy_domains and partition_sched_domains.
- */
-DEFINE_MUTEX(sched_domains_mutex);
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
-}
-#else
-struct rq *uprq;
-#endif /* CONFIG_SMP */
-
-static inline void update_rq_clock(struct rq *rq);
-
-/*
- * Sanity check should sched_clock return bogus values. We make sure it does
- * not appear to go backwards, and use jiffies to determine the maximum and
- * minimum it could possibly have increased, and round down to the nearest
- * jiffy when it falls outside this.
- */
-static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
-{
-	unsigned long min_diff, max_diff;
-
-	if (jiff_diff > 1)
-		min_diff = JIFFIES_TO_NS(jiff_diff - 1);
-	else
-		min_diff = 1;
-	/*  Round up to the nearest tick for maximum */
-	max_diff = JIFFIES_TO_NS(jiff_diff + 1);
-
-	if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
-		*niff_diff = min_diff;
-}
-
-#ifdef CONFIG_SMP
-static inline int cpu_of(struct rq *rq)
-{
-	return rq->cpu;
-}
-
-/*
- * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
- * clock is updated with the grq.lock held, it is an opportunity to update the
- * niffies value. Any CPU can update it by adding how much its clock has
- * increased since it last updated niffies, minus any added niffies by other
- * CPUs.
- */
-static inline void update_clocks(struct rq *rq)
-{
-	s64 ndiff;
-	long jdiff;
-
-	update_rq_clock(rq);
-	ndiff = rq->clock - rq->old_clock;
-	/* old_clock is only updated when we are updating niffies */
-	rq->old_clock = rq->clock;
-	ndiff -= grq.niffies - rq->last_niffy;
-	jdiff = jiffies - grq.last_jiffy;
-	niffy_diff(&ndiff, jdiff);
-	grq.last_jiffy += jdiff;
-	grq.niffies += ndiff;
-	rq->last_niffy = grq.niffies;
-}
-#else /* CONFIG_SMP */
-static inline int cpu_of(struct rq *rq)
-{
-	return 0;
-}
-
-static inline void update_clocks(struct rq *rq)
-{
-	s64 ndiff;
-	long jdiff;
-
-	update_rq_clock(rq);
-	ndiff = rq->clock - rq->old_clock;
-	rq->old_clock = rq->clock;
-	jdiff = jiffies - grq.last_jiffy;
-	niffy_diff(&ndiff, jdiff);
-	grq.last_jiffy += jdiff;
-	grq.niffies += ndiff;
-}
-#endif
-
-#include "stats.h"
-
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next)	do { } while (0)
-#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev)	do { } while (0)
-#endif
-#ifndef finish_arch_post_lock_switch
-# define finish_arch_post_lock_switch()	do { } while (0)
-#endif
-
-/*
- * All common locking functions performed on grq.lock. rq->clock is local to
- * the CPU accessing it so it can be modified just with interrupts disabled
- * when we're not updating niffies.
- * Looking up task_rq must be done under grq.lock to be safe.
- */
-static void update_rq_clock_task(struct rq *rq, s64 delta);
-
-static inline void update_rq_clock(struct rq *rq)
-{
-	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-
-	if (unlikely(delta < 0))
-		return;
-	rq->clock += delta;
-	update_rq_clock_task(rq, delta);
-}
-
-static inline bool task_running(struct task_struct *p)
-{
-	return p->on_cpu;
-}
-
-static inline void grq_lock(void)
-	__acquires(grq.lock)
-{
-	raw_spin_lock(&grq.lock);
-}
-
-static inline void grq_unlock(void)
-	__releases(grq.lock)
-{
-	raw_spin_unlock(&grq.lock);
-}
-
-static inline void grq_lock_irq(void)
-	__acquires(grq.lock)
-{
-	raw_spin_lock_irq(&grq.lock);
-}
-
-static inline void time_lock_grq(struct rq *rq)
-{
-	grq_lock();
-	update_clocks(rq);
-}
-
-static inline void grq_unlock_irq(void)
-	__releases(grq.lock)
-{
-	raw_spin_unlock_irq(&grq.lock);
-}
-
-static inline void grq_lock_irqsave(unsigned long *flags)
-	__acquires(grq.lock)
-{
-	raw_spin_lock_irqsave(&grq.lock, *flags);
-}
-
-static inline void grq_unlock_irqrestore(unsigned long *flags)
-	__releases(grq.lock)
-{
-	raw_spin_unlock_irqrestore(&grq.lock, *flags);
-}
-
-static inline struct rq
-*task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(p->pi_lock)
-{
-	raw_spin_lock_irqsave(&p->pi_lock, *flags);
-	grq_lock();
-	return task_rq(p);
-}
-
-static inline struct rq
-*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
-{
-	struct rq *rq = task_grq_lock(p, flags);
-
-	update_clocks(rq);
-	return rq;
-}
-
-static inline void task_grq_unlock(struct task_struct *p, unsigned long *flags)
-	__releases(p->pi_lock)
-{
-	grq_unlock();
-	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-
-static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
-{
-	local_irq_save(*flags);
-	time_lock_grq(rq);
-}
-
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
-	/* this is a valid case when another task releases the spinlock */
-	grq.lock.owner = current;
-#endif
-	/*
-	 * If we are tracking spinlock dependencies then we have to
-	 * fix up the runqueue lock - which gets 'carried over' from
-	 * prev into current:
-	 */
-	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
-
-	grq_unlock_irq();
-}
-
-static inline bool deadline_before(u64 deadline, u64 time)
-{
-	return (deadline < time);
-}
-
-static inline bool deadline_after(u64 deadline, u64 time)
-{
-	return (deadline > time);
-}
-
-/*
- * Deadline is "now" in niffies + (offset by priority). Setting the deadline
- * is the key to everything. It distributes cpu fairly amongst tasks of the
- * same nice value, it proportions cpu according to nice level, it means the
- * task that last woke up the longest ago has the earliest deadline, thus
- * ensuring that interactive tasks get low latency on wake up. The CPU
- * proportion works out to the square of the virtual deadline difference, so
- * this equation will give nice 19 3% CPU compared to nice 0.
- */
-static inline u64 prio_deadline_diff(int user_prio)
-{
-	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
-}
-
-static inline u64 task_deadline_diff(struct task_struct *p)
-{
-	return prio_deadline_diff(TASK_USER_PRIO(p));
-}
-
-static inline u64 static_deadline_diff(int static_prio)
-{
-	return prio_deadline_diff(USER_PRIO(static_prio));
-}
-
-static inline int longest_deadline_diff(void)
-{
-	return prio_deadline_diff(39);
-}
-
-static inline int ms_longest_deadline_diff(void)
-{
-	return NS_TO_MS(longest_deadline_diff());
-}
-
-/*
- * A task that is not running or queued will not have a node set.
- * A task that is queued but not running will have a node set.
- * A task that is currently running will have ->on_cpu set but no node set.
- */
-static inline bool task_queued(struct task_struct *p)
-{
-	return !skiplist_node_empty(&p->node);
-}
-
-/*
- * Removing from the global runqueue. Enter with grq locked. Deleting a task
- * from the skip list is done via the stored node reference in the task struct
- * and does not require a full look up. Thus it occurs in O(k) time where k
- * is the "level" of the list the task was stored at - usually < 4, max 16.
- */
-static void dequeue_task(struct task_struct *p)
-{
-	skiplist_delete(grq.sl, &p->node);
-	sched_info_dequeued(task_rq(p), p);
-}
-
-#ifdef CONFIG_PREEMPT_RCU
-static bool rcu_read_critical(struct task_struct *p)
-{
-	return p->rcu_read_unlock_special.b.blocked;
-}
-#else /* CONFIG_PREEMPT_RCU */
-#define rcu_read_critical(p) (false)
-#endif /* CONFIG_PREEMPT_RCU */
-
-/*
- * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
- * an idle task, we ensure none of the following conditions are met.
- */
-static bool idleprio_suitable(struct task_struct *p)
-{
-	return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
-		!signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
-}
-
-/*
- * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
- * that the iso_refractory flag is not set.
- */
-static bool isoprio_suitable(void)
-{
-	return !grq.iso_refractory;
-}
-
-/*
- * Adding to the global runqueue. Enter with grq locked.
- */
-static void enqueue_task(struct task_struct *p, struct rq *rq)
-{
-	unsigned int randseed;
-	u64 sl_id;
-
-	if (!rt_task(p)) {
-		/* Check it hasn't gotten rt from PI */
-		if ((idleprio_task(p) && idleprio_suitable(p)) ||
-		   (iso_task(p) && isoprio_suitable()))
-			p->prio = p->normal_prio;
-		else
-			p->prio = NORMAL_PRIO;
-	}
-	/*
-	 * The sl_id key passed to the skiplist generates a sorted list.
-	 * Realtime and sched iso tasks run FIFO so they only need be sorted
-	 * according to priority. The skiplist will put tasks of the same
-	 * key inserted later in FIFO order. Tasks of sched normal, batch
-	 * and idleprio are sorted according to their deadlines. Idleprio
-	 * tasks are offset by an impossibly large deadline value ensuring
-	 * they get sorted into last positions, but still according to their
-	 * own deadlines. This creates a "landscape" of skiplists running
-	 * from priority 0 realtime in first place to the lowest priority
-	 * idleprio tasks last. Skiplist insertion is an O(log n) process.
-	 */
-	if (p->prio <= ISO_PRIO)
-		sl_id = p->prio;
-	else {
-		sl_id = p->deadline;
-		if (idleprio_task(p)) {
-			/* Set it to cope with 4 left shifts with locality_diff */
-			if (p->prio == IDLE_PRIO)
-				sl_id |= 0x00FF000000000000;
-			else
-				sl_id += longest_deadline_diff();
-		}
-	}
-	/*
-	 * Some architectures don't have better than microsecond resolution
-	 * so mask out ~microseconds as the random seed for skiplist insertion.
-	 */
-	randseed = (grq.niffies >> 10) & 0xFFFFFFFF;
-	skiplist_insert(grq.sl, &p->node, sl_id, p, randseed);
-	sched_info_queued(rq, p);
-}
-
-static inline void requeue_task(struct task_struct *p)
-{
-	sched_info_queued(task_rq(p), p);
-}
-
-/*
- * Returns the relative length of deadline all compared to the shortest
- * deadline which is that of nice -20.
- */
-static inline int task_prio_ratio(struct task_struct *p)
-{
-	return prio_ratios[TASK_USER_PRIO(p)];
-}
-
-/*
- * task_timeslice - all tasks of all priorities get the exact same timeslice
- * length. CPU distribution is handled by giving different deadlines to
- * tasks of different priorities. Use 128 as the base value for fast shifts.
- */
-static inline int task_timeslice(struct task_struct *p)
-{
-	return (rr_interval * task_prio_ratio(p) / 128);
-}
-
-static void resched_task(struct task_struct *p);
-
-static inline void resched_curr(struct rq *rq)
-{
-	resched_task(rq->curr);
-}
-
-/*
- * qnr is the "queued but not running" count which is the total number of
- * tasks on the global runqueue list waiting for cpu time but not actually
- * currently running on a cpu.
- */
-static inline void inc_qnr(void)
-{
-	grq.qnr++;
-}
-
-static inline void dec_qnr(void)
-{
-	grq.qnr--;
-}
-
-static inline int queued_notrunning(void)
-{
-	return grq.qnr;
-}
-
-static unsigned long rq_load_avg(struct rq *rq)
-{
-	return rq->soft_affined * SCHED_CAPACITY_SCALE;
-}
-
-#ifdef CONFIG_SMT_NICE
-static const cpumask_t *thread_cpumask(int cpu);
-
-/* Find the best real time priority running on any SMT siblings of cpu and if
- * none are running, the static priority of the best deadline task running.
- * The lookups to the other runqueues is done lockless as the occasional wrong
- * value would be harmless. */
-static int best_smt_bias(struct rq *this_rq)
-{
-	int other_cpu, best_bias = 0;
-
-	for_each_cpu(other_cpu, &this_rq->thread_mask) {
-		struct rq *rq = cpu_rq(other_cpu);
-
-		if (rq_idle(rq))
-			continue;
-		if (unlikely(!rq->online))
-			continue;
-		if (!rq->rq_mm)
-			continue;
-		if (likely(rq->rq_smt_bias > best_bias))
-			best_bias = rq->rq_smt_bias;
-	}
-	return best_bias;
-}
-
-static int task_prio_bias(struct task_struct *p)
-{
-	if (rt_task(p))
-		return 1 << 30;
-	else if (task_running_iso(p))
-		return 1 << 29;
-	else if (task_running_idle(p))
-		return 0;
-	return MAX_PRIO - p->static_prio;
-}
-
-static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq)
-{
-	return true;
-}
-
-static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule;
-
-/* We've already decided p can run on CPU, now test if it shouldn't for SMT
- * nice reasons. */
-static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq)
-{
-	int best_bias, task_bias;
-
-	/* Kernel threads always run */
-	if (unlikely(!p->mm))
-		return true;
-	if (rt_task(p))
-		return true;
-	if (!idleprio_suitable(p))
-		return true;
-	best_bias = best_smt_bias(this_rq);
-	/* The smt siblings are all idle or running IDLEPRIO */
-	if (best_bias < 1)
-		return true;
-	task_bias = task_prio_bias(p);
-	if (task_bias < 1)
-		return false;
-	if (task_bias >= best_bias)
-		return true;
-	/* Dither 25% cpu of normal tasks regardless of nice difference */
-	if (best_bias % 4 == 1)
-		return true;
-	/* Sorry, you lose */
-	return false;
-}
-#else /* CONFIG_SMT_NICE */
-#define smt_schedule(p, this_rq) (true)
-#endif /* CONFIG_SMT_NICE */
-#ifdef CONFIG_SMP
-/*
- * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
- * allow easy lookup of whether any suitable idle CPUs are available.
- * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
- * idle_cpus variable than to do a full bitmask check when we are busy.
- */
-static inline void set_cpuidle_map(int cpu)
-{
-	if (likely(cpu_online(cpu))) {
-		cpumask_set_cpu(cpu, &grq.cpu_idle_map);
-		grq.idle_cpus = true;
-	}
-}
-
-static inline void clear_cpuidle_map(int cpu)
-{
-	cpumask_clear_cpu(cpu, &grq.cpu_idle_map);
-	if (cpumask_empty(&grq.cpu_idle_map))
-		grq.idle_cpus = false;
-}
-
-static bool suitable_idle_cpus(struct task_struct *p)
-{
-	if (!grq.idle_cpus)
-		return false;
-	return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
-}
-
-#define CPUIDLE_DIFF_THREAD	(1)
-#define CPUIDLE_DIFF_CORE	(2)
-#define CPUIDLE_CACHE_BUSY	(4)
-#define CPUIDLE_DIFF_CPU	(8)
-#define CPUIDLE_THREAD_BUSY	(16)
-#define CPUIDLE_DIFF_NODE	(32)
-
-/*
- * The best idle CPU is chosen according to the CPUIDLE ranking above where the
- * lowest value would give the most suitable CPU to schedule p onto next. The
- * order works out to be the following:
- *
- * Same thread, idle or busy cache, idle or busy threads
- * Other core, same cache, idle or busy cache, idle threads.
- * Same node, other CPU, idle cache, idle threads.
- * Same node, other CPU, busy cache, idle threads.
- * Other core, same cache, busy threads.
- * Same node, other CPU, busy threads.
- * Other node, other CPU, idle cache, idle threads.
- * Other node, other CPU, busy cache, idle threads.
- * Other node, other CPU, busy threads.
- */
-static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
-{
-	int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
-		CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
-		CPUIDLE_DIFF_THREAD;
-	int cpu_tmp;
-
-	if (cpumask_test_cpu(best_cpu, tmpmask))
-		goto out;
-
-	for_each_cpu(cpu_tmp, tmpmask) {
-		int ranking, locality;
-		struct rq *tmp_rq;
-
-		ranking = 0;
-		tmp_rq = cpu_rq(cpu_tmp);
-
-		locality = rq->cpu_locality[cpu_tmp];
-#ifdef CONFIG_NUMA
-		if (locality > 3)
-			ranking |= CPUIDLE_DIFF_NODE;
-		else
-#endif
-		if (locality > 2)
-			ranking |= CPUIDLE_DIFF_CPU;
-#ifdef CONFIG_SCHED_MC
-		else if (locality == 2)
-			ranking |= CPUIDLE_DIFF_CORE;
-		else if (!(tmp_rq->cache_idle(tmp_rq)))
-			ranking |= CPUIDLE_CACHE_BUSY;
-#endif
-#ifdef CONFIG_SCHED_SMT
-		if (locality == 1)
-			ranking |= CPUIDLE_DIFF_THREAD;
-		if (!(tmp_rq->siblings_idle(tmp_rq)))
-			ranking |= CPUIDLE_THREAD_BUSY;
-#endif
-		if (ranking < best_ranking) {
-			best_cpu = cpu_tmp;
-			best_ranking = ranking;
-		}
-	}
-out:
-	return best_cpu;
-}
-
-bool cpus_share_cache(int this_cpu, int that_cpu)
-{
-	struct rq *this_rq = cpu_rq(this_cpu);
-
-	return (this_rq->cpu_locality[that_cpu] < 3);
-}
-
-static bool resched_best_idle(struct task_struct *p)
-{
-	cpumask_t tmpmask;
-	struct rq *rq;
-	int best_cpu;
-
-	cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
-	best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask);
-	rq = cpu_rq(best_cpu);
-	if (!smt_schedule(p, rq))
-		return false;
-	resched_curr(rq);
-	return true;
-}
-
-static inline void resched_suitable_idle(struct task_struct *p)
-{
-	if (suitable_idle_cpus(p))
-		resched_best_idle(p);
-}
-
-static inline int locality_diff(int cpu, struct rq *rq)
-{
-	return rq->cpu_locality[cpu];
-}
-#else /* CONFIG_SMP */
-static inline void set_cpuidle_map(int cpu)
-{
-}
-
-static inline void clear_cpuidle_map(int cpu)
-{
-}
-
-static inline bool suitable_idle_cpus(struct task_struct *p)
-{
-	return uprq->curr == uprq->idle;
-}
-
-static inline void resched_suitable_idle(struct task_struct *p)
-{
-}
-
-static inline int locality_diff(int cpu, struct rq *rq)
-{
-	return 0;
-}
-#endif /* CONFIG_SMP */
-
-static inline int normal_prio(struct task_struct *p)
-{
-	if (has_rt_policy(p))
-		return MAX_RT_PRIO - 1 - p->rt_priority;
-	if (idleprio_task(p))
-		return IDLE_PRIO;
-	if (iso_task(p))
-		return ISO_PRIO;
-	return NORMAL_PRIO;
-}
-
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks as it will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
-{
-	p->normal_prio = normal_prio(p);
-	/*
-	 * If we are RT tasks or we were boosted to RT priority,
-	 * keep the priority unchanged. Otherwise, update priority
-	 * to the normal priority:
-	 */
-	if (!rt_prio(p->prio))
-		return p->normal_prio;
-	return p->prio;
-}
-
-/*
- * Update the load average for feeding into cpu frequency governors. Use a
- * rough estimate of a rolling average with ~ time constant of 32ms.
- * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
- */
-static void update_load_avg(struct rq *rq)
-{
-	/* rq clock can go backwards so skip update if that happens */
-	if (likely(rq->clock > rq->load_update)) {
-		unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
-		long load;
-
-		load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
-		if (unlikely(load < 0))
-			load = 0;
-		load += rq->soft_affined * rq_load_avg(rq) * us_interval * 5 / 262144;
-		rq->load_avg = load;
-	}
-	rq->load_update = rq->clock;
-}
-
-/*
- * activate_task - move a task to the runqueue. Enter with grq locked.
- */
-static void activate_task(struct task_struct *p, struct rq *rq)
-{
-	update_clocks(rq);
-
-	/*
-	 * Sleep time is in units of nanosecs, so shift by 20 to get a
-	 * milliseconds-range estimation of the amount of time that the task
-	 * spent sleeping:
-	 */
-	if (unlikely(prof_on == SLEEP_PROFILING)) {
-		if (p->state == TASK_UNINTERRUPTIBLE)
-			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
-				     (rq->clock_task - p->last_ran) >> 20);
-	}
-
-	p->prio = effective_prio(p);
-	if (task_contributes_to_load(p))
-		grq.nr_uninterruptible--;
-	enqueue_task(p, rq);
-	rq->soft_affined++;
-	p->on_rq = 1;
-	grq.nr_running++;
-	inc_qnr();
-	update_load_avg(rq);
-	cpufreq_trigger(grq.niffies, rq->load_avg);
-}
-
-/*
- * deactivate_task - If it's running, it's not on the grq and we can just
- * decrement the nr_running. Enter with grq locked.
- */
-static inline void deactivate_task(struct task_struct *p, struct rq *rq)
-{
-	if (task_contributes_to_load(p))
-		grq.nr_uninterruptible++;
-	rq->soft_affined--;
-	p->on_rq = 0;
-	grq.nr_running--;
-	update_load_avg(rq);
-	cpufreq_trigger(grq.niffies, rq->load_avg);
-}
-
-#ifdef CONFIG_SMP
-void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_LOCKDEP
-	/*
-	 * The caller should hold either p->pi_lock or grq lock, when changing
-	 * a task's CPU. ->pi_lock for waking tasks, grq lock for runnable tasks.
-	 *
-	 * Furthermore, all task_rq users should acquire both locks, see
-	 * task_grq_lock().
-	 */
-	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
-				      lockdep_is_held(&grq.lock)));
-#endif
-	if (task_cpu(p) == cpu)
-		return;
-	trace_sched_migrate_task(p, cpu);
-	perf_event_task_migrate(p);
-
-	/*
-	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
-	 * successfully executed on another CPU. We must ensure that updates of
-	 * per-task data have been completed by this moment.
-	 */
-	smp_wmb();
-
-	if (p->on_rq) {
-		struct rq *rq = task_rq(p);
-
-		rq->soft_affined--;
-		update_load_avg(rq);
-		rq = cpu_rq(cpu);
-		rq->soft_affined++;
-		update_load_avg(rq);
-	}
-	task_thread_info(p)->cpu = cpu;
-}
-#endif /* CONFIG_SMP */
-
-/*
- * Move a task off the global queue and take it to a cpu for it will
- * become the running task.
- */
-static inline void take_task(int cpu, struct task_struct *p)
-{
-	set_task_cpu(p, cpu);
-	dequeue_task(p);
-	dec_qnr();
-}
-
-/*
- * Returns a descheduling task to the grq runqueue unless it is being
- * deactivated.
- */
-static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate)
-{
-	if (deactivate)
-		deactivate_task(p, rq);
-	else {
-		inc_qnr();
-		enqueue_task(p, rq);
-	}
-}
-
-/* Enter with grq lock held. We know p is on the local cpu */
-static inline void __set_tsk_resched(struct task_struct *p)
-{
-	set_tsk_need_resched(p);
-	set_preempt_need_resched();
-}
-
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-void resched_task(struct task_struct *p)
-{
-	int cpu;
-
-	lockdep_assert_held(&grq.lock);
-
-	if (test_tsk_need_resched(p))
-		return;
-
-	set_tsk_need_resched(p);
-
-	cpu = task_cpu(p);
-	if (cpu == smp_processor_id()) {
-		set_preempt_need_resched();
-		return;
-	}
-
-	smp_send_reschedule(cpu);
-}
-
-/**
- * task_curr - is this task currently executing on a CPU?
- * @p: the task in question.
- *
- * Return: 1 if the task is currently executing. 0 otherwise.
- */
-inline int task_curr(const struct task_struct *p)
-{
-	return cpu_curr(task_cpu(p)) == p;
-}
-
-#ifdef CONFIG_SMP
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change.  If it changes, i.e. @p might have woken up,
- * then return zero.  When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count).  If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
-{
-	unsigned long flags;
-	bool running, on_rq;
-	unsigned long ncsw;
-	struct rq *rq;
-
-	for (;;) {
-		rq = task_rq(p);
-
-		/*
-		 * If the task is actively running on another CPU
-		 * still, just relax and busy-wait without holding
-		 * any locks.
-		 *
-		 * NOTE! Since we don't hold any locks, it's not
-		 * even sure that "rq" stays as the right runqueue!
-		 * But we don't care, since this will return false
-		 * if the runqueue has changed and p is actually now
-		 * running somewhere else!
-		 */
-		while (task_running(p) && p == rq->curr) {
-			if (match_state && unlikely(p->state != match_state))
-				return 0;
-			cpu_relax();
-		}
-
-		/*
-		 * Ok, time to look more closely! We need the grq
-		 * lock now, to be *sure*. If we're wrong, we'll
-		 * just go back and repeat.
-		 */
-		rq = task_grq_lock(p, &flags);
-		trace_sched_wait_task(p);
-		running = task_running(p);
-		on_rq = p->on_rq;
-		ncsw = 0;
-		if (!match_state || p->state == match_state)
-			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_grq_unlock(p, &flags);
-
-		/*
-		 * If it changed from the expected state, bail out now.
-		 */
-		if (unlikely(!ncsw))
-			break;
-
-		/*
-		 * Was it really running after all now that we
-		 * checked with the proper locks actually held?
-		 *
-		 * Oops. Go back and try again..
-		 */
-		if (unlikely(running)) {
-			cpu_relax();
-			continue;
-		}
-
-		/*
-		 * It's not enough that it's not actively running,
-		 * it must be off the runqueue _entirely_, and not
-		 * preempted!
-		 *
-		 * So if it was still runnable (but just not actively
-		 * running right now), it's preempted, and we should
-		 * yield - it could be a while.
-		 */
-		if (unlikely(on_rq)) {
-			ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
-
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
-			continue;
-		}
-
-		/*
-		 * Ahh, all good. It wasn't running, and it wasn't
-		 * runnable, which means that it will never become
-		 * running in the future either. We're all done!
-		 */
-		break;
-	}
-
-	return ncsw;
-}
-
-/***
- * kick_process - kick a running thread to enter/exit the kernel
- * @p: the to-be-kicked thread
- *
- * Cause a process which is running on another CPU to enter
- * kernel-mode, without any delay. (to get signals handled.)
- *
- * NOTE: this function doesn't have to take the runqueue lock,
- * because all it wants to ensure is that the remote task enters
- * the kernel. If the IPI races and the task has been migrated
- * to another CPU then no harm is done and the purpose has been
- * achieved as well.
- */
-void kick_process(struct task_struct *p)
-{
-	int cpu;
-
-	preempt_disable();
-	cpu = task_cpu(p);
-	if ((cpu != smp_processor_id()) && task_curr(p))
-		smp_send_reschedule(cpu);
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(kick_process);
-#endif
-
-/*
- * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
- * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
- * between themselves, they cooperatively multitask. An idle rq scores as
- * prio PRIO_LIMIT so it is always preempted.
- */
-static inline bool
-can_preempt(struct task_struct *p, int prio, u64 deadline)
-{
-	/* Better static priority RT task or better policy preemption */
-	if (p->prio < prio)
-		return true;
-	if (p->prio > prio)
-		return false;
-	/* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
-	if (!deadline_before(p->deadline, deadline))
-		return false;
-	return true;
-}
-
-#ifdef CONFIG_SMP
-#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Check to see if there is a task that is affined only to offline CPUs but
- * still wants runtime. This happens to kernel threads during suspend/halt and
- * disabling of CPUs.
- */
-static inline bool online_cpus(struct task_struct *p)
-{
-	return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed)));
-}
-#else /* CONFIG_HOTPLUG_CPU */
-/* All available CPUs are always online without hotplug. */
-static inline bool online_cpus(struct task_struct *p)
-{
-	return true;
-}
-#endif
-
-/*
- * Check to see if p can run on cpu, and if not, whether there are any online
- * CPUs it can run on instead.
- */
-static inline bool needs_other_cpu(struct task_struct *p, int cpu)
-{
-	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
-		return true;
-	return false;
-}
-
-static void try_preempt(struct task_struct *p, struct rq *this_rq)
-{
-	int i, this_entries = this_rq->soft_affined;
-	cpumask_t tmp;
-
-	if (suitable_idle_cpus(p) && resched_best_idle(p))
-		return;
-
-	/* IDLEPRIO tasks never preempt anything but idle */
-	if (p->policy == SCHED_IDLEPRIO)
-		return;
-
-	cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
-
-	/*
-	 * We iterate over CPUs in locality order using rq_order, finding the
-	 * first one we can preempt if possible, thus staying closest in
-	 * locality.
-	 */
-	for (i = 0; i < num_possible_cpus(); i++) {
-		struct rq *rq = this_rq->rq_order[i];
-
-		if (!cpumask_test_cpu(rq->cpu, &tmp))
-			continue;
-
-		if (!sched_interactive && rq != this_rq && rq->soft_affined <= this_entries)
-			continue;
-		if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
-			/*
-			 * If we have decided this task should preempt this CPU,
-			 * set the task's CPU to match thereby speeding up matching
-			 * this task in earliest_deadline_task.
-			 */
-			set_task_cpu(p, rq->cpu);
-			resched_curr(rq);
-			return;
-		}
-	}
-}
-
-static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask, bool check);
-#else /* CONFIG_SMP */
-static inline bool needs_other_cpu(struct task_struct *p, int cpu)
-{
-	return false;
-}
-
-static void try_preempt(struct task_struct *p, struct rq *this_rq)
-{
-	if (p->policy == SCHED_IDLEPRIO)
-		return;
-	if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
-		resched_curr(uprq);
-}
-
-static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-					 const struct cpumask *new_mask, bool check)
-{
-	return set_cpus_allowed_ptr(p, new_mask);
-}
-#endif /* CONFIG_SMP */
-
-static void
-ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
-{
-#ifdef CONFIG_SCHEDSTATS
-	struct rq *rq = this_rq();
-
-#ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-
-	if (cpu == this_cpu)
-		schedstat_inc(rq, ttwu_local);
-	else {
-		struct sched_domain *sd;
-
-		rcu_read_lock();
-		for_each_domain(this_cpu, sd) {
-			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-				schedstat_inc(sd, ttwu_wake_remote);
-				break;
-			}
-		}
-		rcu_read_unlock();
-	}
-
-#endif /* CONFIG_SMP */
-
-	schedstat_inc(rq, ttwu_count);
-#endif /* CONFIG_SCHEDSTATS */
-}
-
-void wake_up_if_idle(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	rcu_read_lock();
-
-	if (!is_idle_task(rcu_dereference(rq->curr)))
-		goto out;
-
-	grq_lock_irqsave(&flags);
-	if (likely(is_idle_task(rq->curr)))
-		smp_send_reschedule(cpu);
-	/* Else cpu is not in idle, do nothing here */
-	grq_unlock_irqrestore(&flags);
-
-out:
-	rcu_read_unlock();
-}
-
-#ifdef CONFIG_SMP
-void scheduler_ipi(void)
-{
-	/*
-	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
-	 * TIF_NEED_RESCHED remotely (for the first time) will also send
-	 * this IPI.
-	 */
-	preempt_fold_need_resched();
-}
-#endif
-
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
-				 bool is_sync)
-{
-	activate_task(p, rq);
-
-	/*
-	 * Sync wakeups (i.e. those types of wakeups where the waker
-	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption if there are no idle cpus,
-	 * instead waiting for current to deschedule.
-	 */
-	if (!is_sync || suitable_idle_cpus(p))
-		try_preempt(p, rq);
-}
-
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
-					bool success)
-{
-	trace_sched_wakeup(p);
-	p->state = TASK_RUNNING;
-
-	/*
-	 * if a worker is waking up, notify workqueue. Note that on BFS, we
-	 * don't really know what cpu it will be, so we fake it for
-	 * wq_worker_waking_up :/
-	 */
-	if ((p->flags & PF_WQ_WORKER) && success)
-		wq_worker_waking_up(p, cpu_of(rq));
-}
-
-/*
- * wake flags
- */
-#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
-#define WF_FORK		0x02		/* child wakeup after fork */
-#define WF_MIGRATED	0x4		/* internal use, task got migrated */
-
-/***
- * try_to_wake_up - wake up a thread
- * @p: the thread to be awakened
- * @state: the mask of task states that can be woken
- * @wake_flags: wake modifier flags (WF_*)
- *
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
- *
- * Return: %true if @p was woken up, %false if it was already running.
- * or @state didn't match @p's state.
- */
-static bool try_to_wake_up(struct task_struct *p, unsigned int state,
-			  int wake_flags)
-{
-	bool success = false;
-	unsigned long flags;
-	struct rq *rq;
-	int cpu;
-
-	/*
-	 * If we are going to wake up a thread waiting for CONDITION we
-	 * need to ensure that CONDITION=1 done by the caller can not be
-	 * reordered with p->state check below. This pairs with mb() in
-	 * set_current_state() the waiting thread does.
-	 */
-	smp_mb__before_spinlock();
-
-	/*
-	 * No need to do time_lock_grq as we only need to update the rq clock
-	 * if we activate the task
-	 */
-	rq = task_grq_lock(p, &flags);
-	cpu = task_cpu(p);
-
-	/* state is a volatile long, どうして、分からない */
-	if (!((unsigned int)p->state & state))
-		goto out_unlock;
-
-	trace_sched_waking(p);
-
-	if (task_queued(p) || task_running(p))
-		goto out_running;
-
-	ttwu_activate(p, rq, wake_flags & WF_SYNC);
-	success = true;
-
-out_running:
-	ttwu_post_activation(p, rq, success);
-out_unlock:
-	task_grq_unlock(p, &flags);
-
-	if (schedstat_enabled())
-		ttwu_stat(p, cpu, wake_flags);
-
-	return success;
-}
-
-/**
- * try_to_wake_up_local - try to wake up a local task with grq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that grq is locked and, @p is not the current task.
- * grq stays locked over invocation.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
-	struct rq *rq = task_rq(p);
-	bool success = false;
-
-	lockdep_assert_held(&grq.lock);
-
-	if (!(p->state & TASK_NORMAL))
-		return;
-
-	trace_sched_waking(p);
-
-	if (!task_queued(p)) {
-		if (likely(!task_running(p))) {
-			schedstat_inc(rq, ttwu_count);
-			schedstat_inc(rq, ttwu_local);
-		}
-		ttwu_activate(p, rq, false);
-		if (schedstat_enabled())
-			ttwu_stat(p, smp_processor_id(), 0);
-		success = true;
-	}
-	ttwu_post_activation(p, rq, success);
-}
-
-/**
- * wake_up_process - Wake up a specific process
- * @p: The process to be woken up.
- *
- * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.
- *
- * Return: 1 if the process was woken up, 0 if it was already running.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-int wake_up_process(struct task_struct *p)
-{
-	return try_to_wake_up(p, TASK_NORMAL, 0);
-}
-EXPORT_SYMBOL(wake_up_process);
-
-int wake_up_state(struct task_struct *p, unsigned int state)
-{
-	return try_to_wake_up(p, state, 0);
-}
-
-static void time_slice_expired(struct task_struct *p);
-
-/*
- * Perform scheduler related setup for a newly forked process p.
- * p is forked by current.
- */
-int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
-{
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-	INIT_HLIST_HEAD(&p->preempt_notifiers);
-#endif
-	/*
-	 * The process state is set to the same value of the process executing
-	 * do_fork() code. That is running. This guarantees that nobody will
-	 * actually run it, and a signal or other external event cannot wake
-	 * it up and insert it on the runqueue either.
-	 */
-
-	/* Should be reset in fork.c but done here for ease of bfs patching */
-	p->on_rq =
-	p->utime =
-	p->stime =
-	p->utimescaled =
-	p->stimescaled =
-	p->sched_time =
-	p->stime_pc =
-	p->utime_pc = 0;
-	skiplist_node_init(&p->node);
-
-	/*
-	 * We mark the process as NEW here. This guarantees that
-	 * nobody will actually run it, and a signal or other external
-	 * event cannot wake it up and insert it on the runqueue either.
-	 */
-	p->state = TASK_NEW;
-
-	/*
-	 * Revert to default priority/policy on fork if requested.
-	 */
-	if (unlikely(p->sched_reset_on_fork)) {
-		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
-			p->policy = SCHED_NORMAL;
-			p->normal_prio = normal_prio(p);
-		}
-
-		if (PRIO_TO_NICE(p->static_prio) < 0) {
-			p->static_prio = NICE_TO_PRIO(0);
-			p->normal_prio = p->static_prio;
-		}
-
-		/*
-		 * We don't need the reset flag anymore after the fork. It has
-		 * fulfilled its duty:
-		 */
-		p->sched_reset_on_fork = 0;
-	}
-
-#ifdef CONFIG_SCHED_INFO
-	if (unlikely(sched_info_on()))
-		memset(&p->sched_info, 0, sizeof(p->sched_info));
-#endif
-	p->on_cpu = false;
-	init_task_preempt_count(p);
-	return 0;
-}
-
-#ifdef CONFIG_SCHEDSTATS
-
-DEFINE_STATIC_KEY_FALSE(sched_schedstats);
-static bool __initdata __sched_schedstats = false;
-
-static void set_schedstats(bool enabled)
-{
-	if (enabled)
-		static_branch_enable(&sched_schedstats);
-	else
-		static_branch_disable(&sched_schedstats);
-}
-
-void force_schedstat_enabled(void)
-{
-	if (!schedstat_enabled()) {
-		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
-		static_branch_enable(&sched_schedstats);
-	}
-}
-
-static int __init setup_schedstats(char *str)
-{
-	int ret = 0;
-	if (!str)
-		goto out;
-
-	/*
-	 * This code is called before jump labels have been set up, so we can't
-	 * change the static branch directly just yet.  Instead set a temporary
-	 * variable so init_schedstats() can do it later.
-	 */
-	if (!strcmp(str, "enable")) {
-		__sched_schedstats = true;
-		ret = 1;
-	} else if (!strcmp(str, "disable")) {
-		__sched_schedstats = false;
-		ret = 1;
-	}
-out:
-	if (!ret)
-		pr_warn("Unable to parse schedstats=\n");
-
-	return ret;
-}
-__setup("schedstats=", setup_schedstats);
-
-static void __init init_schedstats(void)
-{
-	set_schedstats(__sched_schedstats);
-}
-
-#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table t;
-	int err;
-	int state = static_branch_likely(&sched_schedstats);
-
-	if (write && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	t = *table;
-	t.data = &state;
-	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
-	if (err < 0)
-		return err;
-	if (write)
-		set_schedstats(state);
-	return err;
-}
-#endif /* CONFIG_PROC_SYSCTL */
-#else  /* !CONFIG_SCHEDSTATS */
-static inline void init_schedstats(void) {}
-#endif /* CONFIG_SCHEDSTATS */
-
-/*
- * wake_up_new_task - wake up a newly created task for the first time.
- *
- * This function will do some initial scheduler statistics housekeeping
- * that must be done for every newly created context, then puts the task
- * on the runqueue and wakes it.
- */
-void wake_up_new_task(struct task_struct *p)
-{
-	struct task_struct *parent, *rq_curr;
-	struct rq *rq, *new_rq;
-	unsigned long flags;
-
-	parent = p->parent;
-	rq = task_grq_lock(p, &flags);
-	if (unlikely(needs_other_cpu(p, task_cpu(p))))
- 		set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
-	rq_curr = rq->curr;
-	p->state = TASK_RUNNING;
-
-	/*
-	 * Reinit new task deadline as its creator deadline could have changed
-	 * since call to dup_task_struct().
-	 */
-	p->deadline = rq->rq_deadline;
-
-	/* The new task might not be able to run on the same CPU as rq->curr */
-	if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
-		set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
-		new_rq = task_rq(p);
-	} else
-		new_rq = rq;
-
-	/*
-	 * Make sure we do not leak PI boosting priority to the child.
-	 */
-	p->prio = rq_curr->normal_prio;
-
-	activate_task(p, rq);
-	trace_sched_wakeup_new(p);
-
-	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness. If it's negative, it won't
-	 * matter since that's the same as being 0. current's time_slice is
-	 * actually in rq_time_slice when it's running, as is its last_ran
-	 * value. rq->rq_deadline is only modified within schedule() so it
-	 * is always equal to current->deadline.
-	 */
-	p->last_ran = rq->rq_last_ran;
-	if (likely(rq_curr->policy != SCHED_FIFO)) {
-		rq->rq_time_slice /= 2;
-		if (unlikely(rq->rq_time_slice < RESCHED_US)) {
-			/*
-			 * Forking task has run out of timeslice. Reschedule it and
-			 * start its child with a new time slice and deadline. The
-			 * child will end up running first because its deadline will
-			 * be slightly earlier.
-			 */
-			rq->rq_time_slice = 0;
-			__set_tsk_resched(rq_curr);
-			time_slice_expired(p);
-			if (suitable_idle_cpus(p))
-				resched_best_idle(p);
-			else if (unlikely(rq != new_rq))
-				try_preempt(p, new_rq);
-		} else {
-			p->time_slice = rq->rq_time_slice;
-			if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
-				/*
-				 * The VM isn't cloned, so we're in a good position to
-				 * do child-runs-first in anticipation of an exec. This
-				 * usually avoids a lot of COW overhead.
-				 */
-				__set_tsk_resched(rq_curr);
-			} else
-				try_preempt(p, new_rq);
-		}
-	} else {
-		time_slice_expired(p);
-		try_preempt(p, new_rq);
-	}
-	task_grq_unlock(p, &flags);
-}
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-
-static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
-
-void preempt_notifier_inc(void)
-{
-	static_key_slow_inc(&preempt_notifier_key);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_inc);
-
-void preempt_notifier_dec(void)
-{
-	static_key_slow_dec(&preempt_notifier_key);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_dec);
-
-/**
- * preempt_notifier_register - tell me when current is being preempted & rescheduled
- * @notifier: notifier struct to register
- */
-void preempt_notifier_register(struct preempt_notifier *notifier)
-{
-	if (!static_key_false(&preempt_notifier_key))
-		WARN(1, "registering preempt_notifier while notifiers disabled\n");
-
-	hlist_add_head(&notifier->link, &current->preempt_notifiers);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_register);
-
-/**
- * preempt_notifier_unregister - no longer interested in preemption notifications
- * @notifier: notifier struct to unregister
- *
- * This is *not* safe to call from within a preemption notifier.
- */
-void preempt_notifier_unregister(struct preempt_notifier *notifier)
-{
-	hlist_del(&notifier->link);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-
-static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-	struct preempt_notifier *notifier;
-
-	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
-		notifier->ops->sched_in(notifier, raw_smp_processor_id());
-}
-
-static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-	if (static_key_false(&preempt_notifier_key))
-		__fire_sched_in_preempt_notifiers(curr);
-}
-
-static void
-__fire_sched_out_preempt_notifiers(struct task_struct *curr,
-				 struct task_struct *next)
-{
-	struct preempt_notifier *notifier;
-
-	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
-		notifier->ops->sched_out(notifier, next);
-}
-
-static __always_inline void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-				 struct task_struct *next)
-{
-	if (static_key_false(&preempt_notifier_key))
-		__fire_sched_out_preempt_notifiers(curr, next);
-}
-
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
-
-static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-}
-
-static inline void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-				 struct task_struct *next)
-{
-}
-
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
-
-/**
- * prepare_task_switch - prepare to switch tasks
- * @rq: the runqueue preparing to switch
- * @next: the task we are going to switch to.
- *
- * This is called with the rq lock held and interrupts off. It must
- * be paired with a subsequent finish_task_switch after the context
- * switch.
- *
- * prepare_task_switch sets up locking and calls architecture specific
- * hooks.
- */
-static inline void
-prepare_task_switch(struct rq *rq, struct task_struct *prev,
-		    struct task_struct *next)
-{
-	sched_info_switch(rq, prev, next);
-	perf_event_task_sched_out(prev, next);
-	fire_sched_out_preempt_notifiers(prev, next);
-	prepare_lock_switch(rq, next);
-	prepare_arch_switch(next);
-}
-
-/**
- * finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
- * @prev: the thread we just switched away from.
- *
- * finish_task_switch must be called after the context switch, paired
- * with a prepare_task_switch call before the context switch.
- * finish_task_switch will reconcile locking set up by prepare_task_switch,
- * and do any other architecture-specific cleanup actions.
- *
- * Note that we may have delayed dropping an mm in context_switch(). If
- * so, we finish that here outside of the runqueue lock.  (Doing it
- * with the lock held can cause deadlocks; see schedule() for
- * details.)
- *
- * The context switch have flipped the stack from under us and restored the
- * local variables which were saved when this task called schedule() in the
- * past. prev == current is still correct but we need to recalculate this_rq
- * because prev may have moved to another CPU.
- */
-static struct rq *finish_task_switch(struct task_struct *prev)
-	__releases(grq.lock)
-{
-	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
-	long prev_state;
-
-	/*
-	 * The previous task will have left us with a preempt_count of 2
-	 * because it left us after:
-	 *
-	 *	schedule()
-	 *	  preempt_disable();			// 1
-	 *	  __schedule()
-	 *	    raw_spin_lock_irq(&rq->lock)	// 2
-	 *
-	 * Also, see FORK_PREEMPT_COUNT.
-	 */
-	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
-		      "corrupted preempt_count: %s/%d/0x%x\n",
-		      current->comm, current->pid, preempt_count()))
-		preempt_count_set(FORK_PREEMPT_COUNT);
-
-	rq->prev_mm = NULL;
-
-	/*
-	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
-	 * schedule one last time. The schedule call will never return, and
-	 * the scheduled task must drop that reference.
-	 *
-	 * We must observe prev->state before clearing prev->on_cpu (in
-	 * finish_lock_switch), otherwise a concurrent wakeup can get prev
-	 * running on another CPU and we could rave with its RUNNING -> DEAD
-	 * transition, resulting in a double drop.
-	 */
-	prev_state = prev->state;
-	vtime_task_switch(prev);
-	perf_event_task_sched_in(prev, current);
-	finish_lock_switch(rq, prev);
-	finish_arch_post_lock_switch();
-
-	fire_sched_in_preempt_notifiers(current);
-	if (mm)
-		mmdrop(mm);
-	if (unlikely(prev_state == TASK_DEAD)) {
-		/*
-		 * Remove function-return probe instances associated with this
-		 * task and put them back on the free list.
-		 */
-		kprobe_flush_task(prev);
-		put_task_struct(prev);
-	}
-	return rq;
-}
-
-/**
- * schedule_tail - first thing a freshly forked thread must call.
- * @prev: the thread we just switched away from.
- */
-asmlinkage __visible void schedule_tail(struct task_struct *prev)
-	__releases(grq.lock)
-{
-	struct rq *rq;
-
-	/*
-	 * New tasks start with FORK_PREEMPT_COUNT, see there and
-	 * finish_task_switch() for details.
-	 *
-	 * finish_task_switch() will drop rq->lock() and lower preempt_count
-	 * and the preempt_enable() will end up enabling preemption (on
-	 * PREEMPT_COUNT kernels).
-	 */
-
-	rq = finish_task_switch(prev);
-	preempt_enable();
-
-	if (current->set_child_tid)
-		put_user(task_pid_vnr(current), current->set_child_tid);
-}
-
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next)
-{
-	struct mm_struct *mm, *oldmm;
-
-	prepare_task_switch(rq, prev, next);
-
-	mm = next->mm;
-	oldmm = prev->active_mm;
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
-	if (!mm) {
-		next->active_mm = oldmm;
-		atomic_inc(&oldmm->mm_count);
-		enter_lazy_tlb(oldmm, next);
-	} else
-		switch_mm_irqs_off(oldmm, mm, next);
-
-	if (!prev->mm) {
-		prev->active_mm = NULL;
-		rq->prev_mm = oldmm;
-	}
-	/*
-	 * Since the runqueue lock will be released by the next
-	 * task (which is an invalid locking op but in the case
-	 * of the scheduler it's an obvious special-case), so we
-	 * do an early lockdep release here:
-	 */
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
-
-	/* Here we just switch the register state and the stack. */
-	switch_to(prev, next, prev);
-	barrier();
-
-	return finish_task_switch(prev);
-}
-
-/*
- * nr_running, nr_uninterruptible and nr_context_switches:
- *
- * externally visible scheduler statistics: current number of runnable
- * threads, total number of context switches performed since bootup. All are
- * measured without grabbing the grq lock but the occasional inaccurate result
- * doesn't matter so long as it's positive.
- */
-unsigned long nr_running(void)
-{
-	long nr = grq.nr_running;
-
-	if (unlikely(nr < 0))
-		nr = 0;
-	return (unsigned long)nr;
-}
-
-static unsigned long nr_uninterruptible(void)
-{
-	long nu = grq.nr_uninterruptible;
-
-	if (unlikely(nu < 0))
-		nu = 0;
-	return nu;
-}
-
-/*
- * Check if only the current task is running on the cpu.
- *
- * Caution: this function does not check that the caller has disabled
- * preemption, thus the result might have a time-of-check-to-time-of-use
- * race.  The caller is responsible to use it correctly, for example:
- *
- * - from a non-preemptable section (of course)
- *
- * - from a thread that is bound to a single CPU
- *
- * - in a loop with very short iterations (e.g. a polling loop)
- */
-bool single_task_running(void)
-{
-	if (cpu_rq(smp_processor_id())->soft_affined == 1)
-		return true;
-	else
-		return false;
-}
-EXPORT_SYMBOL(single_task_running);
-
-unsigned long long nr_context_switches(void)
-{
-	long long ns = grq.nr_switches;
-
-	/* This is of course impossible */
-	if (unlikely(ns < 0))
-		ns = 1;
-	return (unsigned long long)ns;
-}
-
-unsigned long nr_iowait(void)
-{
-	unsigned long i, sum = 0;
-
-	for_each_possible_cpu(i)
-		sum += atomic_read(&cpu_rq(i)->nr_iowait);
-
-	return sum;
-}
-
-unsigned long nr_iowait_cpu(int cpu)
-{
-	struct rq *this = cpu_rq(cpu);
-	return atomic_read(&this->nr_iowait);
-}
-
-unsigned long nr_active(void)
-{
-	return nr_running() + nr_uninterruptible();
-}
-
-/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we
- * base it on the number of running or queued tasks with their ->rq pointer
- * set to this cpu as being the CPU they're more likely to run on. */
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
-{
-	struct rq *rq = this_rq();
-
-	*nr_waiters = atomic_read(&rq->nr_iowait);
-	*load = rq->soft_affined;
-}
-
-/* Variables and functions for calc_load */
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
-
-/**
- * get_avenrun - get the load average array
- * @loads:	pointer to dest load array
- * @offset:	offset to add
- * @shift:	shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-	loads[0] = (avenrun[0] + offset) << shift;
-	loads[1] = (avenrun[1] + offset) << shift;
-	loads[2] = (avenrun[2] + offset) << shift;
-}
-
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-	unsigned long newload;
-
-	newload = load * exp + active * (FIXED_1 - exp);
-	if (active >= load)
-		newload += FIXED_1-1;
-
-	return newload / FIXED_1;
-}
-
-/*
- * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
- */
-void calc_global_load(unsigned long ticks)
-{
-	long active;
-
-	if (time_before(jiffies, calc_load_update))
-		return;
-	active = nr_active() * FIXED_1;
-
-	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
-	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
-	calc_load_update = jiffies + LOAD_FREQ;
-}
-
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
-
-EXPORT_PER_CPU_SYMBOL(kstat);
-EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
-	sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
-	sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
-	__this_cpu_inc(irq_time_seq.sequence);
-	smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
-	smp_wmb();
-	__this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-	u64 irq_time;
-	unsigned seq;
-
-	do {
-		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
-		irq_time = per_cpu(cpu_softirq_time, cpu) +
-			   per_cpu(cpu_hardirq_time, cpu);
-	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
-	return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void irqtime_account_irq(struct task_struct *curr)
-{
-	unsigned long flags;
-	s64 delta;
-	int cpu;
-
-	if (!sched_clock_irqtime)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-	__this_cpu_add(irq_start_time, delta);
-
-	irq_time_write_begin();
-	/*
-	 * We do not account for softirq time from ksoftirqd here.
-	 * We want to continue accounting softirq time to ksoftirqd thread
-	 * in that case, so as not to confuse scheduler with a special task
-	 * that do not consume any time, but still wants to run.
-	 */
-	if (hardirq_count())
-		__this_cpu_add(cpu_hardirq_time, delta);
-	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-		__this_cpu_add(cpu_softirq_time, delta);
-
-	irq_time_write_end();
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(irqtime_account_irq);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
-	if (unlikely(steal > NSEC_PER_SEC))
-		return div_u64(steal, TICK_NSEC);
-
-	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-
-	/*
-	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
-	 * this case when a previous update_rq_clock() happened inside a
-	 * {soft,}irq region.
-	 *
-	 * When this happens, we stop ->clock_task and only update the
-	 * prev_irq_time stamp to account for the part that fit, so that a next
-	 * update will consume the rest. This ensures ->clock_task is
-	 * monotonic.
-	 *
-	 * It does however cause some slight miss-attribution of {soft,}irq
-	 * time, a more accurate solution would be to update the irq_time using
-	 * the current rq->clock timestamp, except that would require using
-	 * atomic ops.
-	 */
-	if (irq_delta > delta)
-		irq_delta = delta;
-
-	rq->prev_irq_time += irq_delta;
-	delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-	if (static_key_false((&paravirt_steal_rq_enabled))) {
-		s64 steal = paravirt_steal_clock(cpu_of(rq));
-
-		steal -= rq->prev_steal_time_rq;
-
-		if (unlikely(steal > delta))
-			steal = delta;
-
-		rq->prev_steal_time_rq += steal;
-
-		delta -= steal;
-	}
-#endif
-
-	rq->clock_task += delta;
-}
-
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
-#endif
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static void irqtime_account_hi_si(void)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	u64 latest_ns;
-
-	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
-	if (latest_ns > cpustat[CPUTIME_IRQ])
-		cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
-
-	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
-	if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
-		cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime	(0)
-
-static inline void irqtime_account_hi_si(void)
-{
-}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
-	if (static_key_false(&paravirt_steal_enabled)) {
-		u64 steal;
-		cputime_t steal_ct;
-
-		steal = paravirt_steal_clock(smp_processor_id());
-		steal -= this_rq()->prev_steal_time;
-
-		/*
-		 * cputime_t may be less precise than nsecs (eg: if it's
-		 * based on jiffies). Lets cast the result to cputime
-		 * granularity and account the rest on the next rounds.
-		 */
-		steal_ct = nsecs_to_cputime(steal);
-		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
-
-		account_steal_time(steal_ct);
-		return steal_ct;
-	}
-#endif
-	return false;
-}
-
-/*
- * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
- * tasks (sum on group iteration) belonging to @tsk's group.
- */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
-	struct signal_struct *sig = tsk->signal;
-	cputime_t utime, stime;
-	struct task_struct *t;
-	unsigned int seq, nextseq;
-	unsigned long flags;
-
-	rcu_read_lock();
-	/* Attempt a lockless read on the first round. */
-	nextseq = 0;
-	do {
-		seq = nextseq;
-		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
-		times->utime = sig->utime;
-		times->stime = sig->stime;
-		times->sum_exec_runtime = sig->sum_sched_runtime;
-
-		for_each_thread(tsk, t) {
-			task_cputime(t, &utime, &stime);
-			times->utime += utime;
-			times->stime += stime;
-			times->sum_exec_runtime += task_sched_runtime(t);
-		}
-		/* If lockless access failed, take the lock. */
-		nextseq = 1;
-	} while (need_seqretry(&sig->stats_lock, seq));
-	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
-	rcu_read_unlock();
-}
-
-/*
- * On each tick, see what percentage of that tick was attributed to each
- * component and add the percentage to the _pc values. Once a _pc value has
- * accumulated one tick's worth, account for that. This means the total
- * percentage of load components will always be 128 (pseudo 100) per tick.
- */
-static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-	if (atomic_read(&rq->nr_iowait) > 0) {
-		rq->iowait_pc += pc;
-		if (rq->iowait_pc >= 128) {
-			cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128;
-			rq->iowait_pc %= 128;
-		}
-	} else {
-		rq->idle_pc += pc;
-		if (rq->idle_pc >= 128) {
-			cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128;
-			rq->idle_pc %= 128;
-		}
-	}
-	acct_update_integrals(idle);
-}
-
-static void
-pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
-	       unsigned long pc, unsigned long ns)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-
-	p->stime_pc += pc;
-	if (p->stime_pc >= 128) {
-		int jiffs = p->stime_pc / 128;
-
-		p->stime_pc %= 128;
-		p->stime += (__force u64)cputime_one_jiffy * jiffs;
-		p->stimescaled += one_jiffy_scaled * jiffs;
-		account_group_system_time(p, cputime_one_jiffy * jiffs);
-	}
-	p->sched_time += ns;
-	account_group_exec_runtime(p, ns);
-
-	if (hardirq_count() - hardirq_offset) {
-		rq->irq_pc += pc;
-		if (rq->irq_pc >= 128) {
-			cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128;
-			rq->irq_pc %= 128;
-		}
-	} else if (in_serving_softirq()) {
-		rq->softirq_pc += pc;
-		if (rq->softirq_pc >= 128) {
-			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
-			rq->softirq_pc %= 128;
-		}
-	} else {
-		rq->system_pc += pc;
-		if (rq->system_pc >= 128) {
-			cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128;
-			rq->system_pc %= 128;
-		}
-	}
-	acct_update_integrals(p);
-}
-
-static void pc_user_time(struct rq *rq, struct task_struct *p,
-			 unsigned long pc, unsigned long ns)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-
-	p->utime_pc += pc;
-	if (p->utime_pc >= 128) {
-		int jiffs = p->utime_pc / 128;
-
-		p->utime_pc %= 128;
-		p->utime += (__force u64)cputime_one_jiffy * jiffs;
-		p->utimescaled += one_jiffy_scaled * jiffs;
-		account_group_user_time(p, cputime_one_jiffy * jiffs);
-	}
-	p->sched_time += ns;
-	account_group_exec_runtime(p, ns);
-
-	if (this_cpu_ksoftirqd() == p) {
-		/*
-		 * ksoftirqd time do not get accounted in cpu_softirq_time.
-		 * So, we have to handle it separately here.
-		 */
-		rq->softirq_pc += pc;
-		if (rq->softirq_pc >= 128) {
-			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
-			rq->softirq_pc %= 128;
-		}
-	}
-
-	if (task_nice(p) > 0 || idleprio_task(p)) {
-		rq->nice_pc += pc;
-		if (rq->nice_pc >= 128) {
-			cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128;
-			rq->nice_pc %= 128;
-		}
-	} else {
-		rq->user_pc += pc;
-		if (rq->user_pc >= 128) {
-			cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128;
-			rq->user_pc %= 128;
-		}
-	}
-	acct_update_integrals(p);
-}
-
-/*
- * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
- * shifts instead of 100
- */
-#define NS_TO_PC(NS)	(NS * 128 / JIFFY_NS)
-
-/*
- * This is called on clock ticks.
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
- * CPU scheduler quota accounting is also performed here in microseconds.
- */
-static void
-update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
-{
-	long account_ns = rq->clock_task - rq->rq_last_ran;
-	struct task_struct *idle = rq->idle;
-	unsigned long account_pc;
-
-	if (unlikely(account_ns < 0) || steal_account_process_tick())
-		goto ts_account;
-
-	account_pc = NS_TO_PC(account_ns);
-
-	/* Accurate tick timekeeping */
-	if (user_mode(get_irq_regs()))
-		pc_user_time(rq, p, account_pc, account_ns);
-	else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
-		pc_system_time(rq, p, HARDIRQ_OFFSET,
-			       account_pc, account_ns);
-	else
-		pc_idle_time(rq, idle, account_pc);
-
-	if (sched_clock_irqtime)
-		irqtime_account_hi_si();
-
-ts_account:
-	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
-	if (rq->rq_policy != SCHED_FIFO && p != idle) {
-		s64 time_diff = rq->clock - rq->timekeep_clock;
-
-		niffy_diff(&time_diff, 1);
-		rq->rq_time_slice -= NS_TO_US(time_diff);
-	}
-
-	rq->rq_last_ran = rq->clock_task;
-	rq->timekeep_clock = rq->clock;
-}
-
-/*
- * This is called on context switches.
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
- * CPU scheduler quota accounting is also performed here in microseconds.
- */
-static void
-update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
-{
-	long account_ns = rq->clock_task - rq->rq_last_ran;
-	struct task_struct *idle = rq->idle;
-	unsigned long account_pc;
-
-	if (unlikely(account_ns < 0))
-		goto ts_account;
-
-	account_pc = NS_TO_PC(account_ns);
-
-	/* Accurate subtick timekeeping */
-	if (p != idle) {
-		pc_user_time(rq, p, account_pc, account_ns);
-	}
-	else
-		pc_idle_time(rq, idle, account_pc);
-
-ts_account:
-	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
-	if (rq->rq_policy != SCHED_FIFO && p != idle) {
-		s64 time_diff = rq->clock - rq->timekeep_clock;
-
-		niffy_diff(&time_diff, 1);
-		rq->rq_time_slice -= NS_TO_US(time_diff);
-	}
-
-	rq->rq_last_ran = rq->clock_task;
-	rq->timekeep_clock = rq->clock;
-}
-
-/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_grq_lock() held.
- */
-static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
-	u64 ns = 0;
-
-	/*
-	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
-	 * project cycles that may never be accounted to this
-	 * thread, breaking clock_gettime().
-	 */
-	if (p == rq->curr && p->on_rq) {
-		update_clocks(rq);
-		ns = rq->clock_task - rq->rq_last_ran;
-		if (unlikely((s64)ns < 0))
-			ns = 0;
-	}
-
-	return ns;
-}
-
-/*
- * Return accounted runtime for the task.
- * Return separately the current's pending runtime that have not been
- * accounted yet.
- *
- */
-unsigned long long task_sched_runtime(struct task_struct *p)
-{
-	unsigned long flags;
-	struct rq *rq;
-	u64 ns;
-
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
-	/*
-	 * 64-bit doesn't need locks to atomically read a 64bit value.
-	 * So we have a optimization chance when the task's delta_exec is 0.
-	 * Reading ->on_cpu is racy, but this is ok.
-	 *
-	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
-	 * If we race with it entering cpu, unaccounted time is 0. This is
-	 * indistinguishable from the read occurring a few cycles earlier.
-	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
-	 * been accounted, so we're correct here as well.
-	 */
-	if (!p->on_cpu || !p->on_rq)
-		return tsk_seruntime(p);
-#endif
-
-	rq = task_grq_lock(p, &flags);
-	ns = p->sched_time + do_task_delta_exec(p, rq);
-	task_grq_unlock(p, &flags);
-
-	return ns;
-}
-
-/* Compatibility crap */
-void account_user_time(struct task_struct *p, cputime_t cputime,
-		       cputime_t cputime_scaled)
-{
-}
-
-void account_idle_time(cputime_t cputime)
-{
-}
-
-/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
-			       cputime_t cputime_scaled)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-	/* Add guest time to process. */
-	p->utime += (__force u64)cputime;
-	p->utimescaled += (__force u64)cputime_scaled;
-	account_group_user_time(p, cputime);
-	p->gtime += (__force u64)cputime;
-
-	/* Add guest time to cpustat. */
-	if (task_nice(p) > 0) {
-		cpustat[CPUTIME_NICE] += (__force u64)cputime;
-		cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
-	} else {
-		cpustat[CPUTIME_USER] += (__force u64)cputime;
-		cpustat[CPUTIME_GUEST] += (__force u64)cputime;
-	}
-}
-
-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
-			cputime_t cputime_scaled, cputime64_t *target_cputime64)
-{
-	/* Add system time to process. */
-	p->stime += (__force u64)cputime;
-	p->stimescaled += (__force u64)cputime_scaled;
-	account_group_system_time(p, cputime);
-
-	/* Add system time to cpustat. */
-	*target_cputime64 += (__force u64)cputime;
-
-	/* Account for system time used */
-	acct_update_integrals(p);
-}
-
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * This is for guest only now.
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
-			 cputime_t cputime, cputime_t cputime_scaled)
-{
-
-	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
-		account_guest_time(p, cputime, cputime_scaled);
-}
-
-/*
- * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-	cpustat[CPUTIME_STEAL] += (__force u64)cputime;
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-static void account_idle_times(cputime_t cputime)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	struct rq *rq = this_rq();
-
-	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
-	else
-		cpustat[CPUTIME_IDLE] += (__force u64)cputime;
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-
-void account_process_tick(struct task_struct *p, int user_tick)
-{
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
-	account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-	account_idle_times(jiffies_to_cputime(ticks));
-}
-#endif
-
-static inline void grq_iso_lock(void)
-	__acquires(grq.iso_lock)
-{
-	raw_spin_lock(&grq.iso_lock);
-}
-
-static inline void grq_iso_unlock(void)
-	__releases(grq.iso_lock)
-{
-	raw_spin_unlock(&grq.iso_lock);
-}
-
-/*
- * Functions to test for when SCHED_ISO tasks have used their allocated
- * quota as real time scheduling and convert them back to SCHED_NORMAL.
- * Where possible, the data is tested lockless, to avoid grabbing iso_lock
- * because the occasional inaccurate result won't matter. However the
- * tick data is only ever modified under lock. iso_refractory is only simply
- * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
- */
-static bool set_iso_refractory(void)
-{
-	grq.iso_refractory = true;
-	return grq.iso_refractory;
-}
-
-static bool clear_iso_refractory(void)
-{
-	grq.iso_refractory = false;
-	return grq.iso_refractory;
-}
-
-/*
- * Test if SCHED_ISO tasks have run longer than their alloted period as RT
- * tasks and set the refractory flag if necessary. There is 10% hysteresis
- * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
- * slow division.
- */
-static bool test_ret_isorefractory(struct rq *rq)
-{
-	if (likely(!grq.iso_refractory)) {
-		if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu)
-			return set_iso_refractory();
-	} else {
-		if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))
-			return clear_iso_refractory();
-	}
-	return grq.iso_refractory;
-}
-
-static void iso_tick(void)
-{
-	grq_iso_lock();
-	grq.iso_ticks += 100;
-	grq_iso_unlock();
-}
-
-/* No SCHED_ISO task was running so decrease rq->iso_ticks */
-static inline void no_iso_tick(void)
-{
-	if (grq.iso_ticks) {
-		grq_iso_lock();
-		grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
-		if (unlikely(grq.iso_refractory && grq.iso_ticks <
-		    ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
-			clear_iso_refractory();
-		grq_iso_unlock();
-	}
-}
-
-/* This manages tasks that have run out of timeslice during a scheduler_tick */
-static void task_running_tick(struct rq *rq)
-{
-	struct task_struct *p;
-
-	/*
-	 * If a SCHED_ISO task is running we increment the iso_ticks. In
-	 * order to prevent SCHED_ISO tasks from causing starvation in the
-	 * presence of true RT tasks we account those as iso_ticks as well.
-	 */
-	if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) {
-		if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128)
-			iso_tick();
-	} else
-		no_iso_tick();
-
-	if (iso_queue(rq)) {
-		if (unlikely(test_ret_isorefractory(rq))) {
-			if (rq_running_iso(rq)) {
-				/*
-				 * SCHED_ISO task is running as RT and limit
-				 * has been hit. Force it to reschedule as
-				 * SCHED_NORMAL by zeroing its time_slice
-				 */
-				rq->rq_time_slice = 0;
-			}
-		}
-	}
-
-	/* SCHED_FIFO tasks never run out of timeslice. */
-	if (rq->rq_policy == SCHED_FIFO)
-		return;
-	/*
-	 * Tasks that were scheduled in the first half of a tick are not
-	 * allowed to run into the 2nd half of the next tick if they will
-	 * run out of time slice in the interim. Otherwise, if they have
-	 * less than RESCHED_US μs of time slice left they will be rescheduled.
-	 */
-	if (rq->dither) {
-		if (rq->rq_time_slice > HALF_JIFFY_US)
-			return;
-		else
-			rq->rq_time_slice = 0;
-	} else if (rq->rq_time_slice >= RESCHED_US)
-			return;
-
-	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
-	p = rq->curr;
-
-	grq_lock();
-	requeue_task(p);
-	resched_task(p);
-	grq_unlock();
-}
-
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled. The data modified is all
- * local to struct rq so we don't need to grab grq lock.
- */
-void scheduler_tick(void)
-{
-	int cpu __maybe_unused = smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
-
-	sched_clock_tick();
-	/* grq lock not grabbed, so only update rq clock */
-	update_rq_clock(rq);
-	update_cpu_clock_tick(rq, rq->curr);
-	update_load_avg(rq);
-	cpufreq_trigger(grq.niffies, rq->load_avg);
-	if (!rq_idle(rq))
-		task_running_tick(rq);
-	else
-		no_iso_tick();
-	rq->last_tick = rq->clock;
-	perf_event_task_tick();
-}
-
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-				defined(CONFIG_PREEMPT_TRACER))
-/*
- * If the value passed in is equal to the current preempt count
- * then we just disabled preemption. Start timing the latency.
- */
-static inline void preempt_latency_start(int val)
-{
-	if (preempt_count() == val) {
-		unsigned long ip = get_lock_parent_ip();
-#ifdef CONFIG_DEBUG_PREEMPT
-		current->preempt_disable_ip = ip;
-#endif
-		trace_preempt_off(CALLER_ADDR0, ip);
-	}
-}
-
-void preempt_count_add(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-	/*
-	 * Underflow?
-	 */
-	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
-		return;
-#endif
-	__preempt_count_add(val);
-#ifdef CONFIG_DEBUG_PREEMPT
-	/*
-	 * Spinlock count overflowing soon?
-	 */
-	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
-				PREEMPT_MASK - 10);
-#endif
-	preempt_latency_start(val);
-}
-EXPORT_SYMBOL(preempt_count_add);
-NOKPROBE_SYMBOL(preempt_count_add);
-
-/*
- * If the value passed in equals to the current preempt count
- * then we just enabled preemption. Stop timing the latency.
- */
-static inline void preempt_latency_stop(int val)
-{
-	if (preempt_count() == val)
-		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
-}
-
-void preempt_count_sub(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-	/*
-	 * Underflow?
-	 */
-	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
-		return;
-	/*
-	 * Is the spinlock portion underflowing?
-	 */
-	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
-			!(preempt_count() & PREEMPT_MASK)))
-		return;
-#endif
-
-	preempt_latency_stop(val);
-	__preempt_count_sub(val);
-}
-EXPORT_SYMBOL(preempt_count_sub);
-NOKPROBE_SYMBOL(preempt_count_sub);
-
-#else
-static inline void preempt_latency_start(int val) { }
-static inline void preempt_latency_stop(int val) { }
-#endif
-
-/*
- * The time_slice is only refilled when it is empty and that is when we set a
- * new deadline.
- */
-static void time_slice_expired(struct task_struct *p)
-{
-	p->time_slice = timeslice();
-	p->deadline = grq.niffies + task_deadline_diff(p);
-#ifdef CONFIG_SMT_NICE
-	if (!p->mm)
-		p->smt_bias = 0;
-	else if (rt_task(p))
-		p->smt_bias = 1 << 30;
-	else if (task_running_iso(p))
-		p->smt_bias = 1 << 29;
-	else if (idleprio_task(p)) {
-		if (task_running_idle(p))
-			p->smt_bias = 0;
-		else
-			p->smt_bias = 1;
-	} else if (--p->smt_bias < 1)
-		p->smt_bias = MAX_PRIO - p->static_prio;
-#endif
-}
-
-/*
- * Timeslices below RESCHED_US are considered as good as expired as there's no
- * point rescheduling when there's so little time left. SCHED_BATCH tasks
- * have been flagged be not latency sensitive and likely to be fully CPU
- * bound so every time they're rescheduled they have their time_slice
- * refilled, but get a new later deadline to have little effect on
- * SCHED_NORMAL tasks.
-
- */
-static inline void check_deadline(struct task_struct *p)
-{
-	if (p->time_slice < RESCHED_US || batch_task(p))
-		time_slice_expired(p);
-}
-
-#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
-
-/*
- * Scheduler queue bitmap specific find next bit.
- */
-static inline unsigned long
-next_sched_bit(const unsigned long *addr, unsigned long offset)
-{
-	const unsigned long *p;
-	unsigned long result;
-	unsigned long size;
-	unsigned long tmp;
-
-	size = PRIO_LIMIT;
-	if (offset >= size)
-		return size;
-
-	p = addr + BITOP_WORD(offset);
-	result = offset & ~(BITS_PER_LONG-1);
-	size -= result;
-	offset %= BITS_PER_LONG;
-	if (offset) {
-		tmp = *(p++);
-		tmp &= (~0UL << offset);
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found_middle;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
-	}
-	if (!size)
-		return result;
-	tmp = *p;
-
-found_first:
-	tmp &= (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found_middle:
-	return result + __ffs(tmp);
-}
-
-/*
- * Task selection with skiplists is a simple matter of picking off the first
- * task in the sorted list, an O(1) operation. The only time it takes longer
- * is if tasks do not have suitable affinity and then we iterate over entries
- * till we find the first that does. Worst case here is no tasks with suitable
- * affinity and taking O(n).
- */
-static inline struct
-task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
-{
-	skiplist_node *node = &grq.node;
-	struct task_struct *edt = idle;
-	u64 earliest_deadline = ~0ULL;
-
-	while ((node = node->next[0]) != &grq.node) {
-		struct task_struct *p = node->value;
-
-		/* Make sure affinity is ok */
-		if (needs_other_cpu(p, cpu))
-			continue;
-
-		if (!smt_schedule(p, rq))
-			continue;
-
-		if (!sched_interactive) {
-			int tcpu;
-
-			if ((tcpu = task_cpu(p)) != cpu) {
-				u64 dl = p->deadline << locality_diff(tcpu, rq);
-
-				if (!deadline_before(dl, earliest_deadline))
-					continue;
-				earliest_deadline = dl;
-				edt = p;
-				/* We continue even though we've found the earliest
-				 * deadline task as the locality offset means there
-				 * may be a better candidate after it. */
-				continue;
-			}
-		}
-		/* We've encountered the best deadline local task */
-		edt = p;
-		break;
-	}
-	if (likely(edt != idle))
-		take_task(cpu, edt);
-	return edt;
-}
-
-/*
- * Print scheduling while atomic bug:
- */
-static noinline void __schedule_bug(struct task_struct *prev)
-{
-	if (oops_in_progress)
-		return;
-
-	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
-		prev->comm, prev->pid, preempt_count());
-
-	debug_show_held_locks(prev);
-	print_modules();
-	if (irqs_disabled())
-		print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
-	if (in_atomic_preempt_off()) {
-		pr_err("Preemption disabled at:");
-		print_ip_sym(current->preempt_disable_ip);
-		pr_cont("\n");
-	}
-#endif
-	if (panic_on_warn)
-		panic("scheduling while atomic\n");
-
-	dump_stack();
-	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
-}
-
-/*
- * Various schedule()-time debugging checks and statistics:
- */
-static inline void schedule_debug(struct task_struct *prev)
-{
-#ifdef CONFIG_SCHED_STACK_END_CHECK
-	if (task_stack_end_corrupted(prev))
-		panic("corrupted stack end detected inside scheduler\n");
-#endif
-
-	if (unlikely(in_atomic_preempt_off())) {
-		__schedule_bug(prev);
-		preempt_count_set(PREEMPT_DISABLED);
-	}
-	rcu_sleep_check();
-
-	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-
-	schedstat_inc(this_rq(), sched_count);
-}
-
-/*
- * The currently running task's information is all stored in rq local data
- * which is only modified by the local CPU, thereby allowing the data to be
- * changed without grabbing the grq lock.
- */
-static inline void set_rq_task(struct rq *rq, struct task_struct *p)
-{
-	rq->rq_time_slice = p->time_slice;
-	rq->rq_deadline = p->deadline;
-	rq->rq_last_ran = p->last_ran = rq->clock_task;
-	rq->rq_policy = p->policy;
-	rq->rq_prio = p->prio;
-#ifdef CONFIG_SMT_NICE
-	rq->rq_mm = p->mm;
-	rq->rq_smt_bias = p->smt_bias;
-#endif
-}
-
-static void reset_rq_task(struct rq *rq, struct task_struct *p)
-{
-	rq->rq_policy = p->policy;
-	rq->rq_prio = p->prio;
-#ifdef CONFIG_SMT_NICE
-	rq->rq_smt_bias = p->smt_bias;
-#endif
-}
-
-#ifdef CONFIG_SMT_NICE
-static void check_no_siblings(struct rq __maybe_unused *this_rq) {}
-static void wake_no_siblings(struct rq __maybe_unused *this_rq) {}
-static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings;
-static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings;
-
-/* Iterate over smt siblings when we've scheduled a process on cpu and decide
- * whether they should continue running or be descheduled. */
-static void check_smt_siblings(struct rq *this_rq)
-{
-	int other_cpu;
-
-	for_each_cpu(other_cpu, &this_rq->thread_mask) {
-		struct task_struct *p;
-		struct rq *rq;
-
-		rq = cpu_rq(other_cpu);
-		if (rq_idle(rq))
-			continue;
-		if (unlikely(!rq->online))
-			continue;
-		p = rq->curr;
-		if (!smt_schedule(p, this_rq)) {
-			set_tsk_need_resched(p);
-			smp_send_reschedule(other_cpu);
-		}
-	}
-}
-
-static void wake_smt_siblings(struct rq *this_rq)
-{
-	int other_cpu;
-
-	if (!queued_notrunning())
-		return;
-
-	for_each_cpu(other_cpu, &this_rq->thread_mask) {
-		struct rq *rq;
-
-		rq = cpu_rq(other_cpu);
-		if (unlikely(!rq->online))
-			continue;
-		if (rq_idle(rq)) {
-			struct task_struct *p = rq->curr;
-
-			set_tsk_need_resched(p);
-			smp_send_reschedule(other_cpu);
-		}
-	}
-}
-#else
-static void check_siblings(struct rq __maybe_unused *this_rq) {}
-static void wake_siblings(struct rq __maybe_unused *this_rq) {}
-#endif
-
-/*
- * schedule() is the main scheduler function.
- *
- * The main means of driving the scheduler and thus entering this function are:
- *
- *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
- *
- *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
- *      paths. For example, see arch/x86/entry_64.S.
- *
- *      To drive preemption between tasks, the scheduler sets the flag in timer
- *      interrupt handler scheduler_tick().
- *
- *   3. Wakeups don't really cause entry into schedule(). They add a
- *      task to the run-queue and that's it.
- *
- *      Now, if the new task added to the run-queue preempts the current
- *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
- *      called on the nearest possible occasion:
- *
- *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
- *
- *         - in syscall or exception context, at the next outmost
- *           preempt_enable(). (this might be as soon as the wake_up()'s
- *           spin_unlock()!)
- *
- *         - in IRQ context, return from interrupt-handler to
- *           preemptible context
- *
- *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
- *         then at the next:
- *
- *          - cond_resched() call
- *          - explicit schedule() call
- *          - return from syscall or exception to user-space
- *          - return from interrupt-handler to user-space
- *
- * WARNING: must be called with preemption disabled!
- */
-static void __sched notrace __schedule(bool preempt)
-{
-	struct task_struct *prev, *next, *idle;
-	unsigned long *switch_count;
-	bool deactivate = false;
-	struct rq *rq;
-	int cpu;
-
-	cpu = smp_processor_id();
-	rq = cpu_rq(cpu);
-	prev = rq->curr;
-
-	/*
-	 * do_exit() calls schedule() with preemption disabled as an exception;
-	 * however we must fix that up, otherwise the next task will see an
-	 * inconsistent (higher) preempt count.
-	 *
-	 * It also avoids the below schedule_debug() test from complaining
-	 * about this.
-	 */
-	if (unlikely(prev->state == TASK_DEAD))
-		preempt_enable_no_resched_notrace();
-
-	schedule_debug(prev);
-
-	local_irq_disable();
-	rcu_note_context_switch();
-
-	/*
-	 * Make sure that signal_pending_state()->signal_pending() below
-	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
-	 * done by the caller to avoid the race with signal_wake_up().
-	 */
-	smp_mb__before_spinlock();
-	grq_lock();
-
-	switch_count = &prev->nivcsw;
-	if (!preempt && prev->state) {
-		if (unlikely(signal_pending_state(prev->state, prev))) {
-			prev->state = TASK_RUNNING;
-		} else {
-			deactivate = true;
-			prev->on_rq = 0;
-
-			/*
-			 * If a worker is going to sleep, notify and
-			 * ask workqueue whether it wants to wake up a
-			 * task to maintain concurrency.  If so, wake
-			 * up the task.
-			 */
-			if (prev->flags & PF_WQ_WORKER) {
-				struct task_struct *to_wakeup;
-
-				to_wakeup = wq_worker_sleeping(prev);
-				if (to_wakeup) {
-					/* This shouldn't happen, but does */
-					if (unlikely(to_wakeup == prev))
-						deactivate = false;
-					else
-						try_to_wake_up_local(to_wakeup);
-				}
-			}
-		}
-		switch_count = &prev->nvcsw;
-	}
-
-	update_clocks(rq);
-	update_cpu_clock_switch(rq, prev);
-	if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
-		rq->dither = false;
-	else
-		rq->dither = true;
-
-	clear_tsk_need_resched(prev);
-	clear_preempt_need_resched();
-
-	idle = rq->idle;
-	if (idle != prev) {
-		/* Update all the information stored on struct rq */
-		prev->time_slice = rq->rq_time_slice;
-		prev->deadline = rq->rq_deadline;
-		check_deadline(prev);
-		prev->last_ran = rq->clock_task;
-		return_task(prev, rq, deactivate);
-	}
-
-	if (unlikely(!queued_notrunning())) {
-		/*
-		 * This CPU is now truly idle as opposed to when idle is
-		 * scheduled as a high priority task in its own right.
-		 */
-		next = idle;
-		schedstat_inc(rq, sched_goidle);
-		set_cpuidle_map(cpu);
-	} else {
-		next = earliest_deadline_task(rq, cpu, idle);
-		if (likely(next->prio != PRIO_LIMIT))
-			clear_cpuidle_map(cpu);
-		else
-			set_cpuidle_map(cpu);
-	}
-
-	if (likely(prev != next)) {
-		/*
-		 * Don't reschedule an idle task or deactivated tasks
-		 */
-		if (prev != idle && !deactivate)
-			resched_suitable_idle(prev);
-		set_rq_task(rq, next);
-		if (next != idle)
-			check_siblings(rq);
-		else
-			wake_siblings(rq);
-		grq.nr_switches++;
-		prev->on_cpu = false;
-		next->on_cpu = true;
-		rq->curr = next;
-		++*switch_count;
-
-		trace_sched_switch(preempt, prev, next);
-		rq = context_switch(rq, prev, next); /* unlocks the grq */
-	} else {
-		check_siblings(rq);
-		grq_unlock_irq();
-	}
-}
-
-static inline void sched_submit_work(struct task_struct *tsk)
-{
-	if (!tsk->state || tsk_is_pi_blocked(tsk) ||
-	    preempt_count() ||
-	    signal_pending_state(tsk->state, tsk))
-		return;
-
-	/*
-	 * If we are going to sleep and we have plugged IO queued,
-	 * make sure to submit it to avoid deadlocks.
-	 */
-	if (blk_needs_flush_plug(tsk))
-		blk_schedule_flush_plug(tsk);
-}
-
-asmlinkage __visible void __sched schedule(void)
-{
-	struct task_struct *tsk = current;
-
-	sched_submit_work(tsk);
-	do {
-		preempt_disable();
-		__schedule(false);
-		sched_preempt_enable_no_resched();
-	} while (need_resched());
-}
-
-EXPORT_SYMBOL(schedule);
-
-#ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage __visible void __sched schedule_user(void)
-{
-	/*
-	 * If we come here after a random call to set_need_resched(),
-	 * or we have been woken up remotely but the IPI has not yet arrived,
-	 * we haven't yet exited the RCU idle mode. Do it here manually until
-	 * we find a better solution.
-	 *
-	 * NB: There are buggy callers of this function.  Ideally we
-	 * should warn if prev_state != IN_USER, but that will trigger
-	 * too frequently to make sense yet.
-	 */
-	enum ctx_state prev_state = exception_enter();
-	schedule();
-	exception_exit(prev_state);
-}
-#endif
-
-/**
- * schedule_preempt_disabled - called with preemption disabled
- *
- * Returns with preemption disabled. Note: preempt_count must be 1
- */
-void __sched schedule_preempt_disabled(void)
-{
-	sched_preempt_enable_no_resched();
-	schedule();
-	preempt_disable();
-}
-
-static void __sched notrace preempt_schedule_common(void)
-{
-	do {
-		/*
-		 * Because the function tracer can trace preempt_count_sub()
-		 * and it also uses preempt_enable/disable_notrace(), if
-		 * NEED_RESCHED is set, the preempt_enable_notrace() called
-		 * by the function tracer will call this function again and
-		 * cause infinite recursion.
-		 *
-		 * Preemption must be disabled here before the function
-		 * tracer can trace. Break up preempt_disable() into two
-		 * calls. One to disable preemption without fear of being
-		 * traced. The other to still record the preemption latency,
-		 * which can also be traced by the function tracer.
-		 */
-		preempt_disable_notrace();
-		preempt_latency_start(1);
-		__schedule(true);
-		preempt_latency_stop(1);
-		preempt_enable_no_resched_notrace();
-
-		/*
-		 * Check again in case we missed a preemption opportunity
-		 * between schedule and now.
-		 */
-	} while (need_resched());
-}
-
-#ifdef CONFIG_PREEMPT
-/*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
- */
-asmlinkage __visible void __sched notrace preempt_schedule(void)
-{
-	/*
-	 * If there is a non-zero preempt_count or interrupts are disabled,
-	 * we do not want to preempt the current task. Just return..
-	 */
-	if (likely(!preemptible()))
-		return;
-
-	preempt_schedule_common();
-}
-NOKPROBE_SYMBOL(preempt_schedule);
-EXPORT_SYMBOL(preempt_schedule);
-
-/**
- * preempt_schedule_notrace - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
-{
-	enum ctx_state prev_ctx;
-
-	if (likely(!preemptible()))
-		return;
-
-	do {
-		/*
-		 * Because the function tracer can trace preempt_count_sub()
-		 * and it also uses preempt_enable/disable_notrace(), if
-		 * NEED_RESCHED is set, the preempt_enable_notrace() called
-		 * by the function tracer will call this function again and
-		 * cause infinite recursion.
-		 *
-		 * Preemption must be disabled here before the function
-		 * tracer can trace. Break up preempt_disable() into two
-		 * calls. One to disable preemption without fear of being
-		 * traced. The other to still record the preemption latency,
-		 * which can also be traced by the function tracer.
-		 */
-		preempt_disable_notrace();
-		preempt_latency_start(1);
-		/*
-		 * Needs preempt disabled in case user_exit() is traced
-		 * and the tracer calls preempt_enable_notrace() causing
-		 * an infinite recursion.
-		 */
-		prev_ctx = exception_enter();
-		__schedule(true);
-		exception_exit(prev_ctx);
-
-		preempt_latency_stop(1);
-		preempt_enable_no_resched_notrace();
-	} while (need_resched());
-}
-EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
-
-#endif /* CONFIG_PREEMPT */
-
-/*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
- */
-asmlinkage __visible void __sched preempt_schedule_irq(void)
-{
-	enum ctx_state prev_state;
-
-	/* Catch callers which need to be fixed */
-	BUG_ON(preempt_count() || !irqs_disabled());
-
-	prev_state = exception_enter();
-
-	do {
-		preempt_disable();
-		local_irq_enable();
-		__schedule(true);
-		local_irq_disable();
-		sched_preempt_enable_no_resched();
-	} while (need_resched());
-
-	exception_exit(prev_state);
-}
-
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
-			  void *key)
-{
-	return try_to_wake_up(curr->private, mode, wake_flags);
-}
-EXPORT_SYMBOL(default_wake_function);
-
-#ifdef CONFIG_RT_MUTEXES
-
-/*
- * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
- *
- * This function changes the 'effective' priority of a task. It does
- * not touch ->normal_prio like __setscheduler().
- *
- * Used by the rt_mutex code to implement priority inheritance
- * logic. Call site only calls if the priority of the task changed.
- */
-void rt_mutex_setprio(struct task_struct *p, int prio)
-{
-	unsigned long flags;
-	struct rq *rq;
-	int oldprio;
-
-	BUG_ON(prio < 0 || prio > MAX_PRIO);
-
-	rq = task_grq_lock(p, &flags);
-
-	/*
-	 * Idle task boosting is a nono in general. There is one
-	 * exception, when PREEMPT_RT and NOHZ is active:
-	 *
-	 * The idle task calls get_next_timer_interrupt() and holds
-	 * the timer wheel base->lock on the CPU and another CPU wants
-	 * to access the timer (probably to cancel it). We can safely
-	 * ignore the boosting request, as the idle CPU runs this code
-	 * with interrupts disabled and will complete the lock
-	 * protected section without being interrupted. So there is no
-	 * real need to boost.
-	 */
-	if (unlikely(p == rq->idle)) {
-		WARN_ON(p != rq->curr);
-		WARN_ON(p->pi_blocked_on);
-		goto out_unlock;
-	}
-
-	trace_sched_pi_setprio(p, prio);
-	oldprio = p->prio;
-	p->prio = prio;
-	if (task_running(p)){
-		if (prio > oldprio)
-			resched_task(p);
-	} else if (task_queued(p)) {
-		dequeue_task(p);
-		enqueue_task(p, rq);
-		if (prio < oldprio)
-			try_preempt(p, rq);
-	}
-out_unlock:
-	task_grq_unlock(p, &flags);
-}
-
-#endif
-
-/*
- * Adjust the deadline for when the priority is to change, before it's
- * changed.
- */
-static inline void adjust_deadline(struct task_struct *p, int new_prio)
-{
-	p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
-}
-
-void set_user_nice(struct task_struct *p, long nice)
-{
-	int new_static, old_static;
-	unsigned long flags;
-	struct rq *rq;
-
-	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
-		return;
-	new_static = NICE_TO_PRIO(nice);
-	/*
-	 * We have to be careful, if called from sys_setpriority(),
-	 * the task might be in the middle of scheduling on another CPU.
-	 */
-	rq = time_task_grq_lock(p, &flags);
-	/*
-	 * The RT priorities are set via sched_setscheduler(), but we still
-	 * allow the 'normal' nice value to be set - but as expected
-	 * it wont have any effect on scheduling until the task is
-	 * not SCHED_NORMAL/SCHED_BATCH:
-	 */
-	if (has_rt_policy(p)) {
-		p->static_prio = new_static;
-		goto out_unlock;
-	}
-
-	adjust_deadline(p, new_static);
-	old_static = p->static_prio;
-	p->static_prio = new_static;
-	p->prio = effective_prio(p);
-
-	if (task_queued(p)) {
-		dequeue_task(p);
-		enqueue_task(p, rq);
-		if (new_static < old_static)
-			try_preempt(p, rq);
-	} else if (task_running(p)) {
-		reset_rq_task(rq, p);
-		if (old_static < new_static)
-			resched_task(p);
-	}
-out_unlock:
-	task_grq_unlock(p, &flags);
-}
-EXPORT_SYMBOL(set_user_nice);
-
-/*
- * can_nice - check if a task can reduce its nice value
- * @p: task
- * @nice: nice value
- */
-int can_nice(const struct task_struct *p, const int nice)
-{
-	/* convert nice value [19,-20] to rlimit style value [1,40] */
-	int nice_rlim = nice_to_rlimit(nice);
-
-	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
-		capable(CAP_SYS_NICE));
-}
-
-#ifdef __ARCH_WANT_SYS_NICE
-
-/*
- * sys_nice - change the priority of the current process.
- * @increment: priority increment
- *
- * sys_setpriority is a more generic, but much slower function that
- * does similar things.
- */
-SYSCALL_DEFINE1(nice, int, increment)
-{
-	long nice, retval;
-
-	/*
-	 * Setpriority might change our priority at the same moment.
-	 * We don't have to worry. Conceptually one call occurs first
-	 * and we have a single winner.
-	 */
-
-	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
-	nice = task_nice(current) + increment;
-
-	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
-	if (increment < 0 && !can_nice(current, nice))
-		return -EPERM;
-
-	retval = security_task_setnice(current, nice);
-	if (retval)
-		return retval;
-
-	set_user_nice(current, nice);
-	return 0;
-}
-
-#endif
-
-/**
- * task_prio - return the priority value of a given task.
- * @p: the task in question.
- *
- * Return: The priority value as seen by users in /proc.
- * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
- * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
- */
-int task_prio(const struct task_struct *p)
-{
-	int delta, prio = p->prio - MAX_RT_PRIO;
-
-	/* rt tasks and iso tasks */
-	if (prio <= 0)
-		goto out;
-
-	/* Convert to ms to avoid overflows */
-	delta = NS_TO_MS(p->deadline - grq.niffies);
-	delta = delta * 40 / ms_longest_deadline_diff();
-	if (delta > 0 && delta <= 80)
-		prio += delta;
-	if (idleprio_task(p))
-		prio += 40;
-out:
-	return prio;
-}
-
-/**
- * idle_cpu - is a given cpu idle currently?
- * @cpu: the processor in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int idle_cpu(int cpu)
-{
-	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
-}
-
-/**
- * idle_task - return the idle task for a given cpu.
- * @cpu: the processor in question.
- *
- * Return: The idle task for the cpu @cpu.
- */
-struct task_struct *idle_task(int cpu)
-{
-	return cpu_rq(cpu)->idle;
-}
-
-/**
- * find_process_by_pid - find a process with a matching PID value.
- * @pid: the pid in question.
- *
- * The task of @pid, if found. %NULL otherwise.
- */
-static inline struct task_struct *find_process_by_pid(pid_t pid)
-{
-	return pid ? find_task_by_vpid(pid) : current;
-}
-
-/* Actually do priority change: must hold grq lock. */
-static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
-			   int prio, bool keep_boost)
-{
-	int oldrtprio, oldprio;
-
-	p->policy = policy;
-	oldrtprio = p->rt_priority;
-	p->rt_priority = prio;
-	p->normal_prio = normal_prio(p);
-	oldprio = p->prio;
-	/*
-	 * Keep a potential priority boosting if called from
-	 * sched_setscheduler().
-	 */
-	if (keep_boost) {
-		/*
-		 * Take priority boosted tasks into account. If the new
-		 * effective priority is unchanged, we just store the new
-		 * normal parameters and do not touch the scheduler class and
-		 * the runqueue. This will be done when the task deboost
-		 * itself.
-		 */
-		p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
-	} else
-		p->prio = p->normal_prio;
-
-	if (task_running(p)) {
-		reset_rq_task(rq, p);
-		resched_task(p);
-	} else if (task_queued(p)) {
-		dequeue_task(p);
-		enqueue_task(p, rq);
-		if (p->prio < oldprio || p->rt_priority > oldrtprio)
-			try_preempt(p, rq);
-	}
-}
-
-/*
- * check the target process has a UID that matches the current process's
- */
-static bool check_same_owner(struct task_struct *p)
-{
-	const struct cred *cred = current_cred(), *pcred;
-	bool match;
-
-	rcu_read_lock();
-	pcred = __task_cred(p);
-	match = (uid_eq(cred->euid, pcred->euid) ||
-		 uid_eq(cred->euid, pcred->uid));
-	rcu_read_unlock();
-	return match;
-}
-
-static int
-__sched_setscheduler(struct task_struct *p, int policy,
-		     const struct sched_param *param, bool user, bool pi)
-{
-	struct sched_param zero_param = { .sched_priority = 0 };
-	unsigned long flags, rlim_rtprio = 0;
-	int retval, oldpolicy = -1;
-	int reset_on_fork;
-	struct rq *rq;
-
-	/* may grab non-irq protected spin_locks */
-	BUG_ON(in_interrupt());
-
-	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
-		unsigned long lflags;
-
-		if (!lock_task_sighand(p, &lflags))
-			return -ESRCH;
-		rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
-		unlock_task_sighand(p, &lflags);
-		if (rlim_rtprio)
-			goto recheck;
-		/*
-		 * If the caller requested an RT policy without having the
-		 * necessary rights, we downgrade the policy to SCHED_ISO.
-		 * We also set the parameter to zero to pass the checks.
-		 */
-		policy = SCHED_ISO;
-		param = &zero_param;
-	}
-recheck:
-	/* double check policy once rq lock held */
-	if (policy < 0) {
-		reset_on_fork = p->sched_reset_on_fork;
-		policy = oldpolicy = p->policy;
-	} else {
-		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
-		policy &= ~SCHED_RESET_ON_FORK;
-
-		if (!SCHED_RANGE(policy))
-			return -EINVAL;
-	}
-
-	/*
-	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
-	 * SCHED_BATCH is 0.
-	 */
-	if (param->sched_priority < 0 ||
-	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
-	    (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
-		return -EINVAL;
-	if (is_rt_policy(policy) != (param->sched_priority != 0))
-		return -EINVAL;
-
-	/*
-	 * Allow unprivileged RT tasks to decrease priority:
-	 */
-	if (user && !capable(CAP_SYS_NICE)) {
-		if (is_rt_policy(policy)) {
-			unsigned long rlim_rtprio =
-					task_rlimit(p, RLIMIT_RTPRIO);
-
-			/* can't set/change the rt policy */
-			if (policy != p->policy && !rlim_rtprio)
-				return -EPERM;
-
-			/* can't increase priority */
-			if (param->sched_priority > p->rt_priority &&
-			    param->sched_priority > rlim_rtprio)
-				return -EPERM;
-		} else {
-			switch (p->policy) {
-				/*
-				 * Can only downgrade policies but not back to
-				 * SCHED_NORMAL
-				 */
-				case SCHED_ISO:
-					if (policy == SCHED_ISO)
-						goto out;
-					if (policy == SCHED_NORMAL)
-						return -EPERM;
-					break;
-				case SCHED_BATCH:
-					if (policy == SCHED_BATCH)
-						goto out;
-					if (policy != SCHED_IDLEPRIO)
-						return -EPERM;
-					break;
-				case SCHED_IDLEPRIO:
-					if (policy == SCHED_IDLEPRIO)
-						goto out;
-					return -EPERM;
-				default:
-					break;
-			}
-		}
-
-		/* can't change other user's priorities */
-		if (!check_same_owner(p))
-			return -EPERM;
-
-		/* Normal users shall not reset the sched_reset_on_fork flag */
-		if (p->sched_reset_on_fork && !reset_on_fork)
-			return -EPERM;
-	}
-
-	if (user) {
-		retval = security_task_setscheduler(p);
-		if (retval)
-			return retval;
-	}
-
-	/*
-	 * make sure no PI-waiters arrive (or leave) while we are
-	 * changing the priority of the task:
-	 *
-	 * To be able to change p->policy safely, the grunqueue lock must be
-	 * held.
-	 */
-	rq = task_grq_lock(p, &flags);
-
-	/*
-	 * Changing the policy of the stop threads its a very bad idea
-	 */
-	if (p == rq->stop) {
-		task_grq_unlock(p, &flags);
-		return -EINVAL;
-	}
-
-	/*
-	 * If not changing anything there's no need to proceed further:
-	 */
-	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
-			param->sched_priority == p->rt_priority))) {
-
-		task_grq_unlock(p, &flags);
-		return 0;
-	}
-
-	/* recheck policy now with rq lock held */
-	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
-		policy = oldpolicy = -1;
-		task_grq_unlock(p, &flags);
-		goto recheck;
-	}
-	update_clocks(rq);
-	p->sched_reset_on_fork = reset_on_fork;
-
-	__setscheduler(p, rq, policy, param->sched_priority, pi);
-	task_grq_unlock(p, &flags);
-
-	if (pi)
-		rt_mutex_adjust_pi(p);
-out:
-	return 0;
-}
-
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-		       const struct sched_param *param)
-{
-	return __sched_setscheduler(p, policy, param, true, true);
-}
-
-EXPORT_SYMBOL_GPL(sched_setscheduler);
-
-int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
-{
-	const struct sched_param param = { .sched_priority = attr->sched_priority };
-	int policy = attr->sched_policy;
-
-	return __sched_setscheduler(p, policy, &param, true, true);
-}
-EXPORT_SYMBOL_GPL(sched_setattr);
-
-/**
- * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Just like sched_setscheduler, only don't bother checking if the
- * current context has permission.  For example, this is needed in
- * stop_machine(): we create temporary high priority worker threads,
- * but our caller might not have that capability.
- *
- * Return: 0 on success. An error code otherwise.
- */
-int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-			       const struct sched_param *param)
-{
-	return __sched_setscheduler(p, policy, param, false, true);
-}
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
-
-static int
-do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-{
-	struct sched_param lparam;
-	struct task_struct *p;
-	int retval;
-
-	if (!param || pid < 0)
-		return -EINVAL;
-	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
-		return -EFAULT;
-
-	rcu_read_lock();
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (p != NULL)
-		retval = sched_setscheduler(p, policy, &lparam);
-	rcu_read_unlock();
-
-	return retval;
-}
-
-/*
- * Mimics kernel/events/core.c perf_copy_attr().
- */
-static int sched_copy_attr(struct sched_attr __user *uattr,
-			   struct sched_attr *attr)
-{
-	u32 size;
-	int ret;
-
-	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
-		return -EFAULT;
-
-	/*
-	 * zero the full structure, so that a short copy will be nice.
-	 */
-	memset(attr, 0, sizeof(*attr));
-
-	ret = get_user(size, &uattr->size);
-	if (ret)
-		return ret;
-
-	if (size > PAGE_SIZE)	/* silly large */
-		goto err_size;
-
-	if (!size)		/* abi compat */
-		size = SCHED_ATTR_SIZE_VER0;
-
-	if (size < SCHED_ATTR_SIZE_VER0)
-		goto err_size;
-
-	/*
-	 * If we're handed a bigger struct than we know of,
-	 * ensure all the unknown bits are 0 - i.e. new
-	 * user-space does not rely on any kernel feature
-	 * extensions we dont know about yet.
-	 */
-	if (size > sizeof(*attr)) {
-		unsigned char __user *addr;
-		unsigned char __user *end;
-		unsigned char val;
-
-		addr = (void __user *)uattr + sizeof(*attr);
-		end  = (void __user *)uattr + size;
-
-		for (; addr < end; addr++) {
-			ret = get_user(val, addr);
-			if (ret)
-				return ret;
-			if (val)
-				goto err_size;
-		}
-		size = sizeof(*attr);
-	}
-
-	ret = copy_from_user(attr, uattr, size);
-	if (ret)
-		return -EFAULT;
-
-	/*
-	 * XXX: do we want to be lenient like existing syscalls; or do we want
-	 * to be strict and return an error on out-of-bounds values?
-	 */
-	attr->sched_nice = clamp(attr->sched_nice, -20, 19);
-
-	/* sched/core.c uses zero here but we already know ret is zero */
-	return 0;
-
-err_size:
-	put_user(sizeof(*attr), &uattr->size);
-	return -E2BIG;
-}
-
-/**
- * sys_sched_setscheduler - set/change the scheduler policy and RT priority
- * @pid: the pid in question.
- * @policy: new policy.
- *
- * Return: 0 on success. An error code otherwise.
- * @param: structure containing the new RT priority.
- */
-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
-				       struct sched_param __user *param)
-{
-	/* negative values for policy are not valid */
-	if (policy < 0)
-		return -EINVAL;
-
-	return do_sched_setscheduler(pid, policy, param);
-}
-
-/*
- * sched_setparam() passes in -1 for its policy, to let the functions
- * it calls know not to change it.
- */
-#define SETPARAM_POLICY	-1
-
-/**
- * sys_sched_setparam - set/change the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
-{
-	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
-}
-
-/**
- * sys_sched_setattr - same as above, but with extended sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- */
-SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
-			       unsigned int, flags)
-{
-	struct sched_attr attr;
-	struct task_struct *p;
-	int retval;
-
-	if (!uattr || pid < 0 || flags)
-		return -EINVAL;
-
-	retval = sched_copy_attr(uattr, &attr);
-	if (retval)
-		return retval;
-
-	if ((int)attr.sched_policy < 0)
-		return -EINVAL;
-
-	rcu_read_lock();
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (p != NULL)
-		retval = sched_setattr(p, &attr);
-	rcu_read_unlock();
-
-	return retval;
-}
-
-/**
- * sys_sched_getscheduler - get the policy (scheduling class) of a thread
- * @pid: the pid in question.
- *
- * Return: On success, the policy of the thread. Otherwise, a negative error
- * code.
- */
-SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
-{
-	struct task_struct *p;
-	int retval = -EINVAL;
-
-	if (pid < 0)
-		goto out_nounlock;
-
-	retval = -ESRCH;
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	if (p) {
-		retval = security_task_getscheduler(p);
-		if (!retval)
-			retval = p->policy;
-	}
-	rcu_read_unlock();
-
-out_nounlock:
-	return retval;
-}
-
-/**
- * sys_sched_getscheduler - get the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the RT priority.
- *
- * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
- * code.
- */
-SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
-{
-	struct sched_param lp = { .sched_priority = 0 };
-	struct task_struct *p;
-	int retval = -EINVAL;
-
-	if (!param || pid < 0)
-		goto out_nounlock;
-
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
-
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	if (has_rt_policy(p))
-		lp.sched_priority = p->rt_priority;
-	rcu_read_unlock();
-
-	/*
-	 * This one might sleep, we cannot do it with a spinlock held ...
-	 */
-	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
-out_nounlock:
-	return retval;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
-}
-
-static int sched_read_attr(struct sched_attr __user *uattr,
-			   struct sched_attr *attr,
-			   unsigned int usize)
-{
-	int ret;
-
-	if (!access_ok(VERIFY_WRITE, uattr, usize))
-		return -EFAULT;
-
-	/*
-	 * If we're handed a smaller struct than we know of,
-	 * ensure all the unknown bits are 0 - i.e. old
-	 * user-space does not get uncomplete information.
-	 */
-	if (usize < sizeof(*attr)) {
-		unsigned char *addr;
-		unsigned char *end;
-
-		addr = (void *)attr + usize;
-		end  = (void *)attr + sizeof(*attr);
-
-		for (; addr < end; addr++) {
-			if (*addr)
-				return -EFBIG;
-		}
-
-		attr->size = usize;
-	}
-
-	ret = copy_to_user(uattr, attr, attr->size);
-	if (ret)
-		return -EFAULT;
-
-	/* sched/core.c uses zero here but we already know ret is zero */
-	return ret;
-}
-
-/**
- * sys_sched_getattr - similar to sched_getparam, but with sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
-		unsigned int, size, unsigned int, flags)
-{
-	struct sched_attr attr = {
-		.size = sizeof(struct sched_attr),
-	};
-	struct task_struct *p;
-	int retval;
-
-	if (!uattr || pid < 0 || size > PAGE_SIZE ||
-	    size < SCHED_ATTR_SIZE_VER0 || flags)
-		return -EINVAL;
-
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
-
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	attr.sched_policy = p->policy;
-	if (rt_task(p))
-		attr.sched_priority = p->rt_priority;
-	else
-		attr.sched_nice = task_nice(p);
-
-	rcu_read_unlock();
-
-	retval = sched_read_attr(uattr, &attr, size);
-	return retval;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
-}
-
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
-{
-	cpumask_var_t cpus_allowed, new_mask;
-	struct task_struct *p;
-	int retval;
-
-	get_online_cpus();
-	rcu_read_lock();
-
-	p = find_process_by_pid(pid);
-	if (!p) {
-		rcu_read_unlock();
-		put_online_cpus();
-		return -ESRCH;
-	}
-
-	/* Prevent p going away */
-	get_task_struct(p);
-	rcu_read_unlock();
-
-	if (p->flags & PF_NO_SETAFFINITY) {
-		retval = -EINVAL;
-		goto out_put_task;
-	}
-	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
-		retval = -ENOMEM;
-		goto out_put_task;
-	}
-	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
-		retval = -ENOMEM;
-		goto out_free_cpus_allowed;
-	}
-	retval = -EPERM;
-	if (!check_same_owner(p)) {
-		rcu_read_lock();
-		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
-			rcu_read_unlock();
-			goto out_unlock;
-		}
-		rcu_read_unlock();
-	}
-
-	retval = security_task_setscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	cpuset_cpus_allowed(p, cpus_allowed);
-	cpumask_and(new_mask, in_mask, cpus_allowed);
-again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
-
-	if (!retval) {
-		cpuset_cpus_allowed(p, cpus_allowed);
-		if (!cpumask_subset(new_mask, cpus_allowed)) {
-			/*
-			 * We must have raced with a concurrent cpuset
-			 * update. Just reset the cpus_allowed to the
-			 * cpuset's cpus_allowed
-			 */
-			cpumask_copy(new_mask, cpus_allowed);
-			goto again;
-		}
-	}
-out_unlock:
-	free_cpumask_var(new_mask);
-out_free_cpus_allowed:
-	free_cpumask_var(cpus_allowed);
-out_put_task:
-	put_task_struct(p);
-	put_online_cpus();
-	return retval;
-}
-
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-			     cpumask_t *new_mask)
-{
-	if (len < sizeof(cpumask_t)) {
-		memset(new_mask, 0, sizeof(cpumask_t));
-	} else if (len > sizeof(cpumask_t)) {
-		len = sizeof(cpumask_t);
-	}
-	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
-}
-
-
-/**
- * sys_sched_setaffinity - set the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new cpu mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
-		unsigned long __user *, user_mask_ptr)
-{
-	cpumask_var_t new_mask;
-	int retval;
-
-	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
-	if (retval == 0)
-		retval = sched_setaffinity(pid, new_mask);
-	free_cpumask_var(new_mask);
-	return retval;
-}
-
-long sched_getaffinity(pid_t pid, cpumask_t *mask)
-{
-	struct task_struct *p;
-	unsigned long flags;
-	int retval;
-
-	get_online_cpus();
-	rcu_read_lock();
-
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (!p)
-		goto out_unlock;
-
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	grq_lock_irqsave(&flags);
-	cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
-	grq_unlock_irqrestore(&flags);
-
-out_unlock:
-	rcu_read_unlock();
-	put_online_cpus();
-
-	return retval;
-}
-
-/**
- * sys_sched_getaffinity - get the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current cpu mask
- *
- * Return: size of CPU mask copied to user_mask_ptr on success. An
- * error code otherwise.
- */
-SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
-		unsigned long __user *, user_mask_ptr)
-{
-	int ret;
-	cpumask_var_t mask;
-
-	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
-		return -EINVAL;
-	if (len & (sizeof(unsigned long)-1))
-		return -EINVAL;
-
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	ret = sched_getaffinity(pid, mask);
-	if (ret == 0) {
-		size_t retlen = min_t(size_t, len, cpumask_size());
-
-		if (copy_to_user(user_mask_ptr, mask, retlen))
-			ret = -EFAULT;
-		else
-			ret = retlen;
-	}
-	free_cpumask_var(mask);
-
-	return ret;
-}
-
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. It does this by
- * scheduling away the current task. If it still has the earliest deadline
- * it will be scheduled again as the next task.
- *
- * Return: 0.
- */
-SYSCALL_DEFINE0(sched_yield)
-{
-	struct task_struct *p;
-
-	p = current;
-	grq_lock_irq();
-	schedstat_inc(task_rq(p), yld_count);
-	requeue_task(p);
-
-	/*
-	 * Since we are going to call schedule() anyway, there's
-	 * no need to preempt or enable interrupts:
-	 */
-	__release(grq.lock);
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
-	do_raw_spin_unlock(&grq.lock);
-	sched_preempt_enable_no_resched();
-
-	schedule();
-
-	return 0;
-}
-
-int __sched _cond_resched(void)
-{
-	if (should_resched(0)) {
-		preempt_schedule_common();
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(_cond_resched);
-
-/*
- * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
- * call schedule, and on return reacquire the lock.
- *
- * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
- * operations here to prevent schedule() from being called twice (once via
- * spin_unlock(), once by hand).
- */
-int __cond_resched_lock(spinlock_t *lock)
-{
-	int resched = should_resched(PREEMPT_LOCK_OFFSET);
-	int ret = 0;
-
-	lockdep_assert_held(lock);
-
-	if (spin_needbreak(lock) || resched) {
-		spin_unlock(lock);
-		if (resched)
-			preempt_schedule_common();
-		else
-			cpu_relax();
-		ret = 1;
-		spin_lock(lock);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(__cond_resched_lock);
-
-int __sched __cond_resched_softirq(void)
-{
-	BUG_ON(!in_softirq());
-
-	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
-		local_bh_enable();
-		preempt_schedule_common();
-		local_bh_disable();
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
-/**
- * yield - yield the current processor to other threads.
- *
- * Do not ever use this function, there's a 99% chance you're doing it wrong.
- *
- * The scheduler is at all times free to pick the calling task as the most
- * eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
- *
- * Typical broken usage is:
- *
- * while (!event)
- * 	yield();
- *
- * where one assumes that yield() will let 'the other' process run that will
- * make event true. If the current task is a SCHED_FIFO task that will never
- * happen. Never use yield() as a progress guarantee!!
- *
- * If you want to use yield() to wait for something, use wait_event().
- * If you want to use yield() to be 'nice' for others, use cond_resched().
- * If you still want to use yield(), do not!
- */
-void __sched yield(void)
-{
-	set_current_state(TASK_RUNNING);
-	sys_sched_yield();
-}
-EXPORT_SYMBOL(yield);
-
-/**
- * yield_to - yield the current processor to another thread in
- * your thread group, or accelerate that thread toward the
- * processor it's on.
- * @p: target task
- * @preempt: whether task preemption is allowed or not
- *
- * It's the caller's job to ensure that the target task struct
- * can't go away on us before we can do any checks.
- *
- * Return:
- *	true (>0) if we indeed boosted the target task.
- *	false (0) if we failed to boost the target.
- *	-ESRCH if there's no task to yield to.
- */
-int __sched yield_to(struct task_struct *p, bool preempt)
-{
-	struct rq *rq, *p_rq;
-	unsigned long flags;
-	int yielded = 0;
-
-	rq = this_rq();
-	grq_lock_irqsave(&flags);
-	if (task_running(p) || p->state) {
-		yielded = -ESRCH;
-		goto out_unlock;
-	}
-
-	p_rq = task_rq(p);
-	yielded = 1;
-	if (p->deadline > rq->rq_deadline)
-		p->deadline = rq->rq_deadline;
-	p->time_slice += rq->rq_time_slice;
-	rq->rq_time_slice = 0;
-	if (p->time_slice > timeslice())
-		p->time_slice = timeslice();
-	if (preempt && rq != p_rq)
-		resched_curr(p_rq);
-out_unlock:
-	grq_unlock_irqrestore(&flags);
-
-	if (yielded > 0)
-		schedule();
-	return yielded;
-}
-EXPORT_SYMBOL_GPL(yield_to);
-
-/*
- * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
- * that process accounting knows that this is a task in IO wait state.
- *
- * But don't do that if it is a deliberate, throttling IO wait (this task
- * has set its backing_dev_info: the queue against which it should throttle)
- */
-
-long __sched io_schedule_timeout(long timeout)
-{
-	int old_iowait = current->in_iowait;
-	struct rq *rq;
-	long ret;
-
-	current->in_iowait = 1;
-	blk_schedule_flush_plug(current);
-
-	delayacct_blkio_start();
-	rq = raw_rq();
-	atomic_inc(&rq->nr_iowait);
-	ret = schedule_timeout(timeout);
-	current->in_iowait = old_iowait;
-	atomic_dec(&rq->nr_iowait);
-	delayacct_blkio_end();
-
-	return ret;
-}
-EXPORT_SYMBOL(io_schedule_timeout);
-
-/**
- * sys_sched_get_priority_max - return maximum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the maximum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
-{
-	int ret = -EINVAL;
-
-	switch (policy) {
-	case SCHED_FIFO:
-	case SCHED_RR:
-		ret = MAX_USER_RT_PRIO-1;
-		break;
-	case SCHED_NORMAL:
-	case SCHED_BATCH:
-	case SCHED_ISO:
-	case SCHED_IDLEPRIO:
-		ret = 0;
-		break;
-	}
-	return ret;
-}
-
-/**
- * sys_sched_get_priority_min - return minimum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the minimum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
-{
-	int ret = -EINVAL;
-
-	switch (policy) {
-	case SCHED_FIFO:
-	case SCHED_RR:
-		ret = 1;
-		break;
-	case SCHED_NORMAL:
-	case SCHED_BATCH:
-	case SCHED_ISO:
-	case SCHED_IDLEPRIO:
-		ret = 0;
-		break;
-	}
-	return ret;
-}
-
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
-		struct timespec __user *, interval)
-{
-	struct task_struct *p;
-	unsigned int time_slice;
-	unsigned long flags;
-	int retval;
-	struct timespec t;
-
-	if (pid < 0)
-		return -EINVAL;
-
-	retval = -ESRCH;
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	if (!p)
-		goto out_unlock;
-
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
-
-	grq_lock_irqsave(&flags);
-	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
-	grq_unlock_irqrestore(&flags);
-
-	rcu_read_unlock();
-	t = ns_to_timespec(time_slice);
-	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
-	return retval;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
-}
-
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
-void sched_show_task(struct task_struct *p)
-{
-	unsigned long free = 0;
-	int ppid;
-	unsigned long state = p->state;
-
-	if (state)
-		state = __ffs(state) + 1;
-	printk(KERN_INFO "%-15.15s %c", p->comm,
-		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT " running  ");
-	else
-		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT "  running task    ");
-	else
-		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
-#ifdef CONFIG_DEBUG_STACK_USAGE
-	free = stack_not_used(p);
-#endif
-	ppid = 0;
-	rcu_read_lock();
-	if (pid_alive(p))
-		ppid = task_pid_nr(rcu_dereference(p->real_parent));
-	rcu_read_unlock();
-	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-		task_pid_nr(p), ppid,
-		(unsigned long)task_thread_info(p)->flags);
-
-	print_worker_info(KERN_INFO, p);
-	show_stack(p, NULL);
-}
-
-void show_state_filter(unsigned long state_filter)
-{
-	struct task_struct *g, *p;
-
-#if BITS_PER_LONG == 32
-	printk(KERN_INFO
-		"  task                PC stack   pid father\n");
-#else
-	printk(KERN_INFO
-		"  task                        PC stack   pid father\n");
-#endif
-	rcu_read_lock();
-	for_each_process_thread(g, p) {
-		/*
-		 * reset the NMI-timeout, listing all files on a slow
-		 * console might take a lot of time:
-		 * Also, reset softlockup watchdogs on all CPUs, because
-		 * another CPU might be blocked waiting for us to process
-		 * an IPI.
-		 */
-		touch_nmi_watchdog();
-		touch_all_softlockup_watchdogs();
-		if (!state_filter || (p->state & state_filter))
-			sched_show_task(p);
-	}
-
-	rcu_read_unlock();
-	/*
-	 * Only show locks if all tasks are dumped:
-	 */
-	if (!state_filter)
-		debug_show_all_locks();
-}
-
-void dump_cpu_task(int cpu)
-{
-	pr_info("Task dump for CPU %d:\n", cpu);
-	sched_show_task(cpu_curr(cpu));
-}
-
-#ifdef CONFIG_SMP
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
-{
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-	cpumask_copy(tsk_cpus_allowed(p), new_mask);
-	if (needs_other_cpu(p, task_cpu(p)))
-		set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
-}
-#endif
-
-/**
- * init_idle - set up an idle thread for a given CPU
- * @idle: task in question
- * @cpu: cpu the idle task belongs to
- *
- * NOTE: this function does not set the idle thread's NEED_RESCHED
- * flag, to make booting more robust.
- */
-void init_idle(struct task_struct *idle, int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&idle->pi_lock, flags);
-	time_lock_grq(rq);
-	idle->last_ran = rq->clock_task;
-	idle->state = TASK_RUNNING;
-	/* Setting prio to illegal value shouldn't matter when never queued */
-	idle->prio = PRIO_LIMIT;
-
-	kasan_unpoison_task_stack(idle);
-
-#ifdef CONFIG_SMP
-	/*
-	 * It's possible that init_idle() gets called multiple times on a task,
-	 * in that case do_set_cpus_allowed() will not do the right thing.
-	 *
-	 * And since this is boot we can forgo the serialisation.
-	 */
-	set_cpus_allowed_common(idle, cpumask_of(cpu));
-#ifdef CONFIG_SMT_NICE
-	idle->smt_bias = 0;
-#endif
-#endif
-	set_rq_task(rq, idle);
-
-	/* Silence PROVE_RCU */
-	rcu_read_lock();
-	set_task_cpu(idle, cpu);
-	rcu_read_unlock();
-
-	rq->curr = rq->idle = idle;
-	idle->on_cpu = 1;
-	grq_unlock();
-	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
-
-	/* Set the preempt count _outside_ the spinlocks! */
-	init_idle_preempt_count(idle, cpu);
-
-	ftrace_graph_init_idle_task(idle, cpu);
-#ifdef CONFIG_SMP
-	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
-}
-
-int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
-			      const struct cpumask __maybe_unused *trial)
-{
-	return 1;
-}
-
-int task_can_attach(struct task_struct *p,
-		    const struct cpumask *cs_cpus_allowed)
-{
-	int ret = 0;
-
-	/*
-	 * Kthreads which disallow setaffinity shouldn't be moved
-	 * to a new cpuset; we don't want to change their cpu
-	 * affinity and isolating such threads by their set of
-	 * allowed nodes is unnecessary.  Thus, cpusets are not
-	 * applicable for such threads.  This prevents checking for
-	 * success of set_cpus_allowed_ptr() on all attached tasks
-	 * before cpus_allowed may be changed.
-	 */
-	if (p->flags & PF_NO_SETAFFINITY)
-		ret = -EINVAL;
-
-	return ret;
-}
-
-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
-{
-	struct wake_q_node *node = &task->wake_q;
-
-	/*
-	 * Atomically grab the task, if ->wake_q is !nil already it means
-	 * its already queued (either by us or someone else) and will get the
-	 * wakeup due to that.
-	 *
-	 * This cmpxchg() implies a full barrier, which pairs with the write
-	 * barrier implied by the wakeup in wake_up_q().
-	 */
-	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
-		return;
-
-	get_task_struct(task);
-
-	/*
-	 * The head is context local, there can be no concurrency.
-	 */
-	*head->lastp = node;
-	head->lastp = &node->next;
-}
-
-void wake_up_q(struct wake_q_head *head)
-{
-	struct wake_q_node *node = head->first;
-
-	while (node != WAKE_Q_TAIL) {
-		struct task_struct *task;
-
-		task = container_of(node, struct task_struct, wake_q);
-		BUG_ON(!task);
-		/* task can safely be re-inserted now */
-		node = node->next;
-		task->wake_q.next = NULL;
-
-		/*
-		 * wake_up_process() implies a wmb() to pair with the queueing
-		 * in wake_q_add() so as not to miss wakeups.
-		 */
-		wake_up_process(task);
-		put_task_struct(task);
-	}
-}
-
-void resched_cpu(int cpu)
-{
-	unsigned long flags;
-
-	grq_lock_irqsave(&flags);
-	resched_task(cpu_curr(cpu));
-	grq_unlock_irqrestore(&flags);
-}
-
-#ifdef CONFIG_SMP
-#ifdef CONFIG_NO_HZ_COMMON
-void nohz_balance_enter_idle(int cpu)
-{
-}
-
-void select_nohz_load_balancer(int stop_tick)
-{
-}
-
-void set_cpu_sd_state_idle(void) {}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:	The cpu whose lowest level of sched domain is to
- *		be returned.
- * @flag:	The flag to check for the lowest sched_domain
- *		for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-	struct sched_domain *sd;
-
-	for_each_domain(cpu, sd)
-		if (sd && (sd->flags & flag))
-			break;
-
-	return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:	The cpu whose domains we're iterating over.
- * @sd:		variable holding the value of the power_savings_sd
- *		for cpu.
- * @flag:	The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-	for (sd = lowest_flag_domain(cpu, flag); \
-		(sd && (sd->flags & flag)); sd = sd->parent)
-
-#endif /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-
-/*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu.  This is good for power-savings.
- *
- * We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
- */
-int get_nohz_timer_target(void)
-{
-	int i, cpu = smp_processor_id();
-	struct sched_domain *sd;
-
-	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
-		return cpu;
-
-	rcu_read_lock();
-	for_each_domain(cpu, sd) {
-		for_each_cpu(i, sched_domain_span(sd)) {
-			if (cpu == i)
-				continue;
-
-			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
-				cpu = i;
-				cpu = i;
-				goto unlock;
-			}
-		}
-	}
-
-	if (!is_housekeeping_cpu(cpu))
-		cpu = housekeeping_any_cpu();
-unlock:
-	rcu_read_unlock();
-	return cpu;
-}
-
-/*
- * When add_timer_on() enqueues a timer into the timer wheel of an
- * idle CPU then this timer might expire before the next timer event
- * which is scheduled to wake up that CPU. In case of a completely
- * idle system the next event might even be infinite time into the
- * future. wake_up_idle_cpu() ensures that the CPU is woken up and
- * leaves the inner idle loop so the newly added timer is taken into
- * account when the CPU goes back to idle and evaluates the timer
- * wheel for the next timer event.
- */
-void wake_up_idle_cpu(int cpu)
-{
-	if (cpu == smp_processor_id())
-		return;
-
-	set_tsk_need_resched(cpu_rq(cpu)->idle);
-	smp_send_reschedule(cpu);
-}
-
-void wake_up_nohz_cpu(int cpu)
-{
-	wake_up_idle_cpu(cpu);
-}
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask, bool check)
-{
-	const struct cpumask *cpu_valid_mask = cpu_active_mask;
-	bool running_wrong = false;
-	struct cpumask old_mask;
-	bool queued = false;
-	unsigned long flags;
-	struct rq *rq;
-	int ret = 0;
-
-	rq = task_grq_lock(p, &flags);
-
-	if (p->flags & PF_KTHREAD) {
-		/*
-		 * Kernel threads are allowed on online && !active CPUs
-		 */
-		cpu_valid_mask = cpu_online_mask;
-	}
-
-	/*
-	 * Must re-check here, to close a race against __kthread_bind(),
-	 * sched_setaffinity() is not guaranteed to observe the flag.
-	 */
-	if (check && (p->flags & PF_NO_SETAFFINITY)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	cpumask_copy(&old_mask, tsk_cpus_allowed(p));
-	if (cpumask_equal(&old_mask, new_mask))
-		goto out;
-
-	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	queued = task_queued(p);
-
-	do_set_cpus_allowed(p, new_mask);
-
-	if (p->flags & PF_KTHREAD) {
-		/*
-		 * For kernel threads that do indeed end up on online &&
-		 * !active we want to ensure they are strict per-cpu threads.
-		 */
-		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
-			!cpumask_intersects(new_mask, cpu_active_mask) &&
-			tsk_nr_cpus_allowed(p) != 1);
-	}
-
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
-
-	if (task_running(p)) {
-		/* Task is running on the wrong cpu now, reschedule it. */
-		if (rq == this_rq()) {
-			set_tsk_need_resched(p);
-			running_wrong = true;
-		} else
-			resched_task(p);
-	} else
-		set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask));
-
-out:
-	if (queued && !cpumask_subset(new_mask, &old_mask))
-		try_preempt(p, rq);
-	if (running_wrong)
-		preempt_disable();
-	task_grq_unlock(p, &flags);
-
-	if (running_wrong) {
-		__schedule(true);
-		preempt_enable();
-	}
-
-	return ret;
-}
-
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-	return __set_cpus_allowed_ptr(p, new_mask, false);
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-static bool sched_smp_initialized __read_mostly;
-
-#ifdef CONFIG_HOTPLUG_CPU
-/* Run through task list and find tasks affined to the dead cpu, then remove
- * that cpu from the list, enable cpu0 and set the zerobound flag. */
-static void bind_zero(int src_cpu)
-{
-	struct task_struct *p, *t;
-	int bound = 0;
-
-	if (src_cpu == 0)
-		return;
-
-	do_each_thread(t, p) {
-		if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
-			cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p));
-			cpumask_set_cpu(0, tsk_cpus_allowed(p));
-			p->zerobound = true;
-			bound++;
-			if (task_cpu(p) == src_cpu) {
-				set_task_cpu(p, 0);
-				if (task_running(p))
-					resched_task(p);
-			}
-		}
-	} while_each_thread(t, p);
-
-	if (bound) {
-		printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n",
-		       bound, src_cpu);
-	}
-}
-
-/* Find processes with the zerobound flag and reenable their affinity for the
- * CPU coming alive. */
-static void unbind_zero(int src_cpu)
-{
-	int unbound = 0, zerobound = 0;
-	struct task_struct *p, *t;
-
-	if (src_cpu == 0)
-		return;
-
-	do_each_thread(t, p) {
-		if (!p->mm)
-			p->zerobound = false;
-		if (p->zerobound) {
-			unbound++;
-			cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p));
-			/* Once every CPU affinity has been re-enabled, remove
-			 * the zerobound flag */
-			if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) {
-				p->zerobound = false;
-				zerobound++;
-			}
-		}
-	} while_each_thread(t, p);
-
-	if (unbound) {
-		printk(KERN_INFO "Added affinity for %d processes to cpu %d\n",
-		       unbound, src_cpu);
-	}
-	if (zerobound) {
-		printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n",
-		       zerobound);
-	}
-}
-
-/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
- */
-void idle_task_exit(void)
-{
-	struct mm_struct *mm = current->active_mm;
-
-	BUG_ON(cpu_online(smp_processor_id()));
-
-	if (mm != &init_mm) {
-		switch_mm_irqs_off(mm, &init_mm, current);
-		finish_arch_post_lock_switch();
-	}
-	mmdrop(mm);
-}
-#else /* CONFIG_HOTPLUG_CPU */
-static void unbind_zero(int src_cpu) {}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-void sched_set_stop_task(int cpu, struct task_struct *stop)
-{
-	struct sched_param stop_param = { .sched_priority = STOP_PRIO };
-	struct sched_param start_param = { .sched_priority = 0 };
-	struct task_struct *old_stop = cpu_rq(cpu)->stop;
-
-	if (stop) {
-		/*
-		 * Make it appear like a SCHED_FIFO task, its something
-		 * userspace knows about and won't get confused about.
-		 *
-		 * Also, it will make PI more or less work without too
-		 * much confusion -- but then, stop work should not
-		 * rely on PI working anyway.
-		 */
-		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
-	}
-
-	cpu_rq(cpu)->stop = stop;
-
-	if (old_stop) {
-		/*
-		 * Reset it back to a normal scheduling policy so that
-		 * it can die in pieces.
-		 */
-		sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
-	}
-}
-
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
-	{
-		.procname	= "sched_domain",
-		.mode		= 0555,
-	},
-	{}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= sd_ctl_dir,
-	},
-	{}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-	struct ctl_table *entry =
-		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-	return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-	struct ctl_table *entry;
-
-	/*
-	 * In the intermediate directories, both the child directory and
-	 * procname are dynamically allocated and could fail but the mode
-	 * will always be set. In the lowest directory the names are
-	 * static strings and all have proc handlers.
-	 */
-	for (entry = *tablep; entry->mode; entry++) {
-		if (entry->child)
-			sd_free_ctl_entry(&entry->child);
-		if (entry->proc_handler == NULL)
-			kfree(entry->procname);
-	}
-
-	kfree(*tablep);
-	*tablep = NULL;
-}
-
-#define CPU_LOAD_IDX_MAX 5
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
-		const char *procname, void *data, int maxlen,
-		umode_t mode, proc_handler *proc_handler,
-		bool load_idx)
-{
-	entry->procname = procname;
-	entry->data = data;
-	entry->maxlen = maxlen;
-	entry->mode = mode;
-	entry->proc_handler = proc_handler;
-
-	if (load_idx) {
-		entry->extra1 = &min_load_idx;
-		entry->extra2 = &max_load_idx;
-	}
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-	struct ctl_table *table = sd_alloc_ctl_entry(14);
-
-	if (table == NULL)
-		return NULL;
-
-	set_table_entry(&table[0], "min_interval", &sd->min_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[1], "max_interval", &sd->max_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[9], "cache_nice_tries",
-		&sd->cache_nice_tries,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[10], "flags", &sd->flags,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[11], "max_newidle_lb_cost",
-		&sd->max_newidle_lb_cost,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[12], "name", sd->name,
-		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-	/* &table[13] is terminator */
-
-	return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-	struct ctl_table *entry, *table;
-	struct sched_domain *sd;
-	int domain_num = 0, i;
-	char buf[32];
-
-	for_each_domain(cpu, sd)
-		domain_num++;
-	entry = table = sd_alloc_ctl_entry(domain_num + 1);
-	if (table == NULL)
-		return NULL;
-
-	i = 0;
-	for_each_domain(cpu, sd) {
-		snprintf(buf, 32, "domain%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_domain_table(sd);
-		entry++;
-		i++;
-	}
-	return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-void register_sched_domain_sysctl(void)
-{
-	int i, cpu_num = num_possible_cpus();
-	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-	char buf[32];
-
-	WARN_ON(sd_ctl_dir[0].child);
-	sd_ctl_dir[0].child = entry;
-
-	if (entry == NULL)
-		return;
-
-	for_each_possible_cpu(i) {
-		snprintf(buf, 32, "cpu%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_cpu_table(i);
-		entry++;
-	}
-
-	WARN_ON(sd_sysctl_header);
-	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-void unregister_sched_domain_sysctl(void)
-{
-	unregister_sysctl_table(sd_sysctl_header);
-	sd_sysctl_header = NULL;
-	if (sd_ctl_dir[0].child)
-		sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#endif /* CONFIG_SYSCTL */
-
-static void set_rq_online(struct rq *rq)
-{
-	if (!rq->online) {
-		cpumask_set_cpu(cpu_of(rq), rq->rd->online);
-		rq->online = true;
-	}
-}
-
-static void set_rq_offline(struct rq *rq)
-{
-	if (rq->online) {
-		int cpu = cpu_of(rq);
-
-		cpumask_clear_cpu(cpu, rq->rd->online);
-		rq->online = false;
-		clear_cpuidle_map(cpu);
-	}
-}
-
-static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
-
-#ifdef CONFIG_SCHED_DEBUG
-
-static __read_mostly int sched_debug_enabled;
-
-static int __init sched_debug_setup(char *str)
-{
-	sched_debug_enabled = 1;
-
-	return 0;
-}
-early_param("sched_debug", sched_debug_setup);
-
-static inline bool sched_debug(void)
-{
-	return sched_debug_enabled;
-}
-
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-				  struct cpumask *groupmask)
-{
-	cpumask_clear(groupmask);
-
-	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
-
-	if (!(sd->flags & SD_LOAD_BALANCE)) {
-		printk("does not load-balance\n");
-		if (sd->parent)
-			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-					" has parent");
-		return -1;
-	}
-
-	printk(KERN_CONT "span %*pbl level %s\n",
-	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
-
-	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-		printk(KERN_ERR "ERROR: domain->span does not contain "
-				"CPU%d\n", cpu);
-	}
-
-	printk(KERN_CONT "\n");
-
-	if (!cpumask_equal(sched_domain_span(sd), groupmask))
-		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-
-	if (sd->parent &&
-	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
-		printk(KERN_ERR "ERROR: parent span is not a superset "
-			"of domain->span\n");
-	return 0;
-}
-
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
-{
-	int level = 0;
-
-	if (!sched_debug_enabled)
-		return;
-
-	if (!sd) {
-		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
-		return;
-	}
-
-	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-
-	for (;;) {
-		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
-			break;
-		level++;
-		sd = sd->parent;
-		if (!sd)
-			break;
-	}
-}
-#else /* !CONFIG_SCHED_DEBUG */
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
-	return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
-
-static int sd_degenerate(struct sched_domain *sd)
-{
-	if (cpumask_weight(sched_domain_span(sd)) == 1)
-		return 1;
-
-	/* Following flags don't use groups */
-	if (sd->flags & (SD_WAKE_AFFINE))
-		return 0;
-
-	return 1;
-}
-
-static int
-sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
-{
-	unsigned long cflags = sd->flags, pflags = parent->flags;
-
-	if (sd_degenerate(parent))
-		return 1;
-
-	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
-		return 0;
-
-	if (~cflags & pflags)
-		return 0;
-
-	return 1;
-}
-
-static void free_rootdomain(struct rcu_head *rcu)
-{
-	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
-
-	cpupri_cleanup(&rd->cpupri);
-	free_cpumask_var(rd->rto_mask);
-	free_cpumask_var(rd->online);
-	free_cpumask_var(rd->span);
-	kfree(rd);
-}
-
-static void rq_attach_root(struct rq *rq, struct root_domain *rd)
-{
-	struct root_domain *old_rd = NULL;
-	unsigned long flags;
-
-	grq_lock_irqsave(&flags);
-
-	if (rq->rd) {
-		old_rd = rq->rd;
-
-		if (cpumask_test_cpu(rq->cpu, old_rd->online))
-			set_rq_offline(rq);
-
-		cpumask_clear_cpu(rq->cpu, old_rd->span);
-
-		/*
-		 * If we dont want to free the old_rd yet then
-		 * set old_rd to NULL to skip the freeing later
-		 * in this function:
-		 */
-		if (!atomic_dec_and_test(&old_rd->refcount))
-			old_rd = NULL;
-	}
-
-	atomic_inc(&rd->refcount);
-	rq->rd = rd;
-
-	cpumask_set_cpu(rq->cpu, rd->span);
-	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
-		set_rq_online(rq);
-
-	grq_unlock_irqrestore(&flags);
-
-	if (old_rd)
-		call_rcu_sched(&old_rd->rcu, free_rootdomain);
-}
-
-static int init_rootdomain(struct root_domain *rd)
-{
-	memset(rd, 0, sizeof(*rd));
-
-	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
-		goto out;
-	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
-		goto free_span;
-	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
-		goto free_online;
-
-	if (cpupri_init(&rd->cpupri) != 0)
-		goto free_rto_mask;
-	return 0;
-
-free_rto_mask:
-	free_cpumask_var(rd->rto_mask);
-free_online:
-	free_cpumask_var(rd->online);
-free_span:
-	free_cpumask_var(rd->span);
-out:
-	return -ENOMEM;
-}
-
-static void init_defrootdomain(void)
-{
-	init_rootdomain(&def_root_domain);
-
-	atomic_set(&def_root_domain.refcount, 1);
-}
-
-static struct root_domain *alloc_rootdomain(void)
-{
-	struct root_domain *rd;
-
-	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
-	if (!rd)
-		return NULL;
-
-	if (init_rootdomain(rd) != 0) {
-		kfree(rd);
-		return NULL;
-	}
-
-	return rd;
-}
-
-static void free_sched_domain(struct rcu_head *rcu)
-{
-	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
-	kfree(sd);
-}
-
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
-{
-	call_rcu(&sd->rcu, free_sched_domain);
-}
-
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
-{
-	for (; sd; sd = sd->parent)
-		destroy_sched_domain(sd, cpu);
-}
-
-/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
- * hold the hotplug lock.
- */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct sched_domain *tmp;
-
-	/* Remove the sched domains which do not contribute to scheduling. */
-	for (tmp = sd; tmp; ) {
-		struct sched_domain *parent = tmp->parent;
-		if (!parent)
-			break;
-
-		if (sd_parent_degenerate(tmp, parent)) {
-			tmp->parent = parent->parent;
-			if (parent->parent)
-				parent->parent->child = tmp;
-			/*
-			 * Transfer SD_PREFER_SIBLING down in case of a
-			 * degenerate parent; the spans match for this
-			 * so the property transfers.
-			 */
-			if (parent->flags & SD_PREFER_SIBLING)
-				tmp->flags |= SD_PREFER_SIBLING;
-			destroy_sched_domain(parent, cpu);
-		} else
-			tmp = tmp->parent;
-	}
-
-	if (sd && sd_degenerate(sd)) {
-		tmp = sd;
-		sd = sd->parent;
-		destroy_sched_domain(tmp, cpu);
-		if (sd)
-			sd->child = NULL;
-	}
-
-	sched_domain_debug(sd, cpu);
-
-	rq_attach_root(rq, rd);
-	tmp = rq->sd;
-	rcu_assign_pointer(rq->sd, sd);
-	destroy_sched_domains(tmp, cpu);
-}
-
-/* Setup the mask of cpus configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
-	int ret;
-
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	ret = cpulist_parse(str, cpu_isolated_map);
-	if (ret) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
-		return 0;
-	}
-	return 1;
-}
-
-__setup("isolcpus=", isolated_cpu_setup);
-
-struct s_data {
-	struct sched_domain ** __percpu sd;
-	struct root_domain	*rd;
-};
-
-enum s_alloc {
-	sa_rootdomain,
-	sa_sd,
-	sa_sd_storage,
-	sa_none,
-};
-
-/*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- */
-
-static int default_relax_domain_level = -1;
-int sched_domain_level_max;
-
-static int __init setup_relax_domain_level(char *str)
-{
-	if (kstrtoint(str, 0, &default_relax_domain_level))
-		pr_warn("Unable to set relax_domain_level\n");
-
-	return 1;
-}
-__setup("relax_domain_level=", setup_relax_domain_level);
-
-static void set_domain_attribute(struct sched_domain *sd,
-				 struct sched_domain_attr *attr)
-{
-	int request;
-
-	if (!attr || attr->relax_domain_level < 0) {
-		if (default_relax_domain_level < 0)
-			return;
-		else
-			request = default_relax_domain_level;
-	} else
-		request = attr->relax_domain_level;
-	if (request < sd->level) {
-		/* turn off idle balance on this domain */
-		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-	} else {
-		/* turn on idle balance on this domain */
-		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-	}
-}
-
-static void __sdt_free(const struct cpumask *cpu_map);
-static int __sdt_alloc(const struct cpumask *cpu_map);
-
-static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
-				 const struct cpumask *cpu_map)
-{
-	switch (what) {
-	case sa_rootdomain:
-		if (!atomic_read(&d->rd->refcount))
-			free_rootdomain(&d->rd->rcu); /* fall through */
-	case sa_sd:
-		free_percpu(d->sd); /* fall through */
-	case sa_sd_storage:
-		__sdt_free(cpu_map); /* fall through */
-	case sa_none:
-		break;
-	}
-}
-
-static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
-						   const struct cpumask *cpu_map)
-{
-	memset(d, 0, sizeof(*d));
-
-	if (__sdt_alloc(cpu_map))
-		return sa_sd_storage;
-	d->sd = alloc_percpu(struct sched_domain *);
-	if (!d->sd)
-		return sa_sd_storage;
-	d->rd = alloc_rootdomain();
-	if (!d->rd)
-		return sa_sd;
-	return sa_rootdomain;
-}
-
-/*
- * NULL the sd_data elements we've used to build the sched_domain
- * structure so that the subsequent __free_domain_allocs()
- * will not free the data we're using.
- */
-static void claim_allocations(int cpu, struct sched_domain *sd)
-{
-	struct sd_data *sdd = sd->private;
-
-	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
-	*per_cpu_ptr(sdd->sd, cpu) = NULL;
-}
-
-#ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
-static int *sched_domains_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
-#endif
-
-/*
- * SD_flags allowed in topology descriptions.
- *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
- *
- * Odd one out:
- * SD_ASYM_PACKING        - describes SMT quirks
- */
-#define TOPOLOGY_SD_FLAGS		\
-	(SD_SHARE_CPUCAPACITY |		\
-	 SD_SHARE_PKG_RESOURCES |	\
-	 SD_NUMA |			\
-	 SD_ASYM_PACKING |		\
-	 SD_SHARE_POWERDOMAIN)
-
-static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
-{
-	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-	int sd_weight, sd_flags = 0;
-
-#ifdef CONFIG_NUMA
-	/*
-	 * Ugly hack to pass state to sd_numa_mask()...
-	 */
-	sched_domains_curr_level = tl->numa_level;
-#endif
-
-	sd_weight = cpumask_weight(tl->mask(cpu));
-
-	if (tl->sd_flags)
-		sd_flags = (*tl->sd_flags)();
-	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
-			"wrong sd_flags in topology description\n"))
-		sd_flags &= ~TOPOLOGY_SD_FLAGS;
-
-	*sd = (struct sched_domain){
-		.min_interval		= sd_weight,
-		.max_interval		= 2*sd_weight,
-		.busy_factor		= 32,
-		.imbalance_pct		= 125,
-
-		.cache_nice_tries	= 0,
-		.busy_idx		= 0,
-		.idle_idx		= 0,
-		.newidle_idx		= 0,
-		.wake_idx		= 0,
-		.forkexec_idx		= 0,
-
-		.flags			= 1*SD_LOAD_BALANCE
-					| 1*SD_BALANCE_NEWIDLE
-					| 1*SD_BALANCE_EXEC
-					| 1*SD_BALANCE_FORK
-					| 0*SD_BALANCE_WAKE
-					| 1*SD_WAKE_AFFINE
-					| 0*SD_SHARE_CPUCAPACITY
-					| 0*SD_SHARE_PKG_RESOURCES
-					| 0*SD_SERIALIZE
-					| 0*SD_PREFER_SIBLING
-					| 0*SD_NUMA
-					| sd_flags
-					,
-
-		.last_balance		= jiffies,
-		.balance_interval	= sd_weight,
-		.smt_gain		= 0,
-		.max_newidle_lb_cost	= 0,
-		.next_decay_max_lb_cost	= jiffies,
-#ifdef CONFIG_SCHED_DEBUG
-		.name			= tl->name,
-#endif
-	};
-
-	/*
-	 * Convert topological properties into behaviour.
-	 */
-
-	if (sd->flags & SD_SHARE_CPUCAPACITY) {
-		sd->flags |= SD_PREFER_SIBLING;
-		sd->imbalance_pct = 110;
-		sd->smt_gain = 1178; /* ~15% */
-
-	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-		sd->imbalance_pct = 117;
-		sd->cache_nice_tries = 1;
-		sd->busy_idx = 2;
-
-#ifdef CONFIG_NUMA
-	} else if (sd->flags & SD_NUMA) {
-		sd->cache_nice_tries = 2;
-		sd->busy_idx = 3;
-		sd->idle_idx = 2;
-
-		sd->flags |= SD_SERIALIZE;
-		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
-			sd->flags &= ~(SD_BALANCE_EXEC |
-				       SD_BALANCE_FORK |
-				       SD_WAKE_AFFINE);
-		}
-
-#endif
-	} else {
-		sd->flags |= SD_PREFER_SIBLING;
-		sd->cache_nice_tries = 1;
-		sd->busy_idx = 2;
-		sd->idle_idx = 1;
-	}
-
-	sd->private = &tl->data;
-
-	return sd;
-}
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-#endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology =
-	default_topology;
-
-#define for_each_sd_topology(tl)			\
-	for (tl = sched_domain_topology; tl->mask; tl++)
-
-void set_sched_topology(struct sched_domain_topology_level *tl)
-{
-	sched_domain_topology = tl;
-}
-
-#ifdef CONFIG_NUMA
-
-static const struct cpumask *sd_numa_mask(int cpu)
-{
-	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
-}
-
-static void sched_numa_warn(const char *str)
-{
-	static int done = false;
-	int i,j;
-
-	if (done)
-		return;
-
-	done = true;
-
-	printk(KERN_WARNING "ERROR: %s\n\n", str);
-
-	for (i = 0; i < nr_node_ids; i++) {
-		printk(KERN_WARNING "  ");
-		for (j = 0; j < nr_node_ids; j++)
-			printk(KERN_CONT "%02d ", node_distance(i,j));
-		printk(KERN_CONT "\n");
-	}
-	printk(KERN_WARNING "\n");
-}
-
-static bool find_numa_distance(int distance)
-{
-	int i;
-
-	if (distance == node_distance(0, 0))
-		return true;
-
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		if (sched_domains_numa_distance[i] == distance)
-			return true;
-	}
-
-	return false;
-}
-
-static void sched_init_numa(void)
-{
-	int next_distance, curr_distance = node_distance(0, 0);
-	struct sched_domain_topology_level *tl;
-	int level = 0;
-	int i, j, k;
-
-	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
-	if (!sched_domains_numa_distance)
-		return;
-
-	/*
-	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
-	 * unique distances in the node_distance() table.
-	 *
-	 * Assumes node_distance(0,j) includes all distances in
-	 * node_distance(i,j) in order to avoid cubic time.
-	 */
-	next_distance = curr_distance;
-	for (i = 0; i < nr_node_ids; i++) {
-		for (j = 0; j < nr_node_ids; j++) {
-			for (k = 0; k < nr_node_ids; k++) {
-				int distance = node_distance(i, k);
-
-				if (distance > curr_distance &&
-				    (distance < next_distance ||
-				     next_distance == curr_distance))
-					next_distance = distance;
-
-				/*
-				 * While not a strong assumption it would be nice to know
-				 * about cases where if node A is connected to B, B is not
-				 * equally connected to A.
-				 */
-				if (sched_debug() && node_distance(k, i) != distance)
-					sched_numa_warn("Node-distance not symmetric");
-
-				if (sched_debug() && i && !find_numa_distance(distance))
-					sched_numa_warn("Node-0 not representative");
-			}
-			if (next_distance != curr_distance) {
-				sched_domains_numa_distance[level++] = next_distance;
-				sched_domains_numa_levels = level;
-				curr_distance = next_distance;
-			} else break;
-		}
-
-		/*
-		 * In case of sched_debug() we verify the above assumption.
-		 */
-		if (!sched_debug())
-			break;
-	}
-	/*
-	 * 'level' contains the number of unique distances, excluding the
-	 * identity distance node_distance(i,i).
-	 *
-	 * The sched_domains_numa_distance[] array includes the actual distance
-	 * numbers.
-	 */
-
-	/*
-	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
-	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
-	 * the array will contain less then 'level' members. This could be
-	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
-	 * in other functions.
-	 *
-	 * We reset it to 'level' at the end of this function.
-	 */
-	sched_domains_numa_levels = 0;
-
-	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
-	if (!sched_domains_numa_masks)
-		return;
-
-	/*
-	 * Now for each level, construct a mask per node which contains all
-	 * cpus of nodes that are that many hops away from us.
-	 */
-	for (i = 0; i < level; i++) {
-		sched_domains_numa_masks[i] =
-			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
-		if (!sched_domains_numa_masks[i])
-			return;
-
-		for (j = 0; j < nr_node_ids; j++) {
-			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
-			if (!mask)
-				return;
-
-			sched_domains_numa_masks[i][j] = mask;
-
-			for_each_node(k) {
-				if (node_distance(j, k) > sched_domains_numa_distance[i])
-					continue;
-
-				cpumask_or(mask, mask, cpumask_of_node(k));
-			}
-		}
-	}
-
-	/* Compute default topology size */
-	for (i = 0; sched_domain_topology[i].mask; i++);
-
-	tl = kzalloc((i + level + 1) *
-			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
-	if (!tl)
-		return;
-
-	/*
-	 * Copy the default topology bits..
-	 */
-	for (i = 0; sched_domain_topology[i].mask; i++)
-		tl[i] = sched_domain_topology[i];
-
-	/*
-	 * .. and append 'j' levels of NUMA goodness.
-	 */
-	for (j = 0; j < level; i++, j++) {
-		tl[i] = (struct sched_domain_topology_level){
-			.mask = sd_numa_mask,
-			.sd_flags = cpu_numa_flags,
-			.flags = SDTL_OVERLAP,
-			.numa_level = j,
-			SD_INIT_NAME(NUMA)
-		};
-	}
-
-	sched_domain_topology = tl;
-
-	sched_domains_numa_levels = level;
-}
-
-static void sched_domains_numa_masks_set(int cpu)
-{
-	int node = cpu_to_node(cpu);
-	int i, j;
-
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		for (j = 0; j < nr_node_ids; j++) {
-			if (node_distance(j, node) <= sched_domains_numa_distance[i])
-				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
-		}
-	}
-}
-
-static void sched_domains_numa_masks_clear(int cpu)
-{
-	int i, j;
-
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		for (j = 0; j < nr_node_ids; j++)
-			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
-	}
-}
-
-#else
-static inline void sched_init_numa(void) { }
-static void sched_domains_numa_masks_set(unsigned int cpu) { }
-static void sched_domains_numa_masks_clear(unsigned int cpu) { }
-#endif /* CONFIG_NUMA */
-
-static int __sdt_alloc(const struct cpumask *cpu_map)
-{
-	struct sched_domain_topology_level *tl;
-	int j;
-
-	for_each_sd_topology(tl) {
-		struct sd_data *sdd = &tl->data;
-
-		sdd->sd = alloc_percpu(struct sched_domain *);
-		if (!sdd->sd)
-			return -ENOMEM;
-
-		for_each_cpu(j, cpu_map) {
-			struct sched_domain *sd;
-
-			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
-					GFP_KERNEL, cpu_to_node(j));
-			if (!sd)
-				return -ENOMEM;
-
-			*per_cpu_ptr(sdd->sd, j) = sd;
-		}
-	}
-
-	return 0;
-}
-
-static void __sdt_free(const struct cpumask *cpu_map)
-{
-	struct sched_domain_topology_level *tl;
-	int j;
-
-	for_each_sd_topology(tl) {
-		struct sd_data *sdd = &tl->data;
-
-		for_each_cpu(j, cpu_map) {
-			struct sched_domain *sd;
-
-			if (sdd->sd) {
-				sd = *per_cpu_ptr(sdd->sd, j);
-				kfree(*per_cpu_ptr(sdd->sd, j));
-			}
-		}
-		free_percpu(sdd->sd);
-		sdd->sd = NULL;
-	}
-}
-
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
-		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-		struct sched_domain *child, int cpu)
-{
-	struct sched_domain *sd = sd_init(tl, cpu);
-	if (!sd)
-		return child;
-
-	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
-	if (child) {
-		sd->level = child->level + 1;
-		sched_domain_level_max = max(sched_domain_level_max, sd->level);
-		child->parent = sd;
-		sd->child = child;
-
-		if (!cpumask_subset(sched_domain_span(child),
-				    sched_domain_span(sd))) {
-			pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
-			pr_err("     the %s domain not a subset of the %s domain\n",
-					child->name, sd->name);
-#endif
-			/* Fixup, ensure @sd has at least @child cpus. */
-			cpumask_or(sched_domain_span(sd),
-				   sched_domain_span(sd),
-				   sched_domain_span(child));
-		}
-
-	}
-	set_domain_attribute(sd, attr);
-
-	return sd;
-}
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-static int build_sched_domains(const struct cpumask *cpu_map,
-			       struct sched_domain_attr *attr)
-{
-	enum s_alloc alloc_state;
-	struct sched_domain *sd;
-	struct s_data d;
-	int i, ret = -ENOMEM;
-
-	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
-	if (alloc_state != sa_rootdomain)
-		goto error;
-
-	/* Set up domains for cpus specified by the cpu_map. */
-	for_each_cpu(i, cpu_map) {
-		struct sched_domain_topology_level *tl;
-
-		sd = NULL;
-		for_each_sd_topology(tl) {
-			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
-			if (tl == sched_domain_topology)
-				*per_cpu_ptr(d.sd, i) = sd;
-			if (tl->flags & SDTL_OVERLAP)
-				sd->flags |= SD_OVERLAP;
-			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-				break;
-		}
-	}
-
-	/* Calculate CPU capacity for physical packages and nodes */
-	for (i = nr_cpumask_bits-1; i >= 0; i--) {
-		if (!cpumask_test_cpu(i, cpu_map))
-			continue;
-
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-			claim_allocations(i, sd);
-		}
-	}
-
-	/* Attach the domains */
-	rcu_read_lock();
-	for_each_cpu(i, cpu_map) {
-		sd = *per_cpu_ptr(d.sd, i);
-		cpu_attach_domain(sd, d.rd, i);
-	}
-	rcu_read_unlock();
-
-	ret = 0;
-error:
-	__free_domain_allocs(&d, alloc_state, cpu_map);
-	return ret;
-}
-
-static cpumask_var_t *doms_cur;	/* current sched domains */
-static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;
-				/* attribues of custom domains in 'doms_cur' */
-
-/*
- * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t fallback_doms;
-
-/*
- * arch_update_cpu_topology lets virtualized architectures update the
- * cpu core maps. It is supposed to return 1 if the topology changed
- * or 0 if it stayed the same.
- */
-int __weak arch_update_cpu_topology(void)
-{
-	return 0;
-}
-
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
-{
-	int i;
-	cpumask_var_t *doms;
-
-	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
-	if (!doms)
-		return NULL;
-	for (i = 0; i < ndoms; i++) {
-		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
-			free_sched_domains(doms, i);
-			return NULL;
-		}
-	}
-	return doms;
-}
-
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
-{
-	unsigned int i;
-	for (i = 0; i < ndoms; i++)
-		free_cpumask_var(doms[i]);
-	kfree(doms);
-}
-
-/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
- */
-static int init_sched_domains(const struct cpumask *cpu_map)
-{
-	int err;
-
-	arch_update_cpu_topology();
-	ndoms_cur = 1;
-	doms_cur = alloc_sched_domains(ndoms_cur);
-	if (!doms_cur)
-		doms_cur = &fallback_doms;
-	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-	err = build_sched_domains(doms_cur[0], NULL);
-	register_sched_domain_sysctl();
-
-	return err;
-}
-
-/*
- * Detach sched domains from a group of cpus specified in cpu_map
- * These cpus will now be attached to the NULL domain
- */
-static void detach_destroy_domains(const struct cpumask *cpu_map)
-{
-	int i;
-
-	rcu_read_lock();
-	for_each_cpu(i, cpu_map)
-		cpu_attach_domain(NULL, &def_root_domain, i);
-	rcu_read_unlock();
-}
-
-/* handle null as "default" */
-static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
-			struct sched_domain_attr *new, int idx_new)
-{
-	struct sched_domain_attr tmp;
-
-	/* fast path */
-	if (!new && !cur)
-		return 1;
-
-	tmp = SD_ATTR_INIT;
-	return !memcmp(cur ? (cur + idx_cur) : &tmp,
-			new ? (new + idx_new) : &tmp,
-			sizeof(struct sched_domain_attr));
-}
-
-/*
- * Partition sched domains as specified by the 'ndoms_new'
- * cpumasks in the array doms_new[] of cpumasks. This compares
- * doms_new[] to the current sched domain partitioning, doms_cur[].
- * It destroys each deleted domain and builds each new domain.
- *
- * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
- * The masks don't intersect (don't overlap.) We should setup one
- * sched domain for each mask. CPUs not in any of the cpumasks will
- * not be load balanced. If the same cpumask appears both in the
- * current 'doms_cur' domains and in the new 'doms_new', we can leave
- * it as it is.
- *
- * The passed in 'doms_new' should be allocated using
- * alloc_sched_domains.  This routine takes ownership of it and will
- * free_sched_domains it when done with it. If the caller failed the
- * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
- *
- * If doms_new == NULL it will be replaced with cpu_online_mask.
- * ndoms_new == 0 is a special case for destroying existing domains,
- * and it will not create the default domain.
- *
- * Call with hotplug lock held
- */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-			     struct sched_domain_attr *dattr_new)
-{
-	int i, j, n;
-	int new_topology;
-
-	mutex_lock(&sched_domains_mutex);
-
-	/* always unregister in case we don't destroy any domains */
-	unregister_sched_domain_sysctl();
-
-	/* Let architecture update cpu core mappings. */
-	new_topology = arch_update_cpu_topology();
-
-	n = doms_new ? ndoms_new : 0;
-
-	/* Destroy deleted domains */
-	for (i = 0; i < ndoms_cur; i++) {
-		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_cur[i], doms_new[j])
-			    && dattrs_equal(dattr_cur, i, dattr_new, j))
-				goto match1;
-		}
-		/* no match - a current sched domain not in new doms_new[] */
-		detach_destroy_domains(doms_cur[i]);
-match1:
-		;
-	}
-
-	n = ndoms_cur;
-	if (doms_new == NULL) {
-		n = 0;
-		doms_new = &fallback_doms;
-		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
-		WARN_ON_ONCE(dattr_new);
-	}
-
-	/* Build new domains */
-	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_new[i], doms_cur[j])
-			    && dattrs_equal(dattr_new, i, dattr_cur, j))
-				goto match2;
-		}
-		/* no match - add a new doms_new */
-		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-match2:
-		;
-	}
-
-	/* Remember the new sched domains */
-	if (doms_cur != &fallback_doms)
-		free_sched_domains(doms_cur, ndoms_cur);
-	kfree(dattr_cur);	/* kfree(NULL) is safe */
-	doms_cur = doms_new;
-	dattr_cur = dattr_new;
-	ndoms_cur = ndoms_new;
-
-	register_sched_domain_sysctl();
-
-	mutex_unlock(&sched_domains_mutex);
-}
-
-static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
-
-/*
- * Update cpusets according to cpu_active mask.  If cpusets are
- * disabled, cpuset_update_active_cpus() becomes a simple wrapper
- * around partition_sched_domains().
- *
- * If we come here as part of a suspend/resume, don't touch cpusets because we
- * want to restore it back to its original state upon resume anyway.
- */
-static void cpuset_cpu_active(void)
-{
-	if (cpuhp_tasks_frozen) {
-		/*
-		 * num_cpus_frozen tracks how many CPUs are involved in suspend
-		 * resume sequence. As long as this is not the last online
-		 * operation in the resume sequence, just build a single sched
-		 * domain, ignoring cpusets.
-		 */
-		num_cpus_frozen--;
-		if (likely(num_cpus_frozen)) {
-			partition_sched_domains(1, NULL, NULL);
-			return;
-		}
-		/*
-		 * This is the last CPU online operation. So fall through and
-		 * restore the original sched domains by considering the
-		 * cpuset configurations.
-		 */
-	}
-
-	cpuset_update_active_cpus(true);
-}
-
-static int cpuset_cpu_inactive(unsigned int cpu)
-{
-	if (!cpuhp_tasks_frozen) {
-		cpuset_update_active_cpus(false);
-	} else {
-		num_cpus_frozen++;
-		partition_sched_domains(1, NULL, NULL);
-	}
-	return 0;
-}
-
-int sched_cpu_activate(unsigned int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	set_cpu_active(cpu, true);
-
-	if (sched_smp_initialized) {
-		sched_domains_numa_masks_set(cpu);
-		cpuset_cpu_active();
-	}
-
-	/*
-	 * Put the rq online, if not already. This happens:
-	 *
-	 * 1) In the early boot process, because we build the real domains
-	 *    after all cpus have been brought up.
-	 *
-	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
-	 *    domains.
-	 */
-	grq_lock_irqsave(&flags);
-	if (rq->rd) {
-		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_online(rq);
-	}
-	unbind_zero(cpu);
-	grq.noc = num_online_cpus();
-	grq_unlock_irqrestore(&flags);
-
-	return 0;
-}
-
-int sched_cpu_deactivate(unsigned int cpu)
-{
-	int ret;
-
-	set_cpu_active(cpu, false);
-	/*
-	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
-	 * users of this state to go away such that all new such users will
-	 * observe it.
-	 *
-	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
-	 * not imply sync_sched(), so wait for both.
-	 *
-	 * Do sync before park smpboot threads to take care the rcu boost case.
-	 */
-	if (IS_ENABLED(CONFIG_PREEMPT))
-		synchronize_rcu_mult(call_rcu, call_rcu_sched);
-	else
-		synchronize_rcu();
-
-	if (!sched_smp_initialized)
-		return 0;
-
-	ret = cpuset_cpu_inactive(cpu);
-	if (ret) {
-		set_cpu_active(cpu, true);
-		return ret;
-	}
-	sched_domains_numa_masks_clear(cpu);
-	return 0;
-}
-
-int sched_cpu_starting(unsigned int __maybe_unused cpu)
-{
-	return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-int sched_cpu_dying(unsigned int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	grq_lock_irqsave(&flags);
-	if (rq->rd) {
-		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_offline(rq);
-	}
-	bind_zero(cpu);
-	grq.noc = num_online_cpus();
-	grq_unlock_irqrestore(&flags);
-
-	return 0;
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
-/*
- * Cheaper version of the below functions in case support for SMT and MC is
- * compiled in but CPUs have no siblings.
- */
-static bool sole_cpu_idle(struct rq *rq)
-{
-	return rq_idle(rq);
-}
-#endif
-#ifdef CONFIG_SCHED_SMT
-static const cpumask_t *thread_cpumask(int cpu)
-{
-	return topology_sibling_cpumask(cpu);
-}
-/* All this CPU's SMT siblings are idle */
-static bool siblings_cpu_idle(struct rq *rq)
-{
-	return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
-}
-#endif
-#ifdef CONFIG_SCHED_MC
-static const cpumask_t *core_cpumask(int cpu)
-{
-	return topology_core_cpumask(cpu);
-}
-/* All this CPU's shared cache siblings are idle */
-static bool cache_cpu_idle(struct rq *rq)
-{
-	return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
-}
-#endif
-
-enum sched_domain_level {
-	SD_LV_NONE = 0,
-	SD_LV_SIBLING,
-	SD_LV_MC,
-	SD_LV_BOOK,
-	SD_LV_CPU,
-	SD_LV_NODE,
-	SD_LV_ALLNODES,
-	SD_LV_MAX
-};
-
-void __init sched_init_smp(void)
-{
-	struct sched_domain *sd;
-	int cpu, other_cpu;
-#ifdef CONFIG_SCHED_SMT
-	bool smt_threads = false;
-#endif
-	struct rq *rq;
-
-	cpumask_var_t non_isolated_cpus;
-
-	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-
-	sched_init_numa();
-
-	/*
-	 * There's no userspace yet to cause hotplug operations; hence all the
-	 * cpu masks are stable and all blatant races in the below code cannot
-	 * happen.
-	 */
-	mutex_lock(&sched_domains_mutex);
-	init_sched_domains(cpu_active_mask);
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-	if (cpumask_empty(non_isolated_cpus))
-		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
-	mutex_unlock(&sched_domains_mutex);
-
-	/* Move init over to a non-isolated CPU */
-	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
-		BUG();
-	free_cpumask_var(non_isolated_cpus);
-
-	mutex_lock(&sched_domains_mutex);
-	grq_lock_irq();
-	/*
-	 * Set up the relative cache distance of each online cpu from each
-	 * other in a simple array for quick lookup. Locality is determined
-	 * by the closest sched_domain that CPUs are separated by. CPUs with
-	 * shared cache in SMT and MC are treated as local. Separate CPUs
-	 * (within the same package or physically) within the same node are
-	 * treated as not local. CPUs not even in the same domain (different
-	 * nodes) are treated as very distant.
-	 */
-	for_each_online_cpu(cpu) {
-		rq = cpu_rq(cpu);
-
-		/* First check if this cpu is in the same node */
-		for_each_domain(cpu, sd) {
-			if (sd->level > SD_LV_MC)
-				continue;
-			/* Set locality to local node if not already found lower */
-			for_each_cpu(other_cpu, sched_domain_span(sd)) {
-				if (rq->cpu_locality[other_cpu] > 3)
-					rq->cpu_locality[other_cpu] = 3;
-			}
-		}
-
-		/*
-		 * Each runqueue has its own function in case it doesn't have
-		 * siblings of its own allowing mixed topologies.
-		 */
-#ifdef CONFIG_SCHED_MC
-		for_each_cpu(other_cpu, core_cpumask(cpu)) {
-			if (rq->cpu_locality[other_cpu] > 2)
-				rq->cpu_locality[other_cpu] = 2;
-		}
-		if (cpumask_weight(core_cpumask(cpu)) > 1) {
-			cpumask_copy(&rq->core_mask, core_cpumask(cpu));
-			cpumask_clear_cpu(cpu, &rq->core_mask);
-			rq->cache_idle = cache_cpu_idle;
-		}
-#endif
-#ifdef CONFIG_SCHED_SMT
-		for_each_cpu(other_cpu, thread_cpumask(cpu))
-			rq->cpu_locality[other_cpu] = 1;
-		if (cpumask_weight(thread_cpumask(cpu)) > 1) {
-			cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
-			cpumask_clear_cpu(cpu, &rq->thread_mask);
-			rq->siblings_idle = siblings_cpu_idle;
-			smt_threads = true;
-		}
-#endif
-	}
-	for_each_possible_cpu(cpu) {
-		int total_cpus = 0, locality;
-
-		rq = cpu_rq(cpu);
-		for (locality = 0; locality <= 4; locality++) {
-			for_each_possible_cpu(other_cpu) {
-				if (rq->cpu_locality[other_cpu] == locality)
-					rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
-			}
-		}
-	}
-#ifdef CONFIG_SMT_NICE
-	if (smt_threads) {
-		check_siblings = &check_smt_siblings;
-		wake_siblings = &wake_smt_siblings;
-		smt_schedule = &smt_should_schedule;
-	}
-#endif
-	grq_unlock_irq();
-	mutex_unlock(&sched_domains_mutex);
-
-	for_each_online_cpu(cpu) {
-		rq = cpu_rq(cpu);
-
-		for_each_online_cpu(other_cpu) {
-			if (other_cpu <= cpu)
-				continue;
-			printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
-		}
-	}
-	sched_smp_initialized = true;
-}
-#else
-void __init sched_init_smp(void)
-{
-}
-#endif /* CONFIG_SMP */
-
-int in_sched_functions(unsigned long addr)
-{
-	return in_lock_functions(addr) ||
-		(addr >= (unsigned long)__sched_text_start
-		&& addr < (unsigned long)__sched_text_end);
-}
-
-#ifdef CONFIG_CGROUP_SCHED
-/* task group related information */
-struct task_group {
-	struct cgroup_subsys_state css;
-
-	struct rcu_head rcu;
-	struct list_head list;
-
-	struct task_group *parent;
-	struct list_head siblings;
-	struct list_head children;
-};
-
-/*
- * Default task group.
- * Every task in system belongs to this group at bootup.
- */
-struct task_group root_task_group;
-LIST_HEAD(task_groups);
-
-/* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
-#endif /* CONFIG_CGROUP_SCHED */
-
-void __init sched_init(void)
-{
-#ifdef CONFIG_SMP
-	int cpu_ids;
-#endif
-	int i;
-	struct rq *rq;
-
-	prio_ratios[0] = 128;
-	for (i = 1 ; i < NICE_WIDTH ; i++)
-		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
-
-	raw_spin_lock_init(&grq.lock);
-	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
-	grq.niffies = 0;
-	grq.last_jiffy = jiffies;
-	raw_spin_lock_init(&grq.iso_lock);
-	grq.iso_ticks = 0;
-	grq.iso_refractory = false;
-	grq.noc = 1;
-	skiplist_init(&grq.node);
-	grq.sl = new_skiplist(&grq.node);
-	skiplist_node_init(&init_task.node);
-
-#ifdef CONFIG_SMP
-	init_defrootdomain();
-	grq.qnr = grq.idle_cpus = 0;
-	cpumask_clear(&grq.cpu_idle_map);
-#else
-	uprq = &per_cpu(runqueues, 0);
-#endif
-
-#ifdef CONFIG_CGROUP_SCHED
-	task_group_cache = KMEM_CACHE(task_group, 0);
-
-	list_add(&root_task_group.list, &task_groups);
-	INIT_LIST_HEAD(&root_task_group.children);
-	INIT_LIST_HEAD(&root_task_group.siblings);
-#endif /* CONFIG_CGROUP_SCHED */
-	for_each_possible_cpu(i) {
-		rq = cpu_rq(i);
-		rq->grq_lock = &grq.lock;
-		rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
-			      rq->iowait_pc = rq->idle_pc = 0;
-		rq->dither = false;
-#ifdef CONFIG_SMP
-		rq->last_niffy = 0;
-		rq->sd = NULL;
-		rq->rd = NULL;
-		rq->online = false;
-		rq->cpu = i;
-		rq_attach_root(rq, &def_root_domain);
-#endif
-		atomic_set(&rq->nr_iowait, 0);
-	}
-
-#ifdef CONFIG_SMP
-	cpu_ids = i;
-	/*
-	 * Set the base locality for cpu cache distance calculation to
-	 * "distant" (3). Make sure the distance from a CPU to itself is 0.
-	 */
-	for_each_possible_cpu(i) {
-		int j;
-
-		rq = cpu_rq(i);
-#ifdef CONFIG_SCHED_SMT
-		rq->siblings_idle = sole_cpu_idle;
-#endif
-#ifdef CONFIG_SCHED_MC
-		rq->cache_idle = sole_cpu_idle;
-#endif
-		rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
-		for_each_possible_cpu(j) {
-			if (i == j)
-				rq->cpu_locality[j] = 0;
-			else
-				rq->cpu_locality[j] = 4;
-		}
-		rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
-		rq->rq_order[0] = rq;
-		for (j = 1; j < cpu_ids; j++)
-			rq->rq_order[j] = cpu_rq(j);
-	}
-#endif
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
-	/*
-	 * The boot idle thread does lazy MMU switching as well:
-	 */
-	atomic_inc(&init_mm.mm_count);
-	enter_lazy_tlb(&init_mm, current);
-
-	/*
-	 * Make us the idle thread. Technically, schedule() should not be
-	 * called from this thread, however somewhere below it might be,
-	 * but because we are the idle thread, we just pick up running again
-	 * when this runqueue becomes "idle".
-	 */
-	init_idle(current, smp_processor_id());
-
-#ifdef CONFIG_SMP
-	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
-	/* May be allocated at isolcpus cmdline parse time */
-	if (cpu_isolated_map == NULL)
-		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
-	idle_thread_set_boot_cpu();
-#endif /* SMP */
-
-	init_schedstats();
-}
-
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
-	int nested = preempt_count() + rcu_preempt_depth();
-
-	return (nested == preempt_offset);
-}
-
-void __might_sleep(const char *file, int line, int preempt_offset)
-{
-	/*
-	 * Blocking primitives will set (and therefore destroy) current->state,
-	 * since we will exit with TASK_RUNNING make sure we enter with it,
-	 * otherwise we will destroy state.
-	 */
-	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
-			"do not call blocking ops when !TASK_RUNNING; "
-			"state=%lx set at [<%p>] %pS\n",
-			current->state,
-			(void *)current->task_state_change,
-			(void *)current->task_state_change);
-
-	___might_sleep(file, line, preempt_offset);
-}
-EXPORT_SYMBOL(__might_sleep);
-
-void ___might_sleep(const char *file, int line, int preempt_offset)
-{
-	static unsigned long prev_jiffy;	/* ratelimiting */
-
-	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
-	     !is_idle_task(current)) ||
-	    system_state != SYSTEM_RUNNING || oops_in_progress)
-		return;
-	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-		return;
-	prev_jiffy = jiffies;
-
-	printk(KERN_ERR
-		"BUG: sleeping function called from invalid context at %s:%d\n",
-			file, line);
-	printk(KERN_ERR
-		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-			in_atomic(), irqs_disabled(),
-			current->pid, current->comm);
-
-	if (task_stack_end_corrupted(current))
-		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
-
-	debug_show_held_locks(current);
-	if (irqs_disabled())
-		print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
-	if (!preempt_count_equals(preempt_offset)) {
-		pr_err("Preemption disabled at:");
-		print_ip_sym(current->preempt_disable_ip);
-		pr_cont("\n");
-	}
-#endif
-	dump_stack();
-}
-EXPORT_SYMBOL(___might_sleep);
-#endif
-
-#ifdef CONFIG_MAGIC_SYSRQ
-static inline void normalise_rt_tasks(void)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	struct rq *rq;
-
-	read_lock(&tasklist_lock);
-	for_each_process_thread(g, p) {
-		/*
-		 * Only normalize user tasks:
-		 */
-		if (p->flags & PF_KTHREAD)
-			continue;
-
-		if (!rt_task(p) && !iso_task(p))
-			continue;
-
-		rq = task_grq_lock(p, &flags);
-		__setscheduler(p, rq, SCHED_NORMAL, 0, false);
-		task_grq_unlock(p, &flags);
-	}
-	read_unlock(&tasklist_lock);
-}
-
-void normalize_rt_tasks(void)
-{
-	normalise_rt_tasks();
-}
-#endif /* CONFIG_MAGIC_SYSRQ */
-
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
-/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
- *
- * They can only be called when the whole system has been
- * stopped - every CPU needs to be quiescent, and no scheduling
- * activity can take place. Using them for anything else would
- * be a serious bug, and as a result, they aren't even visible
- * under any other configuration.
- */
-
-/**
- * curr_task - return the current task for a given cpu.
- * @cpu: the processor in question.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- *
- * Return: The current task for @cpu.
- */
-struct task_struct *curr_task(int cpu)
-{
-	return cpu_curr(cpu);
-}
-
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * set_curr_task - set the current task for a given cpu.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack.  It allows the architecture to switch the
- * notion of the current task on a cpu in a non-blocking manner.  This function
- * must be called with all CPU's synchronised, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void set_curr_task(int cpu, struct task_struct *p)
-{
-	cpu_curr(cpu) = p;
-}
-
-#endif
-
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	*ut = p->utime;
-	*st = p->stime;
-}
-EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime;
-
-	thread_group_cputime(p, &cputime);
-
-	*ut = cputime.utime;
-	*st = cputime.stime;
-}
-
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	vtime_account_system(tsk);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
-	if (is_idle_task(prev))
-		vtime_account_idle(prev);
-	else
-		vtime_account_system(prev);
-
-	vtime_account_user(prev);
-	arch_vtime_task_switch(prev);
-}
-#endif
-
-#else
-/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
-{
-	u64 scaled;
-
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
-		if (stime > rtime) {
-			u64 tmp = rtime; rtime = stime; stime = tmp;
-		}
-
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
-
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
-		rtime >>= 1;
-		total >>= 1;
-	}
-
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return (__force cputime_t) scaled;
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
- */
-static void cputime_adjust(struct task_cputime *curr,
-			   struct prev_cputime *prev,
-			   cputime_t *ut, cputime_t *st)
-{
-	cputime_t rtime, stime, utime, total;
-
-	stime = curr->stime;
-	total = stime + curr->utime;
-
-	/*
-	 * Tick based cputime accounting depend on random scheduling
-	 * timeslices of a task to be interrupted or not by the timer.
-	 * Depending on these circumstances, the number of these interrupts
-	 * may be over or under-optimistic, matching the real user and system
-	 * cputime with a variable precision.
-	 *
-	 * Fix this by scaling these tick based values against the total
-	 * runtime accounted by the CFS scheduler.
-	 */
-	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-
-	/*
-	 * Update userspace visible utime/stime values only if actual execution
-	 * time is bigger than already exported. Note that can happen, that we
-	 * provided bigger values due to scaling inaccuracy on big numbers.
-	 */
-	if (prev->stime + prev->utime >= rtime)
-		goto out;
-
-	if (total) {
-		stime = scale_stime((__force u64)stime,
-				    (__force u64)rtime, (__force u64)total);
-		utime = rtime - stime;
-	} else {
-		stime = rtime;
-		utime = 0;
-	}
-
-	/*
-	 * If the tick based count grows faster than the scheduler one,
-	 * the result of the scaling may go backward.
-	 * Let's enforce monotonicity.
-	 */
-	prev->stime = max(prev->stime, stime);
-	prev->utime = max(prev->utime, utime);
-
-out:
-	*ut = prev->utime;
-	*st = prev->stime;
-}
-
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime = {
-		.sum_exec_runtime = tsk_seruntime(p),
-	};
-
-	task_cputime(p, &cputime.utime, &cputime.stime);
-	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
-}
-EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime;
-
-	thread_group_cputime(p, &cputime);
-	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
-}
-#endif
-
-void init_idle_bootup_task(struct task_struct *idle)
-{}
-
-#ifdef CONFIG_SCHED_DEBUG
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
-{}
-
-void proc_sched_set_task(struct task_struct *p)
-{}
-#endif
-
-#ifdef CONFIG_SMP
-#define SCHED_LOAD_SHIFT	(10)
-#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
-
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-	return SCHED_LOAD_SCALE;
-}
-
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-	unsigned long weight = cpumask_weight(sched_domain_span(sd));
-	unsigned long smt_gain = sd->smt_gain;
-
-	smt_gain /= weight;
-
-	return smt_gain;
-}
-#endif
-
-#ifdef CONFIG_CGROUP_SCHED
-static void sched_free_group(struct task_group *tg)
-{
-	kmem_cache_free(task_group_cache, tg);
-}
-
-/* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(struct task_group *parent)
-{
-	struct task_group *tg;
-
-	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
-	if (!tg)
-		return ERR_PTR(-ENOMEM);
-
-	return tg;
-}
-
-void sched_online_group(struct task_group *tg, struct task_group *parent)
-{
-}
-
-/* rcu callback to free various structures associated with a task group */
-static void sched_free_group_rcu(struct rcu_head *rhp)
-{
-	/* now it should be safe to free those cfs_rqs */
-	sched_free_group(container_of(rhp, struct task_group, rcu));
-}
-
-void sched_destroy_group(struct task_group *tg)
-{
-	/* wait for possible concurrent references to cfs_rqs complete */
-	call_rcu(&tg->rcu, sched_free_group_rcu);
-}
-
-void sched_offline_group(struct task_group *tg)
-{
-}
-
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-{
-	return css ? container_of(css, struct task_group, css) : NULL;
-}
-
-static struct cgroup_subsys_state *
-cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct task_group *parent = css_tg(parent_css);
-	struct task_group *tg;
-
-	if (!parent) {
-		/* This is early initialization for the top cgroup */
-		return &root_task_group.css;
-	}
-
-	tg = sched_create_group(parent);
-	if (IS_ERR(tg))
-		return ERR_PTR(-ENOMEM);
-	return &tg->css;
-}
-
-static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
-{
-	struct task_group *tg = css_tg(css);
-
-	sched_offline_group(tg);
-}
-
-static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
-{
-	struct task_group *tg = css_tg(css);
-
-	/*
-	 * Relies on the RCU grace period between css_released() and this.
-	 */
-	sched_free_group(tg);
-}
-
-static void cpu_cgroup_fork(struct task_struct *task)
-{
-}
-
-static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
-{
-	return 0;
-}
-
-static void cpu_cgroup_attach(struct cgroup_taskset *tset)
-{
-}
-
-static struct cftype cpu_files[] = {
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys cpu_cgrp_subsys = {
-	.css_alloc	= cpu_cgroup_css_alloc,
-	.css_released	= cpu_cgroup_css_released,
-	.css_free	= cpu_cgroup_css_free,
-	.fork		= cpu_cgroup_fork,
-	.can_attach	= cpu_cgroup_can_attach,
-	.attach		= cpu_cgroup_attach,
-	.legacy_cftypes	= cpu_files,
-	.early_init	= true,
-};
-#endif	/* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h
deleted file mode 100644
index 00a16ba0a..000000000
--- a/kernel/sched/bfs_sched.h
+++ /dev/null
@@ -1,224 +0,0 @@
-#include <linux/sched.h>
-#include <linux/cpuidle.h>
-#include <linux/stop_machine.h>
-
-#ifndef BFS_SCHED_H
-#define BFS_SCHED_H
-
-/*
- * This is the main, per-CPU runqueue data structure.
- * This data should only be modified by the local cpu.
- */
-struct rq {
-	struct task_struct *curr, *idle, *stop;
-	struct mm_struct *prev_mm;
-
-	/* Pointer to grq spinlock */
-	raw_spinlock_t *grq_lock;
-
-	/* Stored data about rq->curr to work outside grq lock */
-	u64 rq_deadline;
-	unsigned int rq_policy;
-	int rq_time_slice;
-	u64 rq_last_ran;
-	int rq_prio;
-	int soft_affined; /* Running or queued tasks with this set as their rq */
-	u64 load_update; /* When we last updated load */
-	unsigned long load_avg; /* Rolling load average */
-#ifdef CONFIG_SMT_NICE
-	struct mm_struct *rq_mm;
-	int rq_smt_bias; /* Policy/nice level bias across smt siblings */
-#endif
-	/* Accurate timekeeping data */
-	u64 timekeep_clock;
-	unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
-		iowait_pc, idle_pc;
-	atomic_t nr_iowait;
-
-#ifdef CONFIG_SMP
-	int cpu;		/* cpu of this runqueue */
-	bool online;
-
-	struct root_domain *rd;
-	struct sched_domain *sd;
-	int *cpu_locality; /* CPU relative cache distance */
-	struct rq **rq_order; /* RQs ordered by relative cache distance */
-#ifdef CONFIG_SCHED_SMT
-	cpumask_t thread_mask;
-	bool (*siblings_idle)(struct rq *rq);
-	/* See if all smt siblings are idle */
-#endif /* CONFIG_SCHED_SMT */
-#ifdef CONFIG_SCHED_MC
-	cpumask_t core_mask;
-	bool (*cache_idle)(struct rq *rq);
-	/* See if all cache siblings are idle */
-#endif /* CONFIG_SCHED_MC */
-	u64 last_niffy; /* Last time this RQ updated grq.niffies */
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	u64 prev_irq_time;
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#ifdef CONFIG_PARAVIRT
-	u64 prev_steal_time;
-#endif /* CONFIG_PARAVIRT */
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-	u64 prev_steal_time_rq;
-#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
-
-	u64 clock, old_clock, last_tick;
-	u64 clock_task;
-	bool dither;
-
-#ifdef CONFIG_SCHEDSTATS
-
-	/* latency stats */
-	struct sched_info rq_sched_info;
-	unsigned long long rq_cpu_time;
-	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
-
-	/* sys_sched_yield() stats */
-	unsigned int yld_count;
-
-	/* schedule() stats */
-	unsigned int sched_switch;
-	unsigned int sched_count;
-	unsigned int sched_goidle;
-
-	/* try_to_wake_up() stats */
-	unsigned int ttwu_count;
-	unsigned int ttwu_local;
-#endif /* CONFIG_SCHEDSTATS */
-#ifdef CONFIG_CPU_IDLE
-	/* Must be inspected within a rcu lock section */
-	struct cpuidle_state *idle_state;
-#endif
-};
-
-#ifdef CONFIG_SMP
-struct rq *cpu_rq(int cpu);
-#endif
-
-#ifndef CONFIG_SMP
-extern struct rq *uprq;
-#define cpu_rq(cpu)	(uprq)
-#define this_rq()	(uprq)
-#define raw_rq()	(uprq)
-#define task_rq(p)	(uprq)
-#define cpu_curr(cpu)	((uprq)->curr)
-#else /* CONFIG_SMP */
-DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#define this_rq()		this_cpu_ptr(&runqueues)
-#define raw_rq()		raw_cpu_ptr(&runqueues)
-#endif /* CONFIG_SMP */
-
-static inline u64 __rq_clock_broken(struct rq *rq)
-{
-	return READ_ONCE(rq->clock);
-}
-
-static inline u64 rq_clock(struct rq *rq)
-{
-	lockdep_assert_held(rq->grq_lock);
-	return rq->clock;
-}
-
-static inline u64 rq_clock_task(struct rq *rq)
-{
-	lockdep_assert_held(rq->grq_lock);
-	return rq->clock_task;
-}
-
-extern struct mutex sched_domains_mutex;
-extern struct static_key_false sched_schedstats;
-
-#define rcu_dereference_check_sched_domain(p) \
-	rcu_dereference_check((p), \
-			      lockdep_is_held(&sched_domains_mutex))
-
-/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
- *
- * The domain tree of any CPU may only be accessed from within
- * preempt-disabled sections.
- */
-#define for_each_domain(cpu, __sd) \
-	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
-
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-void register_sched_domain_sysctl(void);
-void unregister_sched_domain_sysctl(void);
-#else
-static inline void register_sched_domain_sysctl(void)
-{
-}
-static inline void unregister_sched_domain_sysctl(void)
-{
-}
-#endif
-
-static inline void sched_ttwu_pending(void) { }
-
-static inline int task_on_rq_queued(struct task_struct *p)
-{
-	return p->on_rq;
-}
-
-#ifdef CONFIG_SMP
-
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
-
-#endif
-
-#ifdef CONFIG_CPU_IDLE
-static inline void idle_set_state(struct rq *rq,
-				  struct cpuidle_state *idle_state)
-{
-	rq->idle_state = idle_state;
-}
-
-static inline struct cpuidle_state *idle_get_state(struct rq *rq)
-{
-	WARN_ON(!rcu_read_lock_held());
-	return rq->idle_state;
-}
-#else
-static inline void idle_set_state(struct rq *rq,
-				  struct cpuidle_state *idle_state)
-{
-}
-
-static inline struct cpuidle_state *idle_get_state(struct rq *rq)
-{
-	return NULL;
-}
-#endif
-
-#ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
-
-static inline void cpufreq_trigger(u64 time, unsigned long util)
-{
-       struct update_util_data *data;
-
-       if (util > SCHED_CAPACITY_SCALE)
-	       util = SCHED_CAPACITY_SCALE;
-       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
-       if (data)
-               data->func(data, time, util, SCHED_CAPACITY_SCALE);
-}
-#else
-static inline void cpufreq_trigger(u64 time, unsigned long util)
-{
-}
-#endif /* CONFIG_CPU_FREQ */
-
-#ifdef arch_scale_freq_capacity
-#ifndef arch_scale_freq_invariant
-#define arch_scale_freq_invariant()	(true)
-#endif
-#else /* arch_scale_freq_capacity */
-#define arch_scale_freq_invariant()	(false)
-#endif
-
-#endif /* BFS_SCHED_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 2ebd7b0c4..2a644b6be 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -9,8 +9,8 @@
  * published by the Free Software Foundation.
  */
 
-#ifdef CONFIG_SCHED_BFS
-#include "bfs_sched.h"
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
 #else
 #include "sched.h"
 #endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index eba226d7c..a2bf1ea69 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -16,8 +16,8 @@
 #include <linux/slab.h>
 #include <trace/events/power.h>
 
-#ifdef CONFIG_SCHED_BFS
-#include "bfs_sched.h"
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
 #else
 #include "sched.h"
 #endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1e855dcbd..060b76d85 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -14,8 +14,8 @@
 
 #include <trace/events/power.h>
 
-#ifdef CONFIG_SCHED_BFS
-#include "bfs_sched.h"
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
 #else
 #include "sched.h"
 #endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 7466a0bb2..ba7b137c3 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,10 +4,10 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 
-#ifndef CONFIG_SCHED_BFS
+#ifndef CONFIG_SCHED_MUQSS
 #include "sched.h"
 #else
-#include "bfs_sched.h"
+#include "MuQSS.h"
 #endif
 
 /*
diff --git a/kernel/skip_list.c b/kernel/skip_list.c
new file mode 100644
index 000000000..5c66067f2
--- /dev/null
+++ b/kernel/skip_list.c
@@ -0,0 +1,174 @@
+/*
+  Copyright (C) 2011,2016 Con Kolivas.
+
+  Code based on example originally by William Pugh.
+
+Skip Lists are a probabilistic alternative to balanced trees, as
+described in the June 1990 issue of CACM and were invented by
+William Pugh in 1987.
+
+A couple of comments about this implementation:
+The routine randomLevel has been hard-coded to generate random
+levels using p=0.25. It can be easily changed.
+
+The insertion routine has been implemented so as to use the
+dirty hack described in the CACM paper: if a random level is
+generated that is more than the current maximum level, the
+current maximum level plus one is used instead.
+
+Levels start at zero and go up to MaxLevel (which is equal to
+MaxNumberOfLevels-1).
+
+The routines defined in this file are:
+
+init: defines slnode
+
+new_skiplist: returns a new, empty list
+
+randomLevel: Returns a random level based on a u64 random seed passed to it.
+In MuQSS, the "niffy" time is used for this purpose.
+
+insert(l,key, value): inserts the binding (key, value) into l. This operation
+occurs in O(log n) time.
+
+delnode(slnode, l, node): deletes any binding of key from the l based on the
+actual node value. This operation occurs in O(k) time where k is the
+number of levels of the node in question (max 8). The original delete
+function occurred in O(log n) time and involved a search.
+
+MuQSS Notes: In this implementation of skiplists, there are bidirectional
+next/prev pointers and the insert function returns a pointer to the actual
+node the value is stored. The key here is chosen by the scheduler so as to
+sort tasks according to the priority list requirements and is no longer used
+by the scheduler after insertion. The scheduler lookup, however, occurs in
+O(1) time because it is always the first item in the level 0 linked list.
+Since the task struct stores a copy of the node pointer upon skiplist_insert,
+it can also remove it much faster than the original implementation with the
+aid of prev<->next pointer manipulation and no searching.
+
+*/
+
+#include <linux/slab.h>
+#include <linux/skip_list.h>
+
+#define MaxNumberOfLevels 8
+#define MaxLevel (MaxNumberOfLevels - 1)
+
+void skiplist_init(skiplist_node *slnode)
+{
+	int i;
+
+	slnode->key = 0xFFFFFFFFFFFFFFFF;
+	slnode->level = 0;
+	slnode->value = NULL;
+	for (i = 0; i < MaxNumberOfLevels; i++)
+		slnode->next[i] = slnode->prev[i] = slnode;
+}
+
+skiplist *new_skiplist(skiplist_node *slnode)
+{
+	skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC);
+
+	BUG_ON(!l);
+	l->header = slnode;
+	return l;
+}
+
+void free_skiplist(skiplist *l)
+{
+	skiplist_node *p, *q;
+
+	p = l->header;
+	do {
+		q = p->next[0];
+		p->next[0]->prev[0] = q->prev[0];
+		skiplist_node_init(p);
+		p = q;
+	} while (p != l->header);
+	kfree(l);
+}
+
+void skiplist_node_init(skiplist_node *node)
+{
+	memset(node, 0, sizeof(skiplist_node));
+}
+
+/*
+ * Returns a pseudo-random number based on the randseed value by masking out
+ * 0-15. As many levels are not required when only few values are on the list,
+ * we limit the height of the levels according to how many list entries there
+ * are in a cheap manner. The height of the levels may have been higher while
+ * there were more entries queued previously but as this code is used only by
+ * the scheduler, entries are short lived and will be torn down regularly.
+ *
+ * 00-03 entries - 1 level
+ * 04-07 entries - 2 levels
+ * 08-15 entries - 4 levels
+ * 15-31 entries - 7 levels
+ *  32+  entries - max(16) levels
+ */
+static inline unsigned int randomLevel(int entries, unsigned int randseed)
+{
+	unsigned int mask;
+
+	if (entries > 15)
+		mask = 0x7;
+	else if (entries > 7)
+		mask = 0x3;
+	else if (entries > 3)
+		mask = 0x1;
+	else
+		return 0;
+
+	return randseed & mask;
+}
+
+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed)
+{
+	skiplist_node *update[MaxNumberOfLevels];
+	skiplist_node *p, *q;
+	int k = l->level;
+
+	p = l->header;
+	do {
+		while (q = p->next[k], q->key <= key)
+			p = q;
+		update[k] = p;
+	} while (--k >= 0);
+
+	k = randomLevel(++l->entries, randseed);
+	if (k > MaxLevel)
+		k = MaxLevel;
+	if (k > l->level) {
+		k = ++l->level;
+		update[k] = l->header;
+	}
+
+	node->level = k;
+	node->key = key;
+	node->value = value;
+	do {
+		p = update[k];
+		node->next[k] = p->next[k];
+		p->next[k] = node;
+		node->prev[k] = p;
+		node->next[k]->prev[k] = node;
+	} while (--k >= 0);
+}
+
+void skiplist_delete(skiplist *l, skiplist_node *node)
+{
+	int k, m = node->level;
+
+	for (k = 0; k <= m; k++) {
+		node->prev[k]->next[k] = node->next[k];
+		node->next[k]->prev[k] = node->prev[k];
+	}
+	skiplist_node_init(node);
+	if (m == l->level) {
+		while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0)
+			m--;
+		l->level = m;
+	}
+	l->entries--;
+}
diff --git a/kernel/skip_lists.c b/kernel/skip_lists.c
deleted file mode 100644
index 40b7ba240..000000000
--- a/kernel/skip_lists.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
-  Copyright (C) 2011,2016 Con Kolivas.
-
-  Code based on example originally by William Pugh.
-
-Skip Lists are a probabilistic alternative to balanced trees, as
-described in the June 1990 issue of CACM and were invented by
-William Pugh in 1987.
-
-A couple of comments about this implementation:
-The routine randomLevel has been hard-coded to generate random
-levels using p=0.25. It can be easily changed.
-
-The insertion routine has been implemented so as to use the
-dirty hack described in the CACM paper: if a random level is
-generated that is more than the current maximum level, the
-current maximum level plus one is used instead.
-
-Levels start at zero and go up to MaxLevel (which is equal to
-MaxNumberOfLevels-1).
-
-The routines defined in this file are:
-
-init: defines slnode
-
-new_skiplist: returns a new, empty list
-
-randomLevel: Returns a random level based on a u64 random seed passed to it.
-In BFS, the "niffy" time is used for this purpose.
-
-insert(l,key, value): inserts the binding (key, value) into l. This operation
-occurs in O(log n) time.
-
-delnode(slnode, l, node): deletes any binding of key from the l based on the
-actual node value. This operation occurs in O(k) time where k is the
-number of levels of the node in question (max 16). The original delete
-function occurred in O(log n) time and involved a search.
-
-BFS Notes: In this implementation of skiplists, there are bidirectional
-next/prev pointers and the insert function returns a pointer to the actual
-node the value is stored. The key here is chosen by the scheduler so as to
-sort tasks according to the priority list requirements and is no longer used
-by the scheduler after insertion. The scheduler lookup, however, occurs in
-O(1) time because it is always the first item in the level 0 linked list.
-Since the task struct stores a copy of the node pointer upon skiplist_insert,
-it can also remove it much faster than the original implementation with the
-aid of prev<->next pointer manipulation and no searching.
-
-*/
-
-#include <linux/slab.h>
-#include <linux/skip_lists.h>
-
-#define MaxNumberOfLevels 16
-#define MaxLevel (MaxNumberOfLevels - 1)
-
-void skiplist_init(skiplist_node *slnode)
-{
-	int i;
-
-	slnode->key = 0xFFFFFFFFFFFFFFFF;
-	slnode->level = 0;
-	slnode->value = NULL;
-	for (i = 0; i < MaxNumberOfLevels; i++)
-		slnode->next[i] = slnode->prev[i] = slnode;
-}
-
-skiplist *new_skiplist(skiplist_node *slnode)
-{
-	skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC);
-
-	BUG_ON(!l);
-	l->header = slnode;
-	return l;
-}
-
-void free_skiplist(skiplist *l)
-{
-	skiplist_node *p, *q;
-
-	p = l->header;
-	do {
-		q = p->next[0];
-		p->next[0]->prev[0] = q->prev[0];
-		skiplist_node_init(p);
-		p = q;
-	} while (p != l->header);
-	kfree(l);
-}
-
-void skiplist_node_init(skiplist_node *node)
-{
-	memset(node, 0, sizeof(skiplist_node));
-}
-
-/*
- * Returns a pseudo-random number based on the randseed value by masking out
- * 0-15. As many levels are not required when only few values are on the list,
- * we limit the height of the levels according to how many list entries there
- * are in a cheap manner. The height of the levels may have been higher while
- * there were more entries queued previously but as this code is used only by
- * the scheduler, entries are short lived and will be torn down regularly.
- *
- * 00-03 entries - 1 level
- * 04-07 entries - 2 levels
- * 08-15 entries - 4 levels
- * 15-31 entries - 7 levels
- *  32+  entries - max(16) levels
- */
-static inline unsigned int randomLevel(int entries, unsigned int randseed)
-{
-	unsigned int mask;
-
-	if (entries > 31)
-		mask = 0xF;
-	else if (entries > 15)
-		mask = 0x7;
-	else if (entries > 7)
-		mask = 0x3;
-	else if (entries > 3)
-		mask = 0x1;
-	else
-		return 0;
-
-	return randseed & mask;
-}
-
-void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed)
-{
-	skiplist_node *update[MaxNumberOfLevels];
-	skiplist_node *p, *q;
-	int k = l->level;
-
-	p = l->header;
-	do {
-		while (q = p->next[k], q->key <= key)
-			p = q;
-		update[k] = p;
-	} while (--k >= 0);
-
-	k = randomLevel(++l->entries, randseed);
-	if (k > l->level) {
-		k = ++l->level;
-		update[k] = l->header;
-	}
-
-	node->level = k;
-	node->key = key;
-	node->value = value;
-	do {
-		p = update[k];
-		node->next[k] = p->next[k];
-		p->next[k] = node;
-		node->prev[k] = p;
-		node->next[k]->prev[k] = node;
-	} while (--k >= 0);
-}
-
-void skiplist_delete(skiplist *l, skiplist_node *node)
-{
-	int k, m = node->level;
-
-	for (k = 0; k <= m; k++) {
-		node->prev[k]->next[k] = node->next[k];
-		node->next[k]->prev[k] = node->prev[k];
-	}
-	skiplist_node_init(node);
-	if (m == l->level) {
-		while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0)
-			m--;
-		l->level = m;
-	}
-	l->entries--;
-}
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index fc0d8270f..13bc43d1f 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data)
 
 		if (kthread_should_park()) {
 			__set_current_state(TASK_RUNNING);
+			preempt_enable();
 			if (ht->park && td->status == HP_THREAD_ACTIVE) {
 				BUG_ON(td->cpu != smp_processor_id());
 				ht->park(td->cpu);
 				td->status = HP_THREAD_PARKED;
 			}
-			preempt_enable();
 			kthread_parkme();
 			/* We might have been woken for stop */
 			continue;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ca8093ed7..4bd185938 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -127,7 +127,7 @@ static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
 static int __read_mostly one_hundred = 100;
 static int __read_mostly one_thousand = 1000;
-#ifdef CONFIG_SCHED_BFS
+#ifdef CONFIG_SCHED_MUQSS
 extern int rr_interval;
 extern int sched_interactive;
 extern int sched_iso_cpu;
@@ -269,7 +269,7 @@ static struct ctl_table sysctl_base_table[] = {
 	{ }
 };
 
-#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS)
+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS)
 static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
@@ -286,7 +286,7 @@ static int max_extfrag_threshold = 1000;
 #endif
 
 static struct ctl_table kern_table[] = {
-#ifndef CONFIG_SCHED_BFS
+#ifndef CONFIG_SCHED_MUQSS
 	{
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
@@ -455,7 +455,7 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
-#endif /* !CONFIG_SCHED_BFS */
+#endif /* !CONFIG_SCHED_MUQSS */
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
@@ -1020,7 +1020,7 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_SCHED_BFS
+#ifdef CONFIG_SCHED_MUQSS
 	{
 		.procname	= "rr_interval",
 		.data		= &rr_interval,
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 6931b6e3c..10e18d267 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -89,7 +89,7 @@ config NO_HZ_IDLE
 config NO_HZ_FULL
 	bool "Full dynticks system (tickless)"
 	# NO_HZ_COMMON dependency
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS
+	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_MUQSS
 	# We need at least one periodic CPU for timekeeping
 	depends on SMP
 	depends on HAVE_CONTEXT_TRACKING
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 287cf721c..69ee53a67 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1039,8 +1039,8 @@ static int trace_wakeup_test_thread(void *data)
 {
 	/* Make this a -deadline thread */
 	static const struct sched_attr attr = {
-#ifdef CONFIG_SCHED_BFS
-		/* No deadline on BFS, use RR */
+#ifdef CONFIG_SCHED_MUQSS
+		/* No deadline on MuQSS, use RR */
 		.sched_policy = SCHED_RR,
 #else
 		.sched_policy = SCHED_DEADLINE,
-- 
cgit v1.2.3-54-g00ecf