diff options
Diffstat (limited to 'kernel')
59 files changed, 1639 insertions, 18751 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index dd84575cb..7241e3459 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o skip_lists.o + async.o range.o smpboot.o skip_list.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h deleted file mode 100644 index 10b65633f..000000000 --- a/kernel/power/tuxonice.h +++ /dev/null @@ -1,260 +0,0 @@ -/* - * kernel/power/tuxonice.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations used throughout swsusp. - * - */ - -#ifndef KERNEL_POWER_TOI_H -#define KERNEL_POWER_TOI_H - -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/suspend.h> -#include <linux/fs.h> -#include <asm/setup.h> -#include "tuxonice_pageflags.h" -#include "power.h" - -#define TOI_CORE_VERSION "3.3" -#define TOI_HEADER_VERSION 3 -#define MY_BOOT_KERNEL_DATA_VERSION 4 - -struct toi_boot_kernel_data { - int version; - int size; - unsigned long toi_action; - unsigned long toi_debug_state; - u32 toi_default_console_level; - int toi_io_time[2][2]; - char toi_nosave_commandline[COMMAND_LINE_SIZE]; - unsigned long pages_used[33]; - unsigned long incremental_bytes_in; - unsigned long incremental_bytes_out; - unsigned long compress_bytes_in; - unsigned long compress_bytes_out; - unsigned long pruned_pages; -}; - -extern struct toi_boot_kernel_data toi_bkd; - -/* Location of book kernel data struct in kernel being resumed */ -extern unsigned long boot_kernel_data_buffer; - -/* == Action states == */ - -enum { - TOI_REBOOT, - TOI_PAUSE, - TOI_LOGALL, - TOI_CAN_CANCEL, - TOI_KEEP_IMAGE, - TOI_FREEZER_TEST, - TOI_SINGLESTEP, - TOI_PAUSE_NEAR_PAGESET_END, - TOI_TEST_FILTER_SPEED, - TOI_TEST_BIO, - TOI_NO_PAGESET2, - TOI_IGNORE_ROOTFS, - TOI_REPLACE_SWSUSP, - TOI_PAGESET2_FULL, - TOI_ABORT_ON_RESAVE_NEEDED, - TOI_NO_MULTITHREADED_IO, - TOI_NO_DIRECT_LOAD, /* Obsolete */ - TOI_LATE_CPU_HOTPLUG, /* Obsolete */ - TOI_GET_MAX_MEM_ALLOCD, - TOI_NO_FLUSHER_THREAD, - TOI_NO_PS2_IF_UNNEEDED, - TOI_POST_RESUME_BREAKPOINT, - TOI_NO_READAHEAD, - TOI_TRACE_DEBUG_ON, - TOI_INCREMENTAL_IMAGE, -}; - -extern unsigned long toi_bootflags_mask; - -#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action)) - -/* == Result states == */ - -enum { - TOI_ABORTED, - TOI_ABORT_REQUESTED, - TOI_NOSTORAGE_AVAILABLE, - TOI_INSUFFICIENT_STORAGE, - TOI_FREEZING_FAILED, - TOI_KEPT_IMAGE, - TOI_WOULD_EAT_MEMORY, - TOI_UNABLE_TO_FREE_ENOUGH_MEMORY, - TOI_PM_SEM, - TOI_DEVICE_REFUSED, - TOI_SYSDEV_REFUSED, - TOI_EXTRA_PAGES_ALLOW_TOO_SMALL, - TOI_UNABLE_TO_PREPARE_IMAGE, - TOI_FAILED_MODULE_INIT, - TOI_FAILED_MODULE_CLEANUP, - TOI_FAILED_IO, - TOI_OUT_OF_MEMORY, - TOI_IMAGE_ERROR, - TOI_PLATFORM_PREP_FAILED, - TOI_CPU_HOTPLUG_FAILED, - TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */ - TOI_RESAVE_NEEDED, - TOI_CANT_SUSPEND, - TOI_NOTIFIERS_PREPARE_FAILED, - TOI_PRE_SNAPSHOT_FAILED, - TOI_PRE_RESTORE_FAILED, - TOI_USERMODE_HELPERS_ERR, - TOI_CANT_USE_ALT_RESUME, - TOI_HEADER_TOO_BIG, - TOI_WAKEUP_EVENT, - TOI_SYSCORE_REFUSED, - TOI_DPM_PREPARE_FAILED, - TOI_DPM_SUSPEND_FAILED, - TOI_NUM_RESULT_STATES /* Used in printing debug info only */ -}; - -extern unsigned long toi_result; - -#define set_result_state(bit) (test_and_set_bit(bit, &toi_result)) -#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \ - test_and_set_bit(bit, &toi_result)) -#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result)) -#define test_result_state(bit) (test_bit(bit, &toi_result)) - -/* == Debug sections and levels == */ - -/* debugging levels. */ -enum { - TOI_STATUS = 0, - TOI_ERROR = 2, - TOI_LOW, - TOI_MEDIUM, - TOI_HIGH, - TOI_VERBOSE, -}; - -enum { - TOI_ANY_SECTION, - TOI_EAT_MEMORY, - TOI_IO, - TOI_HEADER, - TOI_WRITER, - TOI_MEMORY, - TOI_PAGEDIR, - TOI_COMPRESS, - TOI_BIO, -}; - -#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state)) -#define clear_debug_state(bit) \ - (test_and_clear_bit(bit, &toi_bkd.toi_debug_state)) -#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state)) - -/* == Steps in hibernating == */ - -enum { - STEP_HIBERNATE_PREPARE_IMAGE, - STEP_HIBERNATE_SAVE_IMAGE, - STEP_HIBERNATE_POWERDOWN, - STEP_RESUME_CAN_RESUME, - STEP_RESUME_LOAD_PS1, - STEP_RESUME_DO_RESTORE, - STEP_RESUME_READ_PS2, - STEP_RESUME_GO, - STEP_RESUME_ALT_IMAGE, - STEP_CLEANUP, - STEP_QUIET_CLEANUP -}; - -/* == TuxOnIce states == - (see also include/linux/suspend.h) */ - -#define get_toi_state() (toi_state) -#define restore_toi_state(saved_state) \ - do { toi_state = saved_state; } while (0) - -/* == Module support == */ - -struct toi_core_fns { - int (*post_context_save)(void); - unsigned long (*get_nonconflicting_page)(void); - int (*try_hibernate)(void); - void (*try_resume)(void); -}; - -extern struct toi_core_fns *toi_core_fns; - -/* == All else == */ -#define KB(x) ((x) << (PAGE_SHIFT - 10)) -#define MB(x) ((x) >> (20 - PAGE_SHIFT)) - -extern int toi_start_anything(int toi_or_resume); -extern void toi_finish_anything(int toi_or_resume); - -extern int save_image_part1(void); -extern int toi_atomic_restore(void); - -extern int toi_try_hibernate(void); -extern void toi_try_resume(void); - -extern int __toi_post_context_save(void); - -extern unsigned int nr_hibernates; -extern char alt_resume_param[256]; - -extern void copyback_post(void); -extern int toi_hibernate(void); -extern unsigned long extra_pd1_pages_used; - -#define SECTOR_SIZE 512 - -extern void toi_early_boot_message(int can_erase_image, int default_answer, - char *warning_reason, ...); - -extern int do_check_can_resume(void); -extern int do_toi_step(int step); -extern int toi_launch_userspace_program(char *command, int channel_no, - int wait, int debug); - -extern char tuxonice_signature[9]; - -extern int toi_start_other_threads(void); -extern void toi_stop_other_threads(void); - -extern int toi_trace_index; -#define TOI_TRACE_DEBUG(PFN, DESC, ...) \ - do { \ - if (test_action_state(TOI_TRACE_DEBUG_ON)) { \ - printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \ - } \ - } while(0) - -#ifdef CONFIG_TOI_KEEP_IMAGE -#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE)) -#else -#define toi_keeping_image (0) -#endif - -#ifdef CONFIG_TOI_INCREMENTAL -extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose); -extern int toi_reset_dirtiness(int verbose); -extern void toi_cbw_write(void); -extern void toi_cbw_restore(void); -extern int toi_allocate_cbw_data(void); -extern void toi_free_cbw_data(void); -extern int toi_cbw_init(void); -extern void toi_mark_tasks_cbw(void); -#else -static inline int toi_reset_dirtiness(int verbose) { return 0; } -#define toi_cbw_write() do { } while(0) -#define toi_cbw_restore() do { } while(0) -#define toi_allocate_cbw_data() do { } while(0) -#define toi_free_cbw_data() do { } while(0) -static inline int toi_cbw_init(void) { return 0; } -#endif -#endif diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c deleted file mode 100644 index 1d8b1cbda..000000000 --- a/kernel/power/tuxonice_alloc.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * kernel/power/tuxonice_alloc.c - * - * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include <linux/export.h> -#include <linux/slab.h> -#include "tuxonice_modules.h" -#include "tuxonice_alloc.h" -#include "tuxonice_sysfs.h" -#include "tuxonice.h" - -#define TOI_ALLOC_PATHS 41 - -static DEFINE_MUTEX(toi_alloc_mutex); - -static struct toi_module_ops toi_alloc_ops; - -static int toi_fail_num; - -static atomic_t toi_alloc_count[TOI_ALLOC_PATHS], - toi_free_count[TOI_ALLOC_PATHS], - toi_test_count[TOI_ALLOC_PATHS], - toi_fail_count[TOI_ALLOC_PATHS]; -static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS]; -static int cur_allocd, max_allocd; - -static char *toi_alloc_desc[TOI_ALLOC_PATHS] = { - "", /* 0 */ - "get_io_info_struct", - "extent", - "extent (loading chain)", - "userui channel", - "userui arg", /* 5 */ - "attention list metadata", - "extra pagedir memory metadata", - "bdev metadata", - "extra pagedir memory", - "header_locations_read", /* 10 */ - "bio queue", - "prepare_readahead", - "i/o buffer", - "writer buffer in bio_init", - "checksum buffer", /* 15 */ - "compression buffer", - "filewriter signature op", - "set resume param alloc1", - "set resume param alloc2", - "debugging info buffer", /* 20 */ - "check can resume buffer", - "write module config buffer", - "read module config buffer", - "write image header buffer", - "read pageset1 buffer", /* 25 */ - "get_have_image_data buffer", - "checksum page", - "worker rw loop", - "get nonconflicting page", - "ps1 load addresses", /* 30 */ - "remove swap image", - "swap image exists", - "swap parse sig location", - "sysfs kobj", - "swap mark resume attempted buffer", /* 35 */ - "cluster member", - "boot kernel data buffer", - "setting swap signature", - "block i/o bdev struct", - "copy before write", /* 40 */ -}; - -#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \ - do { \ - BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \ - \ - if (FAIL_NUM == toi_fail_num) { \ - atomic_inc(&toi_test_count[FAIL_NUM]); \ - toi_fail_num = 0; \ - return FAIL_VAL; \ - } \ - } while (0) - -static void alloc_update_stats(int fail_num, void *result, int size) -{ - if (!result) { - atomic_inc(&toi_fail_count[fail_num]); - return; - } - - atomic_inc(&toi_alloc_count[fail_num]); - if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { - mutex_lock(&toi_alloc_mutex); - toi_cur_allocd[fail_num]++; - cur_allocd += size; - if (unlikely(cur_allocd > max_allocd)) { - int i; - - for (i = 0; i < TOI_ALLOC_PATHS; i++) - toi_max_allocd[i] = toi_cur_allocd[i]; - max_allocd = cur_allocd; - } - mutex_unlock(&toi_alloc_mutex); - } -} - -static void free_update_stats(int fail_num, int size) -{ - BUG_ON(fail_num >= TOI_ALLOC_PATHS); - atomic_inc(&toi_free_count[fail_num]); - if (unlikely(atomic_read(&toi_free_count[fail_num]) > - atomic_read(&toi_alloc_count[fail_num]))) - dump_stack(); - if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { - mutex_lock(&toi_alloc_mutex); - cur_allocd -= size; - toi_cur_allocd[fail_num]--; - mutex_unlock(&toi_alloc_mutex); - } -} - -void *toi_kzalloc(int fail_num, size_t size, gfp_t flags) -{ - void *result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, NULL); - result = kzalloc(size, flags); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, result, size); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -unsigned long toi_get_free_pages(int fail_num, gfp_t mask, - unsigned int order) -{ - unsigned long result; - - mask |= ___GFP_TOI_NOTRACK; - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, 0); - result = __get_free_pages(mask, order); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, - PAGE_SIZE << order); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -struct page *toi_alloc_page(int fail_num, gfp_t mask) -{ - struct page *result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, NULL); - mask |= ___GFP_TOI_NOTRACK; - result = alloc_page(mask); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask) -{ - unsigned long result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, 0); - mask |= ___GFP_TOI_NOTRACK; - result = get_zeroed_page(mask); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -void toi_kfree(int fail_num, const void *arg, int size) -{ - if (arg && toi_alloc_ops.enabled) - free_update_stats(fail_num, size); - - if (fail_num == toi_trace_allocs) - dump_stack(); - kfree(arg); -} - -void toi_free_page(int fail_num, unsigned long virt) -{ - if (virt && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE); - - if (fail_num == toi_trace_allocs) - dump_stack(); - free_page(virt); -} - -void toi__free_page(int fail_num, struct page *page) -{ - if (page && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE); - - if (fail_num == toi_trace_allocs) - dump_stack(); - __free_page(page); -} - -void toi_free_pages(int fail_num, struct page *page, int order) -{ - if (page && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE << order); - - if (fail_num == toi_trace_allocs) - dump_stack(); - __free_pages(page, order); -} - -void toi_alloc_print_debug_stats(void) -{ - int i, header_done = 0; - - if (!toi_alloc_ops.enabled) - return; - - for (i = 0; i < TOI_ALLOC_PATHS; i++) - if (atomic_read(&toi_alloc_count[i]) != - atomic_read(&toi_free_count[i])) { - if (!header_done) { - printk(KERN_INFO "Idx Allocs Frees Tests " - " Fails Max Description\n"); - header_done = 1; - } - - printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i, - atomic_read(&toi_alloc_count[i]), - atomic_read(&toi_free_count[i]), - atomic_read(&toi_test_count[i]), - atomic_read(&toi_fail_count[i]), - toi_max_allocd[i], - toi_alloc_desc[i]); - } -} - -static int toi_alloc_initialise(int starting_cycle) -{ - int i; - - if (!starting_cycle) - return 0; - - if (toi_trace_allocs) - dump_stack(); - - for (i = 0; i < TOI_ALLOC_PATHS; i++) { - atomic_set(&toi_alloc_count[i], 0); - atomic_set(&toi_free_count[i], 0); - atomic_set(&toi_test_count[i], 0); - atomic_set(&toi_fail_count[i], 0); - toi_cur_allocd[i] = 0; - toi_max_allocd[i] = 0; - }; - - max_allocd = 0; - cur_allocd = 0; - return 0; -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL), - SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0, - NULL), - SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action, - TOI_GET_MAX_MEM_ALLOCD, 0), - SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0, - NULL) -}; - -static struct toi_module_ops toi_alloc_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "allocation debugging", - .directory = "alloc", - .module = THIS_MODULE, - .early = 1, - .initialise = toi_alloc_initialise, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_alloc_init(void) -{ - int result = toi_register_module(&toi_alloc_ops); - return result; -} - -void toi_alloc_exit(void) -{ - toi_unregister_module(&toi_alloc_ops); -} diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h deleted file mode 100644 index 0cd6b686f..000000000 --- a/kernel/power/tuxonice_alloc.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * kernel/power/tuxonice_alloc.h - * - * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include <linux/slab.h> -#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN) -#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN) - -#ifdef CONFIG_PM_DEBUG -extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags); -extern void toi_kfree(int fail_num, const void *arg, int size); - -extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask, - unsigned int order); -#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0) -extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask); -extern void toi_free_page(int fail_num, unsigned long buf); -extern void toi__free_page(int fail_num, struct page *page); -extern void toi_free_pages(int fail_num, struct page *page, int order); -extern struct page *toi_alloc_page(int fail_num, gfp_t mask); -extern int toi_alloc_init(void); -extern void toi_alloc_exit(void); - -extern void toi_alloc_print_debug_stats(void); - -#else /* CONFIG_PM_DEBUG */ - -#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS)) -#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN)) - -#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER) -#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS) -#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS) -#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0) -#define toi__free_page(FAIL, PAGE) __free_page(PAGE) -#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER) -#define toi_alloc_page(FAIL, MASK) alloc_page(MASK) -static inline int toi_alloc_init(void) -{ - return 0; -} - -static inline void toi_alloc_exit(void) { } - -static inline void toi_alloc_print_debug_stats(void) { } - -#endif - -extern int toi_trace_allocs; diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c deleted file mode 100644 index 5845217f8..000000000 --- a/kernel/power/tuxonice_atomic_copy.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * kernel/power/tuxonice_atomic_copy.c - * - * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * Routines for doing the atomic save/restore. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/cpu.h> -#include <linux/freezer.h> -#include <linux/console.h> -#include <linux/syscore_ops.h> -#include <linux/ftrace.h> -#include <asm/suspend.h> -#include "tuxonice.h" -#include "tuxonice_storage.h" -#include "tuxonice_power_off.h" -#include "tuxonice_ui.h" -#include "tuxonice_io.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_checksum.h" -#include "tuxonice_builtin.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_alloc.h" -#include "tuxonice_modules.h" - -unsigned long extra_pd1_pages_used; - -/** - * free_pbe_list - free page backup entries used by the atomic copy code. - * @list: List to free. - * @highmem: Whether the list is in highmem. - * - * Normally, this function isn't used. If, however, we need to abort before - * doing the atomic copy, we use this to free the pbes previously allocated. - **/ -static void free_pbe_list(struct pbe **list, int highmem) -{ - while (*list) { - int i; - struct pbe *free_pbe, *next_page = NULL; - struct page *page; - - if (highmem) { - page = (struct page *) *list; - free_pbe = (struct pbe *) kmap(page); - } else { - page = virt_to_page(*list); - free_pbe = *list; - } - - for (i = 0; i < PBES_PER_PAGE; i++) { - if (!free_pbe) - break; - if (highmem) - toi__free_page(29, free_pbe->address); - else - toi_free_page(29, - (unsigned long) free_pbe->address); - free_pbe = free_pbe->next; - } - - if (highmem) { - if (free_pbe) - next_page = free_pbe; - kunmap(page); - } else { - if (free_pbe) - next_page = free_pbe; - } - - toi__free_page(29, page); - *list = (struct pbe *) next_page; - }; -} - -/** - * copyback_post - post atomic-restore actions - * - * After doing the atomic restore, we have a few more things to do: - * 1) We want to retain some values across the restore, so we now copy - * these from the nosave variables to the normal ones. - * 2) Set the status flags. - * 3) Resume devices. - * 4) Tell userui so it can redraw & restore settings. - * 5) Reread the page cache. - **/ -void copyback_post(void) -{ - struct toi_boot_kernel_data *bkd = - (struct toi_boot_kernel_data *) boot_kernel_data_buffer; - - if (toi_activate_storage(1)) - panic("Failed to reactivate our storage."); - - toi_post_atomic_restore_modules(bkd); - - toi_cond_pause(1, "About to reload secondary pagedir."); - - if (read_pageset2(0)) - panic("Unable to successfully reread the page cache."); - - /* - * If the user wants to sleep again after resuming from full-off, - * it's most likely to be in order to suspend to ram, so we'll - * do this check after loading pageset2, to give them the fastest - * wakeup when they are ready to use the computer again. - */ - toi_check_resleep(); - - if (test_action_state(TOI_INCREMENTAL_IMAGE)) - toi_reset_dirtiness(1); -} - -/** - * toi_copy_pageset1 - do the atomic copy of pageset1 - * - * Make the atomic copy of pageset1. We can't use copy_page (as we once did) - * because we can't be sure what side effects it has. On my old Duron, with - * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt - * count at resume time 4 instead of 3. - * - * We don't want to call kmap_atomic unconditionally because it has the side - * effect of incrementing the preempt count, which will leave it one too high - * post resume (the page containing the preempt count will be copied after - * its incremented. This is essentially the same problem. - **/ -void toi_copy_pageset1(void) -{ - int i; - unsigned long source_index, dest_index; - - memory_bm_position_reset(pageset1_map); - memory_bm_position_reset(pageset1_copy_map); - - source_index = memory_bm_next_pfn(pageset1_map, 0); - dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); - - for (i = 0; i < pagedir1.size; i++) { - unsigned long *origvirt, *copyvirt; - struct page *origpage, *copypage; - int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1, - was_present1, was_present2; - - origpage = pfn_to_page(source_index); - copypage = pfn_to_page(dest_index); - - origvirt = PageHighMem(origpage) ? - kmap_atomic(origpage) : - page_address(origpage); - - copyvirt = PageHighMem(copypage) ? - kmap_atomic(copypage) : - page_address(copypage); - - was_present1 = kernel_page_present(origpage); - if (!was_present1) - kernel_map_pages(origpage, 1, 1); - - was_present2 = kernel_page_present(copypage); - if (!was_present2) - kernel_map_pages(copypage, 1, 1); - - while (loop >= 0) { - *(copyvirt + loop) = *(origvirt + loop); - loop--; - } - - if (!was_present1) - kernel_map_pages(origpage, 1, 0); - - if (!was_present2) - kernel_map_pages(copypage, 1, 0); - - if (PageHighMem(origpage)) - kunmap_atomic(origvirt); - - if (PageHighMem(copypage)) - kunmap_atomic(copyvirt); - - source_index = memory_bm_next_pfn(pageset1_map, 0); - dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); - } -} - -/** - * __toi_post_context_save - steps after saving the cpu context - * - * Steps taken after saving the CPU state to make the actual - * atomic copy. - * - * Called from swsusp_save in snapshot.c via toi_post_context_save. - **/ -int __toi_post_context_save(void) -{ - unsigned long old_ps1_size = pagedir1.size; - - check_checksums(); - - free_checksum_pages(); - - toi_recalculate_image_contents(1); - - extra_pd1_pages_used = pagedir1.size > old_ps1_size ? - pagedir1.size - old_ps1_size : 0; - - if (extra_pd1_pages_used > extra_pd1_pages_allowance) { - printk(KERN_INFO "Pageset1 has grown by %lu pages. " - "extra_pages_allowance is currently only %lu.\n", - pagedir1.size - old_ps1_size, - extra_pd1_pages_allowance); - - /* - * Highlevel code will see this, clear the state and - * retry if we haven't already done so twice. - */ - if (any_to_free(1)) { - set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - return 1; - } - if (try_allocate_extra_memory()) { - printk(KERN_INFO "Failed to allocate the extra memory" - " needed. Restarting the process."); - set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - return 1; - } - printk(KERN_INFO "However it looks like there's enough" - " free ram and storage to handle this, so " - " continuing anyway."); - /* - * What if try_allocate_extra_memory above calls - * toi_allocate_extra_pagedir_memory and it allocs a new - * slab page via toi_kzalloc which should be in ps1? So... - */ - toi_recalculate_image_contents(1); - } - - if (!test_action_state(TOI_TEST_FILTER_SPEED) && - !test_action_state(TOI_TEST_BIO)) - toi_copy_pageset1(); - - return 0; -} - -/** - * toi_hibernate - high level code for doing the atomic copy - * - * High-level code which prepares to do the atomic copy. Loosely based - * on the swsusp version, but with the following twists: - * - We set toi_running so the swsusp code uses our code paths. - * - We give better feedback regarding what goes wrong if there is a - * problem. - * - We use an extra function to call the assembly, just in case this code - * is in a module (return address). - **/ -int toi_hibernate(void) -{ - int error; - - error = toi_lowlevel_builtin(); - - if (!error) { - struct toi_boot_kernel_data *bkd = - (struct toi_boot_kernel_data *) boot_kernel_data_buffer; - - /* - * The boot kernel's data may be larger (newer version) or - * smaller (older version) than ours. Copy the minimum - * of the two sizes, so that we don't overwrite valid values - * from pre-atomic copy. - */ - - memcpy(&toi_bkd, (char *) boot_kernel_data_buffer, - min_t(int, sizeof(struct toi_boot_kernel_data), - bkd->size)); - } - - return error; -} - -/** - * toi_atomic_restore - prepare to do the atomic restore - * - * Get ready to do the atomic restore. This part gets us into the same - * state we are in prior to do calling do_toi_lowlevel while - * hibernating: hot-unplugging secondary cpus and freeze processes, - * before starting the thread that will do the restore. - **/ -int toi_atomic_restore(void) -{ - int error; - - toi_prepare_status(DONT_CLEAR_BAR, "Atomic restore."); - - memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line, - strlen(saved_command_line)); - - toi_pre_atomic_restore_modules(&toi_bkd); - - if (add_boot_kernel_data_pbe()) - goto Failed; - - toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); - - if (toi_go_atomic(PMSG_QUIESCE, 0)) - goto Failed; - - /* We'll ignore saved state, but this gets preempt count (etc) right */ - save_processor_state(); - - error = swsusp_arch_resume(); - /* - * Code below is only ever reached in case of failure. Otherwise - * execution continues at place where swsusp_arch_suspend was called. - * - * We don't know whether it's safe to continue (this shouldn't happen), - * so lets err on the side of caution. - */ - BUG(); - -Failed: - free_pbe_list(&restore_pblist, 0); -#ifdef CONFIG_HIGHMEM - free_pbe_list(&restore_highmem_pblist, 1); -#endif - return 1; -} - -/** - * toi_go_atomic - do the actual atomic copy/restore - * @state: The state to use for dpm_suspend_start & power_down calls. - * @suspend_time: Whether we're suspending or resuming. - **/ -int toi_go_atomic(pm_message_t state, int suspend_time) -{ - if (suspend_time) { - if (platform_begin(1)) { - set_abort_result(TOI_PLATFORM_PREP_FAILED); - toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); - return 1; - } - - if (dpm_prepare(PMSG_FREEZE)) { - set_abort_result(TOI_DPM_PREPARE_FAILED); - dpm_complete(PMSG_RECOVER); - toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); - return 1; - } - } - - suspend_console(); - pm_restrict_gfp_mask(); - - if (suspend_time) { - if (dpm_suspend(state)) { - set_abort_result(TOI_DPM_SUSPEND_FAILED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); - return 1; - } - } else { - if (dpm_suspend_start(state)) { - set_abort_result(TOI_DPM_SUSPEND_FAILED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); - return 1; - } - } - - /* At this point, dpm_suspend_start() has been called, but *not* - * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ - - if (dpm_suspend_end(state)) { - set_abort_result(TOI_DEVICE_REFUSED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1); - return 1; - } - - if (suspend_time) { - if (platform_pre_snapshot(1)) - set_abort_result(TOI_PRE_SNAPSHOT_FAILED); - } else { - if (platform_pre_restore(1)) - set_abort_result(TOI_PRE_RESTORE_FAILED); - } - - if (test_result_state(TOI_ABORTED)) { - toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1); - return 1; - } - - if (disable_nonboot_cpus()) { - set_abort_result(TOI_CPU_HOTPLUG_FAILED); - toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG, - suspend_time, 1); - return 1; - } - - local_irq_disable(); - - if (syscore_suspend()) { - set_abort_result(TOI_SYSCORE_REFUSED); - toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1); - return 1; - } - - if (suspend_time && pm_wakeup_pending()) { - set_abort_result(TOI_WAKEUP_EVENT); - toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1); - return 1; - } - return 0; -} - -/** - * toi_end_atomic - post atomic copy/restore routines - * @stage: What step to start at. - * @suspend_time: Whether we're suspending or resuming. - * @error: Whether we're recovering from an error. - **/ -void toi_end_atomic(int stage, int suspend_time, int error) -{ - pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) : - PMSG_RESTORE; - - switch (stage) { - case ATOMIC_ALL_STEPS: - if (!suspend_time) { - events_check_enabled = false; - } - platform_leave(1); - case ATOMIC_STEP_SYSCORE_RESUME: - syscore_resume(); - case ATOMIC_STEP_IRQS: - local_irq_enable(); - case ATOMIC_STEP_CPU_HOTPLUG: - enable_nonboot_cpus(); - case ATOMIC_STEP_PLATFORM_FINISH: - if (!suspend_time && error & 2) - platform_restore_cleanup(1); - else - platform_finish(1); - dpm_resume_start(msg); - case ATOMIC_STEP_DEVICE_RESUME: - if (suspend_time && (error & 2)) - platform_recover(1); - dpm_resume(msg); - if (!toi_in_suspend()) { - dpm_resume_end(PMSG_RECOVER); - } - if (error || !toi_in_suspend()) { - pm_restore_gfp_mask(); - } - resume_console(); - case ATOMIC_STEP_DPM_COMPLETE: - dpm_complete(msg); - case ATOMIC_STEP_PLATFORM_END: - platform_end(1); - - toi_prepare_status(DONT_CLEAR_BAR, "Post atomic."); - } -} diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h deleted file mode 100644 index e2d2b4fb3..000000000 --- a/kernel/power/tuxonice_atomic_copy.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * kernel/power/tuxonice_atomic_copy.h - * - * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * Routines for doing the atomic save/restore. - */ - -enum { - ATOMIC_ALL_STEPS, - ATOMIC_STEP_SYSCORE_RESUME, - ATOMIC_STEP_IRQS, - ATOMIC_STEP_CPU_HOTPLUG, - ATOMIC_STEP_PLATFORM_FINISH, - ATOMIC_STEP_DEVICE_RESUME, - ATOMIC_STEP_DPM_COMPLETE, - ATOMIC_STEP_PLATFORM_END, -}; - -int toi_go_atomic(pm_message_t state, int toi_time); -void toi_end_atomic(int stage, int toi_time, int error); - -extern void platform_recover(int platform_mode); diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h deleted file mode 100644 index 2f717f5c5..000000000 --- a/kernel/power/tuxonice_bio.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * kernel/power/tuxonice_bio.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains declarations for functions exported from - * tuxonice_bio.c, which contains low level io functions. - */ - -#include <linux/buffer_head.h> -#include "tuxonice_extent.h" - -void toi_put_extent_chain(struct hibernate_extent_chain *chain); -int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, - unsigned long start, unsigned long end); - -struct hibernate_extent_saved_state { - int extent_num; - struct hibernate_extent *extent_ptr; - unsigned long offset; -}; - -struct toi_bdev_info { - struct toi_bdev_info *next; - struct hibernate_extent_chain blocks; - struct block_device *bdev; - struct toi_module_ops *allocator; - int allocator_index; - struct hibernate_extent_chain allocations; - char name[266]; /* "swap on " or "file " + up to 256 chars */ - - /* Saved in header */ - char uuid[17]; - dev_t dev_t; - int prio; - int bmap_shift; - int blocks_per_page; - unsigned long pages_used; - struct hibernate_extent_saved_state saved_state[4]; -}; - -struct toi_extent_iterate_state { - struct toi_bdev_info *current_chain; - int num_chains; - int saved_chain_number[4]; - struct toi_bdev_info *saved_chain_ptr[4]; -}; - -/* - * Our exported interface so the swapwriter and filewriter don't - * need these functions duplicated. - */ -struct toi_bio_ops { - int (*bdev_page_io) (int rw, struct block_device *bdev, long pos, - struct page *page); - int (*register_storage)(struct toi_bdev_info *new); - void (*free_storage)(void); -}; - -struct toi_allocator_ops { - unsigned long (*toi_swap_storage_available) (void); -}; - -extern struct toi_bio_ops toi_bio_ops; - -extern char *toi_writer_buffer; -extern int toi_writer_buffer_posn; - -struct toi_bio_allocator_ops { - int (*register_storage) (void); - unsigned long (*storage_available)(void); - int (*allocate_storage) (struct toi_bdev_info *, unsigned long); - int (*bmap) (struct toi_bdev_info *); - void (*free_storage) (struct toi_bdev_info *); - unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used); -}; - -extern int toi_bio_register_storage(void); diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c deleted file mode 100644 index 11cd37f77..000000000 --- a/kernel/power/tuxonice_bio_chains.c +++ /dev/null @@ -1,1121 +0,0 @@ -/* - * kernel/power/tuxonice_bio_devinfo.c - * - * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - */ - -#include <linux/mm_types.h> -#include "tuxonice_bio.h" -#include "tuxonice_bio_internal.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" -#include "tuxonice.h" -#include "tuxonice_io.h" - -static struct toi_bdev_info *prio_chain_head; -static int num_chains; - -/* Pointer to current entry being loaded/saved. */ -struct toi_extent_iterate_state toi_writer_posn; - -#define metadata_size (sizeof(struct toi_bdev_info) - \ - offsetof(struct toi_bdev_info, uuid)) - -/* - * After section 0 (header) comes 2 => next_section[0] = 2 - */ -static int next_section[3] = { 2, 3, 1 }; - -/** - * dump_block_chains - print the contents of the bdev info array. - **/ -void dump_block_chains(void) -{ - int i = 0; - int j; - struct toi_bdev_info *cur_chain = prio_chain_head; - - while (cur_chain) { - struct hibernate_extent *this = cur_chain->blocks.first; - - printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio); - - while (this) { - printk(KERN_CONT " [%lu-%lu]%s", this->start, - this->end, this->next ? "," : ""); - this = this->next; - } - - printk("\n"); - cur_chain = cur_chain->next; - i++; - } - - printk(KERN_DEBUG "Saved states:\n"); - for (i = 0; i < 4; i++) { - printk(KERN_DEBUG "Slot %d: Chain %d.\n", - i, toi_writer_posn.saved_chain_number[i]); - - cur_chain = prio_chain_head; - j = 0; - while (cur_chain) { - printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n", - j, cur_chain->saved_state[i].extent_num, - cur_chain->saved_state[i].offset); - cur_chain = cur_chain->next; - j++; - } - printk(KERN_CONT "\n"); - } -} - -/** - * - **/ -static void toi_extent_chain_next(void) -{ - struct toi_bdev_info *this = toi_writer_posn.current_chain; - - if (!this->blocks.current_extent) - return; - - if (this->blocks.current_offset == this->blocks.current_extent->end) { - if (this->blocks.current_extent->next) { - this->blocks.current_extent = - this->blocks.current_extent->next; - this->blocks.current_offset = - this->blocks.current_extent->start; - } else { - this->blocks.current_extent = NULL; - this->blocks.current_offset = 0; - } - } else - this->blocks.current_offset++; -} - -/** - * - */ - -static struct toi_bdev_info *__find_next_chain_same_prio(void) -{ - struct toi_bdev_info *start_chain = toi_writer_posn.current_chain; - struct toi_bdev_info *this = start_chain; - int orig_prio = this->prio; - - do { - this = this->next; - - if (!this) - this = prio_chain_head; - - /* Back on original chain? Use it again. */ - if (this == start_chain) - return start_chain; - - } while (!this->blocks.current_extent || this->prio != orig_prio); - - return this; -} - -static void find_next_chain(void) -{ - struct toi_bdev_info *this; - - this = __find_next_chain_same_prio(); - - /* - * If we didn't get another chain of the same priority that we - * can use, look for the next priority. - */ - while (this && !this->blocks.current_extent) - this = this->next; - - toi_writer_posn.current_chain = this; -} - -/** - * toi_extent_state_next - go to the next extent - * @blocks: The number of values to progress. - * @stripe_mode: Whether to spread usage across all chains. - * - * Given a state, progress to the next valid entry. We may begin in an - * invalid state, as we do when invoked after extent_state_goto_start below. - * - * When using compression and expected_compression > 0, we let the image size - * be larger than storage, so we can validly run out of data to return. - **/ -static unsigned long toi_extent_state_next(int blocks, int current_stream) -{ - int i; - - if (!toi_writer_posn.current_chain) - return -ENOSPC; - - /* Assume chains always have lengths that are multiples of @blocks */ - for (i = 0; i < blocks; i++) - toi_extent_chain_next(); - - /* The header stream is not striped */ - if (current_stream || - !toi_writer_posn.current_chain->blocks.current_extent) - find_next_chain(); - - return toi_writer_posn.current_chain ? 0 : -ENOSPC; -} - -static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this) -{ - struct toi_bdev_info **prev_ptr; - struct toi_bdev_info *cur; - - /* Loop through the existing chain, finding where to insert it */ - prev_ptr = &prio_chain_head; - cur = prio_chain_head; - - while (cur && cur->prio >= this->prio) { - prev_ptr = &cur->next; - cur = cur->next; - } - - this->next = *prev_ptr; - *prev_ptr = this; - - this = prio_chain_head; - while (this) - this = this->next; - num_chains++; -} - -/** - * toi_extent_state_goto_start - reinitialize an extent chain iterator - * @state: Iterator to reinitialize - **/ -void toi_extent_state_goto_start(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Setting current extent to %p.", this->blocks.first); - this->blocks.current_extent = this->blocks.first; - if (this->blocks.current_extent) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Setting current offset to %lu.", - this->blocks.current_extent->start); - this->blocks.current_offset = - this->blocks.current_extent->start; - } - - this = this->next; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.", - prio_chain_head); - toi_writer_posn.current_chain = prio_chain_head; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start."); -} - -/** - * toi_extent_state_save - save state of the iterator - * @state: Current state of the chain - * @saved_state: Iterator to populate - * - * Given a state and a struct hibernate_extent_state_store, save the current - * position in a format that can be used with relocated chains (at - * resume time). - **/ -void toi_extent_state_save(int slot) -{ - struct toi_bdev_info *cur_chain = prio_chain_head; - struct hibernate_extent *extent; - struct hibernate_extent_saved_state *chain_state; - int i = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.", - slot); - - if (!toi_writer_posn.current_chain) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => " - "chain_num = -1."); - toi_writer_posn.saved_chain_number[slot] = -1; - return; - } - - while (cur_chain) { - i++; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) " - "state, slot %d.", i, cur_chain, slot); - - chain_state = &cur_chain->saved_state[slot]; - - chain_state->offset = cur_chain->blocks.current_offset; - - if (toi_writer_posn.current_chain == cur_chain) { - toi_writer_posn.saved_chain_number[slot] = i; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain " - "we were on => chain_num is %d.", i); - } - - if (!cur_chain->blocks.current_extent) { - chain_state->extent_num = 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent " - "for this chain => extent_num %d is 0.", - i); - cur_chain = cur_chain->next; - continue; - } - - extent = cur_chain->blocks.first; - chain_state->extent_num = 1; - - while (extent != cur_chain->blocks.current_extent) { - chain_state->extent_num++; - extent = extent->next; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i, - chain_state->extent_num); - - cur_chain = cur_chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Completed saving extent state slot %d.", slot); -} - -/** - * toi_extent_state_restore - restore the position saved by extent_state_save - * @state: State to populate - * @saved_state: Iterator saved to restore - **/ -void toi_extent_state_restore(int slot) -{ - int i = 0; - struct toi_bdev_info *cur_chain = prio_chain_head; - struct hibernate_extent_saved_state *chain_state; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "toi_extent_state_restore - slot %d.", slot); - - if (toi_writer_posn.saved_chain_number[slot] == -1) { - toi_writer_posn.current_chain = NULL; - return; - } - - while (cur_chain) { - int posn; - int j; - i++; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) " - "state, slot %d.", i, cur_chain, slot); - - chain_state = &cur_chain->saved_state[slot]; - - posn = chain_state->extent_num; - - cur_chain->blocks.current_extent = cur_chain->blocks.first; - cur_chain->blocks.current_offset = chain_state->offset; - - if (i == toi_writer_posn.saved_chain_number[slot]) { - toi_writer_posn.current_chain = cur_chain; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Found current chain."); - } - - for (j = 0; j < 4; j++) - if (i == toi_writer_posn.saved_chain_number[j]) { - toi_writer_posn.saved_chain_ptr[j] = cur_chain; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Found saved chain ptr %d (%p) (offset" - " %d).", j, cur_chain, - cur_chain->saved_state[j].offset); - } - - if (posn) { - while (--posn) - cur_chain->blocks.current_extent = - cur_chain->blocks.current_extent->next; - } else - cur_chain->blocks.current_extent = NULL; - - cur_chain = cur_chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done."); - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); -} - -/* - * Storage needed - * - * Returns amount of space in the image header required - * for the chain data. This ignores the links between - * pages, which we factor in when allocating the space. - */ -int toi_bio_devinfo_storage_needed(void) -{ - int result = sizeof(num_chains); - struct toi_bdev_info *chain = prio_chain_head; - - while (chain) { - result += metadata_size; - - /* Chain size */ - result += sizeof(int); - - /* Extents */ - result += (2 * sizeof(unsigned long) * - chain->blocks.num_extents); - - chain = chain->next; - } - - result += 4 * sizeof(int); - return result; -} - -static unsigned long chain_pages_used(struct toi_bdev_info *chain) -{ - struct hibernate_extent *this = chain->blocks.first; - struct hibernate_extent_saved_state *state = &chain->saved_state[3]; - unsigned long size = 0; - int extent_idx = 1; - - if (!state->extent_num) { - if (!this) - return 0; - else - return chain->blocks.size; - } - - while (extent_idx < state->extent_num) { - size += (this->end - this->start + 1); - this = this->next; - extent_idx++; - } - - /* We didn't use the one we're sitting on, so don't count it */ - return size + state->offset - this->start; -} - -void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain) -{ - unsigned long used = chain_pages_used(chain); - - /* Free the storage */ - unsigned long first_freed = 0; - - if (chain->allocator->bio_allocator_ops->free_unused_storage) - first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used); - - printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed); - - /* Adjust / free the extents. */ - toi_put_extent_chain_from(&chain->blocks, first_freed); - - { - struct hibernate_extent *this = chain->blocks.first; - while (this) { - printk("Extent %lx-%lx.\n", this->start, this->end); - this = this->next; - } - } -} - -/** - * toi_serialise_extent_chain - write a chain in the image - * @chain: Chain to write. - **/ -static int toi_serialise_extent_chain(struct toi_bdev_info *chain) -{ - struct hibernate_extent *this; - int ret; - int i = 1; - - chain->pages_used = chain_pages_used(chain); - - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).", - chain->dev_t); - /* Device info - dev_t, prio, bmap_shift, blocks per page, positions */ - ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, - (char *) &chain->uuid, metadata_size); - if (ret) - return ret; - - /* Num extents */ - ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, - (char *) &chain->blocks.num_extents, sizeof(int)); - if (ret) - return ret; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", - chain->blocks.num_extents); - - this = chain->blocks.first; - while (this) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i); - ret = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, - (char *) this, 2 * sizeof(this->start)); - if (ret) - return ret; - this = this->next; - i++; - } - - return ret; -} - -int toi_serialise_extent_chains(void) -{ - struct toi_bdev_info *this = prio_chain_head; - int result; - - /* Write the number of chains */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)", - num_chains); - result = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, (char *) &num_chains, - sizeof(int)); - if (result) - return result; - - /* Then the chains themselves */ - while (this) { - result = toi_serialise_extent_chain(this); - if (result) - return result; - this = this->next; - } - - /* - * Finally, the chain we should be on at the start of each - * section. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers."); - result = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, - (char *) &toi_writer_posn.saved_chain_number[0], - 4 * sizeof(int)); - - return result; -} - -int toi_register_storage_chain(struct toi_bdev_info *new) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.", - new); - toi_insert_chain_in_prio_list(new); - return 0; -} - -static void free_bdev_info(struct toi_bdev_info *chain) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents."); - toi_put_extent_chain(&chain->blocks); - - /* - * The allocator may need to do more than just free the chains - * (swap_free, for example). Don't call from boot kernel. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents."); - if (chain->allocator) - chain->allocator->bio_allocator_ops->free_storage(chain); - - /* - * Dropping out of reading atomic copy? Need to undo - * toi_open_by_devnum. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev."); - if (chain->bdev && !IS_ERR(chain->bdev) && - chain->bdev != resume_block_device && - chain->bdev != header_block_device && - test_toi_state(TOI_TRYING_TO_RESUME)) - toi_close_bdev(chain->bdev); - - /* Poison */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct."); - toi_kfree(39, chain, sizeof(*chain)); - - if (prio_chain_head == chain) - prio_chain_head = NULL; - - num_chains--; -} - -void free_all_bdev_info(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - struct toi_bdev_info *next = this->next; - free_bdev_info(this); - this = next; - } - - memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn)); - prio_chain_head = NULL; -} - -static void set_up_start_position(void) -{ - toi_writer_posn.current_chain = prio_chain_head; - go_next_page(0, 0); -} - -/** - * toi_load_extent_chain - read back a chain saved in the image - * @chain: Chain to load - * - * The linked list of extents is reconstructed from the disk. chain will point - * to the first entry. - **/ -int toi_load_extent_chain(int index, int *num_loaded) -{ - struct toi_bdev_info *chain = toi_kzalloc(39, - sizeof(struct toi_bdev_info), GFP_ATOMIC); - struct hibernate_extent *this, *last = NULL; - int i, ret; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index); - /* Get dev_t, prio, bmap_shift, blocks per page, positions */ - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &chain->uuid, metadata_size); - - if (ret) { - printk(KERN_ERR "Failed to read the size of extent chain.\n"); - toi_kfree(39, chain, sizeof(*chain)); - return 1; - } - - toi_bkd.pages_used[index] = chain->pages_used; - - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &chain->blocks.num_extents, sizeof(int)); - if (ret) { - printk(KERN_ERR "Failed to read the size of extent chain.\n"); - toi_kfree(39, chain, sizeof(*chain)); - return 1; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", - chain->blocks.num_extents); - - for (i = 0; i < chain->blocks.num_extents; i++) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1); - - this = toi_kzalloc(2, sizeof(struct hibernate_extent), - TOI_ATOMIC_GFP); - if (!this) { - printk(KERN_INFO "Failed to allocate a new extent.\n"); - free_bdev_info(chain); - return -ENOMEM; - } - this->next = NULL; - /* Get the next page */ - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, - NULL, (char *) this, 2 * sizeof(this->start)); - if (ret) { - printk(KERN_INFO "Failed to read an extent.\n"); - toi_kfree(2, this, sizeof(struct hibernate_extent)); - free_bdev_info(chain); - return 1; - } - - if (last) - last->next = this; - else { - char b1[32], b2[32], b3[32]; - /* - * Open the bdev - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Chain dev_t is %s. Resume dev t is %s. Header" - " bdev_t is %s.\n", - format_dev_t(b1, chain->dev_t), - format_dev_t(b2, resume_dev_t), - format_dev_t(b3, toi_sig_data->header_dev_t)); - - if (chain->dev_t == resume_dev_t) - chain->bdev = resume_block_device; - else if (chain->dev_t == toi_sig_data->header_dev_t) - chain->bdev = header_block_device; - else { - chain->bdev = toi_open_bdev(chain->uuid, - chain->dev_t, 1); - if (IS_ERR(chain->bdev)) { - free_bdev_info(chain); - return -ENODEV; - } - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift " - "is %d and blocks per page is %d.", - chain->bmap_shift, - chain->blocks_per_page); - - chain->blocks.first = this; - - /* - * Couldn't do this earlier, but can't do - * goto_start now - we may have already used blocks - * in the first chain. - */ - chain->blocks.current_extent = this; - chain->blocks.current_offset = this->start; - - /* - * Can't wait until we've read the whole chain - * before we insert it in the list. We might need - * this chain to read the next page in the header - */ - toi_insert_chain_in_prio_list(chain); - } - - /* - * We have to wait until 2 extents are loaded before setting up - * properly because if the first extent has only one page, we - * will need to put the position on the second extent. Sounds - * obvious, but it wasn't! - */ - (*num_loaded)++; - if ((*num_loaded) == 2) - set_up_start_position(); - last = this; - } - - /* - * Shouldn't get empty chains, but it's not impossible. Link them in so - * they get freed properly later. - */ - if (!chain->blocks.num_extents) - toi_insert_chain_in_prio_list(chain); - - if (!chain->blocks.current_extent) { - chain->blocks.current_extent = chain->blocks.first; - if (chain->blocks.current_extent) - chain->blocks.current_offset = - chain->blocks.current_extent->start; - } - return 0; -} - -int toi_load_extent_chains(void) -{ - int result; - int to_load; - int i; - int extents_loaded = 0; - - result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &to_load, - sizeof(int)); - if (result) - return result; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load); - - for (i = 0; i < to_load; i++) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.", - i, to_load); - result = toi_load_extent_chain(i, &extents_loaded); - if (result) - return result; - } - - /* If we never got to a second extent, we still need to do this. */ - if (extents_loaded == 1) - set_up_start_position(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers."); - result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, - &toi_blockwriter_ops, - (char *) &toi_writer_posn.saved_chain_number[0], - 4 * sizeof(int)); - - return result; -} - -static int toi_end_of_stream(int writing, int section_barrier) -{ - struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; - int compare_to = next_section[current_stream]; - struct toi_bdev_info *compare_chain = - toi_writer_posn.saved_chain_ptr[compare_to]; - int compare_offset = compare_chain ? - compare_chain->saved_state[compare_to].offset : 0; - - if (!section_barrier) - return 0; - - if (!cur_chain) - return 1; - - if (cur_chain == compare_chain && - cur_chain->blocks.current_offset == compare_offset) { - if (writing) { - if (!current_stream) { - debug_broken_header(); - return 1; - } - } else { - more_readahead = 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Reached the end of stream %d " - "(not an error).", current_stream); - return 1; - } - } - - return 0; -} - -/** - * go_next_page - skip blocks to the start of the next page - * @writing: Whether we're reading or writing the image. - * - * Go forward one page. - **/ -int go_next_page(int writing, int section_barrier) -{ - struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; - int max = cur_chain ? cur_chain->blocks_per_page : 1; - - /* Nope. Go foward a page - or maybe two. Don't stripe the header, - * so that bad fragmentation doesn't put the extent data containing - * the location of the second page out of the first header page. - */ - if (toi_extent_state_next(max, current_stream)) { - /* Don't complain if readahead falls off the end */ - if (writing && section_barrier) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. " - "Expected compression ratio too optimistic?"); - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to " - "read/write. (Not necessarily a fatal error."); - return -ENOSPC; - } - - return 0; -} - -int devices_of_same_priority(struct toi_bdev_info *this) -{ - struct toi_bdev_info *check = prio_chain_head; - int i = 0; - - while (check) { - if (check->prio == this->prio) - i++; - check = check->next; - } - - return i; -} - -/** - * toi_bio_rw_page - do i/o on the next disk page in the image - * @writing: Whether reading or writing. - * @page: Page to do i/o on. - * @is_readahead: Whether we're doing readahead - * @free_group: The group used in allocating the page - * - * Submit a page for reading or writing, possibly readahead. - * Pass the group used in allocating the page as well, as it should - * be freed on completion of the bio if we're writing the page. - **/ -int toi_bio_rw_page(int writing, struct page *page, - int is_readahead, int free_group) -{ - int result = toi_end_of_stream(writing, 1); - struct toi_bdev_info *dev_info = toi_writer_posn.current_chain; - - if (result) { - if (writing) - abort_hibernate(TOI_INSUFFICIENT_STORAGE, - "Insufficient storage for your image."); - else - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to " - "read/write another page when stream has " - "ended."); - return -ENOSPC; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "%s %lx:%ld", - writing ? "Write" : "Read", - dev_info->dev_t, dev_info->blocks.current_offset); - - result = toi_do_io(writing, dev_info->bdev, - dev_info->blocks.current_offset << dev_info->bmap_shift, - page, is_readahead, 0, free_group); - - /* Ignore the result here - will check end of stream if come in again */ - go_next_page(writing, 1); - - if (result) - printk(KERN_ERR "toi_do_io returned %d.\n", result); - return result; -} - -dev_t get_header_dev_t(void) -{ - return prio_chain_head->dev_t; -} - -struct block_device *get_header_bdev(void) -{ - return prio_chain_head->bdev; -} - -unsigned long get_headerblock(void) -{ - return prio_chain_head->blocks.first->start << - prio_chain_head->bmap_shift; -} - -int get_main_pool_phys_params(void) -{ - struct toi_bdev_info *this = prio_chain_head; - int result; - - while (this) { - result = this->allocator->bio_allocator_ops->bmap(this); - if (result) - return result; - this = this->next; - } - - return 0; -} - -static int apply_header_reservation(void) -{ - int i; - - if (!header_pages_reserved) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "No header pages reserved at the moment."); - return 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation."); - - /* Apply header space reservation */ - toi_extent_state_goto_start(); - - for (i = 0; i < header_pages_reserved; i++) - if (go_next_page(1, 0)) - return -ENOSPC; - - /* The end of header pages will be the start of pageset 2 */ - toi_extent_state_save(2); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Finished applying header reservation."); - return 0; -} - -int toi_bio_register_storage(void) -{ - int result = 0; - struct toi_module_ops *this_module; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " - "Registering storage."); - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type != BIO_ALLOCATOR_MODULE) - continue; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Registering storage from %s.", - this_module->name); - result = this_module->bio_allocator_ops->register_storage(); - if (result) - break; - } - - return result; -} - -void toi_bio_free_unused_storage(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - toi_bio_free_unused_storage_chain(this); - this = this->next; - } -} - -int toi_bio_allocate_storage(unsigned long request) -{ - struct toi_bdev_info *chain = prio_chain_head; - unsigned long to_get = request; - unsigned long extra_pages, needed; - int no_free = 0; - - if (!chain) { - printk("TuxOnIce: No storage was registered.\n"); - return 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " - "Request is %lu pages.", request); - extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long) - + sizeof(int)), PAGE_SIZE); - needed = request + extra_pages + header_pages_reserved; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu " - "for header => %lu.", - extra_pages, header_pages_reserved, needed); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.", - raw_pages_allocd); - - to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get); - - if (!to_get) - return apply_header_reservation(); - - while (to_get && chain) { - int num_group = devices_of_same_priority(chain); - int divisor = num_group - no_free; - int i; - unsigned long portion = DIV_ROUND_UP(to_get, divisor); - unsigned long got = 0; - unsigned long got_this_round = 0; - struct toi_bdev_info *top = chain; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Start of loop. To get is %lu. Divisor is %d.", - to_get, divisor); - no_free = 0; - - /* - * We're aiming to spread the allocated storage as evenly - * as possible, but we also want to get all the storage we - * can off this priority. - */ - for (i = 0; i < num_group; i++) { - struct toi_bio_allocator_ops *ops = - chain->allocator->bio_allocator_ops; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Asking for %lu pages from chain %p.", - portion, chain); - got = ops->allocate_storage(chain, portion); - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Got %lu pages from allocator %p.", - got, chain); - if (!got) - no_free++; - got_this_round += got; - chain = chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a " - "total of %lu pages from %d allocators.", - got_this_round, divisor - no_free); - - raw_pages_allocd += got_this_round; - to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : - 0; - - /* - * If we got anything from chains of this priority and we - * still have storage to allocate, go over this priority - * again. - */ - if (got_this_round && to_get) - chain = top; - else - no_free = 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling " - "get_main_pool_phys_params"); - /* Now let swap allocator bmap the pages */ - get_main_pool_phys_params(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header."); - return apply_header_reservation(); -} - -void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd) -{ - int i = 0; - struct toi_bdev_info *cur_chain = prio_chain_head; - - while (cur_chain) { - cur_chain->pages_used = bkd->pages_used[i]; - cur_chain = cur_chain->next; - i++; - } -} - -int toi_bio_chains_debug_info(char *buffer, int size) -{ - /* Show what we actually used */ - struct toi_bdev_info *cur_chain = prio_chain_head; - int len = 0; - - while (cur_chain) { - len += scnprintf(buffer + len, size - len, " Used %lu pages " - "from %s.\n", cur_chain->pages_used, - cur_chain->name); - cur_chain = cur_chain->next; - } - - return len; -} - -void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr) -{ - struct toi_bdev_info *this = toi_writer_posn.current_chain, - *cmp = prio_chain_head; - - ptr->save.chain = 1; - while (this != cmp) { - ptr->save.chain++; - cmp = cmp->next; - } - ptr->save.block = this->blocks.current_offset; - - /* Save the raw info internally for quicker access when updating pointers */ - ptr->bdev = this->bdev; - ptr->block = this->blocks.current_offset << this->bmap_shift; -} - -void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr) -{ - int i = ptr->save.chain - 1; - struct toi_bdev_info *this; - struct hibernate_extent *hib; - - /* Find chain by stored index */ - this = prio_chain_head; - while (i) { - this = this->next; - i--; - } - toi_writer_posn.current_chain = this; - - /* Restore block */ - this->blocks.current_offset = ptr->save.block; - - /* Find current offset from block number */ - hib = this->blocks.first; - - while (hib->start > ptr->save.block) { - hib = hib->next; - } - - this->blocks.last_touched = this->blocks.current_extent = hib; -} diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c deleted file mode 100644 index bc27662d7..000000000 --- a/kernel/power/tuxonice_bio_core.c +++ /dev/null @@ -1,1937 +0,0 @@ -/* - * kernel/power/tuxonice_bio.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains block io functions for TuxOnIce. These are - * used by the swapwriter and it is planned that they will also - * be used by the NFSwriter. - * - */ - -#include <linux/blkdev.h> -#include <linux/syscalls.h> -#include <linux/suspend.h> -#include <linux/ctype.h> -#include <linux/mount.h> -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_bio.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" -#include "tuxonice_builtin.h" -#include "tuxonice_bio_internal.h" - -#define MEMORY_ONLY 1 -#define THROTTLE_WAIT 2 - -/* #define MEASURE_MUTEX_CONTENTION */ -#ifndef MEASURE_MUTEX_CONTENTION -#define my_mutex_lock(index, the_lock) mutex_lock(the_lock) -#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock) -#else -unsigned long mutex_times[2][2][NR_CPUS]; -#define my_mutex_lock(index, the_lock) do { \ - int have_mutex; \ - have_mutex = mutex_trylock(the_lock); \ - if (!have_mutex) { \ - mutex_lock(the_lock); \ - mutex_times[index][0][smp_processor_id()]++; \ - } else { \ - mutex_times[index][1][smp_processor_id()]++; \ - } - -#define my_mutex_unlock(index, the_lock) \ - mutex_unlock(the_lock); \ -} while (0) -#endif - -static int page_idx, reset_idx; - -static int target_outstanding_io = 1024; -static int max_outstanding_writes, max_outstanding_reads; - -static struct page *bio_queue_head, *bio_queue_tail; -static atomic_t toi_bio_queue_size; -static DEFINE_SPINLOCK(bio_queue_lock); - -static int free_mem_throttle, throughput_throttle; -int more_readahead = 1; -static struct page *readahead_list_head, *readahead_list_tail; - -static struct page *waiting_on; - -static atomic_t toi_io_in_progress, toi_io_done; -static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait); - -int current_stream; -/* Not static, so that the allocators can setup and complete - * writing the header */ -char *toi_writer_buffer; -int toi_writer_buffer_posn; - -static DEFINE_MUTEX(toi_bio_mutex); -static DEFINE_MUTEX(toi_bio_readahead_mutex); - -static struct task_struct *toi_queue_flusher; -static int toi_bio_queue_flush_pages(int dedicated_thread); - -struct toi_module_ops toi_blockwriter_ops; - -struct toi_incremental_image_pointer toi_inc_ptr[2][2]; - -#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \ - atomic_read(&toi_bio_queue_size)) - -unsigned long raw_pages_allocd, header_pages_reserved; - -static int toi_rw_buffer(int writing, char *buffer, int buffer_size, - int no_readahead); - -/** - * set_free_mem_throttle - set the point where we pause to avoid oom. - * - * Initially, this value is zero, but when we first fail to allocate memory, - * we set it (plus a buffer) and thereafter throttle i/o once that limit is - * reached. - **/ -static void set_free_mem_throttle(void) -{ - int new_throttle = nr_free_buffer_pages() + 256; - - if (new_throttle > free_mem_throttle) - free_mem_throttle = new_throttle; -} - -#define NUM_REASONS 7 -static atomic_t reasons[NUM_REASONS]; -static char *reason_name[NUM_REASONS] = { - "readahead not ready", - "bio allocation", - "synchronous I/O", - "toi_bio_get_new_page", - "memory low", - "readahead buffer allocation", - "throughput_throttle", -}; - -/* User Specified Parameters. */ -unsigned long resume_firstblock; -dev_t resume_dev_t; -struct block_device *resume_block_device; -static atomic_t resume_bdev_open_count; - -struct block_device *header_block_device; - -/** - * toi_open_bdev: Open a bdev at resume time. - * - * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t - * (the user can have resume= pointing at a swap partition/file that isn't - * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the - * header. It will be from a swap partition that was enabled when we hibernated, - * but we don't know it's real index until we read that first page. - * dev_t: The device major/minor. - * display_errs: Whether to try to do this quietly. - * - * We stored a dev_t in the image header. Open the matching device without - * requiring /dev/<whatever> in most cases and record the details needed - * to close it later and avoid duplicating work. - */ -struct block_device *toi_open_bdev(char *uuid, dev_t default_device, - int display_errs) -{ - struct block_device *bdev; - dev_t device = default_device; - char buf[32]; - int retried = 0; - -retry: - if (uuid) { - struct fs_info seek; - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = 0; - seek.last_mount_size = 0; - device = blk_lookup_fs_info(&seek); - if (!device) { - device = default_device; - printk(KERN_DEBUG "Unable to resolve uuid. Falling back" - " to dev_t.\n"); - } else - printk(KERN_DEBUG "Resolved uuid to device %s.\n", - format_dev_t(buf, device)); - } - - if (!device) { - printk(KERN_ERR "TuxOnIce attempting to open a " - "blank dev_t!\n"); - dump_stack(); - return NULL; - } - bdev = toi_open_by_devnum(device); - - if (IS_ERR(bdev) || !bdev) { - if (!retried) { - retried = 1; - wait_for_device_probe(); - goto retry; - } - if (display_errs) - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "Failed to get access to block device " - "\"%x\" (error %d).\n Maybe you need " - "to run mknod and/or lvmsetup in an " - "initrd/ramfs?", device, bdev); - return ERR_PTR(-EINVAL); - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "TuxOnIce got bdev %p for dev_t %x.", - bdev, device); - - return bdev; -} - -static void toi_bio_reserve_header_space(unsigned long request) -{ - header_pages_reserved = request; -} - -/** - * do_bio_wait - wait for some TuxOnIce I/O to complete - * @reason: The array index of the reason we're waiting. - * - * Wait for a particular page of I/O if we're after a particular page. - * If we're not after a particular page, wait instead for all in flight - * I/O to be completed or for us to have enough free memory to be able - * to submit more I/O. - * - * If we wait, we also update our statistics regarding why we waited. - **/ -static void do_bio_wait(int reason) -{ - struct page *was_waiting_on = waiting_on; - - /* On SMP, waiting_on can be reset, so we make a copy */ - if (was_waiting_on) { - wait_on_page_locked(was_waiting_on); - atomic_inc(&reasons[reason]); - } else { - atomic_inc(&reasons[reason]); - - wait_event(num_in_progress_wait, - !atomic_read(&toi_io_in_progress) || - nr_free_buffer_pages() > free_mem_throttle); - } -} - -/** - * throttle_if_needed - wait for I/O completion if throttle points are reached - * @flags: What to check and how to act. - * - * Check whether we need to wait for some I/O to complete. We always check - * whether we have enough memory available, but may also (depending upon - * @reason) check if the throughput throttle limit has been reached. - **/ -static int throttle_if_needed(int flags) -{ - int free_pages = nr_free_buffer_pages(); - - /* Getting low on memory and I/O is in progress? */ - while (unlikely(free_pages < free_mem_throttle) && - atomic_read(&toi_io_in_progress) && - !test_result_state(TOI_ABORTED)) { - if (!(flags & THROTTLE_WAIT)) - return -ENOMEM; - do_bio_wait(4); - free_pages = nr_free_buffer_pages(); - } - - while (!(flags & MEMORY_ONLY) && throughput_throttle && - TOTAL_OUTSTANDING_IO >= throughput_throttle && - !test_result_state(TOI_ABORTED)) { - int result = toi_bio_queue_flush_pages(0); - if (result) - return result; - atomic_inc(&reasons[6]); - wait_event(num_in_progress_wait, - !atomic_read(&toi_io_in_progress) || - TOTAL_OUTSTANDING_IO < throughput_throttle); - } - - return 0; -} - -/** - * update_throughput_throttle - update the raw throughput throttle - * @jif_index: The number of times this function has been called. - * - * This function is called four times per second by the core, and used to limit - * the amount of I/O we submit at once, spreading out our waiting through the - * whole job and letting userui get an opportunity to do its work. - * - * We don't start limiting I/O until 1/4s has gone so that we get a - * decent sample for our initial limit, and keep updating it because - * throughput may vary (on rotating media, eg) with our block number. - * - * We throttle to 1/10s worth of I/O. - **/ -static void update_throughput_throttle(int jif_index) -{ - int done = atomic_read(&toi_io_done); - throughput_throttle = done * 2 / 5 / jif_index; -} - -/** - * toi_finish_all_io - wait for all outstanding i/o to complete - * - * Flush any queued but unsubmitted I/O and wait for it all to complete. - **/ -static int toi_finish_all_io(void) -{ - int result = toi_bio_queue_flush_pages(0); - toi_bio_queue_flusher_should_finish = 1; - wake_up(&toi_io_queue_flusher); - wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO); - return result; -} - -/** - * toi_end_bio - bio completion function. - * @bio: bio that has completed. - * - * Function called by the block driver from interrupt context when I/O is - * completed. If we were writing the page, we want to free it and will have - * set bio->bi_private to the parameter we should use in telling the page - * allocation accounting code what the page was allocated for. If we're - * reading the page, it will be in the singly linked list made from - * page->private pointers. - **/ -static void toi_end_bio(struct bio *bio) -{ - struct page *page = bio->bi_io_vec[0].bv_page; - - BUG_ON(bio->bi_error); - - unlock_page(page); - bio_put(bio); - - if (waiting_on == page) - waiting_on = NULL; - - put_page(page); - - if (bio->bi_private) - toi__free_page((int) ((unsigned long) bio->bi_private) , page); - - bio_put(bio); - - atomic_dec(&toi_io_in_progress); - atomic_inc(&toi_io_done); - - wake_up(&num_in_progress_wait); -} - -/** - * submit - submit BIO request - * @writing: READ or WRITE. - * @dev: The block device we're using. - * @first_block: The first sector we're using. - * @page: The page being used for I/O. - * @free_group: If writing, the group that was used in allocating the page - * and which will be used in freeing the page from the completion - * routine. - * - * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the - * textbook - allocate and initialize the bio. If we're writing, make sure - * the page is marked as dirty. Then submit it and carry on." - * - * If we're just testing the speed of our own code, we fake having done all - * the hard work and all toi_end_bio immediately. - **/ -static int submit(int writing, struct block_device *dev, sector_t first_block, - struct page *page, int free_group) -{ - struct bio *bio = NULL; - int cur_outstanding_io, result; - - /* - * Shouldn't throttle if reading - can deadlock in the single - * threaded case as pages are only freed when we use the - * readahead. - */ - if (writing) { - result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT); - if (result) - return result; - } - - while (!bio) { - bio = bio_alloc(TOI_ATOMIC_GFP, 1); - if (!bio) { - set_free_mem_throttle(); - do_bio_wait(1); - } - } - - bio->bi_bdev = dev; - bio->bi_iter.bi_sector = first_block; - bio->bi_private = (void *) ((unsigned long) free_group); - bio->bi_end_io = toi_end_bio; - bio_set_flag(bio, BIO_TOI); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n", - (unsigned long long) first_block); - bio_put(bio); - return -EFAULT; - } - - bio_get(bio); - - cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress); - if (writing) { - if (cur_outstanding_io > max_outstanding_writes) - max_outstanding_writes = cur_outstanding_io; - } else { - if (cur_outstanding_io > max_outstanding_reads) - max_outstanding_reads = cur_outstanding_io; - } - - /* Still read the header! */ - if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) { - /* Fake having done the hard work */ - bio->bi_error = 0; - toi_end_bio(bio); - } else - submit_bio(writing | REQ_SYNC, bio); - - return 0; -} - -/** - * toi_do_io: Prepare to do some i/o on a page and submit or batch it. - * - * @writing: Whether reading or writing. - * @bdev: The block device which we're using. - * @block0: The first sector we're reading or writing. - * @page: The page on which I/O is being done. - * @readahead_index: If doing readahead, the index (reset this flag when done). - * @syncio: Whether the i/o is being done synchronously. - * - * Prepare and start a read or write operation. - * - * Note that we always work with our own page. If writing, we might be given a - * compression buffer that will immediately be used to start compressing the - * next page. For reading, we do readahead and therefore don't know the final - * address where the data needs to go. - **/ -int toi_do_io(int writing, struct block_device *bdev, long block0, - struct page *page, int is_readahead, int syncio, int free_group) -{ - page->private = 0; - - /* Do here so we don't race against toi_bio_get_next_page_read */ - lock_page(page); - - if (is_readahead) { - if (readahead_list_head) - readahead_list_tail->private = (unsigned long) page; - else - readahead_list_head = page; - - readahead_list_tail = page; - } - - /* Done before submitting to avoid races. */ - if (syncio) - waiting_on = page; - - /* Submit the page */ - get_page(page); - - if (submit(writing, bdev, block0, page, free_group)) - return -EFAULT; - - if (syncio) - do_bio_wait(2); - - return 0; -} - -/** - * toi_bdev_page_io - simpler interface to do directly i/o on a single page - * @writing: Whether reading or writing. - * @bdev: Block device on which we're operating. - * @pos: Sector at which page to read or write starts. - * @page: Page to be read/written. - * - * A simple interface to submit a page of I/O and wait for its completion. - * The caller must free the page used. - **/ -static int toi_bdev_page_io(int writing, struct block_device *bdev, - long pos, struct page *page) -{ - return toi_do_io(writing, bdev, pos, page, 0, 1, 0); -} - -/** - * toi_bio_memory_needed - report the amount of memory needed for block i/o - * - * We want to have at least enough memory so as to have target_outstanding_io - * or more transactions on the fly at once. If we can do more, fine. - **/ -static int toi_bio_memory_needed(void) -{ - return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) + - sizeof(struct bio)); -} - -/** - * toi_bio_print_debug_stats - put out debugging info in the buffer provided - * @buffer: A buffer of size @size into which text should be placed. - * @size: The size of @buffer. - * - * Fill a buffer with debugging info. This is used for both our debug_info sysfs - * entry and for recording the same info in dmesg. - **/ -static int toi_bio_print_debug_stats(char *buffer, int size) -{ - int len = 0; - - if (toiActiveAllocator != &toi_blockwriter_ops) { - len = scnprintf(buffer, size, - "- Block I/O inactive.\n"); - return len; - } - - len = scnprintf(buffer, size, "- Block I/O active.\n"); - - len += toi_bio_chains_debug_info(buffer + len, size - len); - - len += scnprintf(buffer + len, size - len, - "- Max outstanding reads %d. Max writes %d.\n", - max_outstanding_reads, max_outstanding_writes); - - len += scnprintf(buffer + len, size - len, - " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n", - target_outstanding_io, - PAGE_SIZE, (unsigned int) sizeof(struct request), - (unsigned int) sizeof(struct bio), toi_bio_memory_needed()); - -#ifdef MEASURE_MUTEX_CONTENTION - { - int i; - - len += scnprintf(buffer + len, size - len, - " Mutex contention while reading:\n Contended Free\n"); - - for_each_online_cpu(i) - len += scnprintf(buffer + len, size - len, - " %9lu %9lu\n", - mutex_times[0][0][i], mutex_times[0][1][i]); - - len += scnprintf(buffer + len, size - len, - " Mutex contention while writing:\n Contended Free\n"); - - for_each_online_cpu(i) - len += scnprintf(buffer + len, size - len, - " %9lu %9lu\n", - mutex_times[1][0][i], mutex_times[1][1][i]); - - } -#endif - - return len + scnprintf(buffer + len, size - len, - " Free mem throttle point reached %d.\n", free_mem_throttle); -} - -static int total_header_bytes; -static int unowned; - -void debug_broken_header(void) -{ - printk(KERN_DEBUG "Image header too big for size allocated!\n"); - print_toi_header_storage_for_modules(); - printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed()); - printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header)); - printk(KERN_DEBUG "Total unowned : %d.\n", unowned); - printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes, - DIV_ROUND_UP(total_header_bytes, PAGE_SIZE)); - printk(KERN_DEBUG "Space needed now : %ld.\n", - get_header_storage_needed(0)); - dump_block_chains(); - abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small."); -} - -static int toi_bio_update_previous_inc_img_ptr(int stream) -{ - int result; - char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP); - struct page *page; - struct toi_incremental_image_pointer *prev, *this; - - prev = &toi_inc_ptr[stream][0]; - this = &toi_inc_ptr[stream][1]; - - if (!buffer) { - // We're at the start of writing a pageset. Memory should not be that scarce. - return -ENOMEM; - } - - page = virt_to_page(buffer); - result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0); - - if (result) - goto out; - - memcpy(buffer, (char *) this, sizeof(this->save)); - - result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12); - - // If the IO is successfully submitted (!result), the page will be freed - // asynchronously on completion. -out: - if (result) - toi__free_page(12, virt_to_page(buffer)); - return result; -} - -/** - * toi_rw_init_incremental - incremental image part of setting up to write new section - */ -static int toi_write_init_incremental(int stream) -{ - int result = 0; - - // Remember the location of this block so we can link to it. - toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]); - - // Update the pointer at the start of the last pageset with the same stream number. - result = toi_bio_update_previous_inc_img_ptr(stream); - if (result) - return result; - - // Move the current to the previous slot. - memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1])); - - // Store a blank pointer at the start of this incremental pageset - memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1])); - result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); - if (result) - return result; - - // Serialise extent chains if this is an incremental pageset - return toi_serialise_extent_chains(); -} - -/** - * toi_read_init_incremental - incremental image part of setting up to read new section - */ -static int toi_read_init_incremental(int stream) -{ - int result; - - // Set our position to the start of the next pageset - toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]); - - // Read the start of the next incremental pageset (if any) - result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); - - if (!result) - result = toi_load_extent_chains(); - - return result; -} - -/** - * toi_rw_init - prepare to read or write a stream in the image - * @writing: Whether reading or writing. - * @stream number: Section of the image being processed. - * - * Prepare to read or write a section ('stream') in the image. - **/ -static int toi_rw_init(int writing, int stream_number) -{ - if (stream_number) - toi_extent_state_restore(stream_number); - else - toi_extent_state_goto_start(); - - if (writing) { - reset_idx = 0; - if (!current_stream) - page_idx = 0; - } else { - reset_idx = 1; - } - - atomic_set(&toi_io_done, 0); - if (!toi_writer_buffer) - toi_writer_buffer = (char *) toi_get_zeroed_page(11, - TOI_ATOMIC_GFP); - toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE; - - current_stream = stream_number; - - more_readahead = 1; - - if (test_result_state(TOI_KEPT_IMAGE)) { - int result; - - if (writing) { - result = toi_write_init_incremental(stream_number); - } else { - result = toi_read_init_incremental(stream_number); - } - - if (result) - return result; - } - - return toi_writer_buffer ? 0 : -ENOMEM; -} - -/** - * toi_bio_queue_write - queue a page for writing - * @full_buffer: Pointer to a page to be queued - * - * Add a page to the queue to be submitted. If we're the queue flusher, - * we'll do this once we've dropped toi_bio_mutex, so other threads can - * continue to submit I/O while we're on the slow path doing the actual - * submission. - **/ -static void toi_bio_queue_write(char **full_buffer) -{ - struct page *page = virt_to_page(*full_buffer); - unsigned long flags; - - *full_buffer = NULL; - page->private = 0; - - spin_lock_irqsave(&bio_queue_lock, flags); - if (!bio_queue_head) - bio_queue_head = page; - else - bio_queue_tail->private = (unsigned long) page; - - bio_queue_tail = page; - atomic_inc(&toi_bio_queue_size); - - spin_unlock_irqrestore(&bio_queue_lock, flags); - wake_up(&toi_io_queue_flusher); -} - -/** - * toi_rw_cleanup - Cleanup after i/o. - * @writing: Whether we were reading or writing. - * - * Flush all I/O and clean everything up after reading or writing a - * section of the image. - **/ -static int toi_rw_cleanup(int writing) -{ - int i, result = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup."); - if (writing) { - if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED)) - toi_bio_queue_write(&toi_writer_buffer); - - while (bio_queue_head && !result) - result = toi_bio_queue_flush_pages(0); - - if (result) - return result; - - if (current_stream == 2) - toi_extent_state_save(1); - else if (current_stream == 1) - toi_extent_state_save(3); - } - - result = toi_finish_all_io(); - - while (readahead_list_head) { - void *next = (void *) readahead_list_head->private; - toi__free_page(12, readahead_list_head); - readahead_list_head = next; - } - - readahead_list_tail = NULL; - - if (!current_stream) - return result; - - for (i = 0; i < NUM_REASONS; i++) { - if (!atomic_read(&reasons[i])) - continue; - printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n", - reason_name[i], atomic_read(&reasons[i])); - atomic_set(&reasons[i], 0); - } - - current_stream = 0; - return result; -} - -/** - * toi_start_one_readahead - start one page of readahead - * @dedicated_thread: Is this a thread dedicated to doing readahead? - * - * Start one new page of readahead. If this is being called by a thread - * whose only just is to submit readahead, don't quit because we failed - * to allocate a page. - **/ -static int toi_start_one_readahead(int dedicated_thread) -{ - char *buffer = NULL; - int oom = 0, result; - - result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0); - if (result) { - printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result); - return result; - } - - mutex_lock(&toi_bio_readahead_mutex); - - while (!buffer) { - buffer = (char *) toi_get_zeroed_page(12, - TOI_ATOMIC_GFP); - if (!buffer) { - if (oom && !dedicated_thread) { - mutex_unlock(&toi_bio_readahead_mutex); - printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result); - return -ENOMEM; - } - - oom = 1; - set_free_mem_throttle(); - do_bio_wait(5); - } - } - - result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0); - if (result) { - printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result); - } - if (result == -ENOSPC) - toi__free_page(12, virt_to_page(buffer)); - mutex_unlock(&toi_bio_readahead_mutex); - if (result) { - if (result == -ENOSPC) - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Last readahead page submitted."); - else - printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n", - result); - } - return result; -} - -/** - * toi_start_new_readahead - start new readahead - * @dedicated_thread: Are we dedicated to this task? - * - * Start readahead of image pages. - * - * We can be called as a thread dedicated to this task (may be helpful on - * systems with lots of CPUs), in which case we don't exit until there's no - * more readahead. - * - * If this is not called by a dedicated thread, we top up our queue until - * there's no more readahead to submit, we've submitted the number given - * in target_outstanding_io or the number in progress exceeds the target - * outstanding I/O value. - * - * No mutex needed because this is only ever called by the first cpu. - **/ -static int toi_start_new_readahead(int dedicated_thread) -{ - int last_result, num_submitted = 0; - - /* Start a new readahead? */ - if (!more_readahead) - return 0; - - do { - last_result = toi_start_one_readahead(dedicated_thread); - - if (last_result) { - if (last_result == -ENOMEM || last_result == -ENOSPC) - return 0; - - printk(KERN_DEBUG - "Begin read chunk returned %d.\n", - last_result); - } else - num_submitted++; - - } while (more_readahead && !last_result && - (dedicated_thread || - (num_submitted < target_outstanding_io && - atomic_read(&toi_io_in_progress) < target_outstanding_io))); - - return last_result; -} - -/** - * bio_io_flusher - start the dedicated I/O flushing routine - * @writing: Whether we're writing the image. - **/ -static int bio_io_flusher(int writing) -{ - - if (writing) - return toi_bio_queue_flush_pages(1); - else - return toi_start_new_readahead(1); -} - -/** - * toi_bio_get_next_page_read - read a disk page, perhaps with readahead - * @no_readahead: Whether we can use readahead - * - * Read a page from disk, submitting readahead and cleaning up finished i/o - * while we wait for the page we're after. - **/ -static int toi_bio_get_next_page_read(int no_readahead) -{ - char *virt; - struct page *old_readahead_list_head; - - /* - * When reading the second page of the header, we have to - * delay submitting the read until after we've gotten the - * extents out of the first page. - */ - if (unlikely(no_readahead)) { - int result = toi_start_one_readahead(0); - if (result) { - printk(KERN_EMERG "No readahead and toi_start_one_readahead " - "returned non-zero.\n"); - return -EIO; - } - } - - if (unlikely(!readahead_list_head)) { - /* - * If the last page finishes exactly on the page - * boundary, we will be called one extra time and - * have no data to return. In this case, we should - * not BUG(), like we used to! - */ - if (!more_readahead) { - printk(KERN_EMERG "No more readahead.\n"); - return -ENOSPC; - } - if (unlikely(toi_start_one_readahead(0))) { - printk(KERN_EMERG "No readahead and " - "toi_start_one_readahead returned non-zero.\n"); - return -EIO; - } - } - - if (PageLocked(readahead_list_head)) { - waiting_on = readahead_list_head; - do_bio_wait(0); - } - - virt = page_address(readahead_list_head); - memcpy(toi_writer_buffer, virt, PAGE_SIZE); - - mutex_lock(&toi_bio_readahead_mutex); - old_readahead_list_head = readahead_list_head; - readahead_list_head = (struct page *) readahead_list_head->private; - mutex_unlock(&toi_bio_readahead_mutex); - toi__free_page(12, old_readahead_list_head); - return 0; -} - -/** - * toi_bio_queue_flush_pages - flush the queue of pages queued for writing - * @dedicated_thread: Whether we're a dedicated thread - * - * Flush the queue of pages ready to be written to disk. - * - * If we're a dedicated thread, stay in here until told to leave, - * sleeping in wait_event. - * - * The first thread is normally the only one to come in here. Another - * thread can enter this routine too, though, via throttle_if_needed. - * Since that's the case, we must be careful to only have one thread - * doing this work at a time. Otherwise we have a race and could save - * pages out of order. - * - * If an error occurs, free all remaining pages without submitting them - * for I/O. - **/ - -int toi_bio_queue_flush_pages(int dedicated_thread) -{ - unsigned long flags; - int result = 0; - static DEFINE_MUTEX(busy); - - if (!mutex_trylock(&busy)) - return 0; - -top: - spin_lock_irqsave(&bio_queue_lock, flags); - while (bio_queue_head) { - struct page *page = bio_queue_head; - bio_queue_head = (struct page *) page->private; - if (bio_queue_tail == page) - bio_queue_tail = NULL; - atomic_dec(&toi_bio_queue_size); - spin_unlock_irqrestore(&bio_queue_lock, flags); - - /* Don't generate more error messages if already had one */ - if (!result) - result = toi_bio_rw_page(WRITE, page, 0, 11); - /* - * If writing the page failed, don't drop out. - * Flush the rest of the queue too. - */ - if (result) - toi__free_page(11 , page); - spin_lock_irqsave(&bio_queue_lock, flags); - } - spin_unlock_irqrestore(&bio_queue_lock, flags); - - if (dedicated_thread) { - wait_event(toi_io_queue_flusher, bio_queue_head || - toi_bio_queue_flusher_should_finish); - if (likely(!toi_bio_queue_flusher_should_finish)) - goto top; - toi_bio_queue_flusher_should_finish = 0; - } - - mutex_unlock(&busy); - return result; -} - -/** - * toi_bio_get_new_page - get a new page for I/O - * @full_buffer: Pointer to a page to allocate. - **/ -static int toi_bio_get_new_page(char **full_buffer) -{ - int result = throttle_if_needed(THROTTLE_WAIT); - if (result) - return result; - - while (!*full_buffer) { - *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP); - if (!*full_buffer) { - set_free_mem_throttle(); - do_bio_wait(3); - } - } - - return 0; -} - -/** - * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O - * @writing: Bool - whether writing (or reading). - * @buffer: The start of the buffer to write or fill. - * @buffer_size: The size of the buffer to write or fill. - * @no_readahead: Don't try to start readhead (when getting extents). - **/ -static int toi_rw_buffer(int writing, char *buffer, int buffer_size, - int no_readahead) -{ - int bytes_left = buffer_size, result = 0; - - while (bytes_left) { - char *source_start = buffer + buffer_size - bytes_left; - char *dest_start = toi_writer_buffer + toi_writer_buffer_posn; - int capacity = PAGE_SIZE - toi_writer_buffer_posn; - char *to = writing ? dest_start : source_start; - char *from = writing ? source_start : dest_start; - - if (bytes_left <= capacity) { - memcpy(to, from, bytes_left); - toi_writer_buffer_posn += bytes_left; - return 0; - } - - /* Complete this page and start a new one */ - memcpy(to, from, capacity); - bytes_left -= capacity; - - if (!writing) { - /* - * Perform actual I/O: - * read readahead_list_head into toi_writer_buffer - */ - int result = toi_bio_get_next_page_read(no_readahead); - if (result && bytes_left) { - printk("toi_bio_get_next_page_read " - "returned %d. Expecting to read %d bytes.\n", result, bytes_left); - return result; - } - } else { - toi_bio_queue_write(&toi_writer_buffer); - result = toi_bio_get_new_page(&toi_writer_buffer); - if (result) { - printk(KERN_ERR "toi_bio_get_new_page returned " - "%d.\n", result); - return result; - } - } - - toi_writer_buffer_posn = 0; - toi_cond_pause(0, NULL); - } - - return 0; -} - -/** - * toi_bio_read_page - read a page of the image - * @pfn: The pfn where the data belongs. - * @buffer_page: The page containing the (possibly compressed) data. - * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE). - * - * Read a (possibly compressed) page from the image, into buffer_page, - * returning its pfn and the buffer size. - **/ -static int toi_bio_read_page(unsigned long *pfn, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int result = 0; - int this_idx; - char *buffer_virt = TOI_MAP(buf_type, buffer_page); - - /* - * Only call start_new_readahead if we don't have a dedicated thread - * and we're the queue flusher. - */ - if (current == toi_queue_flusher && more_readahead && - !test_action_state(TOI_NO_READAHEAD)) { - int result2 = toi_start_new_readahead(0); - if (result2) { - printk(KERN_DEBUG "Queue flusher and " - "toi_start_one_readahead returned non-zero.\n"); - result = -EIO; - goto out; - } - } - - my_mutex_lock(0, &toi_bio_mutex); - - /* - * Structure in the image: - * [destination pfn|page size|page data] - * buf_size is PAGE_SIZE - * We can validly find there's nothing to read in a multithreaded - * situation. - */ - if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) || - toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) || - toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) || - toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) { - result = -ENODATA; - goto out_unlock; - } - - if (reset_idx) { - page_idx = this_idx; - reset_idx = 0; - } else { - page_idx++; - if (!this_idx) - result = -ENODATA; - else if (page_idx != this_idx) - printk(KERN_ERR "Got page index %d, expected %d.\n", - this_idx, page_idx); - } - -out_unlock: - my_mutex_unlock(0, &toi_bio_mutex); -out: - TOI_UNMAP(buf_type, buffer_page); - return result; -} - -/** - * toi_bio_write_page - write a page of the image - * @pfn: The pfn where the data belongs. - * @buffer_page: The page containing the (possibly compressed) data. - * @buf_size: The number of bytes on @buffer_page used. - * - * Write a (possibly compressed) page to the image from the buffer, together - * with it's index and buffer size. - **/ -static int toi_bio_write_page(unsigned long pfn, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - char *buffer_virt; - int result = 0, result2 = 0; - - if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED))) - return 0; - - my_mutex_lock(1, &toi_bio_mutex); - - if (test_result_state(TOI_ABORTED)) { - my_mutex_unlock(1, &toi_bio_mutex); - return 0; - } - - buffer_virt = TOI_MAP(buf_type, buffer_page); - page_idx++; - - /* - * Structure in the image: - * [destination pfn|page size|page data] - * buf_size is PAGE_SIZE - */ - if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) || - toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) || - toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) || - toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) { - printk(KERN_DEBUG "toi_rw_buffer returned non-zero to " - "toi_bio_write_page.\n"); - result = -EIO; - } - - TOI_UNMAP(buf_type, buffer_page); - my_mutex_unlock(1, &toi_bio_mutex); - - if (current == toi_queue_flusher) - result2 = toi_bio_queue_flush_pages(0); - - return result ? result : result2; -} - -/** - * _toi_rw_header_chunk - read or write a portion of the image header - * @writing: Whether reading or writing. - * @owner: The module for which we're writing. - * Used for confirming that modules - * don't use more header space than they asked for. - * @buffer: Address of the data to write. - * @buffer_size: Size of the data buffer. - * @no_readahead: Don't try to start readhead (when getting extents). - * - * Perform PAGE_SIZE I/O. Start readahead if needed. - **/ -static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner, - char *buffer, int buffer_size, int no_readahead) -{ - int result = 0; - - if (owner) { - owner->header_used += buffer_size; - toi_message(TOI_HEADER, TOI_LOW, 1, - "Header: %s : %d bytes (%d/%d) from offset %d.", - owner->name, - buffer_size, owner->header_used, - owner->header_requested, - toi_writer_buffer_posn); - if (owner->header_used > owner->header_requested && writing) { - printk(KERN_EMERG "TuxOnIce module %s is using more " - "header space (%u) than it requested (%u).\n", - owner->name, - owner->header_used, - owner->header_requested); - return buffer_size; - } - } else { - unowned += buffer_size; - toi_message(TOI_HEADER, TOI_LOW, 1, - "Header: (No owner): %d bytes (%d total so far) from " - "offset %d.", buffer_size, unowned, - toi_writer_buffer_posn); - } - - if (!writing && !no_readahead && more_readahead) { - result = toi_start_new_readahead(0); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead " - "returned %d.", result); - } - - if (!result) { - result = toi_rw_buffer(writing, buffer, buffer_size, - no_readahead); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned " - "%d.", result); - } - - total_header_bytes += buffer_size; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning " - "%d.", result); - return result; -} - -static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner, - char *buffer, int size) -{ - return _toi_rw_header_chunk(writing, owner, buffer, size, 1); -} - -static int toi_rw_header_chunk_noreadahead(int writing, - struct toi_module_ops *owner, char *buffer, int size) -{ - return _toi_rw_header_chunk(writing, owner, buffer, size, 1); -} - -/** - * toi_bio_storage_needed - get the amount of storage needed for my fns - **/ -static int toi_bio_storage_needed(void) -{ - return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed(); -} - -/** - * toi_bio_save_config_info - save block I/O config to image header - * @buf: PAGE_SIZE'd buffer into which data should be saved. - **/ -static int toi_bio_save_config_info(char *buf) -{ - int *ints = (int *) buf; - ints[0] = target_outstanding_io; - return sizeof(int); -} - -/** - * toi_bio_load_config_info - restore block I/O config - * @buf: Data to be reloaded. - * @size: Size of the buffer saved. - **/ -static void toi_bio_load_config_info(char *buf, int size) -{ - int *ints = (int *) buf; - target_outstanding_io = ints[0]; -} - -void close_resume_dev_t(int force) -{ - if (!resume_block_device) - return; - - if (force) - atomic_set(&resume_bdev_open_count, 0); - else - atomic_dec(&resume_bdev_open_count); - - if (!atomic_read(&resume_bdev_open_count)) { - toi_close_bdev(resume_block_device); - resume_block_device = NULL; - } -} - -int open_resume_dev_t(int force, int quiet) -{ - if (force) { - close_resume_dev_t(1); - atomic_set(&resume_bdev_open_count, 1); - } else - atomic_inc(&resume_bdev_open_count); - - if (resume_block_device) - return 0; - - resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0); - if (IS_ERR(resume_block_device)) { - if (!quiet) - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "Failed to open device %x, where" - " the header should be found.", - resume_dev_t); - resume_block_device = NULL; - atomic_set(&resume_bdev_open_count, 0); - return 1; - } - - return 0; -} - -/** - * toi_bio_initialise - initialise bio code at start of some action - * @starting_cycle: Whether starting a hibernation cycle, or just reading or - * writing a sysfs value. - **/ -static int toi_bio_initialise(int starting_cycle) -{ - int result; - - if (!starting_cycle || !resume_dev_t) - return 0; - - max_outstanding_writes = 0; - max_outstanding_reads = 0; - current_stream = 0; - toi_queue_flusher = current; -#ifdef MEASURE_MUTEX_CONTENTION - { - int i, j, k; - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - for_each_online_cpu(k) - mutex_times[i][j][k] = 0; - } -#endif - result = open_resume_dev_t(0, 1); - - if (result) - return result; - - result = toi_bio_register_storage(); - - if (result) - return result; - - return get_signature_page(); -} - -static unsigned long raw_to_real(unsigned long raw) -{ - unsigned long extra; - - extra = (raw * (sizeof(unsigned long) + sizeof(int)) + - (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) / - (PAGE_SIZE + sizeof(unsigned long) + sizeof(int)); - - return raw > extra ? raw - extra : 0; -} - -static unsigned long toi_bio_storage_available(void) -{ - unsigned long sum = 0; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type != BIO_ALLOCATOR_MODULE) - continue; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage " - "available from %s.", this_module->name); - sum += this_module->bio_allocator_ops->storage_available(); - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu " - "pages (%d header pages).", sum, header_pages_reserved); - - return sum > header_pages_reserved ? - raw_to_real(sum - header_pages_reserved) : 0; - -} - -static unsigned long toi_bio_storage_allocated(void) -{ - return raw_pages_allocd > header_pages_reserved ? - raw_to_real(raw_pages_allocd - header_pages_reserved) : 0; -} - -/* - * If we have read part of the image, we might have filled memory with - * data that should be zeroed out. - */ -static void toi_bio_noresume_reset(void) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset."); - toi_rw_cleanup(READ); - free_all_bdev_info(); -} - -/** - * toi_bio_cleanup - cleanup after some action - * @finishing_cycle: Whether completing a cycle. - **/ -static void toi_bio_cleanup(int finishing_cycle) -{ - if (!finishing_cycle) - return; - - if (toi_writer_buffer) { - toi_free_page(11, (unsigned long) toi_writer_buffer); - toi_writer_buffer = NULL; - } - - forget_signature_page(); - - if (header_block_device && toi_sig_data && - toi_sig_data->header_dev_t != resume_dev_t) - toi_close_bdev(header_block_device); - - header_block_device = NULL; - - close_resume_dev_t(0); -} - -static int toi_bio_write_header_init(void) -{ - int result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init"); - toi_rw_init(WRITE, 0); - toi_writer_buffer_posn = 0; - - /* Info needed to bootstrap goes at the start of the header. - * First we save the positions and devinfo, including the number - * of header pages. Then we save the structs containing data needed - * for reading the header pages back. - * Note that even if header pages take more than one page, when we - * read back the info, we will have restored the location of the - * next header page by the time we go to use it. - */ - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains."); - result = toi_serialise_extent_chains(); - - if (result) - return result; - - /* - * Signature page hasn't been modified at this point. Write it in - * the header so we can restore it later. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page."); - return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops, - (char *) toi_cur_sig_page, - PAGE_SIZE); -} - -static int toi_bio_write_header_cleanup(void) -{ - int result = 0; - - if (toi_writer_buffer_posn) - toi_bio_queue_write(&toi_writer_buffer); - - result = toi_finish_all_io(); - - unowned = 0; - total_header_bytes = 0; - - /* Set signature to save we have an image */ - if (!result) - result = toi_bio_mark_have_image(); - - return result; -} - -/* - * toi_bio_read_header_init() - * - * Description: - * 1. Attempt to read the device specified with resume=. - * 2. Check the contents of the swap header for our signature. - * 3. Warn, ignore, reset and/or continue as appropriate. - * 4. If continuing, read the toi_swap configuration section - * of the header and set up block device info so we can read - * the rest of the header & image. - * - * Returns: - * May not return if user choose to reboot at a warning. - * -EINVAL if cannot resume at this time. Booting should continue - * normally. - */ - -static int toi_bio_read_header_init(void) -{ - int result = 0; - char buf[32]; - - toi_writer_buffer_posn = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init"); - - if (!toi_sig_data) { - printk(KERN_INFO "toi_bio_read_header_init called when we " - "haven't verified there is an image!\n"); - return -EINVAL; - } - - /* - * If the header is not on the resume_swap_dev_t, get the resume device - * first. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.", - toi_sig_data->header_dev_t); - if (toi_sig_data->have_uuid) { - struct fs_info seek; - dev_t device; - - strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16); - seek.dev_t = toi_sig_data->header_dev_t; - seek.last_mount_size = 0; - device = blk_lookup_fs_info(&seek); - if (device) { - printk("Using dev_t %s, returned by blk_lookup_fs_info.\n", - format_dev_t(buf, device)); - toi_sig_data->header_dev_t = device; - } - } - if (toi_sig_data->header_dev_t != resume_dev_t) { - header_block_device = toi_open_bdev(NULL, - toi_sig_data->header_dev_t, 1); - - if (IS_ERR(header_block_device)) - return PTR_ERR(header_block_device); - } else - header_block_device = resume_block_device; - - if (!toi_writer_buffer) - toi_writer_buffer = (char *) toi_get_zeroed_page(11, - TOI_ATOMIC_GFP); - more_readahead = 1; - - /* - * Read toi_swap configuration. - * Headerblock size taken into account already. - */ - result = toi_bio_ops.bdev_page_io(READ, header_block_device, - toi_sig_data->first_header_block, - virt_to_page((unsigned long) toi_writer_buffer)); - if (result) - return result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains."); - result = toi_load_extent_chains(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page."); - toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); - if (!toi_orig_sig_page) { - printk(KERN_ERR "Failed to allocate memory for the current" - " image signature.\n"); - return -ENOMEM; - } - - return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops, - (char *) toi_orig_sig_page, - PAGE_SIZE); -} - -static int toi_bio_read_header_cleanup(void) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup."); - return toi_rw_cleanup(READ); -} - -/* Works only for digits and letters, but small and fast */ -#define TOLOWER(x) ((x) | 0x20) - -/* - * UUID must be 32 chars long. It may have dashes, but nothing - * else. - */ -char *uuid_from_commandline(char *commandline) -{ - int low = 0; - char *result = NULL, *output, *ptr; - - if (strncmp(commandline, "UUID=", 5)) - return NULL; - - result = kzalloc(17, GFP_KERNEL); - if (!result) { - printk("Failed to kzalloc UUID text memory.\n"); - return NULL; - } - - ptr = commandline + 5; - output = result; - - while (*ptr && (output - result) < 16) { - if (isxdigit(*ptr)) { - int value = isdigit(*ptr) ? *ptr - '0' : - TOLOWER(*ptr) - 'a' + 10; - if (low) { - *output += value; - output++; - } else { - *output = value << 4; - } - low = !low; - } else if (*ptr != '-') - break; - ptr++; - } - - if ((output - result) < 16 || *ptr) { - printk(KERN_DEBUG "Found resume=UUID=, but the value looks " - "invalid.\n"); - kfree(result); - result = NULL; - } - - return result; -} - -#define retry_if_fails(command) \ -do { \ - command; \ - if (!resume_dev_t && !waited_for_device_probe) { \ - wait_for_device_probe(); \ - command; \ - waited_for_device_probe = 1; \ - } \ -} while(0) - -/** - * try_to_open_resume_device: Try to parse and open resume= - * - * Any "swap:" has been stripped away and we just have the path to deal with. - * We attempt to do name_to_dev_t, open and stat the file. Having opened the - * file, get the struct block_device * to match. - */ -static int try_to_open_resume_device(char *commandline, int quiet) -{ - struct kstat stat; - int error = 0; - char *uuid = uuid_from_commandline(commandline); - int waited_for_device_probe = 0; - - resume_dev_t = MKDEV(0, 0); - - if (!strlen(commandline)) - retry_if_fails(toi_bio_scan_for_image(quiet)); - - if (uuid) { - struct fs_info seek; - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = resume_dev_t; - seek.last_mount_size = 0; - retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek)); - kfree(uuid); - } - - if (!resume_dev_t) - retry_if_fails(resume_dev_t = name_to_dev_t(commandline)); - - if (!resume_dev_t) { - struct file *file = filp_open(commandline, - O_RDONLY|O_LARGEFILE, 0); - - if (!IS_ERR(file) && file) { - vfs_getattr(&file->f_path, &stat); - filp_close(file, NULL); - } else - error = vfs_stat(commandline, &stat); - if (!error) - resume_dev_t = stat.rdev; - } - - if (!resume_dev_t) { - if (quiet) - return 1; - - if (test_toi_state(TOI_TRYING_TO_RESUME)) - toi_early_boot_message(1, toi_translate_err_default, - "Failed to translate \"%s\" into a device id.\n", - commandline); - else - printk("TuxOnIce: Can't translate \"%s\" into a device " - "id yet.\n", commandline); - return 1; - } - - return open_resume_dev_t(1, quiet); -} - -/* - * Parse Image Location - * - * Attempt to parse a resume= parameter. - * Swap Writer accepts: - * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE] - * - * Where: - * DEVNAME is convertable to a dev_t by name_to_dev_t - * FIRSTBLOCK is the location of the first block in the swap file - * (specifying for a swap partition is nonsensical but not prohibited). - * Data is validated by attempting to read a swap header from the - * location given. Failure will result in toi_swap refusing to - * save an image, and a reboot with correct parameters will be - * necessary. - */ -static int toi_bio_parse_sig_location(char *commandline, - int only_allocator, int quiet) -{ - char *thischar, *devstart, *colon = NULL; - int signature_found, result = -EINVAL, temp_result = 0; - - if (strncmp(commandline, "swap:", 5) && - strncmp(commandline, "file:", 5)) { - /* - * Failing swap:, we'll take a simple resume=/dev/hda2, or a - * blank value (scan) but fall through to other allocators - * if /dev/ or UUID= isn't matched. - */ - if (strncmp(commandline, "/dev/", 5) && - strncmp(commandline, "UUID=", 5) && - strlen(commandline)) - return 1; - } else - commandline += 5; - - devstart = commandline; - thischar = commandline; - while ((*thischar != ':') && (*thischar != '@') && - ((thischar - commandline) < 250) && (*thischar)) - thischar++; - - if (*thischar == ':') { - colon = thischar; - *colon = 0; - thischar++; - } - - while ((thischar - commandline) < 250 && *thischar) - thischar++; - - if (colon) { - unsigned long block; - temp_result = kstrtoul(colon + 1, 0, &block); - if (!temp_result) - resume_firstblock = (int) block; - } else - resume_firstblock = 0; - - clear_toi_state(TOI_CAN_HIBERNATE); - clear_toi_state(TOI_CAN_RESUME); - - if (!temp_result) - temp_result = try_to_open_resume_device(devstart, quiet); - - if (colon) - *colon = ':'; - - /* No error if we only scanned */ - if (temp_result) - return strlen(commandline) ? -EINVAL : 1; - - signature_found = toi_bio_image_exists(quiet); - - if (signature_found != -1) { - result = 0; - /* - * TODO: If only file storage, CAN_HIBERNATE should only be - * set if file allocator's target is valid. - */ - set_toi_state(TOI_CAN_HIBERNATE); - set_toi_state(TOI_CAN_RESUME); - } else - if (!quiet) - printk(KERN_ERR "TuxOnIce: Block I/O: No " - "signature found at %s.\n", devstart); - - return result; -} - -static void toi_bio_release_storage(void) -{ - header_pages_reserved = 0; - raw_pages_allocd = 0; - - free_all_bdev_info(); -} - -/* toi_swap_remove_image - * - */ -static int toi_bio_remove_image(void) -{ - int result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image."); - - result = toi_bio_restore_original_signature(); - - /* - * We don't do a sanity check here: we want to restore the swap - * whatever version of kernel made the hibernate image. - * - * We need to write swap, but swap may not be enabled so - * we write the device directly - * - * If we don't have an current_signature_page, we didn't - * read an image header, so don't change anything. - */ - - toi_bio_release_storage(); - - return result; -} - -struct toi_bio_ops toi_bio_ops = { - .bdev_page_io = toi_bdev_page_io, - .register_storage = toi_register_storage_chain, - .free_storage = toi_bio_release_storage, -}; - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io, - 0, 16384, 0, NULL), -}; - -struct toi_module_ops toi_blockwriter_ops = { - .type = WRITER_MODULE, - .name = "block i/o", - .directory = "block_io", - .module = THIS_MODULE, - .memory_needed = toi_bio_memory_needed, - .print_debug_info = toi_bio_print_debug_stats, - .storage_needed = toi_bio_storage_needed, - .save_config_info = toi_bio_save_config_info, - .load_config_info = toi_bio_load_config_info, - .initialise = toi_bio_initialise, - .cleanup = toi_bio_cleanup, - .post_atomic_restore = toi_bio_chains_post_atomic, - - .rw_init = toi_rw_init, - .rw_cleanup = toi_rw_cleanup, - .read_page = toi_bio_read_page, - .write_page = toi_bio_write_page, - .rw_header_chunk = toi_rw_header_chunk, - .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead, - .io_flusher = bio_io_flusher, - .update_throughput_throttle = update_throughput_throttle, - .finish_all_io = toi_finish_all_io, - - .noresume_reset = toi_bio_noresume_reset, - .storage_available = toi_bio_storage_available, - .storage_allocated = toi_bio_storage_allocated, - .reserve_header_space = toi_bio_reserve_header_space, - .allocate_storage = toi_bio_allocate_storage, - .free_unused_storage = toi_bio_free_unused_storage, - .image_exists = toi_bio_image_exists, - .mark_resume_attempted = toi_bio_mark_resume_attempted, - .write_header_init = toi_bio_write_header_init, - .write_header_cleanup = toi_bio_write_header_cleanup, - .read_header_init = toi_bio_read_header_init, - .read_header_cleanup = toi_bio_read_header_cleanup, - .get_header_version = toi_bio_get_header_version, - .remove_image = toi_bio_remove_image, - .parse_sig_location = toi_bio_parse_sig_location, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/** - * toi_block_io_load - load time routine for block I/O module - * - * Register block i/o ops and sysfs entries. - **/ -static __init int toi_block_io_load(void) -{ - return toi_register_module(&toi_blockwriter_ops); -} - -late_initcall(toi_block_io_load); diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h deleted file mode 100644 index 5e1964a61..000000000 --- a/kernel/power/tuxonice_bio_internal.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * kernel/power/tuxonice_bio_internal.h - * - * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains declarations for functions exported from - * tuxonice_bio.c, which contains low level io functions. - */ - -/* Extent chains */ -void toi_extent_state_goto_start(void); -void toi_extent_state_save(int slot); -int go_next_page(int writing, int section_barrier); -void toi_extent_state_restore(int slot); -void free_all_bdev_info(void); -int devices_of_same_priority(struct toi_bdev_info *this); -int toi_register_storage_chain(struct toi_bdev_info *new); -int toi_serialise_extent_chains(void); -int toi_load_extent_chains(void); -int toi_bio_rw_page(int writing, struct page *page, int is_readahead, - int free_group); -int toi_bio_restore_original_signature(void); -int toi_bio_devinfo_storage_needed(void); -unsigned long get_headerblock(void); -dev_t get_header_dev_t(void); -struct block_device *get_header_bdev(void); -int toi_bio_allocate_storage(unsigned long request); -void toi_bio_free_unused_storage(void); - -/* Signature functions */ -#define HaveImage "HaveImage" -#define NoImage "TuxOnIce" -#define sig_size (sizeof(HaveImage)) - -struct sig_data { - char sig[sig_size]; - int have_image; - int resumed_before; - - char have_uuid; - char header_uuid[17]; - dev_t header_dev_t; - unsigned long first_header_block; - - /* Repeat the signature to be sure we have a header version */ - char sig2[sig_size]; - int header_version; -}; - -void forget_signature_page(void); -int toi_check_for_signature(void); -int toi_bio_image_exists(int quiet); -int get_signature_page(void); -int toi_bio_mark_resume_attempted(int); -extern char *toi_cur_sig_page; -extern char *toi_orig_sig_page; -int toi_bio_mark_have_image(void); -extern struct sig_data *toi_sig_data; -extern dev_t resume_dev_t; -extern struct block_device *resume_block_device; -extern struct block_device *header_block_device; -extern unsigned long resume_firstblock; - -struct block_device *open_bdev(dev_t device, int display_errs); -extern int current_stream; -extern int more_readahead; -int toi_do_io(int writing, struct block_device *bdev, long block0, - struct page *page, int is_readahead, int syncio, int free_group); -int get_main_pool_phys_params(void); - -void toi_close_bdev(struct block_device *bdev); -struct block_device *toi_open_bdev(char *uuid, dev_t default_device, - int display_errs); - -extern struct toi_module_ops toi_blockwriter_ops; -void dump_block_chains(void); -void debug_broken_header(void); -extern unsigned long raw_pages_allocd, header_pages_reserved; -int toi_bio_chains_debug_info(char *buffer, int size); -void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd); -int toi_bio_scan_for_image(int quiet); -int toi_bio_get_header_version(void); - -void close_resume_dev_t(int force); -int open_resume_dev_t(int force, int quiet); - -struct toi_incremental_image_pointer_saved_data { - unsigned long block; - int chain; -}; - -struct toi_incremental_image_pointer { - struct toi_incremental_image_pointer_saved_data save; - struct block_device *bdev; - unsigned long block; -}; - -void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr); -void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr); diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c deleted file mode 100644 index f5418f092..000000000 --- a/kernel/power/tuxonice_bio_signature.c +++ /dev/null @@ -1,403 +0,0 @@ -/* - * kernel/power/tuxonice_bio_signature.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - */ - -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_bio.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" -#include "tuxonice_builtin.h" -#include "tuxonice_bio_internal.h" - -struct sig_data *toi_sig_data; - -/* Struct of swap header pages */ - -struct old_sig_data { - dev_t device; - unsigned long sector; - int resume_attempted; - int orig_sig_type; -}; - -union diskpage { - union swap_header swh; /* swh.magic is the only member used */ - struct sig_data sig_data; - struct old_sig_data old_sig_data; -}; - -union p_diskpage { - union diskpage *pointer; - char *ptr; - unsigned long address; -}; - -char *toi_cur_sig_page; -char *toi_orig_sig_page; -int have_image; -int have_old_image; - -int get_signature_page(void) -{ - if (!toi_cur_sig_page) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Allocating current signature page."); - toi_cur_sig_page = (char *) toi_get_zeroed_page(38, - TOI_ATOMIC_GFP); - if (!toi_cur_sig_page) { - printk(KERN_ERR "Failed to allocate memory for the " - "current image signature.\n"); - return -ENOMEM; - } - - toi_sig_data = (struct sig_data *) toi_cur_sig_page; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx," - " sector %d.", - resume_block_device->bd_dev, resume_firstblock); - - return toi_bio_ops.bdev_page_io(READ, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -void forget_signature_page(void) -{ - if (toi_cur_sig_page) { - toi_sig_data = NULL; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page" - " (%p).", toi_cur_sig_page); - toi_free_page(38, (unsigned long) toi_cur_sig_page); - toi_cur_sig_page = NULL; - } - - if (toi_orig_sig_page) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page" - " (%p).", toi_orig_sig_page); - toi_free_page(38, (unsigned long) toi_orig_sig_page); - toi_orig_sig_page = NULL; - } -} - -/* - * We need to ensure we use the signature page that's currently on disk, - * so as to not remove the image header. Post-atomic-restore, the orig sig - * page will be empty, so we can use that as our method of knowing that we - * need to load the on-disk signature and not use the non-image sig in - * memory. (We're going to powerdown after writing the change, so it's safe. - */ -int toi_bio_mark_resume_attempted(int flag) -{ - toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.", - flag); - if (!toi_orig_sig_page) { - forget_signature_page(); - get_signature_page(); - } - toi_sig_data->resumed_before = flag; - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -int toi_bio_mark_have_image(void) -{ - int result = 0; - char buf[32]; - struct fs_info *fs_info; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists."); - memcpy(toi_sig_data->sig, tuxonice_signature, - sizeof(tuxonice_signature)); - toi_sig_data->have_image = 1; - toi_sig_data->resumed_before = 0; - toi_sig_data->header_dev_t = get_header_dev_t(); - toi_sig_data->have_uuid = 0; - - fs_info = fs_info_from_block_dev(get_header_bdev()); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - if (!result) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.", - format_dev_t(buf, get_header_dev_t())); - toi_sig_data->have_uuid = 1; - } else - toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for " - "dev_t %s.", - format_dev_t(buf, get_header_dev_t())); - - toi_sig_data->first_header_block = get_headerblock(); - have_image = 1; - toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block " - "is %d.", toi_sig_data->header_dev_t, - toi_sig_data->first_header_block); - - memcpy(toi_sig_data->sig2, tuxonice_signature, - sizeof(tuxonice_signature)); - toi_sig_data->header_version = TOI_HEADER_VERSION; - - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -int remove_old_signature(void) -{ - union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page; - char *orig_sig; - char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); - int result; - struct block_device *header_bdev; - struct old_sig_data *old_sig_data = - &swap_header_page.pointer->old_sig_data; - - header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1); - result = toi_bio_ops.bdev_page_io(READ, header_bdev, - old_sig_data->sector, virt_to_page(header_start)); - - if (result) - goto out; - - /* - * TODO: Get the original contents of the first bytes of the swap - * header page. - */ - if (!old_sig_data->orig_sig_type) - orig_sig = "SWAP-SPACE"; - else - orig_sig = "SWAPSPACE2"; - - memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10); - memcpy(swap_header_page.ptr, header_start, 10); - - result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(swap_header_page.ptr)); - -out: - toi_close_bdev(header_bdev); - have_old_image = 0; - toi_free_page(38, (unsigned long) header_start); - return result; -} - -/* - * toi_bio_restore_original_signature - restore the original signature - * - * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used. - * It will have the original signature page contents, stored in the image - * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain - * the contents that were loaded when we started the cycle. - */ -int toi_bio_restore_original_signature(void) -{ - char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page; - - if (have_old_image) - return remove_old_signature(); - - if (!use) { - printk("toi_bio_restore_original_signature: No signature " - "page loaded.\n"); - return 0; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists."); - have_image = 0; - toi_sig_data->have_image = 0; - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(use)); -} - -/* - * check_for_signature - See whether we have an image. - * - * Returns 0 if no image, 1 if there is one, -1 if indeterminate. - */ -int toi_check_for_signature(void) -{ - union p_diskpage swap_header_page; - int type; - const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" }; - const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" }; - char *swap_header; - - if (!toi_cur_sig_page) { - int result = get_signature_page(); - - if (result) - return result; - } - - /* - * Start by looking for the binary header. - */ - if (!memcmp(tuxonice_signature, toi_cur_sig_page, - sizeof(tuxonice_signature))) { - have_image = toi_sig_data->have_image; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. " - "Have image is %d.", have_image); - if (have_image) - toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is " - "%x. First block is %d.", - toi_sig_data->header_dev_t, - toi_sig_data->first_header_block); - return toi_sig_data->have_image; - } - - /* - * Failing that, try old file allocator headers. - */ - - if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) { - have_image = 1; - return 1; - } - - have_image = 0; - - if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage))) - return 0; - - /* - * Nope? How about swap? - */ - swap_header_page = (union p_diskpage) toi_cur_sig_page; - swap_header = swap_header_page.pointer->swh.magic.magic; - - /* Normal swapspace? */ - for (type = 0; type < 2; type++) - if (!memcmp(normal_sigs[type], swap_header, - strlen(normal_sigs[type]))) - return 0; - - /* Swsusp or uswsusp? */ - for (type = 0; type < 3; type++) - if (!memcmp(swsusp_sigs[type], swap_header, - strlen(swsusp_sigs[type]))) - return 2; - - /* Old TuxOnIce version? */ - if (!memcmp(tuxonice_signature, swap_header, - sizeof(tuxonice_signature) - 1)) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce " - "signature."); - have_old_image = 1; - return 3; - } - - return -1; -} - -/* - * Image_exists - * - * Returns -1 if don't know, otherwise 0 (no) or 1 (yes). - */ -int toi_bio_image_exists(int quiet) -{ - int result; - char *msg = NULL; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists."); - - if (!resume_dev_t) { - if (!quiet) - printk(KERN_INFO "Not even trying to read header " - "because resume_dev_t is not set.\n"); - return -1; - } - - if (open_resume_dev_t(0, quiet)) - return -1; - - result = toi_check_for_signature(); - - clear_toi_state(TOI_RESUMED_BEFORE); - if (toi_sig_data->resumed_before) - set_toi_state(TOI_RESUMED_BEFORE); - - if (quiet || result == -ENOMEM) - return result; - - if (result == -1) - msg = "TuxOnIce: Unable to find a signature." - " Could you have moved a swap file?\n"; - else if (!result) - msg = "TuxOnIce: No image found.\n"; - else if (result == 1) - msg = "TuxOnIce: Image found.\n"; - else if (result == 2) - msg = "TuxOnIce: uswsusp or swsusp image found.\n"; - else if (result == 3) - msg = "TuxOnIce: Old implementation's signature found.\n"; - - printk(KERN_INFO "%s", msg); - - return result; -} - -int toi_bio_scan_for_image(int quiet) -{ - struct block_device *bdev; - char default_name[255] = ""; - - if (!quiet) - printk(KERN_DEBUG "Scanning swap devices for TuxOnIce " - "signature...\n"); - for (bdev = next_bdev_of_type(NULL, "swap"); bdev; - bdev = next_bdev_of_type(bdev, "swap")) { - int result; - char name[255] = ""; - sprintf(name, "%u:%u", MAJOR(bdev->bd_dev), - MINOR(bdev->bd_dev)); - if (!quiet) - printk(KERN_DEBUG "- Trying %s.\n", name); - resume_block_device = bdev; - resume_dev_t = bdev->bd_dev; - - result = toi_check_for_signature(); - - resume_block_device = NULL; - resume_dev_t = MKDEV(0, 0); - - if (!default_name[0]) - strcpy(default_name, name); - - if (result == 1) { - /* Got one! */ - strcpy(resume_file, name); - next_bdev_of_type(bdev, NULL); - if (!quiet) - printk(KERN_DEBUG " ==> Image found on %s.\n", - resume_file); - return 1; - } - forget_signature_page(); - } - - if (!quiet) - printk(KERN_DEBUG "TuxOnIce scan: No image found.\n"); - strcpy(resume_file, default_name); - return 0; -} - -int toi_bio_get_header_version(void) -{ - return (memcmp(toi_sig_data->sig2, tuxonice_signature, - sizeof(tuxonice_signature))) ? - 0 : toi_sig_data->header_version; - -} diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c deleted file mode 100644 index 22bf07a43..000000000 --- a/kernel/power/tuxonice_builtin.c +++ /dev/null @@ -1,498 +0,0 @@ -/* - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ -#include <linux/kernel.h> -#include <linux/swap.h> -#include <linux/syscalls.h> -#include <linux/bio.h> -#include <linux/root_dev.h> -#include <linux/freezer.h> -#include <linux/reboot.h> -#include <linux/writeback.h> -#include <linux/tty.h> -#include <linux/crypto.h> -#include <linux/cpu.h> -#include <linux/ctype.h> -#include <linux/kthread.h> -#include "tuxonice_io.h" -#include "tuxonice.h" -#include "tuxonice_extent.h" -#include "tuxonice_netlink.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_ui.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_modules.h" -#include "tuxonice_builtin.h" -#include "tuxonice_power_off.h" -#include "tuxonice_alloc.h" - -unsigned long toi_bootflags_mask; - -/* - * Highmem related functions (x86 only). - */ - -#ifdef CONFIG_HIGHMEM - -/** - * copyback_high: Restore highmem pages. - * - * Highmem data and pbe lists are/can be stored in highmem. - * The format is slightly different to the lowmem pbe lists - * used for the assembly code: the last pbe in each page is - * a struct page * instead of struct pbe *, pointing to the - * next page where pbes are stored (or NULL if happens to be - * the end of the list). Since we don't want to generate - * unnecessary deltas against swsusp code, we use a cast - * instead of a union. - **/ - -static void copyback_high(void) -{ - struct page *pbe_page = (struct page *) restore_highmem_pblist; - struct pbe *this_pbe, *first_pbe; - unsigned long *origpage, *copypage; - int pbe_index = 1; - - if (!pbe_page) - return; - - this_pbe = (struct pbe *) kmap_atomic(pbe_page); - first_pbe = this_pbe; - - while (this_pbe) { - int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1; - - origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address)); - copypage = kmap_atomic((struct page *) this_pbe->address); - - while (loop >= 0) { - *(origpage + loop) = *(copypage + loop); - loop--; - } - - kunmap_atomic(origpage); - kunmap_atomic(copypage); - - if (!this_pbe->next) - break; - - if (pbe_index < PBES_PER_PAGE) { - this_pbe++; - pbe_index++; - } else { - pbe_page = (struct page *) this_pbe->next; - kunmap_atomic(first_pbe); - if (!pbe_page) - return; - this_pbe = (struct pbe *) kmap_atomic(pbe_page); - first_pbe = this_pbe; - pbe_index = 1; - } - } - kunmap_atomic(first_pbe); -} - -#else /* CONFIG_HIGHMEM */ -static void copyback_high(void) { } -#endif - -char toi_wait_for_keypress_dev_console(int timeout) -{ - int fd, this_timeout = 255, orig_kthread = 0; - char key = '\0'; - struct termios t, t_backup; - - /* We should be guaranteed /dev/console exists after populate_rootfs() - * in init/main.c. - */ - fd = sys_open("/dev/console", O_RDONLY, 0); - if (fd < 0) { - printk(KERN_INFO "Couldn't open /dev/console.\n"); - return key; - } - - if (sys_ioctl(fd, TCGETS, (long)&t) < 0) - goto out_close; - - memcpy(&t_backup, &t, sizeof(t)); - - t.c_lflag &= ~(ISIG|ICANON|ECHO); - t.c_cc[VMIN] = 0; - -new_timeout: - if (timeout > 0) { - this_timeout = timeout < 26 ? timeout : 25; - timeout -= this_timeout; - this_timeout *= 10; - } - - t.c_cc[VTIME] = this_timeout; - - if (sys_ioctl(fd, TCSETS, (long)&t) < 0) - goto out_restore; - - if (current->flags & PF_KTHREAD) { - orig_kthread = (current->flags & PF_KTHREAD); - current->flags &= ~PF_KTHREAD; - } - - while (1) { - if (sys_read(fd, &key, 1) <= 0) { - if (timeout) - goto new_timeout; - key = '\0'; - break; - } - key = tolower(key); - if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) { - if (key == 'c') { - set_toi_state(TOI_CONTINUE_REQ); - break; - } else if (key == ' ') - break; - } else - break; - } - if (orig_kthread) { - current->flags |= PF_KTHREAD; - } - -out_restore: - sys_ioctl(fd, TCSETS, (long)&t_backup); -out_close: - sys_close(fd); - - return key; -} - -struct toi_boot_kernel_data toi_bkd __nosavedata - __attribute__((aligned(PAGE_SIZE))) = { - MY_BOOT_KERNEL_DATA_VERSION, - 0, -#ifdef CONFIG_TOI_REPLACE_SWSUSP - (1 << TOI_REPLACE_SWSUSP) | -#endif - (1 << TOI_NO_FLUSHER_THREAD) | - (1 << TOI_PAGESET2_FULL), -}; - -struct block_device *toi_open_by_devnum(dev_t dev) -{ - struct block_device *bdev = bdget(dev); - int err = -ENOMEM; - if (bdev) - err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); - return err ? ERR_PTR(err) : bdev; -} - -/** - * toi_close_bdev: Close a swap bdev. - * - * int: The swap entry number to close. - */ -void toi_close_bdev(struct block_device *bdev) -{ - blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); -} - -int toi_wait = CONFIG_TOI_DEFAULT_WAIT; -struct toi_core_fns *toi_core_fns; -unsigned long toi_result; -struct pagedir pagedir1 = {1}; -struct toi_cbw **toi_first_cbw; -int toi_next_cbw; - -unsigned long toi_get_nonconflicting_page(void) -{ - return toi_core_fns->get_nonconflicting_page(); -} - -int toi_post_context_save(void) -{ - return toi_core_fns->post_context_save(); -} - -int try_tuxonice_hibernate(void) -{ - if (!toi_core_fns) - return -ENODEV; - - return toi_core_fns->try_hibernate(); -} - -static int num_resume_calls; -#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL -static int ignore_late_initcall = 1; -#else -static int ignore_late_initcall; -#endif - -int toi_translate_err_default = TOI_CONTINUE_REQ; - -void try_tuxonice_resume(void) -{ - if (!hibernation_available()) - return; - - /* Don't let it wrap around eventually */ - if (num_resume_calls < 2) - num_resume_calls++; - - if (num_resume_calls == 1 && ignore_late_initcall) { - printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n"); - return; - } - - if (toi_core_fns) - toi_core_fns->try_resume(); - else - printk(KERN_INFO "TuxOnIce core not loaded yet.\n"); -} - -int toi_lowlevel_builtin(void) -{ - int error = 0; - - save_processor_state(); - error = swsusp_arch_suspend(); - if (error) - printk(KERN_ERR "Error %d hibernating\n", error); - - /* Restore control flow appears here */ - if (!toi_in_hibernate) { - copyback_high(); - set_toi_state(TOI_NOW_RESUMING); - } - - restore_processor_state(); - return error; -} - -unsigned long toi_compress_bytes_in; -unsigned long toi_compress_bytes_out; - -int toi_in_suspend(void) -{ - return in_suspend; -} - -unsigned long toi_state = ((1 << TOI_BOOT_TIME) | - (1 << TOI_IGNORE_LOGLEVEL) | - (1 << TOI_IO_STOPPED)); - -/* The number of hibernates we have started (some may have been cancelled) */ -unsigned int nr_hibernates; -int toi_running; -__nosavedata int toi_in_hibernate; -__nosavedata struct pbe *restore_highmem_pblist; - -int toi_trace_allocs; - -void toi_read_lock_tasklist(void) -{ - read_lock(&tasklist_lock); -} - -void toi_read_unlock_tasklist(void) -{ - read_unlock(&tasklist_lock); -} - -#ifdef CONFIG_TOI_ZRAM_SUPPORT -int (*toi_flag_zram_disks) (void); - -int toi_do_flag_zram_disks(void) -{ - return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0; -} - -#endif - -/* toi_generate_free_page_map - * - * Description: This routine generates a bitmap of free pages from the - * lists used by the memory manager. We then use the bitmap - * to quickly calculate which pages to save and in which - * pagesets. - */ -void toi_generate_free_page_map(void) -{ - int order, cpu, t; - unsigned long flags, i; - struct zone *zone; - struct list_head *curr; - unsigned long pfn; - struct page *page; - - for_each_populated_zone(zone) { - - if (!zone->spanned_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - - for (i = 0; i < zone->spanned_pages; i++) { - pfn = zone->zone_start_pfn + i; - - if (!pfn_valid(pfn)) - continue; - - page = pfn_to_page(pfn); - - ClearPageNosaveFree(page); - } - - for_each_migratetype_order(order, t) { - list_for_each(curr, - &zone->free_area[order].free_list[t]) { - unsigned long j; - - pfn = page_to_pfn(list_entry(curr, struct page, - lru)); - for (j = 0; j < (1UL << order); j++) - SetPageNosaveFree(pfn_to_page(pfn + j)); - } - } - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pset = - per_cpu_ptr(zone->pageset, cpu); - struct per_cpu_pages *pcp = &pset->pcp; - struct page *page; - int t; - - for (t = 0; t < MIGRATE_PCPTYPES; t++) - list_for_each_entry(page, &pcp->lists[t], lru) - SetPageNosaveFree(page); - } - - spin_unlock_irqrestore(&zone->lock, flags); - } -} - -/* toi_size_of_free_region - * - * Description: Return the number of pages that are free, beginning with and - * including this one. - */ -int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn) -{ - unsigned long this_pfn = start_pfn, - end_pfn = zone_end_pfn(zone); - - while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn))) - this_pfn++; - - return this_pfn - start_pfn; -} - -static int __init toi_wait_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) { - if (value < -1 || value > 255) - printk(KERN_INFO "TuxOnIce_wait outside range -1 to " - "255.\n"); - else - toi_wait = value; - } - - return 1; -} -__setup("toi_wait", toi_wait_setup); - -static int __init toi_translate_retry_setup(char *str) -{ - toi_translate_err_default = 0; - return 1; -} -__setup("toi_translate_retry", toi_translate_retry_setup); - -static int __init toi_debug_setup(char *str) -{ - toi_bkd.toi_action |= (1 << TOI_LOGALL); - toi_bootflags_mask |= (1 << TOI_LOGALL); - toi_bkd.toi_debug_state = 255; - toi_bkd.toi_default_console_level = 7; - return 1; -} -__setup("toi_debug_setup", toi_debug_setup); - -static int __init toi_pause_setup(char *str) -{ - toi_bkd.toi_action |= (1 << TOI_PAUSE); - toi_bootflags_mask |= (1 << TOI_PAUSE); - return 1; -} -__setup("toi_pause", toi_pause_setup); - -#ifdef CONFIG_PM_DEBUG -static int __init toi_trace_allocs_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) - toi_trace_allocs = value; - - return 1; -} -__setup("toi_trace_allocs", toi_trace_allocs_setup); -#endif - -static int __init toi_ignore_late_initcall_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) - ignore_late_initcall = value; - - return 1; -} -__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup); - -static int __init toi_force_no_multithreaded_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO); - toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO); - - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO); - - return 1; -} -__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup); - -#ifdef CONFIG_KGDB -static int __init toi_post_resume_breakpoint_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT); - toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT); - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT); - - return 1; -} -__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup); -#endif - -static int __init toi_disable_readahead_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD); - toi_bootflags_mask |= (1 << TOI_NO_READAHEAD); - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD); - - return 1; -} -__setup("toi_no_readahead", toi_disable_readahead_setup); diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h deleted file mode 100644 index 9539818e0..000000000 --- a/kernel/power/tuxonice_builtin.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ -#include <asm/setup.h> - -extern struct toi_core_fns *toi_core_fns; -extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out; -extern unsigned int nr_hibernates; -extern int toi_in_hibernate; - -extern __nosavedata struct pbe *restore_highmem_pblist; - -int toi_lowlevel_builtin(void); - -#ifdef CONFIG_HIGHMEM -extern __nosavedata struct zone_data *toi_nosave_zone_list; -extern __nosavedata unsigned long toi_nosave_max_pfn; -#endif - -extern unsigned long toi_get_nonconflicting_page(void); -extern int toi_post_context_save(void); - -extern char toi_wait_for_keypress_dev_console(int timeout); -extern struct block_device *toi_open_by_devnum(dev_t dev); -extern void toi_close_bdev(struct block_device *bdev); -extern int toi_wait; -extern int toi_translate_err_default; -extern int toi_force_no_multithreaded; -extern void toi_read_lock_tasklist(void); -extern void toi_read_unlock_tasklist(void); -extern int toi_in_suspend(void); -extern void toi_generate_free_page_map(void); -extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn); - -#ifdef CONFIG_TOI_ZRAM_SUPPORT -extern int toi_do_flag_zram_disks(void); -#else -#define toi_do_flag_zram_disks() (0) -#endif diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c deleted file mode 100644 index 1c4e10c72..000000000 --- a/kernel/power/tuxonice_checksum.c +++ /dev/null @@ -1,392 +0,0 @@ -/* - * kernel/power/tuxonice_checksum.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data checksum routines for TuxOnIce, - * using cryptoapi. They are used to locate any modifications - * made to pageset 2 while we're saving it. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <linux/crypto.h> -#include <linux/scatterlist.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_checksum.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" - -static struct toi_module_ops toi_checksum_ops; - -/* Constant at the mo, but I might allow tuning later */ -static char toi_checksum_name[32] = "md4"; -/* Bytes per checksum */ -#define CHECKSUM_SIZE (16) - -#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE) - -struct cpu_context { - struct crypto_hash *transform; - struct hash_desc desc; - struct scatterlist sg[2]; - char *buf; -}; - -static DEFINE_PER_CPU(struct cpu_context, contexts); -static int pages_allocated; -static unsigned long page_list; - -static int toi_num_resaved; - -static unsigned long this_checksum, next_page; -static int checksum_count; - -static inline int checksum_pages_needed(void) -{ - return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE); -} - -/* ---- Local buffer management ---- */ - -/* - * toi_checksum_cleanup - * - * Frees memory allocated for our labours. - */ -static void toi_checksum_cleanup(int ending_cycle) -{ - int cpu; - - if (ending_cycle) { - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->transform) { - crypto_free_hash(this->transform); - this->transform = NULL; - this->desc.tfm = NULL; - } - - if (this->buf) { - toi_free_page(27, (unsigned long) this->buf); - this->buf = NULL; - } - } - } -} - -/* - * toi_crypto_initialise - * - * Prepare to do some work by allocating buffers and transforms. - * Returns: Int: Zero. Even if we can't set up checksum, we still - * seek to hibernate. - */ -static int toi_checksum_initialise(int starting_cycle) -{ - int cpu; - - if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled) - return 0; - - if (!*toi_checksum_name) { - printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - struct page *page; - - this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0); - if (IS_ERR(this->transform)) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s checksum algorithm: %ld.\n", - toi_checksum_name, (long) this->transform); - this->transform = NULL; - return 1; - } - - this->desc.tfm = this->transform; - this->desc.flags = 0; - - page = toi_alloc_page(27, GFP_KERNEL); - if (!page) - return 1; - this->buf = page_address(page); - sg_init_one(&this->sg[0], this->buf, PAGE_SIZE); - } - return 0; -} - -/* - * toi_checksum_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_checksum_print_debug_stats(char *buffer, int size) -{ - int len; - - if (!toi_checksum_ops.enabled) - return scnprintf(buffer, size, - "- Checksumming disabled.\n"); - - len = scnprintf(buffer, size, "- Checksum method is '%s'.\n", - toi_checksum_name); - len += scnprintf(buffer + len, size - len, - " %d pages resaved in atomic copy.\n", toi_num_resaved); - return len; -} - -static int toi_checksum_memory_needed(void) -{ - return toi_checksum_ops.enabled ? - checksum_pages_needed() << PAGE_SHIFT : 0; -} - -static int toi_checksum_storage_needed(void) -{ - if (toi_checksum_ops.enabled) - return strlen(toi_checksum_name) + sizeof(int) + 1; - else - return 0; -} - -/* - * toi_checksum_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_checksum_save_config_info(char *buffer) -{ - int namelen = strlen(toi_checksum_name) + 1; - int total_len; - - *((unsigned int *) buffer) = namelen; - strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen); - total_len = sizeof(unsigned int) + namelen; - return total_len; -} - -/* toi_checksum_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for dechecksuming the image at - * resume time. - */ -static void toi_checksum_load_config_info(char *buffer, int size) -{ - int namelen; - - namelen = *((unsigned int *) (buffer)); - strncpy(toi_checksum_name, buffer + sizeof(unsigned int), - namelen); - return; -} - -/* - * Free Checksum Memory - */ - -void free_checksum_pages(void) -{ - while (pages_allocated) { - unsigned long next = *((unsigned long *) page_list); - ClearPageNosave(virt_to_page(page_list)); - toi_free_page(15, (unsigned long) page_list); - page_list = next; - pages_allocated--; - } -} - -/* - * Allocate Checksum Memory - */ - -int allocate_checksum_pages(void) -{ - int pages_needed = checksum_pages_needed(); - - if (!toi_checksum_ops.enabled) - return 0; - - while (pages_allocated < pages_needed) { - unsigned long *new_page = - (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP); - if (!new_page) { - printk(KERN_ERR "Unable to allocate checksum pages.\n"); - return -ENOMEM; - } - SetPageNosave(virt_to_page(new_page)); - (*new_page) = page_list; - page_list = (unsigned long) new_page; - pages_allocated++; - } - - next_page = (unsigned long) page_list; - checksum_count = 0; - - return 0; -} - -char *tuxonice_get_next_checksum(void) -{ - if (!toi_checksum_ops.enabled) - return NULL; - - if (checksum_count % CHECKSUMS_PER_PAGE) - this_checksum += CHECKSUM_SIZE; - else { - this_checksum = next_page + sizeof(void *); - next_page = *((unsigned long *) next_page); - } - - checksum_count++; - return (char *) this_checksum; -} - -int tuxonice_calc_checksum(struct page *page, char *checksum_locn) -{ - char *pa; - int result, cpu = smp_processor_id(); - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!toi_checksum_ops.enabled) - return 0; - - pa = kmap(page); - memcpy(ctx->buf, pa, PAGE_SIZE); - kunmap(page); - result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, - checksum_locn); - if (result) - printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest " - "returned %d.\n", result); - return result; -} -/* - * Calculate checksums - */ - -void check_checksums(void) -{ - int index = 0, cpu = smp_processor_id(); - char current_checksum[CHECKSUM_SIZE]; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - unsigned long pfn; - - if (!toi_checksum_ops.enabled) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled."); - return; - } - - next_page = (unsigned long) page_list; - - toi_num_resaved = 0; - this_checksum = 0; - - toi_trace_index++; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums."); - memory_bm_position_reset(pageset2_map); - for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset2_map, 0)) { - int ret, resave_needed = false; - char *pa; - struct page *page = pfn_to_page(pfn); - - if (index < checksum_count) { - if (index % CHECKSUMS_PER_PAGE) { - this_checksum += CHECKSUM_SIZE; - } else { - this_checksum = next_page + sizeof(void *); - next_page = *((unsigned long *) next_page); - } - - /* Done when IRQs disabled so must be atomic */ - pa = kmap_atomic(page); - memcpy(ctx->buf, pa, PAGE_SIZE); - kunmap_atomic(pa); - ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, - current_checksum); - - if (ret) { - printk(KERN_INFO "Digest failed. Returned %d.\n", ret); - return; - } - - resave_needed = memcmp(current_checksum, (char *) this_checksum, - CHECKSUM_SIZE); - } else { - resave_needed = true; - } - - if (resave_needed) { - TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed); - SetPageResave(pfn_to_page(pfn)); - toi_num_resaved++; - if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED)) - set_abort_result(TOI_RESAVE_NEEDED); - } - - index++; - } - toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete."); -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0, - NULL), - SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action, - TOI_ABORT_ON_RESAVE_NEEDED, 0) -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_checksum_ops = { - .type = MISC_MODULE, - .name = "checksumming", - .directory = "checksum", - .module = THIS_MODULE, - .initialise = toi_checksum_initialise, - .cleanup = toi_checksum_cleanup, - .print_debug_info = toi_checksum_print_debug_stats, - .save_config_info = toi_checksum_save_config_info, - .load_config_info = toi_checksum_load_config_info, - .memory_needed = toi_checksum_memory_needed, - .storage_needed = toi_checksum_storage_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -int toi_checksum_init(void) -{ - int result = toi_register_module(&toi_checksum_ops); - return result; -} - -void toi_checksum_exit(void) -{ - toi_unregister_module(&toi_checksum_ops); -} diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h deleted file mode 100644 index c8196fbb0..000000000 --- a/kernel/power/tuxonice_checksum.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * kernel/power/tuxonice_checksum.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data checksum routines for TuxOnIce, - * using cryptoapi. They are used to locate any modifications - * made to pageset 2 while we're saving it. - */ - -#if defined(CONFIG_TOI_CHECKSUM) -extern int toi_checksum_init(void); -extern void toi_checksum_exit(void); -void check_checksums(void); -int allocate_checksum_pages(void); -void free_checksum_pages(void); -char *tuxonice_get_next_checksum(void); -int tuxonice_calc_checksum(struct page *page, char *checksum_locn); -#else -static inline int toi_checksum_init(void) { return 0; } -static inline void toi_checksum_exit(void) { } -static inline void check_checksums(void) { }; -static inline int allocate_checksum_pages(void) { return 0; }; -static inline void free_checksum_pages(void) { }; -static inline char *tuxonice_get_next_checksum(void) { return NULL; }; -static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn) - { return 0; } -#endif - diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c deleted file mode 100644 index 2873f93c6..000000000 --- a/kernel/power/tuxonice_cluster.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * kernel/power/tuxonice_cluster.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains routines for cluster hibernation support. - * - * Based on ip autoconfiguration code in net/ipv4/ipconfig.c. - * - * How does it work? - * - * There is no 'master' node that tells everyone else what to do. All nodes - * send messages to the broadcast address/port, maintain a list of peers - * and figure out when to progress to the next step in hibernating or resuming. - * This makes us more fault tolerant when it comes to nodes coming and going - * (which may be more of an issue if we're hibernating when power supplies - * are being unreliable). - * - * At boot time, we start a ktuxonice thread that handles communication with - * other nodes. This node maintains a state machine that controls our progress - * through hibernating and resuming, keeping us in step with other nodes. Nodes - * are identified by their hw address. - * - * On startup, the node sends CLUSTER_PING on the configured interface's - * broadcast address, port $toi_cluster_port (see below) and begins to listen - * for other broadcast messages. CLUSTER_PING messages are repeated at - * intervals of 5 minutes, with a random offset to spread traffic out. - * - * A hibernation cycle is initiated from any node via - * - * echo > /sys/power/tuxonice/do_hibernate - * - * and (possibily) the hibernate script. At each step of the process, the node - * completes its work, and waits for all other nodes to signal completion of - * their work (or timeout) before progressing to the next step. - * - * Request/state Action before reply Possible reply Next state - * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP - * HIBERNATE|NACK INIT_0 - * - * PREP prepare_image PREP|ACK IMAGE_WRITE - * PREP|NACK INIT_0 - * ABORT RUNNING - * - * IO write image IO|ACK power off - * ABORT POST_RESUME - * - * (Boot time) check for image IMAGE|ACK RESUME_PREP - * (Note 1) - * IMAGE|NACK (Note 2) - * - * PREP prepare read image PREP|ACK IMAGE_READ - * PREP|NACK (As NACK_IMAGE) - * - * IO read image IO|ACK POST_RESUME - * - * POST_RESUME thaw, post-script RUNNING - * - * INIT_0 init 0 - * - * Other messages: - * - * - PING: Request for all other live nodes to send a PONG. Used at startup to - * announce presence, when a node is suspected dead and periodically, in case - * segments of the network are [un]plugged. - * - * - PONG: Response to a PING. - * - * - ABORT: Request to cancel writing an image. - * - * - BYE: Notification that this node is shutting down. - * - * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that - * nodes which are slower to start up can get state synchronised. If a node - * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send - * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it - * must invalidate its image (if any) and boot normally. - * - * Note 2: May occur when one node lost power or powered off while others - * hibernated. This node waits for others to complete resuming (ACK_READ) - * before completing its boot, so that it appears as a fail node restarting. - * - * If any node has an image, then it also has a list of nodes that hibernated - * in synchronisation with it. The node will wait for other nodes to appear - * or timeout before beginning its restoration. - * - * If a node has no image, it needs to wait, in case other nodes which do have - * an image are going to resume, but are taking longer to announce their - * presence. For this reason, the user can specify a timeout value and a number - * of nodes detected before we just continue. (We might want to assume in a - * cluster of, say, 15 nodes, if 8 others have booted without finding an image, - * the remaining nodes will too. This might help in situations where some nodes - * are much slower to boot, or more subject to hardware failures or such like). - */ - -#include <linux/suspend.h> -#include <linux/if.h> -#include <linux/rtnetlink.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/in.h> -#include <linux/if_arp.h> -#include <linux/kthread.h> -#include <linux/wait.h> -#include <linux/netdevice.h> -#include <net/ip.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" - -#if 1 -#define PRINTK(a, b...) do { printk(a, ##b); } while (0) -#else -#define PRINTK(a, b...) do { } while (0) -#endif - -static int loopback_mode; -static int num_local_nodes = 1; -#define MAX_LOCAL_NODES 8 -#define SADDR (loopback_mode ? b->sid : h->saddr) - -#define MYNAME "TuxOnIce Clustering" - -enum cluster_message { - MSG_ACK = 1, - MSG_NACK = 2, - MSG_PING = 4, - MSG_ABORT = 8, - MSG_BYE = 16, - MSG_HIBERNATE = 32, - MSG_IMAGE = 64, - MSG_IO = 128, - MSG_RUNNING = 256 -}; - -static char *str_message(int message) -{ - switch (message) { - case 4: - return "Ping"; - case 8: - return "Abort"; - case 9: - return "Abort acked"; - case 10: - return "Abort nacked"; - case 16: - return "Bye"; - case 17: - return "Bye acked"; - case 18: - return "Bye nacked"; - case 32: - return "Hibernate request"; - case 33: - return "Hibernate ack"; - case 34: - return "Hibernate nack"; - case 64: - return "Image exists?"; - case 65: - return "Image does exist"; - case 66: - return "No image here"; - case 128: - return "I/O"; - case 129: - return "I/O okay"; - case 130: - return "I/O failed"; - case 256: - return "Running"; - default: - printk(KERN_ERR "Unrecognised message %d.\n", message); - return "Unrecognised message (see dmesg)"; - } -} - -#define MSG_ACK_MASK (MSG_ACK | MSG_NACK) -#define MSG_STATE_MASK (~MSG_ACK_MASK) - -struct node_info { - struct list_head member_list; - wait_queue_head_t member_events; - spinlock_t member_list_lock; - spinlock_t receive_lock; - int peer_count, ignored_peer_count; - struct toi_sysfs_data sysfs_data; - enum cluster_message current_message; -}; - -struct node_info node_array[MAX_LOCAL_NODES]; - -struct cluster_member { - __be32 addr; - enum cluster_message message; - struct list_head list; - int ignore; -}; - -#define toi_cluster_port_send 3501 -#define toi_cluster_port_recv 3502 - -static struct net_device *net_dev; -static struct toi_module_ops toi_cluster_ops; - -static int toi_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); - -static struct packet_type toi_cluster_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = toi_recv, -}; - -struct toi_pkt { /* BOOTP packet format */ - struct iphdr iph; /* IP header */ - struct udphdr udph; /* UDP header */ - u8 htype; /* HW address type */ - u8 hlen; /* HW address length */ - __be32 xid; /* Transaction ID */ - __be16 secs; /* Seconds since we started */ - __be16 flags; /* Just what it says */ - u8 hw_addr[16]; /* Sender's HW address */ - u16 message; /* Message */ - unsigned long sid; /* Source ID for loopback testing */ -}; - -static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE; - -static int added_pack; - -static int others_have_image; - -/* Key used to allow multiple clusters on the same lan */ -static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY; -static char pre_hibernate_script[255] = - CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE; -static char post_hibernate_script[255] = - CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE; - -/* List of cluster members */ -static unsigned long continue_delay = 5 * HZ; -static unsigned long cluster_message_timeout = 3 * HZ; - -/* === Membership list === */ - -static void print_member_info(int index) -{ - struct cluster_member *this; - - printk(KERN_INFO "==> Dumping node %d.\n", index); - - list_for_each_entry(this, &node_array[index].member_list, list) - printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n", - NIPQUAD(this->addr), - str_message(this->message), - this->ignore ? "(Ignored)" : ""); - printk(KERN_INFO "== Done ==\n"); -} - -static struct cluster_member *__find_member(int index, __be32 addr) -{ - struct cluster_member *this; - - list_for_each_entry(this, &node_array[index].member_list, list) { - if (this->addr != addr) - continue; - - return this; - } - - return NULL; -} - -static void set_ignore(int index, __be32 addr, struct cluster_member *this) -{ - if (this->ignore) { - PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", - index, NIPQUAD(addr)); - return; - } - - PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", - index, NIPQUAD(addr)); - this->ignore = 1; - node_array[index].ignored_peer_count++; -} - -static int __add_update_member(int index, __be32 addr, int message) -{ - struct cluster_member *this; - - this = __find_member(index, addr); - if (this) { - if (this->message != message) { - this->message = message; - if ((message & MSG_NACK) && - (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) - set_ignore(index, addr, this); - PRINTK("Node %d sees node %d.%d.%d.%d now sending " - "%s.\n", index, NIPQUAD(addr), - str_message(message)); - wake_up(&node_array[index].member_events); - } - return 0; - } - - this = (struct cluster_member *) toi_kzalloc(36, - sizeof(struct cluster_member), GFP_KERNEL); - - if (!this) - return -1; - - this->addr = addr; - this->message = message; - this->ignore = 0; - INIT_LIST_HEAD(&this->list); - - node_array[index].peer_count++; - - PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index, - NIPQUAD(addr), str_message(message)); - - if ((message & MSG_NACK) && - (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) - set_ignore(index, addr, this); - list_add_tail(&this->list, &node_array[index].member_list); - return 1; -} - -static int add_update_member(int index, __be32 addr, int message) -{ - int result; - unsigned long flags; - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - result = __add_update_member(index, addr, message); - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); - - print_member_info(index); - - wake_up(&node_array[index].member_events); - - return result; -} - -static void del_member(int index, __be32 addr) -{ - struct cluster_member *this; - unsigned long flags; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - this = __find_member(index, addr); - - if (this) { - list_del_init(&this->list); - toi_kfree(36, this, sizeof(*this)); - node_array[index].peer_count--; - } - - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); -} - -/* === Message transmission === */ - -static void toi_send_if(int message, unsigned long my_id); - -/* - * Process received TOI packet. - */ -static int toi_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) -{ - struct toi_pkt *b; - struct iphdr *h; - int len, result, index; - unsigned long addr, message, ack; - - /* Perform verifications before taking the lock. */ - if (skb->pkt_type == PACKET_OTHERHOST) - goto drop; - - if (dev != net_dev) - goto drop; - - skb = skb_share_check(skb, GFP_ATOMIC); - if (!skb) - return NET_RX_DROP; - - if (!pskb_may_pull(skb, - sizeof(struct iphdr) + - sizeof(struct udphdr))) - goto drop; - - b = (struct toi_pkt *)skb_network_header(skb); - h = &b->iph; - - if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) - goto drop; - - /* Fragments are not supported */ - if (h->frag_off & htons(IP_OFFSET | IP_MF)) { - if (net_ratelimit()) - printk(KERN_ERR "TuxOnIce: Ignoring fragmented " - "cluster message.\n"); - goto drop; - } - - if (skb->len < ntohs(h->tot_len)) - goto drop; - - if (ip_fast_csum((char *) h, h->ihl)) - goto drop; - - if (b->udph.source != htons(toi_cluster_port_send) || - b->udph.dest != htons(toi_cluster_port_recv)) - goto drop; - - if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) - goto drop; - - len = ntohs(b->udph.len) - sizeof(struct udphdr); - - /* Ok the front looks good, make sure we can get at the rest. */ - if (!pskb_may_pull(skb, skb->len)) - goto drop; - - b = (struct toi_pkt *)skb_network_header(skb); - h = &b->iph; - - addr = SADDR; - PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n", - str_message(b->message), NIPQUAD(addr)); - - message = b->message & MSG_STATE_MASK; - ack = b->message & MSG_ACK_MASK; - - for (index = 0; index < num_local_nodes; index++) { - int new_message = node_array[index].current_message, - old_message = new_message; - - if (index == SADDR || !old_message) { - PRINTK("Ignoring node %d (offline or self).\n", index); - continue; - } - - /* One message at a time, please. */ - spin_lock(&node_array[index].receive_lock); - - result = add_update_member(index, SADDR, b->message); - if (result == -1) { - printk(KERN_INFO "Failed to add new cluster member " - NIPQUAD_FMT ".\n", - NIPQUAD(addr)); - goto drop_unlock; - } - - switch (b->message & MSG_STATE_MASK) { - case MSG_PING: - break; - case MSG_ABORT: - break; - case MSG_BYE: - break; - case MSG_HIBERNATE: - /* Can I hibernate? */ - new_message = MSG_HIBERNATE | - ((index & 1) ? MSG_NACK : MSG_ACK); - break; - case MSG_IMAGE: - /* Can I resume? */ - new_message = MSG_IMAGE | - ((index & 1) ? MSG_NACK : MSG_ACK); - if (new_message != old_message) - printk(KERN_ERR "Setting whether I can resume " - "to %d.\n", new_message); - break; - case MSG_IO: - new_message = MSG_IO | MSG_ACK; - break; - case MSG_RUNNING: - break; - default: - if (net_ratelimit()) - printk(KERN_ERR "Unrecognised TuxOnIce cluster" - " message %d from " NIPQUAD_FMT ".\n", - b->message, NIPQUAD(addr)); - }; - - if (old_message != new_message) { - node_array[index].current_message = new_message; - printk(KERN_INFO ">>> Sending new message for node " - "%d.\n", index); - toi_send_if(new_message, index); - } else if (!ack) { - printk(KERN_INFO ">>> Resending message for node %d.\n", - index); - toi_send_if(new_message, index); - } -drop_unlock: - spin_unlock(&node_array[index].receive_lock); - }; - -drop: - /* Throw the packet out. */ - kfree_skb(skb); - - return 0; -} - -/* - * Send cluster message to single interface. - */ -static void toi_send_if(int message, unsigned long my_id) -{ - struct sk_buff *skb; - struct toi_pkt *b; - int hh_len = LL_RESERVED_SPACE(net_dev); - struct iphdr *h; - - /* Allocate packet */ - skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL); - if (!skb) - return; - skb_reserve(skb, hh_len); - b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt)); - memset(b, 0, sizeof(struct toi_pkt)); - - /* Construct IP header */ - skb_reset_network_header(skb); - h = ip_hdr(skb); - h->version = 4; - h->ihl = 5; - h->tot_len = htons(sizeof(struct toi_pkt)); - h->frag_off = htons(IP_DF); - h->ttl = 64; - h->protocol = IPPROTO_UDP; - h->daddr = htonl(INADDR_BROADCAST); - h->check = ip_fast_csum((unsigned char *) h, h->ihl); - - /* Construct UDP header */ - b->udph.source = htons(toi_cluster_port_send); - b->udph.dest = htons(toi_cluster_port_recv); - b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr)); - /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ - - /* Construct message */ - b->message = message; - b->sid = my_id; - b->htype = net_dev->type; /* can cause undefined behavior */ - b->hlen = net_dev->addr_len; - memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len); - b->secs = htons(3); /* 3 seconds */ - - /* Chain packet down the line... */ - skb->dev = net_dev; - skb->protocol = htons(ETH_P_IP); - if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol), - net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) || - dev_queue_xmit(skb) < 0) - printk(KERN_INFO "E"); -} - -/* ========================================= */ - -/* kTOICluster */ - -static atomic_t num_cluster_threads; -static DECLARE_WAIT_QUEUE_HEAD(clusterd_events); - -static int kTOICluster(void *data) -{ - unsigned long my_id; - - my_id = atomic_add_return(1, &num_cluster_threads) - 1; - node_array[my_id].current_message = (unsigned long) data; - - PRINTK("kTOICluster daemon %lu starting.\n", my_id); - - current->flags |= PF_NOFREEZE; - - while (node_array[my_id].current_message) { - toi_send_if(node_array[my_id].current_message, my_id); - sleep_on_timeout(&clusterd_events, - cluster_message_timeout); - PRINTK("Link state %lu is %d.\n", my_id, - node_array[my_id].current_message); - } - - toi_send_if(MSG_BYE, my_id); - atomic_dec(&num_cluster_threads); - wake_up(&clusterd_events); - - PRINTK("kTOICluster daemon %lu exiting.\n", my_id); - __set_current_state(TASK_RUNNING); - return 0; -} - -static void kill_clusterd(void) -{ - int i; - - for (i = 0; i < num_local_nodes; i++) { - if (node_array[i].current_message) { - PRINTK("Seeking to kill clusterd %d.\n", i); - node_array[i].current_message = 0; - } - } - wait_event(clusterd_events, - !atomic_read(&num_cluster_threads)); - PRINTK("All cluster daemons have exited.\n"); -} - -static int peers_not_in_message(int index, int message, int precise) -{ - struct cluster_member *this; - unsigned long flags; - int result = 0; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - list_for_each_entry(this, &node_array[index].member_list, list) { - if (this->ignore) - continue; - - PRINTK("Peer %d.%d.%d.%d sending %s. " - "Seeking %s.\n", - NIPQUAD(this->addr), - str_message(this->message), str_message(message)); - if ((precise ? this->message : - this->message & MSG_STATE_MASK) != - message) - result++; - } - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); - PRINTK("%d peers in sought message.\n", result); - return result; -} - -static void reset_ignored(int index) -{ - struct cluster_member *this; - unsigned long flags; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - list_for_each_entry(this, &node_array[index].member_list, list) - this->ignore = 0; - node_array[index].ignored_peer_count = 0; - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); -} - -static int peers_in_message(int index, int message, int precise) -{ - return node_array[index].peer_count - - node_array[index].ignored_peer_count - - peers_not_in_message(index, message, precise); -} - -static int time_to_continue(int index, unsigned long start, int message) -{ - int first = peers_not_in_message(index, message, 0); - int second = peers_in_message(index, message, 1); - - PRINTK("First part returns %d, second returns %d.\n", first, second); - - if (!first && !second) { - PRINTK("All peers answered message %d.\n", - message); - return 1; - } - - if (time_after(jiffies, start + continue_delay)) { - PRINTK("Timeout reached.\n"); - return 1; - } - - PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies, - start + continue_delay); - return 0; -} - -void toi_initiate_cluster_hibernate(void) -{ - int result; - unsigned long start; - - result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); - if (result) - return; - - toi_send_if(MSG_HIBERNATE, 0); - - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_HIBERNATE)); - - if (test_action_state(TOI_FREEZER_TEST)) { - toi_send_if(MSG_ABORT, 0); - - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_RUNNING)); - - do_toi_step(STEP_QUIET_CLEANUP); - return; - } - - toi_send_if(MSG_IO, 0); - - result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); - if (result) - return; - - /* This code runs at resume time too! */ - if (toi_in_hibernate) - result = do_toi_step(STEP_HIBERNATE_POWERDOWN); -} - -/* toi_cluster_print_debug_stats - * - * Description: Print information to be recorded for debugging purposes into a - * buffer. - * Arguments: buffer: Pointer to a buffer into which the debug info will be - * printed. - * size: Size of the buffer. - * Returns: Number of characters written to the buffer. - */ -static int toi_cluster_print_debug_stats(char *buffer, int size) -{ - int len; - - if (strlen(toi_cluster_iface)) - len = scnprintf(buffer, size, - "- Cluster interface is '%s'.\n", - toi_cluster_iface); - else - len = scnprintf(buffer, size, - "- Cluster support is disabled.\n"); - return len; -} - -/* cluster_memory_needed - * - * Description: Tell the caller how much memory we need to operate during - * hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_cluster_memory_needed(void) -{ - return 0; -} - -static int toi_cluster_storage_needed(void) -{ - return 1 + strlen(toi_cluster_iface); -} - -/* toi_cluster_save_config_info - * - * Description: Save informaton needed when reloading the image at resume time. - * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE. - * Returns: Number of bytes used for saving our data. - */ -static int toi_cluster_save_config_info(char *buffer) -{ - strcpy(buffer, toi_cluster_iface); - return strlen(toi_cluster_iface + 1); -} - -/* toi_cluster_load_config_info - * - * Description: Reload information needed for declustering the image at - * resume time. - * Arguments: Buffer: Pointer to the start of the data. - * Size: Number of bytes that were saved. - */ -static void toi_cluster_load_config_info(char *buffer, int size) -{ - strncpy(toi_cluster_iface, buffer, size); - return; -} - -static void cluster_startup(void) -{ - int have_image = do_check_can_resume(), i; - unsigned long start = jiffies, initial_message; - struct task_struct *p; - - initial_message = MSG_IMAGE; - - have_image = 1; - - for (i = 0; i < num_local_nodes; i++) { - PRINTK("Starting ktoiclusterd %d.\n", i); - p = kthread_create(kTOICluster, (void *) initial_message, - "ktoiclusterd/%d", i); - if (IS_ERR(p)) { - printk(KERN_ERR "Failed to start ktoiclusterd.\n"); - return; - } - - wake_up_process(p); - } - - /* Wait for delay or someone else sending first message */ - wait_event(node_array[0].member_events, time_to_continue(0, start, - MSG_IMAGE)); - - others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1); - - printk(KERN_INFO "Continuing. I %shave an image. Peers with image:" - " %d.\n", have_image ? "" : "don't ", others_have_image); - - if (have_image) { - int result; - - /* Start to resume */ - printk(KERN_INFO " === Starting to resume === \n"); - node_array[0].current_message = MSG_IO; - toi_send_if(MSG_IO, 0); - - /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */ - result = 0; - - if (!result) { - /* - * Atomic restore - we'll come back in the hibernation - * path. - */ - - /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */ - result = 0; - - /* do_toi_step(STEP_QUIET_CLEANUP); */ - } - - node_array[0].current_message |= MSG_NACK; - - /* For debugging - disable for real life? */ - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_IO)); - } - - if (others_have_image) { - /* Wait for them to resume */ - printk(KERN_INFO "Waiting for other nodes to resume.\n"); - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_RUNNING)); - if (peers_not_in_message(0, MSG_RUNNING, 0)) - printk(KERN_INFO "Timed out while waiting for other " - "nodes to resume.\n"); - } - - /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE - * as appropriate. - * - * If we don't have an image: - * - Wait until someone else says they have one, or conditions are met - * for continuing to boot (n machines or t seconds). - * - If anyone has an image, wait for them to resume before continuing - * to boot. - * - * If we have an image: - * - Wait until conditions are met before continuing to resume (n - * machines or t seconds). Send RESUME_PREP and freeze processes. - * NACK_PREP if freezing fails (shouldn't) and follow logic for - * us having no image above. On success, wait for [N]ACK_PREP from - * other machines. Read image (including atomic restore) until done. - * Wait for ACK_READ from others (should never fail). Thaw processes - * and do post-resume. (The section after the atomic restore is done - * via the code for hibernating). - */ - - node_array[0].current_message = MSG_RUNNING; -} - -/* toi_cluster_open_iface - * - * Description: Prepare to use an interface. - */ - -static int toi_cluster_open_iface(void) -{ - struct net_device *dev; - - rtnl_lock(); - - for_each_netdev(&init_net, dev) { - if (/* dev == &init_net.loopback_dev || */ - strcmp(dev->name, toi_cluster_iface)) - continue; - - net_dev = dev; - break; - } - - rtnl_unlock(); - - if (!net_dev) { - printk(KERN_ERR MYNAME ": Device %s not found.\n", - toi_cluster_iface); - return -ENODEV; - } - - dev_add_pack(&toi_cluster_packet_type); - added_pack = 1; - - loopback_mode = (net_dev == init_net.loopback_dev); - num_local_nodes = loopback_mode ? 8 : 1; - - PRINTK("Loopback mode is %s. Number of local nodes is %d.\n", - loopback_mode ? "on" : "off", num_local_nodes); - - cluster_startup(); - return 0; -} - -/* toi_cluster_close_iface - * - * Description: Stop using an interface. - */ - -static int toi_cluster_close_iface(void) -{ - kill_clusterd(); - if (added_pack) { - dev_remove_pack(&toi_cluster_packet_type); - added_pack = 0; - } - return 0; -} - -static void write_side_effect(void) -{ - if (toi_cluster_ops.enabled) { - toi_cluster_open_iface(); - set_toi_state(TOI_CLUSTER_MODE); - } else { - toi_cluster_close_iface(); - clear_toi_state(TOI_CLUSTER_MODE); - } -} - -static void node_write_side_effect(void) -{ -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0, - NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0, - write_side_effect), - SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL), - SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script, - 256, 0, NULL), - SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script, - 256, 0, STRING), - SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ, - 0) -}; - -/* - * Ops structure. - */ - -static struct toi_module_ops toi_cluster_ops = { - .type = FILTER_MODULE, - .name = "Cluster", - .directory = "cluster", - .module = THIS_MODULE, - .memory_needed = toi_cluster_memory_needed, - .print_debug_info = toi_cluster_print_debug_stats, - .save_config_info = toi_cluster_save_config_info, - .load_config_info = toi_cluster_load_config_info, - .storage_needed = toi_cluster_storage_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -#ifdef MODULE -#define INIT static __init -#define EXIT static __exit -#else -#define INIT -#define EXIT -#endif - -INIT int toi_cluster_init(void) -{ - int temp = toi_register_module(&toi_cluster_ops), i; - struct kobject *kobj = toi_cluster_ops.dir_kobj; - - for (i = 0; i < MAX_LOCAL_NODES; i++) { - node_array[i].current_message = 0; - INIT_LIST_HEAD(&node_array[i].member_list); - init_waitqueue_head(&node_array[i].member_events); - spin_lock_init(&node_array[i].member_list_lock); - spin_lock_init(&node_array[i].receive_lock); - - /* Set up sysfs entry */ - node_array[i].sysfs_data.attr.name = toi_kzalloc(8, - sizeof(node_array[i].sysfs_data.attr.name), - GFP_KERNEL); - sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d", - i); - node_array[i].sysfs_data.attr.mode = SYSFS_RW; - node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER; - node_array[i].sysfs_data.flags = 0; - node_array[i].sysfs_data.data.integer.variable = - (int *) &node_array[i].current_message; - node_array[i].sysfs_data.data.integer.minimum = 0; - node_array[i].sysfs_data.data.integer.maximum = INT_MAX; - node_array[i].sysfs_data.write_side_effect = - node_write_side_effect; - toi_register_sysfs_file(kobj, &node_array[i].sysfs_data); - } - - toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0); - - if (toi_cluster_ops.enabled) - toi_cluster_open_iface(); - - return temp; -} - -EXIT void toi_cluster_exit(void) -{ - int i; - toi_cluster_close_iface(); - - for (i = 0; i < MAX_LOCAL_NODES; i++) - toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj, - &node_array[i].sysfs_data); - toi_unregister_module(&toi_cluster_ops); -} - -static int __init toi_cluster_iface_setup(char *iface) -{ - toi_cluster_ops.enabled = (*iface && - strcmp(iface, "off")); - - if (toi_cluster_ops.enabled) - strncpy(toi_cluster_iface, iface, strlen(iface)); -} - -__setup("toi_cluster=", toi_cluster_iface_setup); diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h deleted file mode 100644 index 84356b304..000000000 --- a/kernel/power/tuxonice_cluster.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * kernel/power/tuxonice_cluster.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifdef CONFIG_TOI_CLUSTER -extern int toi_cluster_init(void); -extern void toi_cluster_exit(void); -extern void toi_initiate_cluster_hibernate(void); -#else -static inline int toi_cluster_init(void) { return 0; } -static inline void toi_cluster_exit(void) { } -static inline void toi_initiate_cluster_hibernate(void) { } -#endif - diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c deleted file mode 100644 index 84b85226d..000000000 --- a/kernel/power/tuxonice_compress.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * kernel/power/compression.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data compression routines for TuxOnIce, - * using cryptoapi. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <linux/crypto.h> - -#include "tuxonice_builtin.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" - -static int toi_expected_compression; - -static struct toi_module_ops toi_compression_ops; -static struct toi_module_ops *next_driver; - -static char toi_compressor_name[32] = "lzo"; - -static DEFINE_MUTEX(stats_lock); - -struct cpu_context { - u8 *page_buffer; - struct crypto_comp *transform; - unsigned int len; - u8 *buffer_start; - u8 *output_buffer; -}; - -#define OUT_BUF_SIZE (2 * PAGE_SIZE) - -static DEFINE_PER_CPU(struct cpu_context, contexts); - -/* - * toi_crypto_prepare - * - * Prepare to do some work by allocating buffers and transforms. - */ -static int toi_compress_crypto_prepare(void) -{ - int cpu; - - if (!*toi_compressor_name) { - printk(KERN_INFO "TuxOnIce: Compression enabled but no " - "compressor name set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0); - if (IS_ERR(this->transform)) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s compression transform.\n", - toi_compressor_name); - this->transform = NULL; - return 1; - } - - this->page_buffer = - (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP); - - if (!this->page_buffer) { - printk(KERN_ERR - "Failed to allocate a page buffer for TuxOnIce " - "compression driver.\n"); - return -ENOMEM; - } - - this->output_buffer = - (char *) vmalloc_32(OUT_BUF_SIZE); - - if (!this->output_buffer) { - printk(KERN_ERR - "Failed to allocate a output buffer for TuxOnIce " - "compression driver.\n"); - return -ENOMEM; - } - } - - return 0; -} - -static int toi_compress_rw_cleanup(int writing) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->transform) { - crypto_free_comp(this->transform); - this->transform = NULL; - } - - if (this->page_buffer) - toi_free_page(16, (unsigned long) this->page_buffer); - - this->page_buffer = NULL; - - if (this->output_buffer) - vfree(this->output_buffer); - - this->output_buffer = NULL; - } - - return 0; -} - -/* - * toi_compress_init - */ - -static int toi_compress_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - toi_compress_bytes_in = 0; - toi_compress_bytes_out = 0; - - next_driver = toi_get_next_filter(&toi_compression_ops); - - return next_driver ? 0 : -ECHILD; -} - -/* - * toi_compress_rw_init() - */ - -static int toi_compress_rw_init(int rw, int stream_number) -{ - if (toi_compress_crypto_prepare()) { - printk(KERN_ERR "Failed to initialise compression " - "algorithm.\n"); - if (rw == READ) { - printk(KERN_INFO "Unable to read the image.\n"); - return -ENODEV; - } else { - printk(KERN_INFO "Continuing without " - "compressing the image.\n"); - toi_compression_ops.enabled = 0; - } - } - - return 0; -} - -/* - * toi_compress_write_page() - * - * Compress a page of data, buffering output and passing on filled - * pages to the next module in the pipeline. - * - * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing - * data to be compressed. - * - * Returns: 0 on success. Otherwise the error is that returned by later - * modules, -ECHILD if we have a broken pipeline or -EIO if - * zlib errs. - */ -static int toi_compress_write_page(unsigned long index, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - int ret = 0, cpu = smp_processor_id(); - struct cpu_context *ctx = &per_cpu(contexts, cpu); - u8* output_buffer = buffer_page; - int output_len = buf_size; - int out_buf_type = buf_type; - - if (ctx->transform) { - - ctx->buffer_start = TOI_MAP(buf_type, buffer_page); - ctx->len = OUT_BUF_SIZE; - - ret = crypto_comp_compress(ctx->transform, - ctx->buffer_start, buf_size, - ctx->output_buffer, &ctx->len); - - TOI_UNMAP(buf_type, buffer_page); - - toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, - "CPU %d, index %lu: %d bytes", - cpu, index, ctx->len); - - if (!ret && ctx->len < buf_size) { /* some compression */ - output_buffer = ctx->output_buffer; - output_len = ctx->len; - out_buf_type = TOI_VIRT; - } - - } - - mutex_lock(&stats_lock); - - toi_compress_bytes_in += buf_size; - toi_compress_bytes_out += output_len; - - mutex_unlock(&stats_lock); - - if (!ret) - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - - return ret; -} - -/* - * toi_compress_read_page() - * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. - * - * Retrieve data from later modules and decompress it until the input buffer - * is filled. - * Zero if successful. Error condition from me or from downstream on failure. - */ -static int toi_compress_read_page(unsigned long *index, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int ret, cpu = smp_processor_id(); - unsigned int len; - unsigned int outlen = PAGE_SIZE; - char *buffer_start; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!ctx->transform) - return next_driver->read_page(index, TOI_PAGE, buffer_page, - buf_size); - - /* - * All our reads must be synchronous - we can't decompress - * data that hasn't been read yet. - */ - - ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len); - - buffer_start = kmap(buffer_page); - - /* Error or uncompressed data */ - if (ret || len == PAGE_SIZE) { - memcpy(buffer_start, ctx->page_buffer, len); - goto out; - } - - ret = crypto_comp_decompress( - ctx->transform, - ctx->page_buffer, - len, buffer_start, &outlen); - - toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, - "CPU %d, index %lu: %d=>%d (%d).", - cpu, *index, len, outlen, ret); - - if (ret) - abort_hibernate(TOI_FAILED_IO, - "Compress_read returned %d.\n", ret); - else if (outlen != PAGE_SIZE) { - abort_hibernate(TOI_FAILED_IO, - "Decompression yielded %d bytes instead of %ld.\n", - outlen, PAGE_SIZE); - printk(KERN_ERR "Decompression yielded %d bytes instead of " - "%ld.\n", outlen, PAGE_SIZE); - ret = -EIO; - *buf_size = outlen; - } -out: - TOI_UNMAP(buf_type, buffer_page); - return ret; -} - -/* - * toi_compress_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_compress_print_debug_stats(char *buffer, int size) -{ - unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT, - pages_out = toi_compress_bytes_out >> PAGE_SHIFT; - int len; - - /* Output the compression ratio achieved. */ - if (*toi_compressor_name) - len = scnprintf(buffer, size, "- Compressor is '%s'.\n", - toi_compressor_name); - else - len = scnprintf(buffer, size, "- Compressor is not set.\n"); - - if (pages_in) - len += scnprintf(buffer+len, size - len, " Compressed " - "%lu bytes into %lu (%ld percent compression).\n", - toi_compress_bytes_in, - toi_compress_bytes_out, - (pages_in - pages_out) * 100 / pages_in); - return len; -} - -/* - * toi_compress_compression_memory_needed - * - * Tell the caller how much memory we need to operate during hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_compress_memory_needed(void) -{ - return 2 * PAGE_SIZE; -} - -static int toi_compress_storage_needed(void) -{ - return 2 * sizeof(unsigned long) + 2 * sizeof(int) + - strlen(toi_compressor_name) + 1; -} - -/* - * toi_compress_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_compress_save_config_info(char *buffer) -{ - int len = strlen(toi_compressor_name) + 1, offset = 0; - - *((unsigned long *) buffer) = toi_compress_bytes_in; - offset += sizeof(unsigned long); - *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out; - offset += sizeof(unsigned long); - *((int *) (buffer + offset)) = toi_expected_compression; - offset += sizeof(int); - *((int *) (buffer + offset)) = len; - offset += sizeof(int); - strncpy(buffer + offset, toi_compressor_name, len); - return offset + len; -} - -/* toi_compress_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for decompressing the image at - * resume time. - */ -static void toi_compress_load_config_info(char *buffer, int size) -{ - int len, offset = 0; - - toi_compress_bytes_in = *((unsigned long *) buffer); - offset += sizeof(unsigned long); - toi_compress_bytes_out = *((unsigned long *) (buffer + offset)); - offset += sizeof(unsigned long); - toi_expected_compression = *((int *) (buffer + offset)); - offset += sizeof(int); - len = *((int *) (buffer + offset)); - offset += sizeof(int); - strncpy(toi_compressor_name, buffer + offset, len); -} - -static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - bkd->compress_bytes_in = toi_compress_bytes_in; - bkd->compress_bytes_out = toi_compress_bytes_out; -} - -static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_compress_bytes_in = bkd->compress_bytes_in; - toi_compress_bytes_out = bkd->compress_bytes_out; -} - -/* - * toi_expected_compression_ratio - * - * Description: Returns the expected ratio between data passed into this module - * and the amount of data output when writing. - * Returns: 100 if the module is disabled. Otherwise the value set by the - * user via our sysfs entry. - */ - -static int toi_compress_expected_ratio(void) -{ - if (!toi_compression_ops.enabled) - return 100; - else - return 100 - toi_expected_compression; -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression, - 0, 99, 0, NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0, - NULL), - SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL), -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_compression_ops = { - .type = FILTER_MODULE, - .name = "compression", - .directory = "compression", - .module = THIS_MODULE, - .initialise = toi_compress_init, - .memory_needed = toi_compress_memory_needed, - .print_debug_info = toi_compress_print_debug_stats, - .save_config_info = toi_compress_save_config_info, - .load_config_info = toi_compress_load_config_info, - .storage_needed = toi_compress_storage_needed, - .expected_compression = toi_compress_expected_ratio, - - .pre_atomic_restore = toi_compress_pre_atomic_restore, - .post_atomic_restore = toi_compress_post_atomic_restore, - - .rw_init = toi_compress_rw_init, - .rw_cleanup = toi_compress_rw_cleanup, - - .write_page = toi_compress_write_page, - .read_page = toi_compress_read_page, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -static __init int toi_compress_load(void) -{ - return toi_register_module(&toi_compression_ops); -} - -late_initcall(toi_compress_load); diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c deleted file mode 100644 index eb627915e..000000000 --- a/kernel/power/tuxonice_copy_before_write.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * kernel/power/tuxonice_copy_before_write.c - * - * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines (apart from the fault handling code) to deal with allocating memory - * for copying pages before they are modified, restoring the contents and getting - * the contents written to disk. - */ - -#include <linux/percpu-defs.h> -#include <linux/sched.h> -#include <linux/tuxonice.h> -#include "tuxonice_alloc.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice.h" - -DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states); -#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw)) -#define toi_cbw_pool_size 100 - -static void _toi_free_cbw_data(struct toi_cbw_state *state) -{ - struct toi_cbw *page_ptr, *ptr, *next; - - page_ptr = ptr = state->first; - - while(ptr) { - next = ptr->next; - - if (ptr->virt) { - toi__free_page(40, virt_to_page(ptr->virt)); - } - if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) { - /* Must be on a new page - free the previous one. */ - toi__free_page(40, virt_to_page(page_ptr)); - page_ptr = ptr; - } - ptr = next; - } - - if (page_ptr) { - toi__free_page(40, virt_to_page(page_ptr)); - } - - state->first = state->next = state->last = NULL; - state->size = 0; -} - -void toi_free_cbw_data(void) -{ - int i; - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - - if (!state->first) - continue; - - state->enabled = 0; - - while (state->active) { - schedule(); - } - - _toi_free_cbw_data(state); - } -} - -static int _toi_allocate_cbw_data(struct toi_cbw_state *state) -{ - while(state->size < toi_cbw_pool_size) { - int i; - struct toi_cbw *ptr; - - ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL); - - if (!ptr) { - return -ENOMEM; - } - - if (!state->first) { - state->first = state->next = state->last = ptr; - } - - for (i = 0; i < CBWS_PER_PAGE; i++) { - struct toi_cbw *cbw = &ptr[i]; - - cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL); - if (!cbw->virt) { - state->size += i; - printk("Out of memory allocating CBW pages.\n"); - return -ENOMEM; - } - - if (cbw == state->first) - continue; - - state->last->next = cbw; - state->last = cbw; - } - - state->size += CBWS_PER_PAGE; - } - - state->enabled = 1; - - return 0; -} - - -int toi_allocate_cbw_data(void) -{ - int i, result; - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - - result = _toi_allocate_cbw_data(state); - - if (result) - return result; - } - - return 0; -} - -void toi_cbw_restore(void) -{ - if (!toi_keeping_image) - return; - -} - -void toi_cbw_write(void) -{ - if (!toi_keeping_image) - return; - -} - -/** - * toi_cbw_test_read - Test copy before write on one page - * - * Allocate copy before write buffers, then make one page only copy-before-write - * and attempt to write to it. We should then be able to retrieve the original - * version from the cbw buffer and the modified version from the page itself. - */ -static int toi_cbw_test_read(const char *buffer, int count) -{ - unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL); - char *original = "Original contents"; - char *modified = "Modified material"; - struct page *page = virt_to_page(virt); - int i, len = 0, found = 0, pfn = page_to_pfn(page); - - if (!page) { - printk("toi_cbw_test_read: Unable to allocate a page for testing.\n"); - return -ENOMEM; - } - - memcpy((char *) virt, original, strlen(original)); - - if (toi_allocate_cbw_data()) { - printk("toi_cbw_test_read: Unable to allocate cbw data.\n"); - return -ENOMEM; - } - - toi_reset_dirtiness_one(pfn, 0); - - SetPageTOI_CBW(page); - - memcpy((char *) virt, modified, strlen(modified)); - - if (strncmp((char *) virt, modified, strlen(modified))) { - len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n"); - } - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - struct toi_cbw *ptr = state->first, *last_ptr = ptr; - - if (!found) { - while (ptr) { - if (ptr->pfn == pfn) { - found = 1; - if (strncmp(ptr->virt, original, strlen(original))) { - len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n"); - } else { - len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n"); - } - break; - } - - last_ptr = ptr; - ptr = ptr->next; - } - } - - if (!last_ptr) - len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i); - } - - if (!found) - len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n"); - - toi_free_cbw_data(); - - return len; -} - -/* - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read, - NULL, SYSFS_NEEDS_SM_FOR_READ, NULL), -}; - -static struct toi_module_ops toi_cbw_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "copy_before_write debugging", - .directory = "cbw", - .module = THIS_MODULE, - .early = 1, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_cbw_init(void) -{ - int result = toi_register_module(&toi_cbw_ops); - return result; -} diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c deleted file mode 100644 index 522c836ad..000000000 --- a/kernel/power/tuxonice_extent.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * kernel/power/tuxonice_extent.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * These functions encapsulate the manipulation of storage metadata. - */ - -#include <linux/suspend.h> -#include "tuxonice_modules.h" -#include "tuxonice_extent.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" -#include "tuxonice.h" - -/** - * toi_get_extent - return a free extent - * - * May fail, returning NULL instead. - **/ -static struct hibernate_extent *toi_get_extent(void) -{ - return (struct hibernate_extent *) toi_kzalloc(2, - sizeof(struct hibernate_extent), TOI_ATOMIC_GFP); -} - -/** - * toi_put_extent_chain - free a chain of extents starting from value 'from' - * @chain: Chain to free. - * - * Note that 'from' is an extent value, and may be part way through an extent. - * In this case, the extent should be truncated (if necessary) and following - * extents freed. - **/ -void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from) -{ - struct hibernate_extent *this; - - this = chain->first; - - while (this) { - struct hibernate_extent *next = this->next; - - // Delete the whole extent? - if (this->start >= from) { - chain->size -= (this->end - this->start + 1); - if (chain->first == this) - chain->first = next; - if (chain->last_touched == this) - chain->last_touched = NULL; - if (chain->current_extent == this) - chain->current_extent = NULL; - toi_kfree(2, this, sizeof(*this)); - chain->num_extents--; - } else if (this->end >= from) { - // Delete part of the extent - chain->size -= (this->end - from + 1); - this->start = from; - } - this = next; - } -} - -/** - * toi_put_extent_chain - free a whole chain of extents - * @chain: Chain to free. - **/ -void toi_put_extent_chain(struct hibernate_extent_chain *chain) -{ - toi_put_extent_chain_from(chain, 0); -} - -/** - * toi_add_to_extent_chain - add an extent to an existing chain - * @chain: Chain to which the extend should be added - * @start: Start of the extent (first physical block) - * @end: End of the extent (last physical block) - * - * The chain information is updated if the insertion is successful. - **/ -int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, - unsigned long start, unsigned long end) -{ - struct hibernate_extent *new_ext = NULL, *cur_ext = NULL; - - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Adding extent %lu-%lu to chain %p.\n", start, end, chain); - - /* Find the right place in the chain */ - if (chain->last_touched && chain->last_touched->start < start) - cur_ext = chain->last_touched; - else if (chain->first && chain->first->start < start) - cur_ext = chain->first; - - if (cur_ext) { - while (cur_ext->next && cur_ext->next->start < start) - cur_ext = cur_ext->next; - - if (cur_ext->end == (start - 1)) { - struct hibernate_extent *next_ext = cur_ext->next; - cur_ext->end = end; - - /* Merge with the following one? */ - if (next_ext && cur_ext->end + 1 == next_ext->start) { - cur_ext->end = next_ext->end; - cur_ext->next = next_ext->next; - toi_kfree(2, next_ext, sizeof(*next_ext)); - chain->num_extents--; - } - - chain->last_touched = cur_ext; - chain->size += (end - start + 1); - - return 0; - } - } - - new_ext = toi_get_extent(); - if (!new_ext) { - printk(KERN_INFO "Error unable to append a new extent to the " - "chain.\n"); - return -ENOMEM; - } - - chain->num_extents++; - chain->size += (end - start + 1); - new_ext->start = start; - new_ext->end = end; - - chain->last_touched = new_ext; - - if (cur_ext) { - new_ext->next = cur_ext->next; - cur_ext->next = new_ext; - } else { - if (chain->first) - new_ext->next = chain->first; - chain->first = new_ext; - } - - return 0; -} diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h deleted file mode 100644 index aeccf1f5e..000000000 --- a/kernel/power/tuxonice_extent.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * kernel/power/tuxonice_extent.h - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations related to extents. Extents are - * TuxOnIce's method of storing some of the metadata for the image. - * See tuxonice_extent.c for more info. - * - */ - -#include "tuxonice_modules.h" - -#ifndef EXTENT_H -#define EXTENT_H - -struct hibernate_extent { - unsigned long start, end; - struct hibernate_extent *next; -}; - -struct hibernate_extent_chain { - unsigned long size; /* size of the chain ie sum (max-min+1) */ - int num_extents; - struct hibernate_extent *first, *last_touched; - struct hibernate_extent *current_extent; - unsigned long current_offset; -}; - -/* Simplify iterating through all the values in an extent chain */ -#define toi_extent_for_each(extent_chain, extentpointer, value) \ -if ((extent_chain)->first) \ - for ((extentpointer) = (extent_chain)->first, (value) = \ - (extentpointer)->start; \ - ((extentpointer) && ((extentpointer)->next || (value) <= \ - (extentpointer)->end)); \ - (((value) == (extentpointer)->end) ? \ - ((extentpointer) = (extentpointer)->next, (value) = \ - ((extentpointer) ? (extentpointer)->start : 0)) : \ - (value)++)) - -extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from); -#endif diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c deleted file mode 100644 index baf191211..000000000 --- a/kernel/power/tuxonice_file.c +++ /dev/null @@ -1,484 +0,0 @@ -/* - * kernel/power/tuxonice_file.c - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file encapsulates functions for usage of a simple file as a - * backing store. It is based upon the swapallocator, and shares the - * same basic working. Here, though, we have nothing to do with - * swapspace, and only one device to worry about. - * - * The user can just - * - * echo TuxOnIce > /path/to/my_file - * - * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file - * - * and - * - * echo /path/to/my_file > /sys/power/tuxonice/file/target - * - * then put what they find in /sys/power/tuxonice/resume - * as their resume= parameter in lilo.conf (and rerun lilo if using it). - * - * Having done this, they're ready to hibernate and resume. - * - * TODO: - * - File resizing. - */ - -#include <linux/blkdev.h> -#include <linux/mount.h> -#include <linux/fs.h> -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_bio.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_ui.h" -#include "tuxonice_io.h" - -#define target_is_normal_file() (S_ISREG(target_inode->i_mode)) - -static struct toi_module_ops toi_fileops; - -static struct file *target_file; -static struct block_device *toi_file_target_bdev; -static unsigned long pages_available, pages_allocated; -static char toi_file_target[256]; -static struct inode *target_inode; -static int file_target_priority; -static int used_devt; -static int target_claim; -static dev_t toi_file_dev_t; -static int sig_page_index; - -/* For test_toi_file_target */ -static struct toi_bdev_info *file_chain; - -static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num) -{ - int j; - sector_t last = 0; - - for (j = 0; j < dev_info->blocks_per_page; j++) { - sector_t this = bmap(target_inode, - page_num * dev_info->blocks_per_page + j); - - if (!this || (last && (last + 1) != this)) - break; - - last = this; - } - - return j == dev_info->blocks_per_page; -} - -static unsigned long get_usable_pages(struct toi_bdev_info *dev_info) -{ - unsigned long result = 0; - struct block_device *bdev = dev_info->bdev; - int i; - - switch (target_inode->i_mode & S_IFMT) { - case S_IFSOCK: - case S_IFCHR: - case S_IFIFO: /* Socket, Char, Fifo */ - return -1; - case S_IFREG: /* Regular file: current size - holes + free - space on part */ - for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) { - if (has_contiguous_blocks(dev_info, i)) - result++; - } - break; - case S_IFBLK: /* Block device */ - if (!bdev->bd_disk) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "bdev->bd_disk null."); - return 0; - } - - result = (bdev->bd_part ? - bdev->bd_part->nr_sects : - get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9); - } - - - return result; -} - -static int toi_file_register_storage(void) -{ - struct toi_bdev_info *devinfo; - int result = 0; - struct fs_info *fs_info; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage."); - if (!strlen(toi_file_target)) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: " - "No target filename set."); - return 0; - } - - target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0); - toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.", - toi_file_target, target_file); - - if (IS_ERR(target_file) || !target_file) { - target_file = NULL; - toi_file_dev_t = name_to_dev_t(toi_file_target); - if (!toi_file_dev_t) { - struct kstat stat; - int error = vfs_stat(toi_file_target, &stat); - printk(KERN_INFO "Open file %s returned %p and " - "name_to_devt failed.\n", - toi_file_target, target_file); - if (error) { - printk(KERN_INFO "Stating the file also failed." - " Nothing more we can do.\n"); - return 0; - } else - toi_file_dev_t = stat.rdev; - } - - toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t); - if (IS_ERR(toi_file_target_bdev)) { - printk(KERN_INFO "Got a dev_num (%lx) but failed to " - "open it.\n", - (unsigned long) toi_file_dev_t); - toi_file_target_bdev = NULL; - return 0; - } - used_devt = 1; - target_inode = toi_file_target_bdev->bd_inode; - } else - target_inode = target_file->f_mapping->host; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target."); - if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) || - S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) { - printk(KERN_INFO "File support works with regular files," - " character files and block devices.\n"); - /* Cleanup routine will undo the above */ - return 0; - } - - if (!used_devt) { - if (S_ISBLK(target_inode->i_mode)) { - toi_file_target_bdev = I_BDEV(target_inode); - if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE | - FMODE_READ, NULL)) - target_claim = 1; - } else - toi_file_target_bdev = target_inode->i_sb->s_bdev; - if (!toi_file_target_bdev) { - printk(KERN_INFO "%s is not a valid file allocator " - "target.\n", toi_file_target); - return 0; - } - toi_file_dev_t = toi_file_target_bdev->bd_dev; - } - - devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC); - if (!devinfo) { - printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n"); - return -ENOMEM; - } - - devinfo->bdev = toi_file_target_bdev; - devinfo->allocator = &toi_fileops; - devinfo->allocator_index = 0; - - fs_info = fs_info_from_block_dev(toi_file_target_bdev); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(devinfo->uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - /* Unlike swap code, only complain if fs_info_from_block_dev returned - * -ENOMEM. The 'file' might be a full partition, so might validly not - * have an identifiable type, UUID etc. - */ - if (result) - printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n", - result); - devinfo->dev_t = toi_file_dev_t; - devinfo->prio = file_target_priority; - devinfo->bmap_shift = target_inode->i_blkbits - 9; - devinfo->blocks_per_page = - (1 << (PAGE_SHIFT - target_inode->i_blkbits)); - sprintf(devinfo->name, "file %s", toi_file_target); - file_chain = devinfo; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap " - "shift is %d. Blocks per page %d.", - devinfo->dev_t, devinfo->prio, devinfo->bmap_shift, - devinfo->blocks_per_page); - - /* Keep one aside for the signature */ - pages_available = get_usable_pages(devinfo) - 1; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu " - "pages.", pages_available); - - toi_bio_ops.register_storage(devinfo); - return 0; -} - -static unsigned long toi_file_storage_available(void) -{ - return pages_available; -} - -static int toi_file_allocate_storage(struct toi_bdev_info *chain, - unsigned long request) -{ - unsigned long available = pages_available - pages_allocated; - unsigned long to_add = min(available, request); - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated " - "is %lu. Allocating %lu pages from file.", - pages_available, pages_allocated, to_add); - pages_allocated += to_add; - - return to_add; -} - -/** - * __populate_block_list - add an extent to the chain - * @min: Start of the extent (first physical block = sector) - * @max: End of the extent (last physical block = sector) - * - * If TOI_TEST_BIO is set, print a debug message, outputting the min and max - * fs block numbers. - **/ -static int __populate_block_list(struct toi_bdev_info *chain, int min, int max) -{ - if (test_action_state(TOI_TEST_BIO)) - toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.", - min << chain->bmap_shift, - ((max + 1) << chain->bmap_shift) - 1); - - return toi_add_to_extent_chain(&chain->blocks, min, max); -} - -static int get_main_pool_phys_params(struct toi_bdev_info *chain) -{ - int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0; - unsigned long pages_mapped = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks."); - - if (chain->blocks.first) - toi_put_extent_chain(&chain->blocks); - - if (!target_is_normal_file()) { - result = (pages_available > 0) ? - __populate_block_list(chain, chain->blocks_per_page, - (pages_allocated + 1) * - chain->blocks_per_page - 1) : 0; - return result; - } - - /* - * FIXME: We are assuming the first page is contiguous. Is that - * assumption always right? - */ - - for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) { - sector_t new_sector; - - if (!has_contiguous_blocks(chain, i)) - continue; - - if (!have_sig_page) { - have_sig_page = 1; - sig_page_index = i; - continue; - } - - pages_mapped++; - - /* Ignore first page - it has the header */ - if (pages_mapped == 1) - continue; - - new_sector = bmap(target_inode, (i * chain->blocks_per_page)); - - /* - * I'd love to be able to fill in holes and resize - * files, but not yet... - */ - - if (new_sector == extent_max + 1) - extent_max += chain->blocks_per_page; - else { - if (extent_min > -1) { - result = __populate_block_list(chain, - extent_min, extent_max); - if (result) - return result; - } - - extent_min = new_sector; - extent_max = extent_min + - chain->blocks_per_page - 1; - } - - if (pages_mapped == pages_allocated) - break; - } - - if (extent_min > -1) { - result = __populate_block_list(chain, extent_min, extent_max); - if (result) - return result; - } - - return 0; -} - -static void toi_file_free_storage(struct toi_bdev_info *chain) -{ - pages_allocated = 0; - file_chain = NULL; -} - -/** - * toi_file_print_debug_stats - print debug info - * @buffer: Buffer to data to populate - * @size: Size of the buffer - **/ -static int toi_file_print_debug_stats(char *buffer, int size) -{ - int len = scnprintf(buffer, size, "- File Allocator active.\n"); - - len += scnprintf(buffer+len, size-len, " Storage available for " - "image: %lu pages.\n", pages_available); - - return len; -} - -static void toi_file_cleanup(int finishing_cycle) -{ - if (toi_file_target_bdev) { - if (target_claim) { - blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ); - target_claim = 0; - } - - if (used_devt) { - blkdev_put(toi_file_target_bdev, - FMODE_READ | FMODE_NDELAY); - used_devt = 0; - } - toi_file_target_bdev = NULL; - target_inode = NULL; - } - - if (target_file) { - filp_close(target_file, NULL); - target_file = NULL; - } - - pages_available = 0; -} - -/** - * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target - * - * Test wheter the target file is valid for hibernating. - **/ -static void test_toi_file_target(void) -{ - int result = toi_file_register_storage(); - sector_t sector; - char buf[50]; - struct fs_info *fs_info; - - if (result || !file_chain) - return; - - /* This doesn't mean we're in business. Is any storage available? */ - if (!pages_available) - goto out; - - toi_file_allocate_storage(file_chain, 1); - result = get_main_pool_phys_params(file_chain); - if (result) - goto out; - - - sector = bmap(target_inode, sig_page_index * - file_chain->blocks_per_page) << file_chain->bmap_shift; - - /* Use the uuid, or the dev_t if that fails */ - fs_info = fs_info_from_block_dev(toi_file_target_bdev); - if (!fs_info || IS_ERR(fs_info)) { - bdevname(toi_file_target_bdev, buf); - sprintf(resume_file, "/dev/%s:%llu", buf, - (unsigned long long) sector); - } else { - int i; - hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0); - - /* Remove the spaces */ - for (i = 1; i < 16; i++) { - buf[2 * i] = buf[3 * i]; - buf[2 * i + 1] = buf[3 * i + 1]; - } - buf[32] = 0; - sprintf(resume_file, "UUID=%s:0x%llx", buf, - (unsigned long long) sector); - free_fs_info(fs_info); - } - - toi_attempt_to_parse_resume_device(0); -out: - toi_file_free_storage(file_chain); - toi_bio_ops.free_storage(); -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256, - SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target), - SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL), - SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095, - 4096, 0, NULL), -}; - -static struct toi_bio_allocator_ops toi_bio_fileops = { - .register_storage = toi_file_register_storage, - .storage_available = toi_file_storage_available, - .allocate_storage = toi_file_allocate_storage, - .bmap = get_main_pool_phys_params, - .free_storage = toi_file_free_storage, -}; - -static struct toi_module_ops toi_fileops = { - .type = BIO_ALLOCATOR_MODULE, - .name = "file storage", - .directory = "file", - .module = THIS_MODULE, - .print_debug_info = toi_file_print_debug_stats, - .cleanup = toi_file_cleanup, - .bio_allocator_ops = &toi_bio_fileops, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -static __init int toi_file_load(void) -{ - return toi_register_module(&toi_fileops); -} - -late_initcall(toi_file_load); diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c deleted file mode 100644 index 13bb93811..000000000 --- a/kernel/power/tuxonice_highlevel.c +++ /dev/null @@ -1,1414 +0,0 @@ -/* - * kernel/power/tuxonice_highlevel.c - */ -/** \mainpage TuxOnIce. - * - * TuxOnIce provides support for saving and restoring an image of - * system memory to an arbitrary storage device, either on the local computer, - * or across some network. The support is entirely OS based, so TuxOnIce - * works without requiring BIOS, APM or ACPI support. The vast majority of the - * code is also architecture independant, so it should be very easy to port - * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem - * and preemption. Initramfses and initrds are also supported. - * - * TuxOnIce uses a modular design, in which the method of storing the image is - * completely abstracted from the core code, as are transformations on the data - * such as compression and/or encryption (multiple 'modules' can be used to - * provide arbitrary combinations of functionality). The user interface is also - * modular, so that arbitrarily simple or complex interfaces can be used to - * provide anything from debugging information through to eye candy. - * - * \section Copyright - * - * TuxOnIce is released under the GPLv2. - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR> - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR> - * - * \section Credits - * - * Nigel would like to thank the following people for their work: - * - * Bernard Blackham <bernard@blackham.com.au><BR> - * Web page & Wiki administration, some coding. A person without whom - * TuxOnIce would not be where it is. - * - * Michael Frank <mhf@linuxmail.org><BR> - * Extensive testing and help with improving stability. I was constantly - * amazed by the quality and quantity of Michael's help. - * - * Pavel Machek <pavel@ucw.cz><BR> - * Modifications, defectiveness pointing, being with Gabor at the very - * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and - * 2.5.17. Even though Pavel and I disagree on the direction suspend to - * disk should take, I appreciate the valuable work he did in helping Gabor - * get the concept working. - * - * ..and of course the myriads of TuxOnIce users who have helped diagnose - * and fix bugs, made suggestions on how to improve the code, proofread - * documentation, and donated time and money. - * - * Thanks also to corporate sponsors: - * - * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!). - * - * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who - * allowed him to work on TuxOnIce and PM related issues on company time. - * - * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct - * 2003 to Jan 2004. - * - * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing - * maintenance of SMP and Highmem support. - * - * <B>OSDL.</B> Provided access to various hardware configurations, make - * occasional small donations to the project. - */ - -#include <linux/suspend.h> -#include <linux/module.h> -#include <linux/freezer.h> -#include <generated/utsrelease.h> -#include <linux/cpu.h> -#include <linux/console.h> -#include <linux/writeback.h> -#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */ -#include <linux/bio.h> -#include <linux/kgdb.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_power_off.h" -#include "tuxonice_storage.h" -#include "tuxonice_checksum.h" -#include "tuxonice_builtin.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_alloc.h" -#include "tuxonice_cluster.h" - -/*! Pageset metadata. */ -struct pagedir pagedir2 = {2}; - -static mm_segment_t oldfs; -static DEFINE_MUTEX(tuxonice_in_use); -static int block_dump_save; - -int toi_trace_index; - -/* Binary signature if an image is present */ -char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c"; - -unsigned long boot_kernel_data_buffer; - -static char *result_strings[] = { - "Hibernation was aborted", - "The user requested that we cancel the hibernation", - "No storage was available", - "Insufficient storage was available", - "Freezing filesystems and/or tasks failed", - "A pre-existing image was used", - "We would free memory, but image size limit doesn't allow this", - "Unable to free enough memory to hibernate", - "Unable to obtain the Power Management Semaphore", - "A device suspend/resume returned an error", - "A system device suspend/resume returned an error", - "The extra pages allowance is too small", - "We were unable to successfully prepare an image", - "TuxOnIce module initialisation failed", - "TuxOnIce module cleanup failed", - "I/O errors were encountered", - "Ran out of memory", - "An error was encountered while reading the image", - "Platform preparation failed", - "CPU Hotplugging failed", - "Architecture specific preparation failed", - "Pages needed resaving, but we were told to abort if this happens", - "We can't hibernate at the moment (invalid resume= or filewriter " - "target?)", - "A hibernation preparation notifier chain member cancelled the " - "hibernation", - "Pre-snapshot preparation failed", - "Pre-restore preparation failed", - "Failed to disable usermode helpers", - "Can't resume from alternate image", - "Header reservation too small", - "Device Power Management Preparation failed", -}; - -/** - * toi_finish_anything - cleanup after doing anything - * @hibernate_or_resume: Whether finishing a cycle or attempt at - * resuming. - * - * This is our basic clean-up routine, matching start_anything below. We - * call cleanup routines, drop module references and restore process fs and - * cpus allowed masks, together with the global block_dump variable's value. - **/ -void toi_finish_anything(int hibernate_or_resume) -{ - toi_running = 0; - toi_cleanup_modules(hibernate_or_resume); - toi_put_modules(); - if (hibernate_or_resume) { - block_dump = block_dump_save; - set_cpus_allowed_ptr(current, cpu_all_mask); - toi_alloc_print_debug_stats(); - atomic_inc(&snapshot_device_available); - unlock_system_sleep(); - } - - set_fs(oldfs); - mutex_unlock(&tuxonice_in_use); -} - -/** - * toi_start_anything - basic initialisation for TuxOnIce - * @toi_or_resume: Whether starting a cycle or attempt at resuming. - * - * Our basic initialisation routine. Take references on modules, use the - * kernel segment, recheck resume= if no active allocator is set, initialise - * modules, save and reset block_dump and ensure we're running on CPU0. - **/ -int toi_start_anything(int hibernate_or_resume) -{ - mutex_lock(&tuxonice_in_use); - - oldfs = get_fs(); - set_fs(KERNEL_DS); - - toi_trace_index = 0; - - if (hibernate_or_resume) { - lock_system_sleep(); - - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - goto snapshotdevice_unavailable; - } - - if (hibernate_or_resume == SYSFS_HIBERNATE) - toi_print_modules(); - - if (toi_get_modules()) { - printk(KERN_INFO "TuxOnIce: Get modules failed!\n"); - goto prehibernate_err; - } - - if (hibernate_or_resume) { - block_dump_save = block_dump; - block_dump = 0; - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_first(cpu_online_mask))); - } - - if (toi_initialise_modules_early(hibernate_or_resume)) - goto early_init_err; - - if (!toiActiveAllocator) - toi_attempt_to_parse_resume_device(!hibernate_or_resume); - - if (!toi_initialise_modules_late(hibernate_or_resume)) { - toi_running = 1; /* For the swsusp code we use :< */ - return 0; - } - - toi_cleanup_modules(hibernate_or_resume); -early_init_err: - if (hibernate_or_resume) { - block_dump_save = block_dump; - set_cpus_allowed_ptr(current, cpu_all_mask); - } - toi_put_modules(); -prehibernate_err: - if (hibernate_or_resume) - atomic_inc(&snapshot_device_available); -snapshotdevice_unavailable: - if (hibernate_or_resume) - mutex_unlock(&pm_mutex); - release_super_lock(); - set_fs(oldfs); - mutex_unlock(&tuxonice_in_use); - return -EBUSY; -} - -/* - * Nosave page tracking. - * - * Here rather than in prepare_image because we want to do it once only at the - * start of a cycle. - */ - -/** - * mark_nosave_pages - set up our Nosave bitmap - * - * Build a bitmap of Nosave pages from the list. The bitmap allows faster - * use when preparing the image. - **/ -static void mark_nosave_pages(void) -{ - struct nosave_region *region; - - list_for_each_entry(region, &nosave_regions, list) { - unsigned long pfn; - - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - SetPageNosave(pfn_to_page(pfn)); - } - } -} - -/** - * allocate_bitmaps - allocate bitmaps used to record page states - * - * Allocate the bitmaps we use to record the various TuxOnIce related - * page states. - **/ -static int allocate_bitmaps(void) -{ - if (toi_alloc_bitmap(&pageset1_map) || - toi_alloc_bitmap(&pageset1_copy_map) || - toi_alloc_bitmap(&pageset2_map) || - toi_alloc_bitmap(&io_map) || - toi_alloc_bitmap(&nosave_map) || - toi_alloc_bitmap(&free_map) || - toi_alloc_bitmap(&compare_map) || - toi_alloc_bitmap(&page_resave_map)) - return 1; - - return 0; -} - -/** - * free_bitmaps - free the bitmaps used to record page states - * - * Free the bitmaps allocated above. It is not an error to call - * memory_bm_free on a bitmap that isn't currently allocated. - **/ -static void free_bitmaps(void) -{ - toi_free_bitmap(&pageset1_map); - toi_free_bitmap(&pageset1_copy_map); - toi_free_bitmap(&pageset2_map); - toi_free_bitmap(&io_map); - toi_free_bitmap(&nosave_map); - toi_free_bitmap(&free_map); - toi_free_bitmap(&compare_map); - toi_free_bitmap(&page_resave_map); -} - -/** - * io_MB_per_second - return the number of MB/s read or written - * @write: Whether to return the speed at which we wrote. - * - * Calculate the number of megabytes per second that were read or written. - **/ -static int io_MB_per_second(int write) -{ - return (toi_bkd.toi_io_time[write][1]) ? - MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ / - toi_bkd.toi_io_time[write][1] : 0; -} - -#define SNPRINTF(a...) do { len += scnprintf(((char *) buffer) + len, \ - count - len - 1, ## a); } while (0) - -/** - * get_debug_info - fill a buffer with debugging information - * @buffer: The buffer to be filled. - * @count: The size of the buffer, in bytes. - * - * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will - * either printk or return via sysfs. - **/ -static int get_toi_debug_info(const char *buffer, int count) -{ - int len = 0, i, first_result = 1; - - SNPRINTF("TuxOnIce debugging info:\n"); - SNPRINTF("- TuxOnIce core : " TOI_CORE_VERSION "\n"); - SNPRINTF("- Kernel Version : " UTS_RELEASE "\n"); - SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__); - SNPRINTF("- Attempt number : %d\n", nr_hibernates); - SNPRINTF("- Parameters : %ld %ld %ld %d %ld %ld\n", - toi_result, - toi_bkd.toi_action, - toi_bkd.toi_debug_state, - toi_bkd.toi_default_console_level, - image_size_limit, - toi_poweroff_method); - SNPRINTF("- Overall expected compression percentage: %d.\n", - 100 - toi_expected_compression_ratio()); - len += toi_print_module_debug_info(((char *) buffer) + len, - count - len - 1); - if (toi_bkd.toi_io_time[0][1]) { - if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) { - SNPRINTF("- I/O speed: Write %ld KB/s", - (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / - toi_bkd.toi_io_time[0][1])); - if (toi_bkd.toi_io_time[1][1]) - SNPRINTF(", Read %ld KB/s", - (KB((unsigned long) - toi_bkd.toi_io_time[1][0]) * HZ / - toi_bkd.toi_io_time[1][1])); - } else { - SNPRINTF("- I/O speed: Write %ld MB/s", - (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / - toi_bkd.toi_io_time[0][1])); - if (toi_bkd.toi_io_time[1][1]) - SNPRINTF(", Read %ld MB/s", - (MB((unsigned long) - toi_bkd.toi_io_time[1][0]) * HZ / - toi_bkd.toi_io_time[1][1])); - } - SNPRINTF(".\n"); - } else - SNPRINTF("- No I/O speed stats available.\n"); - SNPRINTF("- Extra pages : %lu used/%lu.\n", - extra_pd1_pages_used, extra_pd1_pages_allowance); - - for (i = 0; i < TOI_NUM_RESULT_STATES; i++) - if (test_result_state(i)) { - SNPRINTF("%s: %s.\n", first_result ? - "- Result " : - " ", - result_strings[i]); - first_result = 0; - } - if (first_result) - SNPRINTF("- Result : %s.\n", nr_hibernates ? - "Succeeded" : - "No hibernation attempts so far"); - return len; -} - -#ifdef CONFIG_TOI_INCREMENTAL -/** - * get_toi_page_state - fill a buffer with page state information - * @buffer: The buffer to be filled. - * @count: The size of the buffer, in bytes. - * - * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will - * either printk or return via sysfs. - **/ -static int get_toi_page_state(const char *buffer, int count) -{ - int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0; - int len = 0; - struct zone *zone; - int allocated_bitmaps = 0; - - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_first(cpu_online_mask))); - - if (!free_map) { - BUG_ON(toi_alloc_bitmap(&free_map)); - allocated_bitmaps = 1; - } - - toi_generate_free_page_map(); - - for_each_populated_zone(zone) { - unsigned long loop; - - total += zone->spanned_pages; - - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - /* - * If the page gets allocated, it will be need - * saving in an image. - * Don't bother with explicitly removing any - * RO protection applied below. - * We'll SetPageTOI_Dirty(page) if/when it - * gets allocated. - */ - free += chunk_size; - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageTOI_Untracked(page)) { - untracked++; - } else if (PageTOI_RO(page)) { - ro++; - } else if (PageTOI_Dirty(page)) { - dirty++; - } else { - printk("Page %ld state 'other'.\n", pfn); - other++; - } - } - } - - if (allocated_bitmaps) { - toi_free_bitmap(&free_map); - } - - set_cpus_allowed_ptr(current, cpu_all_mask); - - SNPRINTF("TuxOnIce page breakdown:\n"); - SNPRINTF("- Free : %d\n", free); - SNPRINTF("- Untracked : %d\n", untracked); - SNPRINTF("- Read only : %d\n", ro); - SNPRINTF("- Dirty : %d\n", dirty); - SNPRINTF("- Other : %d\n", other); - SNPRINTF("- Invalid : %d\n", invalid); - SNPRINTF("- Total : %d\n", total); - return len; -} -#endif - -/** - * do_cleanup - cleanup after attempting to hibernate or resume - * @get_debug_info: Whether to allocate and return debugging info. - * - * Cleanup after attempting to hibernate or resume, possibly getting - * debugging info as we do so. - **/ -static void do_cleanup(int get_debug_info, int restarting) -{ - int i = 0; - char *buffer = NULL; - - trap_non_toi_io = 0; - - if (get_debug_info) - toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up..."); - - free_checksum_pages(); - - toi_cbw_restore(); - toi_free_cbw_data(); - - if (get_debug_info) - buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP); - - if (buffer) - i = get_toi_debug_info(buffer, PAGE_SIZE); - - toi_free_extra_pagedir_memory(); - - pagedir1.size = 0; - pagedir2.size = 0; - set_highmem_size(pagedir1, 0); - set_highmem_size(pagedir2, 0); - - if (boot_kernel_data_buffer) { - if (!test_toi_state(TOI_BOOT_KERNEL)) - toi_free_page(37, boot_kernel_data_buffer); - boot_kernel_data_buffer = 0; - } - - if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) { - unlock_device_hotplug(); - clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); - } - - clear_toi_state(TOI_BOOT_KERNEL); - if (current->flags & PF_SUSPEND_TASK) - thaw_processes(); - - if (!restarting) - toi_stop_other_threads(); - - if (toi_keeping_image && - !test_result_state(TOI_ABORTED)) { - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, - "TuxOnIce: Not invalidating the image due " - "to Keep Image or Incremental Image being enabled."); - set_result_state(TOI_KEPT_IMAGE); - - /* - * For an incremental image, free unused storage so - * swap (if any) can be used for normal system operation, - * if so desired. - */ - - toiActiveAllocator->free_unused_storage(); - } else - if (toiActiveAllocator) - toiActiveAllocator->remove_image(); - - free_bitmaps(); - usermodehelper_enable(); - - if (test_toi_state(TOI_NOTIFIERS_PREPARE)) { - pm_notifier_call_chain(PM_POST_HIBERNATION); - clear_toi_state(TOI_NOTIFIERS_PREPARE); - } - - if (buffer && i) { - /* Printk can only handle 1023 bytes, including - * its level mangling. */ - for (i = 0; i < 3; i++) - printk(KERN_ERR "%s", buffer + (1023 * i)); - toi_free_page(20, (unsigned long) buffer); - } - - if (!restarting) - toi_cleanup_console(); - - free_attention_list(); - - if (!restarting) - toi_deactivate_storage(0); - - clear_toi_state(TOI_IGNORE_LOGLEVEL); - clear_toi_state(TOI_TRYING_TO_RESUME); - clear_toi_state(TOI_NOW_RESUMING); -} - -/** - * check_still_keeping_image - we kept an image; check whether to reuse it. - * - * We enter this routine when we have kept an image. If the user has said they - * want to still keep it, all we need to do is powerdown. If powering down - * means hibernating to ram and the power doesn't run out, we'll return 1. - * If we do power off properly or the battery runs out, we'll resume via the - * normal paths. - * - * If the user has said they want to remove the previously kept image, we - * remove it, and return 0. We'll then store a new image. - **/ -static int check_still_keeping_image(void) -{ - if (toi_keeping_image) { - if (!test_action_state(TOI_INCREMENTAL_IMAGE)) { - printk(KERN_INFO "Image already stored: powering down " - "immediately."); - do_toi_step(STEP_HIBERNATE_POWERDOWN); - return 1; - } - /** - * Incremental image - need to write new part. - * We detect that we're writing an incremental image by looking - * at test_result_state(TOI_KEPT_IMAGE) - **/ - return 0; - } - - printk(KERN_INFO "Invalidating previous image.\n"); - toiActiveAllocator->remove_image(); - - return 0; -} - -/** - * toi_init - prepare to hibernate to disk - * - * Initialise variables & data structures, in preparation for - * hibernating to disk. - **/ -static int toi_init(int restarting) -{ - int result, i, j; - - toi_result = 0; - - printk(KERN_INFO "Initiating a hibernation cycle.\n"); - - nr_hibernates++; - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - toi_bkd.toi_io_time[i][j] = 0; - - if (!test_toi_state(TOI_CAN_HIBERNATE) || - allocate_bitmaps()) - return 1; - - mark_nosave_pages(); - - if (!restarting) - toi_prepare_console(); - - result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (result) { - set_result_state(TOI_NOTIFIERS_PREPARE_FAILED); - return 1; - } - set_toi_state(TOI_NOTIFIERS_PREPARE); - - if (!restarting) { - printk(KERN_ERR "Starting other threads."); - toi_start_other_threads(); - } - - result = usermodehelper_disable(); - if (result) { - printk(KERN_ERR "TuxOnIce: Failed to disable usermode " - "helpers\n"); - set_result_state(TOI_USERMODE_HELPERS_ERR); - return 1; - } - - boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP); - if (!boot_kernel_data_buffer) { - printk(KERN_ERR "TuxOnIce: Failed to allocate " - "boot_kernel_data_buffer.\n"); - set_result_state(TOI_OUT_OF_MEMORY); - return 1; - } - - toi_allocate_cbw_data(); - - return 0; -} - -/** - * can_hibernate - perform basic 'Can we hibernate?' tests - * - * Perform basic tests that must pass if we're going to be able to hibernate: - * Can we get the pm_mutex? Is resume= valid (we need to know where to write - * the image header). - **/ -static int can_hibernate(void) -{ - if (!test_toi_state(TOI_CAN_HIBERNATE)) - toi_attempt_to_parse_resume_device(0); - - if (!test_toi_state(TOI_CAN_HIBERNATE)) { - printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n" - "This may be because you haven't put something along " - "the lines of\n\nresume=swap:/dev/hda1\n\n" - "in lilo.conf or equivalent. (Where /dev/hda1 is your " - "swap partition).\n"); - set_abort_result(TOI_CANT_SUSPEND); - return 0; - } - - if (strlen(alt_resume_param)) { - attempt_to_parse_alt_resume_param(); - - if (!strlen(alt_resume_param)) { - printk(KERN_INFO "Alternate resume parameter now " - "invalid. Aborting.\n"); - set_abort_result(TOI_CANT_USE_ALT_RESUME); - return 0; - } - } - - return 1; -} - -/** - * do_post_image_write - having written an image, figure out what to do next - * - * After writing an image, we might load an alternate image or power down. - * Powering down might involve hibernating to ram, in which case we also - * need to handle reloading pageset2. - **/ -static int do_post_image_write(void) -{ - /* If switching images fails, do normal powerdown */ - if (alt_resume_param[0]) - do_toi_step(STEP_RESUME_ALT_IMAGE); - - toi_power_down(); - - barrier(); - mb(); - return 0; -} - -/** - * __save_image - do the hard work of saving the image - * - * High level routine for getting the image saved. The key assumptions made - * are that processes have been frozen and sufficient memory is available. - * - * We also exit through here at resume time, coming back from toi_hibernate - * after the atomic restore. This is the reason for the toi_in_hibernate - * test. - **/ -static int __save_image(void) -{ - int temp_result, did_copy = 0; - - toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image.."); - - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, - " - Final values: %d and %d.", - pagedir1.size, pagedir2.size); - - toi_cond_pause(1, "About to write pagedir2."); - - temp_result = write_pageset(&pagedir2); - - if (temp_result == -1 || test_result_state(TOI_ABORTED)) - return 1; - - toi_cond_pause(1, "About to copy pageset 1."); - - if (test_result_state(TOI_ABORTED)) - return 1; - - toi_deactivate_storage(1); - - toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); - - toi_in_hibernate = 1; - - if (toi_go_atomic(PMSG_FREEZE, 1)) - goto Failed; - - temp_result = toi_hibernate(); - -#ifdef CONFIG_KGDB - if (test_action_state(TOI_POST_RESUME_BREAKPOINT)) - kgdb_breakpoint(); -#endif - - if (!temp_result) - did_copy = 1; - - /* We return here at resume time too! */ - toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result); - -Failed: - if (toi_activate_storage(1)) - panic("Failed to reactivate our storage."); - - /* Resume time? */ - if (!toi_in_hibernate) { - copyback_post(); - return 0; - } - - /* Nope. Hibernating. So, see if we can save the image... */ - - if (temp_result || test_result_state(TOI_ABORTED)) { - if (did_copy) - goto abort_reloading_pagedir_two; - else - return 1; - } - - toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size, - NULL); - - if (test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - toi_cond_pause(1, "About to write pageset1."); - - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1"); - - temp_result = write_pageset(&pagedir1); - - /* We didn't overwrite any memory, so no reread needs to be done. */ - if (test_action_state(TOI_TEST_FILTER_SPEED) || - test_action_state(TOI_TEST_BIO)) - return 1; - - if (temp_result == 1 || test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - toi_cond_pause(1, "About to write header."); - - if (test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - temp_result = write_image_header(); - - if (!temp_result && !test_result_state(TOI_ABORTED)) - return 0; - -abort_reloading_pagedir_two: - temp_result = read_pageset2(1); - - /* If that failed, we're sunk. Panic! */ - if (temp_result) - panic("Attempt to reload pagedir 2 while aborting " - "a hibernate failed."); - - return 1; -} - -static void map_ps2_pages(int enable) -{ - unsigned long pfn = 0; - - memory_bm_position_reset(pageset2_map); - pfn = memory_bm_next_pfn(pageset2_map, 0); - - while (pfn != BM_END_OF_MAP) { - struct page *page = pfn_to_page(pfn); - kernel_map_pages(page, 1, enable); - pfn = memory_bm_next_pfn(pageset2_map, 0); - } -} - -/** - * do_save_image - save the image and handle the result - * - * Save the prepared image. If we fail or we're in the path returning - * from the atomic restore, cleanup. - **/ -static int do_save_image(void) -{ - int result; - map_ps2_pages(0); - result = __save_image(); - map_ps2_pages(1); - return result; -} - -/** - * do_prepare_image - try to prepare an image - * - * Seek to initialise and prepare an image to be saved. On failure, - * cleanup. - **/ -static int do_prepare_image(void) -{ - int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - - if (!restarting && toi_activate_storage(0)) - return 1; - - /* - * If kept image and still keeping image and hibernating to RAM, (non - * incremental image case) we will return 1 after hibernating and - * resuming (provided the power doesn't run out. In that case, we skip - * directly to cleaning up and exiting. - */ - - if (!can_hibernate() || - (test_result_state(TOI_KEPT_IMAGE) && - check_still_keeping_image())) - return 1; - - if (toi_init(restarting) || toi_prepare_image() || - test_result_state(TOI_ABORTED)) - return 1; - - trap_non_toi_io = 1; - - return 0; -} - -/** - * do_check_can_resume - find out whether an image has been stored - * - * Read whether an image exists. We use the same routine as the - * image_exists sysfs entry, and just look to see whether the - * first character in the resulting buffer is a '1'. - **/ -int do_check_can_resume(void) -{ - int result = -1; - - if (toi_activate_storage(0)) - return -1; - - if (!test_toi_state(TOI_RESUME_DEVICE_OK)) - toi_attempt_to_parse_resume_device(1); - - if (toiActiveAllocator) - result = toiActiveAllocator->image_exists(1); - - toi_deactivate_storage(0); - return result; -} - -/** - * do_load_atomic_copy - load the first part of an image, if it exists - * - * Check whether we have an image. If one exists, do sanity checking - * (possibly invalidating the image or even rebooting if the user - * requests that) before loading it into memory in preparation for the - * atomic restore. - * - * If and only if we have an image loaded and ready to restore, we return 1. - **/ -static int do_load_atomic_copy(void) -{ - int read_image_result = 0; - - if (sizeof(swp_entry_t) != sizeof(long)) { - printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size" - " of long. Please report this!\n"); - return 1; - } - - if (!resume_file[0]) - printk(KERN_WARNING "TuxOnIce: " - "You need to use a resume= command line parameter to " - "tell TuxOnIce where to look for an image.\n"); - - toi_activate_storage(0); - - if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) && - !toi_attempt_to_parse_resume_device(0)) { - /* - * Without a usable storage device we can do nothing - - * even if noresume is given - */ - - if (!toiNumAllocators) - printk(KERN_ALERT "TuxOnIce: " - "No storage allocators have been registered.\n"); - else - printk(KERN_ALERT "TuxOnIce: " - "Missing or invalid storage location " - "(resume= parameter). Please correct and " - "rerun lilo (or equivalent) before " - "hibernating.\n"); - toi_deactivate_storage(0); - return 1; - } - - if (allocate_bitmaps()) - return 1; - - read_image_result = read_pageset1(); /* non fatal error ignored */ - - if (test_toi_state(TOI_NORESUME_SPECIFIED)) - clear_toi_state(TOI_NORESUME_SPECIFIED); - - toi_deactivate_storage(0); - - if (read_image_result) - return 1; - - return 0; -} - -/** - * prepare_restore_load_alt_image - save & restore alt image variables - * - * Save and restore the pageset1 maps, when loading an alternate image. - **/ -static void prepare_restore_load_alt_image(int prepare) -{ - static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save; - - if (prepare) { - pageset1_map_save = pageset1_map; - pageset1_map = NULL; - pageset1_copy_map_save = pageset1_copy_map; - pageset1_copy_map = NULL; - set_toi_state(TOI_LOADING_ALT_IMAGE); - toi_reset_alt_image_pageset2_pfn(); - } else { - toi_free_bitmap(&pageset1_map); - pageset1_map = pageset1_map_save; - toi_free_bitmap(&pageset1_copy_map); - pageset1_copy_map = pageset1_copy_map_save; - clear_toi_state(TOI_NOW_RESUMING); - clear_toi_state(TOI_LOADING_ALT_IMAGE); - } -} - -/** - * do_toi_step - perform a step in hibernating or resuming - * - * Perform a step in hibernating or resuming an image. This abstraction - * is in preparation for implementing cluster support, and perhaps replacing - * uswsusp too (haven't looked whether that's possible yet). - **/ -int do_toi_step(int step) -{ - switch (step) { - case STEP_HIBERNATE_PREPARE_IMAGE: - return do_prepare_image(); - case STEP_HIBERNATE_SAVE_IMAGE: - return do_save_image(); - case STEP_HIBERNATE_POWERDOWN: - return do_post_image_write(); - case STEP_RESUME_CAN_RESUME: - return do_check_can_resume(); - case STEP_RESUME_LOAD_PS1: - return do_load_atomic_copy(); - case STEP_RESUME_DO_RESTORE: - /* - * If we succeed, this doesn't return. - * Instead, we return from do_save_image() in the - * hibernated kernel. - */ - return toi_atomic_restore(); - case STEP_RESUME_ALT_IMAGE: - printk(KERN_INFO "Trying to resume alternate image.\n"); - toi_in_hibernate = 0; - save_restore_alt_param(SAVE, NOQUIET); - prepare_restore_load_alt_image(1); - if (!do_check_can_resume()) { - printk(KERN_INFO "Nothing to resume from.\n"); - goto out; - } - if (!do_load_atomic_copy()) - toi_atomic_restore(); - - printk(KERN_INFO "Failed to load image.\n"); -out: - prepare_restore_load_alt_image(0); - save_restore_alt_param(RESTORE, NOQUIET); - break; - case STEP_CLEANUP: - do_cleanup(1, 0); - break; - case STEP_QUIET_CLEANUP: - do_cleanup(0, 0); - break; - } - - return 0; -} - -/* -- Functions for kickstarting a hibernate or resume --- */ - -/** - * toi_try_resume - try to do the steps in resuming - * - * Check if we have an image and if so try to resume. Clear the status - * flags too. - **/ -void toi_try_resume(void) -{ - set_toi_state(TOI_TRYING_TO_RESUME); - resume_attempted = 1; - - current->flags |= PF_MEMALLOC; - toi_start_other_threads(); - - if (do_toi_step(STEP_RESUME_CAN_RESUME) && - !do_toi_step(STEP_RESUME_LOAD_PS1)) - do_toi_step(STEP_RESUME_DO_RESTORE); - - toi_stop_other_threads(); - do_cleanup(0, 0); - - current->flags &= ~PF_MEMALLOC; - - clear_toi_state(TOI_IGNORE_LOGLEVEL); - clear_toi_state(TOI_TRYING_TO_RESUME); - clear_toi_state(TOI_NOW_RESUMING); -} - -/** - * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume - * - * Wrapper for when __toi_try_resume is called from swsusp resume path, - * rather than from echo > /sys/power/tuxonice/do_resume. - **/ -static void toi_sys_power_disk_try_resume(void) -{ - resume_attempted = 1; - - /* - * There's a comment in kernel/power/disk.c that indicates - * we should be able to use mutex_lock_nested below. That - * doesn't seem to cut it, though, so let's just turn lockdep - * off for now. - */ - lockdep_off(); - - if (toi_start_anything(SYSFS_RESUMING)) - goto out; - - toi_try_resume(); - - /* - * For initramfs, we have to clear the boot time - * flag after trying to resume - */ - clear_toi_state(TOI_BOOT_TIME); - - toi_finish_anything(SYSFS_RESUMING); -out: - lockdep_on(); -} - -/** - * toi_try_hibernate - try to start a hibernation cycle - * - * Start a hibernation cycle, coming in from either - * echo > /sys/power/tuxonice/do_suspend - * - * or - * - * echo disk > /sys/power/state - * - * In the later case, we come in without pm_sem taken; in the - * former, it has been taken. - **/ -int toi_try_hibernate(void) -{ - int result = 0, sys_power_disk = 0, retries = 0; - - if (!mutex_is_locked(&tuxonice_in_use)) { - /* Came in via /sys/power/disk */ - if (toi_start_anything(SYSFS_HIBERNATING)) - return -EBUSY; - sys_power_disk = 1; - } - - current->flags |= PF_MEMALLOC; - - if (test_toi_state(TOI_CLUSTER_MODE)) { - toi_initiate_cluster_hibernate(); - goto out; - } - -prepare: - result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); - - if (result) - goto out; - - if (test_action_state(TOI_FREEZER_TEST)) - goto out_restore_gfp_mask; - - result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); - - if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) { - if (retries < 2) { - do_cleanup(0, 1); - retries++; - clear_result_state(TOI_ABORTED); - extra_pd1_pages_allowance = extra_pd1_pages_used + 500; - printk(KERN_INFO "Automatically adjusting the extra" - " pages allowance to %ld and restarting.\n", - extra_pd1_pages_allowance); - pm_restore_gfp_mask(); - goto prepare; - } - - printk(KERN_INFO "Adjusted extra pages allowance twice and " - "still couldn't hibernate successfully. Giving up."); - } - - /* This code runs at resume time too! */ - if (!result && toi_in_hibernate) - result = do_toi_step(STEP_HIBERNATE_POWERDOWN); - -out_restore_gfp_mask: - pm_restore_gfp_mask(); -out: - do_cleanup(1, 0); - current->flags &= ~PF_MEMALLOC; - - if (sys_power_disk) - toi_finish_anything(SYSFS_HIBERNATING); - - return result; -} - -/* - * channel_no: If !0, -c <channel_no> is added to args (userui). - */ -int toi_launch_userspace_program(char *command, int channel_no, - int wait, int debug) -{ - int retval; - static char *envp[] = { - "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; - static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL - }; - char *channel = NULL; - int arg = 0, size; - char test_read[255]; - char *orig_posn = command; - - if (!strlen(orig_posn)) - return 1; - - if (channel_no) { - channel = toi_kzalloc(4, 6, GFP_KERNEL); - if (!channel) { - printk(KERN_INFO "Failed to allocate memory in " - "preparing to launch userspace program.\n"); - return 1; - } - } - - /* Up to 6 args supported */ - while (arg < 6) { - sscanf(orig_posn, "%s", test_read); - size = strlen(test_read); - if (!(size)) - break; - argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP); - strcpy(argv[arg], test_read); - orig_posn += size + 1; - *test_read = 0; - arg++; - } - - if (channel_no) { - sprintf(channel, "-c%d", channel_no); - argv[arg] = channel; - } else - arg--; - - if (debug) { - argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP); - strcpy(argv[arg], "--debug"); - } - - retval = call_usermodehelper(argv[0], argv, envp, wait); - - /* - * If the program reports an error, retval = 256. Don't complain - * about that here. - */ - if (retval && retval != 256) - printk(KERN_ERR "Failed to launch userspace program '%s': " - "Error %d\n", command, retval); - - { - int i; - for (i = 0; i < arg; i++) - if (argv[i] && argv[i] != channel) - toi_kfree(5, argv[i], sizeof(*argv[i])); - } - - toi_kfree(4, channel, sizeof(*channel)); - - return retval; -} - -/* - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_LONG("extra_pages_allowance", SYSFS_RW, - &extra_pd1_pages_allowance, 0, LONG_MAX, 0), - SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read, - image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL), - SYSFS_STRING("resume", SYSFS_RW, resume_file, 255, - SYSFS_NEEDS_SM_FOR_WRITE, - attempt_to_parse_resume_device2), - SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255, - SYSFS_NEEDS_SM_FOR_WRITE, - attempt_to_parse_alt_resume_param), - SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0, - NULL), - SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action, - TOI_IGNORE_ROOTFS, 0), - SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2, - INT_MAX, 0), - SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0), - SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_MULTITHREADED_IO, 0), - SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_FLUSHER_THREAD, 0), - SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action, - TOI_PAGESET2_FULL, 0), - SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0), - SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action, - TOI_REPLACE_SWSUSP, 0), - SYSFS_STRING("resume_commandline", SYSFS_RW, - toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0, - NULL), - SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL), - SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action, - TOI_FREEZER_TEST, 0), - SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0), - SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action, - TOI_TEST_FILTER_SPEED, 0), - SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_PAGESET2, 0), - SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_PS2_IF_UNNEEDED, 0), - SYSFS_STRING("binary_signature", SYSFS_READONLY, - tuxonice_signature, 9, 0, NULL), - SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0, - NULL), -#ifdef CONFIG_KGDB - SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action, - TOI_POST_RESUME_BREAKPOINT, 0), -#endif - SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_READAHEAD, 0), - SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action, - TOI_TRACE_DEBUG_ON, 0), -#ifdef CONFIG_TOI_KEEP_IMAGE - SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE, - 0), -#endif -#ifdef CONFIG_TOI_INCREMENTAL - SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0, - NULL), - SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action, - TOI_INCREMENTAL_IMAGE, 1), -#endif -}; - -static struct toi_core_fns my_fns = { - .get_nonconflicting_page = __toi_get_nonconflicting_page, - .post_context_save = __toi_post_context_save, - .try_hibernate = toi_try_hibernate, - .try_resume = toi_sys_power_disk_try_resume, -}; - -/** - * core_load - initialisation of TuxOnIce core - * - * Initialise the core, beginning with sysfs. Checksum and so on are part of - * the core, but have their own initialisation routines because they either - * aren't compiled in all the time or have their own subdirectories. - **/ -static __init int core_load(void) -{ - int i, - numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION - " (http://tuxonice.net)\n"); - - if (!hibernation_available()) { - printk(KERN_INFO "TuxOnIce disabled due to request for hibernation" - " to be disabled in this kernel.\n"); - return 1; - } - - if (toi_sysfs_init()) - return 1; - - for (i = 0; i < numfiles; i++) - toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); - - toi_core_fns = &my_fns; - - if (toi_alloc_init()) - return 1; - if (toi_checksum_init()) - return 1; - if (toi_usm_init()) - return 1; - if (toi_ui_init()) - return 1; - if (toi_poweroff_init()) - return 1; - if (toi_cluster_init()) - return 1; - if (toi_cbw_init()) - return 1; - - return 0; -} - -late_initcall(core_load); diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c deleted file mode 100644 index a8c5f3660..000000000 --- a/kernel/power/tuxonice_incremental.c +++ /dev/null @@ -1,402 +0,0 @@ -/* - * kernel/power/tuxonice_incremental.c - * - * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains routines related to storing incremental images - that - * is, retaining an image after an initial cycle and then storing incremental - * changes on subsequent hibernations. - * - * Based in part on on... - * - * Debug helper to dump the current kernel pagetables of the system - * so that we can see what the various memory ranges are set to. - * - * (C) Copyright 2008 Intel Corporation - * - * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/mm.h> -#include <linux/tuxonice.h> -#include <linux/sched.h> -#include <asm/pgtable.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/page.h> -#include "tuxonice_pageflags.h" -#include "tuxonice_builtin.h" -#include "power.h" - -int toi_do_incremental_initcall; - -extern void kdb_init(int level); -extern noinline void kgdb_breakpoint(void); - -#undef pr_debug -#if 0 -#define pr_debug(a, b...) do { printk(a, ##b); } while(0) -#else -#define pr_debug(a, b...) do { } while(0) -#endif - -/* Multipliers for offsets within the PTEs */ -#define PTE_LEVEL_MULT (PAGE_SIZE) -#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) -#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) - -/* - * This function gets called on a break in a continuous series - * of PTE entries; the next one is different so we need to - * print what we collected so far. - */ -static void note_page(void *addr) -{ - static struct page *lastpage; - struct page *page; - - page = virt_to_page(addr); - - if (page != lastpage) { - unsigned int level; - pte_t *pte = lookup_address((unsigned long) addr, &level); - struct page *pt_page2 = pte_page(*pte); - //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2)); - SetPageTOI_Untracked(pt_page2); - lastpage = page; - } -} - -static void walk_pte_level(pmd_t addr) -{ - int i; - pte_t *start; - - start = (pte_t *) pmd_page_vaddr(addr); - for (i = 0; i < PTRS_PER_PTE; i++) { - note_page(start); - start++; - } -} - -#if PTRS_PER_PMD > 1 - -static void walk_pmd_level(pud_t addr) -{ - int i; - pmd_t *start; - - start = (pmd_t *) pud_page_vaddr(addr); - for (i = 0; i < PTRS_PER_PMD; i++) { - if (!pmd_none(*start)) { - if (pmd_large(*start) || !pmd_present(*start)) - note_page(start); - else - walk_pte_level(*start); - } else - note_page(start); - start++; - } -} - -#else -#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a))) -#define pud_large(a) pmd_large(__pmd(pud_val(a))) -#define pud_none(a) pmd_none(__pmd(pud_val(a))) -#endif - -#if PTRS_PER_PUD > 1 - -static void walk_pud_level(pgd_t addr) -{ - int i; - pud_t *start; - - start = (pud_t *) pgd_page_vaddr(addr); - - for (i = 0; i < PTRS_PER_PUD; i++) { - if (!pud_none(*start)) { - if (pud_large(*start) || !pud_present(*start)) - note_page(start); - else - walk_pmd_level(*start); - } else - note_page(start); - - start++; - } -} - -#else -#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a))) -#define pgd_large(a) pud_large(__pud(pgd_val(a))) -#define pgd_none(a) pud_none(__pud(pgd_val(a))) -#endif - -/* - * Not static in the original at the time of writing, so needs renaming here. - */ -static void toi_ptdump_walk_pgd_level(pgd_t *pgd) -{ -#ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_level4_pgt; -#else - pgd_t *start = swapper_pg_dir; -#endif - int i; - if (pgd) { - start = pgd; - } - - for (i = 0; i < PTRS_PER_PGD; i++) { - if (!pgd_none(*start)) { - if (pgd_large(*start) || !pgd_present(*start)) - note_page(start); - else - walk_pud_level(*start); - } else - note_page(start); - - start++; - } - - /* Flush out the last page */ - note_page(start); -} - -#ifdef CONFIG_PARAVIRT -extern struct pv_info pv_info; - -static void toi_set_paravirt_ops_untracked(void) { - int i; - - unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)), - pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end)); - //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end); - for (i = pvpfn; i <= pvpfn_end; i++) { - SetPageTOI_Untracked(pfn_to_page(i)); - } -} -#else -#define toi_set_paravirt_ops_untracked() { do { } while(0) } -#endif - -extern void toi_mark_per_cpus_pages_untracked(void); - -void toi_untrack_stack(unsigned long *stack) -{ - int i; - struct page *stack_page = virt_to_page(stack); - - for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) { - pr_debug("Untrack stack page %p.\n", page_address(stack_page + i)); - SetPageTOI_Untracked(stack_page + i); - } -} -void toi_untrack_process(struct task_struct *p) -{ - SetPageTOI_Untracked(virt_to_page(p)); - pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p))); - - toi_untrack_stack(p->stack); -} - -void toi_generate_untracked_map(void) -{ - struct task_struct *p, *t; - struct page *page; - pte_t *pte; - int i; - unsigned int level; - static int been_here = 0; - - if (been_here) - return; - - been_here = 1; - - /* Pagetable pages */ - toi_ptdump_walk_pgd_level(NULL); - - /* Printk buffer - not normally needed but can be helpful for debugging. */ - //toi_set_logbuf_untracked(); - - /* Paravirt ops */ - toi_set_paravirt_ops_untracked(); - - /* Task structs and stacks */ - for_each_process_thread(p, t) { - toi_untrack_process(p); - //toi_untrack_stack((unsigned long *) t->thread.sp); - } - - for (i = 0; i < NR_CPUS; i++) { - struct task_struct *idle = idle_task(i); - - if (idle) { - pr_debug("Untrack idle process for CPU %d.\n", i); - toi_untrack_process(idle); - } - - /* IRQ stack */ - pr_debug("Untrack IRQ stack for CPU %d.\n", i); - toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i)); - } - - /* Per CPU data */ - //pr_debug("Untracking per CPU variable pages.\n"); - toi_mark_per_cpus_pages_untracked(); - - /* Init stack - for bringing up secondary CPUs */ - page = virt_to_page(init_stack); - for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) { - SetPageTOI_Untracked(page + i); - } - - pte = lookup_address((unsigned long) &mmu_cr4_features, &level); - SetPageTOI_Untracked(pte_page(*pte)); - SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features)); -} - -/** - * toi_reset_dirtiness_one - */ - -void toi_reset_dirtiness_one(unsigned long pfn, int verbose) -{ - struct page *page = pfn_to_page(pfn); - - /** - * Don't worry about whether the Dirty flag is - * already set. If this is our first call, it - * won't be. - */ - - preempt_disable(); - - ClearPageTOI_Dirty(page); - SetPageTOI_RO(page); - if (verbose) - printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page)); - - set_memory_ro((unsigned long) page_address(page), 1); - - preempt_enable(); -} - -/** - * TuxOnIce's incremental image support works by marking all memory apart from - * the page tables read-only, then in the page-faults that result enabling - * writing if appropriate and flagging the page as dirty. Free pages are also - * marked as dirty and not protected so that if allocated, they will be included - * in the image without further processing. - * - * toi_reset_dirtiness is called when and image exists and incremental images are - * enabled, and each time we resume thereafter. It is not invoked on a fresh boot. - * - * This routine should be called from a single-cpu-running context to avoid races in setting - * page dirty/read only flags. - * - * TODO: Make "it is not invoked on a fresh boot" true when I've finished developing it! - * - * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c. - **/ - -int toi_reset_dirtiness(int verbose) -{ - struct zone *zone; - unsigned long loop; - int allocated_map = 0; - - toi_generate_untracked_map(); - - if (!free_map) { - if (!toi_alloc_bitmap(&free_map)) - return -ENOMEM; - allocated_map = 1; - } - - toi_generate_free_page_map(); - - pr_debug(KERN_EMERG "Reset dirtiness.\n"); - for_each_populated_zone(zone) { - // 64 bit only. No need to worry about highmem. - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageNosave(page) || !saveable_page(zone, pfn)) { - continue; - } - - if (PageTOI_Untracked(page)) { - continue; - } - - /** - * Do we need to (re)protect the page? - * If it is already protected (PageTOI_RO), there is - * nothing to do - skip the following. - * If it is marked as dirty (PageTOI_Dirty), it was - * either free and has been allocated or has been - * written to and marked dirty. Reset the dirty flag - * and (re)apply the protection. - */ - if (!PageTOI_RO(page)) { - toi_reset_dirtiness_one(pfn, verbose); - } - } - } - - pr_debug(KERN_EMERG "Done resetting dirtiness.\n"); - - if (allocated_map) { - toi_free_bitmap(&free_map); - } - return 0; -} - -static int toi_reset_dirtiness_initcall(void) -{ - if (toi_do_incremental_initcall) { - pr_info("TuxOnIce: Enabling dirty page tracking.\n"); - toi_reset_dirtiness(0); - } - return 1; -} -extern void toi_generate_untracked_map(void); - -// Leave early_initcall for pages to register untracked sections. -early_initcall(toi_reset_dirtiness_initcall); - -static int __init toi_incremental_initcall_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value) && value) - toi_do_incremental_initcall = value; - - return 1; -} -__setup("toi_incremental_initcall", toi_incremental_initcall_setup); diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c deleted file mode 100644 index 2db934350..000000000 --- a/kernel/power/tuxonice_io.c +++ /dev/null @@ -1,1936 +0,0 @@ -/* - * kernel/power/tuxonice_io.c - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr> - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains high level IO routines for hibernating. - * - */ - -#include <linux/suspend.h> -#include <linux/version.h> -#include <linux/utsname.h> -#include <linux/mount.h> -#include <linux/highmem.h> -#include <linux/kthread.h> -#include <linux/cpu.h> -#include <linux/fs_struct.h> -#include <linux/bio.h> -#include <linux/fs_uuid.h> -#include <linux/kmod.h> -#include <asm/tlbflush.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_storage.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_extent.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_builtin.h" -#include "tuxonice_checksum.h" -#include "tuxonice_alloc.h" -char alt_resume_param[256]; - -/* Version read from image header at resume */ -static int toi_image_header_version; - -#define read_if_version(VERS, VAR, DESC, ERR_ACT) do { \ - if (likely(toi_image_header_version >= VERS)) \ - if (toiActiveAllocator->rw_header_chunk(READ, NULL, \ - (char *) &VAR, sizeof(VAR))) { \ - abort_hibernate(TOI_FAILED_IO, "Failed to read DESC."); \ - ERR_ACT; \ - } \ -} while(0) \ - -/* Variables shared between threads and updated under the mutex */ -static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result; -static int io_index, io_nextupdate, io_pc, io_pc_step; -static DEFINE_MUTEX(io_mutex); -static DEFINE_PER_CPU(struct page *, last_sought); -static DEFINE_PER_CPU(struct page *, last_high_page); -static DEFINE_PER_CPU(char *, checksum_locn); -static DEFINE_PER_CPU(struct pbe *, last_low_page); -static atomic_t io_count; -atomic_t toi_io_workers; - -static int using_flusher; - -DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher); - -int toi_bio_queue_flusher_should_finish; - -int toi_max_workers; - -static char *image_version_error = "The image header version is newer than " \ - "this kernel supports."; - -struct toi_module_ops *first_filter; - -static atomic_t toi_num_other_threads; -static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue); -enum toi_worker_commands { - TOI_IO_WORKER_STOP, - TOI_IO_WORKER_RUN, - TOI_IO_WORKER_EXIT -}; -static enum toi_worker_commands toi_worker_command; - -/** - * toi_attempt_to_parse_resume_device - determine if we can hibernate - * - * Can we hibernate, using the current resume= parameter? - **/ -int toi_attempt_to_parse_resume_device(int quiet) -{ - struct list_head *Allocator; - struct toi_module_ops *thisAllocator; - int result, returning = 0; - - if (toi_activate_storage(0)) - return 0; - - toiActiveAllocator = NULL; - clear_toi_state(TOI_RESUME_DEVICE_OK); - clear_toi_state(TOI_CAN_RESUME); - clear_result_state(TOI_ABORTED); - - if (!toiNumAllocators) { - if (!quiet) - printk(KERN_INFO "TuxOnIce: No storage allocators have " - "been registered. Hibernating will be " - "disabled.\n"); - goto cleanup; - } - - list_for_each(Allocator, &toiAllocators) { - thisAllocator = list_entry(Allocator, struct toi_module_ops, - type_list); - - /* - * Not sure why you'd want to disable an allocator, but - * we should honour the flag if we're providing it - */ - if (!thisAllocator->enabled) - continue; - - result = thisAllocator->parse_sig_location( - resume_file, (toiNumAllocators == 1), - quiet); - - switch (result) { - case -EINVAL: - /* For this allocator, but not a valid - * configuration. Error already printed. */ - goto cleanup; - - case 0: - /* For this allocator and valid. */ - toiActiveAllocator = thisAllocator; - - set_toi_state(TOI_RESUME_DEVICE_OK); - set_toi_state(TOI_CAN_RESUME); - returning = 1; - goto cleanup; - } - } - if (!quiet) - printk(KERN_INFO "TuxOnIce: No matching enabled allocator " - "found. Resuming disabled.\n"); -cleanup: - toi_deactivate_storage(0); - return returning; -} - -void attempt_to_parse_resume_device2(void) -{ - toi_prepare_usm(); - toi_attempt_to_parse_resume_device(0); - toi_cleanup_usm(); -} - -void save_restore_alt_param(int replace, int quiet) -{ - static char resume_param_save[255]; - static unsigned long toi_state_save; - - if (replace) { - toi_state_save = toi_state; - strcpy(resume_param_save, resume_file); - strcpy(resume_file, alt_resume_param); - } else { - strcpy(resume_file, resume_param_save); - toi_state = toi_state_save; - } - toi_attempt_to_parse_resume_device(quiet); -} - -void attempt_to_parse_alt_resume_param(void) -{ - int ok = 0; - - /* Temporarily set resume_param to the poweroff value */ - if (!strlen(alt_resume_param)) - return; - - printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n"); - save_restore_alt_param(SAVE, NOQUIET); - if (test_toi_state(TOI_CAN_RESUME)) - ok = 1; - - printk(KERN_INFO "=== Done ===\n"); - save_restore_alt_param(RESTORE, QUIET); - - /* If not ok, clear the string */ - if (ok) - return; - - printk(KERN_INFO "Can't resume from that location; clearing " - "alt_resume_param.\n"); - alt_resume_param[0] = '\0'; -} - -/** - * noresume_reset_modules - reset data structures in case of non resuming - * - * When we read the start of an image, modules (and especially the - * active allocator) might need to reset data structures if we - * decide to remove the image rather than resuming from it. - **/ -static void noresume_reset_modules(void) -{ - struct toi_module_ops *this_filter; - - list_for_each_entry(this_filter, &toi_filters, type_list) - if (this_filter->noresume_reset) - this_filter->noresume_reset(); - - if (toiActiveAllocator && toiActiveAllocator->noresume_reset) - toiActiveAllocator->noresume_reset(); -} - -/** - * fill_toi_header - fill the hibernate header structure - * @struct toi_header: Header data structure to be filled. - **/ -static int fill_toi_header(struct toi_header *sh) -{ - int i, error; - - error = init_header((struct swsusp_info *) sh); - if (error) - return error; - - sh->pagedir = pagedir1; - sh->pageset_2_size = pagedir2.size; - sh->param0 = toi_result; - sh->param1 = toi_bkd.toi_action; - sh->param2 = toi_bkd.toi_debug_state; - sh->param3 = toi_bkd.toi_default_console_level; - sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev; - for (i = 0; i < 4; i++) - sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2]; - sh->bkd = boot_kernel_data_buffer; - return 0; -} - -/** - * rw_init_modules - initialize modules - * @rw: Whether we are reading of writing an image. - * @which: Section of the image being processed. - * - * Iterate over modules, preparing the ones that will be used to read or write - * data. - **/ -static int rw_init_modules(int rw, int which) -{ - struct toi_module_ops *this_module; - /* Initialise page transformers */ - list_for_each_entry(this_module, &toi_filters, type_list) { - if (!this_module->enabled) - continue; - if (this_module->rw_init && this_module->rw_init(rw, which)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialize the %s filter.", - this_module->name); - return 1; - } - } - - /* Initialise allocator */ - if (toiActiveAllocator->rw_init(rw, which)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialise the allocator."); - return 1; - } - - /* Initialise other modules */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type == FILTER_MODULE || - this_module->type == WRITER_MODULE) - continue; - if (this_module->rw_init && this_module->rw_init(rw, which)) { - set_abort_result(TOI_FAILED_MODULE_INIT); - printk(KERN_INFO "Setting aborted flag due to module " - "init failure.\n"); - return 1; - } - } - - return 0; -} - -/** - * rw_cleanup_modules - cleanup modules - * @rw: Whether we are reading of writing an image. - * - * Cleanup components after reading or writing a set of pages. - * Only the allocator may fail. - **/ -static int rw_cleanup_modules(int rw) -{ - struct toi_module_ops *this_module; - int result = 0; - - /* Cleanup other modules */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type == FILTER_MODULE || - this_module->type == WRITER_MODULE) - continue; - if (this_module->rw_cleanup) - result |= this_module->rw_cleanup(rw); - } - - /* Flush data and cleanup */ - list_for_each_entry(this_module, &toi_filters, type_list) { - if (!this_module->enabled) - continue; - if (this_module->rw_cleanup) - result |= this_module->rw_cleanup(rw); - } - - result |= toiActiveAllocator->rw_cleanup(rw); - - return result; -} - -static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high) -{ - int index, min, max; - struct page *high_page = NULL, - **my_last_high_page = raw_cpu_ptr(&last_high_page), - **my_last_sought = raw_cpu_ptr(&last_sought); - struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page); - void *compare; - - if (is_high) { - if (*my_last_sought && *my_last_high_page && - *my_last_sought < orig_page) - high_page = *my_last_high_page; - else - high_page = (struct page *) restore_highmem_pblist; - this = (struct pbe *) kmap(high_page); - compare = orig_page; - } else { - if (*my_last_sought && *my_last_low_page && - *my_last_sought < orig_page) - this = *my_last_low_page; - else - this = restore_pblist; - compare = page_address(orig_page); - } - - *my_last_sought = orig_page; - - /* Locate page containing pbe */ - while (this[PBES_PER_PAGE - 1].next && - this[PBES_PER_PAGE - 1].orig_address < compare) { - if (is_high) { - struct page *next_high_page = (struct page *) - this[PBES_PER_PAGE - 1].next; - kunmap(high_page); - this = kmap(next_high_page); - high_page = next_high_page; - } else - this = this[PBES_PER_PAGE - 1].next; - } - - /* Do a binary search within the page */ - min = 0; - max = PBES_PER_PAGE; - index = PBES_PER_PAGE / 2; - while (max - min) { - if (!this[index].orig_address || - this[index].orig_address > compare) - max = index; - else if (this[index].orig_address == compare) { - if (is_high) { - struct page *page = this[index].address; - *my_last_high_page = high_page; - kunmap(high_page); - return page; - } - *my_last_low_page = this; - return virt_to_page(this[index].address); - } else - min = index; - index = ((max + min) / 2); - }; - - if (is_high) - kunmap(high_page); - - abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for" - " orig page %p. This[min].orig_address=%p.\n", orig_page, - this[index].orig_address); - return NULL; -} - -/** - * write_next_page - write the next page in a pageset - * @data_pfn: The pfn where the next data to write is located. - * @my_io_index: The index of the page in the pageset. - * @write_pfn: The pfn number to write in the image (where the data belongs). - * - * Get the pfn of the next page to write, map the page if necessary and do the - * write. - **/ -static int write_next_page(unsigned long *data_pfn, int *my_io_index, - unsigned long *write_pfn) -{ - struct page *page; - char **my_checksum_locn = raw_cpu_ptr(&checksum_locn); - int result = 0, was_present; - - *data_pfn = memory_bm_next_pfn(io_map, 0); - - /* Another thread could have beaten us to it. */ - if (*data_pfn == BM_END_OF_MAP) { - if (atomic_read(&io_count)) { - printk(KERN_INFO "Ran out of pfns but io_count is " - "still %d.\n", atomic_read(&io_count)); - BUG(); - } - mutex_unlock(&io_mutex); - return -ENODATA; - } - - *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); - - memory_bm_clear_bit(io_map, 0, *data_pfn); - page = pfn_to_page(*data_pfn); - - was_present = kernel_page_present(page); - if (!was_present) - kernel_map_pages(page, 1, 1); - - if (io_pageset == 1) - *write_pfn = memory_bm_next_pfn(pageset1_map, 0); - else { - *write_pfn = *data_pfn; - *my_checksum_locn = tuxonice_get_next_checksum(); - } - - TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index); - - mutex_unlock(&io_mutex); - - if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn)) - return 1; - - result = first_filter->write_page(*write_pfn, TOI_PAGE, page, - PAGE_SIZE); - - if (!was_present) - kernel_map_pages(page, 1, 0); - - return result; -} - -/** - * read_next_page - read the next page in a pageset - * @my_io_index: The index of the page in the pageset. - * @write_pfn: The pfn in which the data belongs. - * - * Read a page of the image into our buffer. It can happen (here and in the - * write routine) that threads don't get run until after other CPUs have done - * all the work. This was the cause of the long standing issue with - * occasionally getting -ENODATA errors at the end of reading the image. We - * therefore need to check there's actually a page to read before trying to - * retrieve one. - **/ - -static int read_next_page(int *my_io_index, unsigned long *write_pfn, - struct page *buffer) -{ - unsigned int buf_size = PAGE_SIZE; - unsigned long left = atomic_read(&io_count); - - if (!left) - return -ENODATA; - - /* Start off assuming the page we read isn't resaved */ - *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); - - mutex_unlock(&io_mutex); - - /* - * Are we aborting? If so, don't submit any more I/O as - * resetting the resume_attempted flag (from ui.c) will - * clear the bdev flags, making this thread oops. - */ - if (unlikely(test_toi_state(TOI_STOP_RESUME))) { - atomic_dec(&toi_io_workers); - if (!atomic_read(&toi_io_workers)) { - /* - * So we can be sure we'll have memory for - * marking that we haven't resumed. - */ - rw_cleanup_modules(READ); - set_toi_state(TOI_IO_STOPPED); - } - while (1) - schedule(); - } - - /* - * See toi_bio_read_page in tuxonice_bio.c: - * read the next page in the image. - */ - return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size); -} - -static void use_read_page(unsigned long write_pfn, struct page *buffer) -{ - struct page *final_page = pfn_to_page(write_pfn), - *copy_page = final_page; - char *virt, *buffer_virt; - int was_present, cpu = smp_processor_id(); - unsigned long idx = 0; - - if (io_pageset == 1 && (!pageset1_copy_map || - !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) { - int is_high = PageHighMem(final_page); - copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high); - } - - if (!memory_bm_test_bit(io_map, cpu, write_pfn)) { - int test = !memory_bm_test_bit(io_map, cpu, write_pfn); - toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test); - mutex_lock(&io_mutex); - idx = atomic_add_return(1, &io_count); - mutex_unlock(&io_mutex); - return; - } - - virt = kmap(copy_page); - buffer_virt = kmap(buffer); - was_present = kernel_page_present(copy_page); - if (!was_present) - kernel_map_pages(copy_page, 1, 1); - memcpy(virt, buffer_virt, PAGE_SIZE); - if (!was_present) - kernel_map_pages(copy_page, 1, 0); - kunmap(copy_page); - kunmap(buffer); - memory_bm_clear_bit(io_map, cpu, write_pfn); - TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset); -} - -static unsigned long status_update(int writing, unsigned long done, - unsigned long ticks) -{ - int cs_index = writing ? 0 : 1; - unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks; - unsigned long msec = jiffies_to_msecs(abs(ticks_so_far)); - unsigned long pgs_per_s, estimate = 0, pages_left; - - if (msec) { - pages_left = io_barmax - done; - pgs_per_s = 1000 * done / msec; - if (pgs_per_s) - estimate = DIV_ROUND_UP(pages_left, pgs_per_s); - } - - if (estimate && ticks > HZ / 2) - return toi_update_status(done, io_barmax, - " %d/%d MB (%lu sec left)", - MB(done+1), MB(io_barmax), estimate); - - return toi_update_status(done, io_barmax, " %d/%d MB", - MB(done+1), MB(io_barmax)); -} - -/** - * worker_rw_loop - main loop to read/write pages - * - * The main I/O loop for reading or writing pages. The io_map bitmap is used to - * track the pages to read/write. - * If we are reading, the pages are loaded to their final (mapped) pfn. - * Data is non zero iff this is a thread started via start_other_threads. - * In that case, we stay in here until told to quit. - **/ -static int worker_rw_loop(void *data) -{ - unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4, - jif_index = 1, start_time = jiffies, thread_num; - int result = 0, my_io_index = 0, last_worker; - struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP); - cpumask_var_t orig_mask; - - if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) { - printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data); - result = -ENOMEM; - goto out; - } - - cpumask_copy(orig_mask, tsk_cpus_allowed(current)); - - current->flags |= PF_NOFREEZE; - -top: - mutex_lock(&io_mutex); - thread_num = atomic_read(&toi_io_workers); - - cpumask_copy(tsk_cpus_allowed(current), orig_mask); - schedule(); - - atomic_inc(&toi_io_workers); - - while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) && - !(io_write && test_result_state(TOI_ABORTED)) && - toi_worker_command == TOI_IO_WORKER_RUN) { - if (!thread_num && jiffies > next_jiffies) { - next_jiffies += HZ / 4; - if (toiActiveAllocator->update_throughput_throttle) - toiActiveAllocator->update_throughput_throttle( - jif_index); - jif_index++; - } - - /* - * What page to use? If reading, don't know yet which page's - * data will be read, so always use the buffer. If writing, - * use the copy (Pageset1) or original page (Pageset2), but - * always write the pfn of the original page. - */ - if (io_write) - result = write_next_page(&data_pfn, &my_io_index, - &write_pfn); - else /* Reading */ - result = read_next_page(&my_io_index, &write_pfn, - buffer); - - if (result) { - mutex_lock(&io_mutex); - /* Nothing to do? */ - if (result == -ENODATA) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Thread %d has no more work.", - smp_processor_id()); - break; - } - - io_result = result; - - if (io_write) { - printk(KERN_INFO "Write chunk returned %d.\n", - result); - abort_hibernate(TOI_FAILED_IO, - "Failed to write a chunk of the " - "image."); - break; - } - - if (io_pageset == 1) { - printk(KERN_ERR "\nBreaking out of I/O loop " - "because of result code %d.\n", result); - break; - } - panic("Read chunk returned (%d)", result); - } - - /* - * Discard reads of resaved pages while reading ps2 - * and unwanted pages while rereading ps2 when aborting. - */ - if (!io_write) { - if (!PageResave(pfn_to_page(write_pfn))) - use_read_page(write_pfn, buffer); - else { - mutex_lock(&io_mutex); - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Resaved %ld.", write_pfn); - atomic_inc(&io_count); - mutex_unlock(&io_mutex); - } - } - - if (!thread_num) { - if(my_io_index + io_base > io_nextupdate) - io_nextupdate = status_update(io_write, - my_io_index + io_base, - jiffies - start_time); - - if (my_io_index > io_pc) { - printk(KERN_CONT "...%d%%", 20 * io_pc_step); - io_pc_step++; - io_pc = io_finish_at * io_pc_step / 5; - } - } - - toi_cond_pause(0, NULL); - - /* - * Subtle: If there's less I/O still to be done than threads - * running, quit. This stops us doing I/O beyond the end of - * the image when reading. - * - * Possible race condition. Two threads could do the test at - * the same time; one should exit and one should continue. - * Therefore we take the mutex before comparing and exiting. - */ - - mutex_lock(&io_mutex); - } - - last_worker = atomic_dec_and_test(&toi_io_workers); - toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers)); - mutex_unlock(&io_mutex); - - if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) { - /* Were we the last thread and we're using a flusher thread? */ - if (last_worker && using_flusher) { - toiActiveAllocator->finish_all_io(); - } - /* First, if we're doing I/O, wait for it to finish */ - wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN); - /* Then wait to be told what to do next */ - wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP); - if (toi_worker_command == TOI_IO_WORKER_RUN) - goto top; - } - - if (thread_num) - atomic_dec(&toi_num_other_threads); - -out: - toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num); - toi__free_page(28, buffer); - free_cpumask_var(orig_mask); - - return result; -} - -int toi_start_other_threads(void) -{ - int cpu; - struct task_struct *p; - int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1; - unsigned long num_started = 0; - - if (test_action_state(TOI_NO_MULTITHREADED_IO)) - return 0; - - toi_worker_command = TOI_IO_WORKER_STOP; - - for_each_online_cpu(cpu) { - if (num_started == to_start) - break; - - if (cpu == smp_processor_id()) - continue; - - p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1, - cpu_to_node(cpu), "ktoi_io/%d", cpu); - if (IS_ERR(p)) { - printk(KERN_ERR "ktoi_io for %i failed\n", cpu); - continue; - } - kthread_bind(p, cpu); - p->flags |= PF_MEMALLOC; - wake_up_process(p); - num_started++; - atomic_inc(&toi_num_other_threads); - } - - toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started); - return num_started; -} - -void toi_stop_other_threads(void) -{ - toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads."); - toi_worker_command = TOI_IO_WORKER_EXIT; - wake_up(&toi_worker_wait_queue); -} - -/** - * do_rw_loop - main highlevel function for reading or writing pages - * - * Create the io_map bitmap and call worker_rw_loop to perform I/O operations. - **/ -static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags, - int base, int barmax, int pageset) -{ - int index = 0, cpu, result = 0, workers_started; - unsigned long pfn, next; - - first_filter = toi_get_next_filter(NULL); - - if (!finish_at) - return 0; - - io_write = write; - io_finish_at = finish_at; - io_base = base; - io_barmax = barmax; - io_pageset = pageset; - io_index = 0; - io_pc = io_finish_at / 5; - io_pc_step = 1; - io_result = 0; - io_nextupdate = base + 1; - toi_bio_queue_flusher_should_finish = 0; - - for_each_online_cpu(cpu) { - per_cpu(last_sought, cpu) = NULL; - per_cpu(last_low_page, cpu) = NULL; - per_cpu(last_high_page, cpu) = NULL; - } - - /* Ensure all bits clear */ - memory_bm_clear(io_map); - - memory_bm_position_reset(io_map); - next = memory_bm_next_pfn(io_map, 0); - - BUG_ON(next != BM_END_OF_MAP); - - /* Set the bits for the pages to write */ - memory_bm_position_reset(pageflags); - - pfn = memory_bm_next_pfn(pageflags, 0); - toi_trace_index++; - - while (pfn != BM_END_OF_MAP && index < finish_at) { - TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at); - memory_bm_set_bit(io_map, 0, pfn); - pfn = memory_bm_next_pfn(pageflags, 0); - index++; - } - - BUG_ON(next != BM_END_OF_MAP || index < finish_at); - - memory_bm_position_reset(io_map); - toi_trace_index++; - - atomic_set(&io_count, finish_at); - - memory_bm_position_reset(pageset1_map); - - mutex_lock(&io_mutex); - - clear_toi_state(TOI_IO_STOPPED); - - using_flusher = (atomic_read(&toi_num_other_threads) && - toiActiveAllocator->io_flusher && - !test_action_state(TOI_NO_FLUSHER_THREAD)); - - workers_started = atomic_read(&toi_num_other_threads); - - memory_bm_position_reset(io_map); - memory_bm_position_reset(pageset1_copy_map); - - toi_worker_command = TOI_IO_WORKER_RUN; - wake_up(&toi_worker_wait_queue); - - mutex_unlock(&io_mutex); - - if (using_flusher) - result = toiActiveAllocator->io_flusher(write); - else - worker_rw_loop(NULL); - - while (atomic_read(&toi_io_workers)) - schedule(); - - printk(KERN_CONT "\n"); - - toi_worker_command = TOI_IO_WORKER_STOP; - wake_up(&toi_worker_wait_queue); - - if (unlikely(test_toi_state(TOI_STOP_RESUME))) { - if (!atomic_read(&toi_io_workers)) { - rw_cleanup_modules(READ); - set_toi_state(TOI_IO_STOPPED); - } - while (1) - schedule(); - } - set_toi_state(TOI_IO_STOPPED); - - if (!io_result && !result && !test_result_state(TOI_ABORTED)) { - unsigned long next; - - toi_update_status(io_base + io_finish_at, io_barmax, - " %d/%d MB ", - MB(io_base + io_finish_at), MB(io_barmax)); - - memory_bm_position_reset(io_map); - next = memory_bm_next_pfn(io_map, 0); - if (next != BM_END_OF_MAP) { - printk(KERN_INFO "Finished I/O loop but still work to " - "do?\nFinish at = %d. io_count = %d.\n", - finish_at, atomic_read(&io_count)); - printk(KERN_INFO "I/O bitmap still records work to do." - "%ld.\n", next); - BUG(); - do { - cpu_relax(); - } while (0); - } - } - - return io_result ? io_result : result; -} - -/** - * write_pageset - write a pageset to disk. - * @pagedir: Which pagedir to write. - * - * Returns: - * Zero on success or -1 on failure. - **/ -int write_pageset(struct pagedir *pagedir) -{ - int finish_at, base = 0; - int barmax = pagedir1.size + pagedir2.size; - long error = 0; - struct memory_bitmap *pageflags; - unsigned long start_time, end_time; - - /* - * Even if there is nothing to read or write, the allocator - * may need the init/cleanup for it's housekeeping. (eg: - * Pageset1 may start where pageset2 ends when writing). - */ - finish_at = pagedir->size; - - if (pagedir->id == 1) { - toi_prepare_status(DONT_CLEAR_BAR, - "Writing kernel & process data..."); - base = pagedir2.size; - if (test_action_state(TOI_TEST_FILTER_SPEED) || - test_action_state(TOI_TEST_BIO)) - pageflags = pageset1_map; - else - pageflags = pageset1_copy_map; - } else { - toi_prepare_status(DONT_CLEAR_BAR, "Writing caches..."); - pageflags = pageset2_map; - } - - start_time = jiffies; - - if (rw_init_modules(WRITE, pagedir->id)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialise modules for writing."); - error = 1; - } - - if (!error) - error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax, - pagedir->id); - - if (rw_cleanup_modules(WRITE) && !error) { - abort_hibernate(TOI_FAILED_MODULE_CLEANUP, - "Failed to cleanup after writing."); - error = 1; - } - - end_time = jiffies; - - if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { - toi_bkd.toi_io_time[0][0] += finish_at, - toi_bkd.toi_io_time[0][1] += (end_time - start_time); - } - - return error; -} - -/** - * read_pageset - highlevel function to read a pageset from disk - * @pagedir: pageset to read - * @overwrittenpagesonly: Whether to read the whole pageset or - * only part of it. - * - * Returns: - * Zero on success or -1 on failure. - **/ -static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly) -{ - int result = 0, base = 0; - int finish_at = pagedir->size; - int barmax = pagedir1.size + pagedir2.size; - struct memory_bitmap *pageflags; - unsigned long start_time, end_time; - - if (pagedir->id == 1) { - toi_prepare_status(DONT_CLEAR_BAR, - "Reading kernel & process data..."); - pageflags = pageset1_map; - } else { - toi_prepare_status(DONT_CLEAR_BAR, "Reading caches..."); - if (overwrittenpagesonly) { - barmax = min(pagedir1.size, pagedir2.size); - finish_at = min(pagedir1.size, pagedir2.size); - } else - base = pagedir1.size; - pageflags = pageset2_map; - } - - start_time = jiffies; - - if (rw_init_modules(READ, pagedir->id)) { - toiActiveAllocator->remove_image(); - result = 1; - } else - result = do_rw_loop(READ, finish_at, pageflags, base, barmax, - pagedir->id); - - if (rw_cleanup_modules(READ) && !result) { - abort_hibernate(TOI_FAILED_MODULE_CLEANUP, - "Failed to cleanup after reading."); - result = 1; - } - - /* Statistics */ - end_time = jiffies; - - if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { - toi_bkd.toi_io_time[1][0] += finish_at, - toi_bkd.toi_io_time[1][1] += (end_time - start_time); - } - - return result; -} - -/** - * write_module_configs - store the modules configuration - * - * The configuration for each module is stored in the image header. - * Returns: Int - * Zero on success, Error value otherwise. - **/ -static int write_module_configs(void) -{ - struct toi_module_ops *this_module; - char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP); - int len, index = 1; - struct toi_module_header toi_module_header; - - if (!buffer) { - printk(KERN_INFO "Failed to allocate a buffer for saving " - "module configuration info.\n"); - return -ENOMEM; - } - - /* - * We have to know which data goes with which module, so we at - * least write a length of zero for a module. Note that we are - * also assuming every module's config data takes <= PAGE_SIZE. - */ - - /* For each module (in registration order) */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || !this_module->storage_needed || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - - /* Get the data from the module */ - len = 0; - if (this_module->save_config_info) - len = this_module->save_config_info(buffer); - - /* Save the details of the module */ - toi_module_header.enabled = this_module->enabled; - toi_module_header.type = this_module->type; - toi_module_header.index = index++; - strncpy(toi_module_header.name, this_module->name, - sizeof(toi_module_header.name)); - toiActiveAllocator->rw_header_chunk(WRITE, - this_module, - (char *) &toi_module_header, - sizeof(toi_module_header)); - - /* Save the size of the data and any data returned */ - toiActiveAllocator->rw_header_chunk(WRITE, - this_module, - (char *) &len, sizeof(int)); - if (len) - toiActiveAllocator->rw_header_chunk( - WRITE, this_module, buffer, len); - } - - /* Write a blank header to terminate the list */ - toi_module_header.name[0] = '\0'; - toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &toi_module_header, sizeof(toi_module_header)); - - toi_free_page(22, (unsigned long) buffer); - return 0; -} - -/** - * read_one_module_config - read and configure one module - * - * Read the configuration for one module, and configure the module - * to match if it is loaded. - * - * Returns: Int - * Zero on success, Error value otherwise. - **/ -static int read_one_module_config(struct toi_module_header *header) -{ - struct toi_module_ops *this_module; - int result, len; - char *buffer; - - /* Find the module */ - this_module = toi_find_module_given_name(header->name); - - if (!this_module) { - if (header->enabled) { - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "It looks like we need module %s for reading " - "the image but it hasn't been registered.\n", - header->name); - if (!(test_toi_state(TOI_CONTINUE_REQ))) - return -EINVAL; - } else - printk(KERN_INFO "Module %s configuration data found, " - "but the module hasn't registered. Looks like " - "it was disabled, so we're ignoring its data.", - header->name); - } - - /* Get the length of the data (if any) */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len, - sizeof(int)); - if (result) { - printk(KERN_ERR "Failed to read the length of the module %s's" - " configuration data.\n", - header->name); - return -EINVAL; - } - - /* Read any data and pass to the module (if we found one) */ - if (!len) - return 0; - - buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP); - - if (!buffer) { - printk(KERN_ERR "Failed to allocate a buffer for reloading " - "module configuration info.\n"); - return -ENOMEM; - } - - toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len); - - if (!this_module) - goto out; - - if (!this_module->save_config_info) - printk(KERN_ERR "Huh? Module %s appears to have a " - "save_config_info, but not a load_config_info " - "function!\n", this_module->name); - else - this_module->load_config_info(buffer, len); - - /* - * Now move this module to the tail of its lists. This will put it in - * order. Any new modules will end up at the top of the lists. They - * should have been set to disabled when loaded (people will - * normally not edit an initrd to load a new module and then hibernate - * without using it!). - */ - - toi_move_module_tail(this_module); - - this_module->enabled = header->enabled; - -out: - toi_free_page(23, (unsigned long) buffer); - return 0; -} - -/** - * read_module_configs - reload module configurations from the image header. - * - * Returns: Int - * Zero on success or an error code. - **/ -static int read_module_configs(void) -{ - int result = 0; - struct toi_module_header toi_module_header; - struct toi_module_ops *this_module; - - /* All modules are initially disabled. That way, if we have a module - * loaded now that wasn't loaded when we hibernated, it won't be used - * in trying to read the data. - */ - list_for_each_entry(this_module, &toi_modules, module_list) - this_module->enabled = 0; - - /* Get the first module header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - (char *) &toi_module_header, - sizeof(toi_module_header)); - if (result) { - printk(KERN_ERR "Failed to read the next module header.\n"); - return -EINVAL; - } - - /* For each module (in registration order) */ - while (toi_module_header.name[0]) { - result = read_one_module_config(&toi_module_header); - - if (result) - return -EINVAL; - - /* Get the next module header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - (char *) &toi_module_header, - sizeof(toi_module_header)); - - if (result) { - printk(KERN_ERR "Failed to read the next module " - "header.\n"); - return -EINVAL; - } - } - - return 0; -} - -static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev) -{ - return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1; -} - -int fs_info_space_needed(int reset) -{ - static int last_result = 0; - const struct super_block *sb; - int result = sizeof(int); - - if (!last_result || reset) { - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) - result += 16 + sizeof(dev_t) + sizeof(int) + - fs->last_mount_size; - free_fs_info(fs); - } - last_result = result; - } - return result; -} - -static int fs_info_num_to_save(void) -{ - const struct super_block *sb; - int to_save = 0; - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) - to_save++; - free_fs_info(fs); - } - - return to_save; -} - -static int fs_info_save(void) -{ - const struct super_block *sb; - int to_save = fs_info_num_to_save(); - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info" - " to save."); - return -EIO; - } - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) { - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - &fs->uuid[0], 16)) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write uuid."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &fs->dev_t, sizeof(dev_t))) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write dev_t."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &fs->last_mount_size, sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write last mount length."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - fs->last_mount, fs->last_mount_size)) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write uuid."); - return -EIO; - } - } - free_fs_info(fs); - } - return 0; -} - -static int fs_info_load_and_check_one(void) -{ - char uuid[16], *last_mount; - int result = 0, ln; - dev_t dev_t; - struct block_device *dev; - struct fs_info *fs_info, seek; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) { - abort_hibernate(TOI_FAILED_IO, "Failed to read uuid."); - return -EIO; - } - - read_if_version(3, dev_t, "uuid dev_t field", return -EIO); - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, - "Failed to read last mount size."); - return -EIO; - } - - last_mount = kzalloc(ln, GFP_KERNEL); - - if (!last_mount) - return -ENOMEM; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount, ln)) { - abort_hibernate(TOI_FAILED_IO, - "Failed to read last mount timestamp."); - result = -EIO; - goto out_lmt; - } - - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = dev_t; - seek.last_mount_size = ln; - seek.last_mount = last_mount; - dev_t = blk_lookup_fs_info(&seek); - if (!dev_t) - goto out_lmt; - - dev = toi_open_by_devnum(dev_t); - - fs_info = fs_info_from_block_dev(dev); - if (fs_info && !IS_ERR(fs_info)) { - if (ln != fs_info->last_mount_size) { - printk(KERN_EMERG "Found matching uuid but last mount " - "time lengths differ?! " - "(%d vs %d).\n", ln, - fs_info->last_mount_size); - result = -EINVAL; - } else { - char buf[BDEVNAME_SIZE]; - result = !!memcmp(fs_info->last_mount, last_mount, ln); - if (result) - printk(KERN_EMERG "Last mount time for %s has " - "changed!\n", bdevname(dev, buf)); - } - } - toi_close_bdev(dev); - free_fs_info(fs_info); -out_lmt: - kfree(last_mount); - return result; -} - -static int fs_info_load_and_check(void) -{ - int to_do, result = 0; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info " - "to load."); - return -EIO; - } - - while(to_do--) - result |= fs_info_load_and_check_one(); - - return result; -} - -/** - * write_image_header - write the image header after write the image proper - * - * Returns: Int - * Zero on success, error value otherwise. - **/ -int write_image_header(void) -{ - int ret; - int total = pagedir1.size + pagedir2.size+2; - char *header_buffer = NULL; - - /* Now prepare to write the header */ - ret = toiActiveAllocator->write_header_init(); - if (ret) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Active allocator's write_header_init" - " function failed."); - goto write_image_header_abort; - } - - /* Get a buffer */ - header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP); - if (!header_buffer) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Out of memory when trying to get page for header!"); - goto write_image_header_abort; - } - - /* Write hibernate header */ - if (fill_toi_header((struct toi_header *) header_buffer)) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to fill header information!"); - goto write_image_header_abort; - } - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - header_buffer, sizeof(struct toi_header))) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to write header info."); - goto write_image_header_abort; - } - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &toi_max_workers, sizeof(toi_max_workers))) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to number of workers to use."); - goto write_image_header_abort; - } - - /* Write filesystem info */ - if (fs_info_save()) - goto write_image_header_abort; - - /* Write module configurations */ - ret = write_module_configs(); - if (ret) { - abort_hibernate(TOI_FAILED_IO, - "Failed to write module configs."); - goto write_image_header_abort; - } - - if (memory_bm_write(pageset1_map, - toiActiveAllocator->rw_header_chunk)) { - abort_hibernate(TOI_FAILED_IO, - "Failed to write bitmaps."); - goto write_image_header_abort; - } - - /* Flush data and let allocator cleanup */ - if (toiActiveAllocator->write_header_cleanup()) { - abort_hibernate(TOI_FAILED_IO, - "Failed to cleanup writing header."); - goto write_image_header_abort_no_cleanup; - } - - if (test_result_state(TOI_ABORTED)) - goto write_image_header_abort_no_cleanup; - - toi_update_status(total, total, NULL); - -out: - if (header_buffer) - toi_free_page(24, (unsigned long) header_buffer); - return ret; - -write_image_header_abort: - toiActiveAllocator->write_header_cleanup(); -write_image_header_abort_no_cleanup: - ret = -1; - goto out; -} - -/** - * sanity_check - check the header - * @sh: the header which was saved at hibernate time. - * - * Perform a few checks, seeking to ensure that the kernel being - * booted matches the one hibernated. They need to match so we can - * be _sure_ things will work. It is not absolutely impossible for - * resuming from a different kernel to work, just not assured. - **/ -static char *sanity_check(struct toi_header *sh) -{ - char *reason = check_image_kernel((struct swsusp_info *) sh); - - if (reason) - return reason; - - if (!test_action_state(TOI_IGNORE_ROOTFS)) { - const struct super_block *sb; - list_for_each_entry(sb, &super_blocks, s_list) { - if ((!(sb->s_flags & MS_RDONLY)) && - (sb->s_type->fs_flags & FS_REQUIRES_DEV)) - return "Device backed fs has been mounted " - "rw prior to resume or initrd/ramfs " - "is mounted rw."; - } - } - - return NULL; -} - -static DECLARE_WAIT_QUEUE_HEAD(freeze_wait); - -#define FREEZE_IN_PROGRESS (~0) - -static int freeze_result; - -static void do_freeze(struct work_struct *dummy) -{ - freeze_result = freeze_processes(); - wake_up(&freeze_wait); - trap_non_toi_io = 1; -} - -static DECLARE_WORK(freeze_work, do_freeze); - -/** - * __read_pageset1 - test for the existence of an image and attempt to load it - * - * Returns: Int - * Zero if image found and pageset1 successfully loaded. - * Error if no image found or loaded. - **/ -static int __read_pageset1(void) -{ - int i, result = 0; - char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP), - *sanity_error = NULL; - struct toi_header *toi_header; - - if (!header_buffer) { - printk(KERN_INFO "Unable to allocate a page for reading the " - "signature.\n"); - return -ENOMEM; - } - - /* Check for an image */ - result = toiActiveAllocator->image_exists(1); - if (result == 3) { - result = -ENODATA; - toi_early_boot_message(1, 0, "The signature from an older " - "version of TuxOnIce has been detected."); - goto out_remove_image; - } - - if (result != 1) { - result = -ENODATA; - noresume_reset_modules(); - printk(KERN_INFO "TuxOnIce: No image found.\n"); - goto out; - } - - /* - * Prepare the active allocator for reading the image header. The - * activate allocator might read its own configuration. - * - * NB: This call may never return because there might be a signature - * for a different image such that we warn the user and they choose - * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the - * location of the image might be unavailable if it was stored on a - * network connection). - */ - - result = toiActiveAllocator->read_header_init(); - if (result) { - printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the " - "image header.\n"); - goto out_remove_image; - } - - /* Check for noresume command line option */ - if (test_toi_state(TOI_NORESUME_SPECIFIED)) { - printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed " - "image.\n"); - goto out_remove_image; - } - - /* Check whether we've resumed before */ - if (test_toi_state(TOI_RESUMED_BEFORE)) { - toi_early_boot_message(1, 0, NULL); - if (!(test_toi_state(TOI_CONTINUE_REQ))) { - printk(KERN_INFO "TuxOnIce: Tried to resume before: " - "Invalidated image.\n"); - goto out_remove_image; - } - } - - clear_toi_state(TOI_CONTINUE_REQ); - - toi_image_header_version = toiActiveAllocator->get_header_version(); - - if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) { - toi_early_boot_message(1, 0, image_version_error); - if (!(test_toi_state(TOI_CONTINUE_REQ))) { - printk(KERN_INFO "TuxOnIce: Header version too new: " - "Invalidated image.\n"); - goto out_remove_image; - } - } - - /* Read hibernate header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - header_buffer, sizeof(struct toi_header)); - if (result < 0) { - printk(KERN_ERR "TuxOnIce: Failed to read the image " - "signature.\n"); - goto out_remove_image; - } - - toi_header = (struct toi_header *) header_buffer; - - /* - * NB: This call may also result in a reboot rather than returning. - */ - - sanity_error = sanity_check(toi_header); - if (sanity_error) { - toi_early_boot_message(1, TOI_CONTINUE_REQ, - sanity_error); - printk(KERN_INFO "TuxOnIce: Sanity check failed.\n"); - goto out_remove_image; - } - - /* - * We have an image and it looks like it will load okay. - * - * Get metadata from header. Don't override commandline parameters. - * - * We don't need to save the image size limit because it's not used - * during resume and will be restored with the image anyway. - */ - - memcpy((char *) &pagedir1, - (char *) &toi_header->pagedir, sizeof(pagedir1)); - toi_result = toi_header->param0; - if (!toi_bkd.toi_debug_state) { - toi_bkd.toi_action = - (toi_header->param1 & ~toi_bootflags_mask) | - (toi_bkd.toi_action & toi_bootflags_mask); - toi_bkd.toi_debug_state = toi_header->param2; - toi_bkd.toi_default_console_level = toi_header->param3; - } - clear_toi_state(TOI_IGNORE_LOGLEVEL); - pagedir2.size = toi_header->pageset_2_size; - for (i = 0; i < 4; i++) - toi_bkd.toi_io_time[i/2][i%2] = - toi_header->io_time[i/2][i%2]; - - set_toi_state(TOI_BOOT_KERNEL); - boot_kernel_data_buffer = toi_header->bkd; - - read_if_version(1, toi_max_workers, "TuxOnIce max workers", - goto out_remove_image); - - /* Read filesystem info */ - if (fs_info_load_and_check()) { - printk(KERN_EMERG "TuxOnIce: File system mount time checks " - "failed. Refusing to corrupt your filesystems!\n"); - goto out_remove_image; - } - - /* Read module configurations */ - result = read_module_configs(); - if (result) { - pagedir1.size = 0; - pagedir2.size = 0; - printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module " - "configurations.\n"); - clear_action_state(TOI_KEEP_IMAGE); - goto out_remove_image; - } - - toi_prepare_console(); - - set_toi_state(TOI_NOW_RESUMING); - - result = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (result) - goto out_notifier_call_chain;; - - if (usermodehelper_disable()) - goto out_enable_usermodehelper; - - current->flags |= PF_NOFREEZE; - freeze_result = FREEZE_IN_PROGRESS; - - schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work); - - toi_cond_pause(1, "About to read original pageset1 locations."); - - /* - * See _toi_rw_header_chunk in tuxonice_bio.c: - * Initialize pageset1_map by reading the map from the image. - */ - if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk)) - goto out_thaw; - - /* - * See toi_rw_cleanup in tuxonice_bio.c: - * Clean up after reading the header. - */ - result = toiActiveAllocator->read_header_cleanup(); - if (result) { - printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the " - "image header.\n"); - goto out_thaw; - } - - toi_cond_pause(1, "About to read pagedir."); - - /* - * Get the addresses of pages into which we will load the kernel to - * be copied back and check if they conflict with the ones we are using. - */ - if (toi_get_pageset1_load_addresses()) { - printk(KERN_INFO "TuxOnIce: Failed to get load addresses for " - "pageset1.\n"); - goto out_thaw; - } - - /* Read the original kernel back */ - toi_cond_pause(1, "About to read pageset 1."); - - /* Given the pagemap, read back the data from disk */ - if (read_pageset(&pagedir1, 0)) { - toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1."); - result = -EIO; - goto out_thaw; - } - - toi_cond_pause(1, "About to restore original kernel."); - result = 0; - - if (!toi_keeping_image && - toiActiveAllocator->mark_resume_attempted) - toiActiveAllocator->mark_resume_attempted(1); - - wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); -out: - current->flags &= ~PF_NOFREEZE; - toi_free_page(25, (unsigned long) header_buffer); - return result; - -out_thaw: - wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); - trap_non_toi_io = 0; - thaw_processes(); -out_enable_usermodehelper: - usermodehelper_enable(); -out_notifier_call_chain: - pm_notifier_call_chain(PM_POST_RESTORE); - toi_cleanup_console(); -out_remove_image: - result = -EINVAL; - if (!toi_keeping_image) - toiActiveAllocator->remove_image(); - toiActiveAllocator->read_header_cleanup(); - noresume_reset_modules(); - goto out; -} - -/** - * read_pageset1 - highlevel function to read the saved pages - * - * Attempt to read the header and pageset1 of a hibernate image. - * Handle the outcome, complaining where appropriate. - **/ -int read_pageset1(void) -{ - int error; - - error = __read_pageset1(); - - if (error && error != -ENODATA && error != -EINVAL && - !test_result_state(TOI_ABORTED)) - abort_hibernate(TOI_IMAGE_ERROR, - "TuxOnIce: Error %d resuming\n", error); - - return error; -} - -/** - * get_have_image_data - check the image header - **/ -static char *get_have_image_data(void) -{ - char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP); - struct toi_header *toi_header; - - if (!output_buffer) { - printk(KERN_INFO "Output buffer null.\n"); - return NULL; - } - - /* Check for an image */ - if (!toiActiveAllocator->image_exists(1) || - toiActiveAllocator->read_header_init() || - toiActiveAllocator->rw_header_chunk(READ, NULL, - output_buffer, sizeof(struct toi_header))) { - sprintf(output_buffer, "0\n"); - /* - * From an initrd/ramfs, catting have_image and - * getting a result of 0 is sufficient. - */ - clear_toi_state(TOI_BOOT_TIME); - goto out; - } - - toi_header = (struct toi_header *) output_buffer; - - sprintf(output_buffer, "1\n%s\n%s\n", - toi_header->uts.machine, - toi_header->uts.version); - - /* Check whether we've resumed before */ - if (test_toi_state(TOI_RESUMED_BEFORE)) - strcat(output_buffer, "Resumed before.\n"); - -out: - noresume_reset_modules(); - return output_buffer; -} - -/** - * read_pageset2 - read second part of the image - * @overwrittenpagesonly: Read only pages which would have been - * verwritten by pageset1? - * - * Read in part or all of pageset2 of an image, depending upon - * whether we are hibernating and have only overwritten a portion - * with pageset1 pages, or are resuming and need to read them - * all. - * - * Returns: Int - * Zero if no error, otherwise the error value. - **/ -int read_pageset2(int overwrittenpagesonly) -{ - int result = 0; - - if (!pagedir2.size) - return 0; - - result = read_pageset(&pagedir2, overwrittenpagesonly); - - toi_cond_pause(1, "Pagedir 2 read."); - - return result; -} - -/** - * image_exists_read - has an image been found? - * @page: Output buffer - * - * Store 0 or 1 in page, depending on whether an image is found. - * Incoming buffer is PAGE_SIZE and result is guaranteed - * to be far less than that, so we don't worry about - * overflow. - **/ -int image_exists_read(const char *page, int count) -{ - int len = 0; - char *result; - - if (toi_activate_storage(0)) - return count; - - if (!test_toi_state(TOI_RESUME_DEVICE_OK)) - toi_attempt_to_parse_resume_device(0); - - if (!toiActiveAllocator) { - len = sprintf((char *) page, "-1\n"); - } else { - result = get_have_image_data(); - if (result) { - len = sprintf((char *) page, "%s", result); - toi_free_page(26, (unsigned long) result); - } - } - - toi_deactivate_storage(0); - - return len; -} - -/** - * image_exists_write - invalidate an image if one exists - **/ -int image_exists_write(const char *buffer, int count) -{ - if (toi_activate_storage(0)) - return count; - - if (toiActiveAllocator && toiActiveAllocator->image_exists(1)) - toiActiveAllocator->remove_image(); - - toi_deactivate_storage(0); - - clear_result_state(TOI_KEPT_IMAGE); - - return count; -} diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h deleted file mode 100644 index 7d4d83f40..000000000 --- a/kernel/power/tuxonice_io.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * kernel/power/tuxonice_io.h - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains high level IO routines for hibernating. - * - */ - -#include <linux/utsname.h> -#include "tuxonice_pagedir.h" - -/* Non-module data saved in our image header */ -struct toi_header { - /* - * Mirror struct swsusp_info, but without - * the page aligned attribute - */ - struct new_utsname uts; - u32 version_code; - unsigned long num_physpages; - int cpus; - unsigned long image_pages; - unsigned long pages; - unsigned long size; - - /* Our own data */ - unsigned long orig_mem_free; - int page_size; - int pageset_2_size; - int param0; - int param1; - int param2; - int param3; - int progress0; - int progress1; - int progress2; - int progress3; - int io_time[2][2]; - struct pagedir pagedir; - dev_t root_fs; - unsigned long bkd; /* Boot kernel data locn */ -}; - -extern int write_pageset(struct pagedir *pagedir); -extern int write_image_header(void); -extern int read_pageset1(void); -extern int read_pageset2(int overwrittenpagesonly); - -extern int toi_attempt_to_parse_resume_device(int quiet); -extern void attempt_to_parse_resume_device2(void); -extern void attempt_to_parse_alt_resume_param(void); -int image_exists_read(const char *page, int count); -int image_exists_write(const char *buffer, int count); -extern void save_restore_alt_param(int replace, int quiet); -extern atomic_t toi_io_workers; - -/* Args to save_restore_alt_param */ -#define RESTORE 0 -#define SAVE 1 - -#define NOQUIET 0 -#define QUIET 1 - -extern wait_queue_head_t toi_io_queue_flusher; -extern int toi_bio_queue_flusher_should_finish; - -int fs_info_space_needed(int reset); - -extern int toi_max_workers; diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c deleted file mode 100644 index a203c8fb9..000000000 --- a/kernel/power/tuxonice_modules.c +++ /dev/null @@ -1,520 +0,0 @@ -/* - * kernel/power/tuxonice_modules.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - */ - -#include <linux/suspend.h> -#include <linux/module.h> -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_ui.h" - -LIST_HEAD(toi_filters); -LIST_HEAD(toiAllocators); - -LIST_HEAD(toi_modules); - -struct toi_module_ops *toiActiveAllocator; - -static int toi_num_filters; -int toiNumAllocators, toi_num_modules; - -/* - * toi_header_storage_for_modules - * - * Returns the amount of space needed to store configuration - * data needed by the modules prior to copying back the original - * kernel. We can exclude data for pageset2 because it will be - * available anyway once the kernel is copied back. - */ -long toi_header_storage_for_modules(void) -{ - struct toi_module_ops *this_module; - int bytes = 0; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - if (this_module->storage_needed) { - int this = this_module->storage_needed() + - sizeof(struct toi_module_header) + - sizeof(int); - this_module->header_requested = this; - bytes += this; - } - } - - /* One more for the empty terminator */ - return bytes + sizeof(struct toi_module_header); -} - -void print_toi_header_storage_for_modules(void) -{ - struct toi_module_ops *this_module; - int bytes = 0; - - printk(KERN_DEBUG "Header storage:\n"); - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - if (this_module->storage_needed) { - int this = this_module->storage_needed() + - sizeof(struct toi_module_header) + - sizeof(int); - this_module->header_requested = this; - bytes += this; - printk(KERN_DEBUG "+ %16s : %-4d/%d.\n", - this_module->name, - this_module->header_used, this); - } - } - - printk(KERN_DEBUG "+ empty terminator : %zu.\n", - sizeof(struct toi_module_header)); - printk(KERN_DEBUG " ====\n"); - printk(KERN_DEBUG " %zu\n", - bytes + sizeof(struct toi_module_header)); -} - -/* - * toi_memory_for_modules - * - * Returns the amount of memory requested by modules for - * doing their work during the cycle. - */ - -long toi_memory_for_modules(int print_parts) -{ - long bytes = 0, result; - struct toi_module_ops *this_module; - - if (print_parts) - printk(KERN_INFO "Memory for modules:\n===================\n"); - list_for_each_entry(this_module, &toi_modules, module_list) { - int this; - if (!this_module->enabled) - continue; - if (this_module->memory_needed) { - this = this_module->memory_needed(); - if (print_parts) - printk(KERN_INFO "%10d bytes (%5ld pages) for " - "module '%s'.\n", this, - DIV_ROUND_UP(this, PAGE_SIZE), - this_module->name); - bytes += this; - } - } - - result = DIV_ROUND_UP(bytes, PAGE_SIZE); - if (print_parts) - printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result); - - return result; -} - -/* - * toi_expected_compression_ratio - * - * Returns the compression ratio expected when saving the image. - */ - -int toi_expected_compression_ratio(void) -{ - int ratio = 100; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled) - continue; - if (this_module->expected_compression) - ratio = ratio * this_module->expected_compression() - / 100; - } - - return ratio; -} - -/* toi_find_module_given_dir - * Functionality : Return a module (if found), given a pointer - * to its directory name - */ - -static struct toi_module_ops *toi_find_module_given_dir(char *name) -{ - struct toi_module_ops *this_module, *found_module = NULL; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!strcmp(name, this_module->directory)) { - found_module = this_module; - break; - } - } - - return found_module; -} - -/* toi_find_module_given_name - * Functionality : Return a module (if found), given a pointer - * to its name - */ - -struct toi_module_ops *toi_find_module_given_name(char *name) -{ - struct toi_module_ops *this_module, *found_module = NULL; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!strcmp(name, this_module->name)) { - found_module = this_module; - break; - } - } - - return found_module; -} - -/* - * toi_print_module_debug_info - * Functionality : Get debugging info from modules into a buffer. - */ -int toi_print_module_debug_info(char *buffer, int buffer_size) -{ - struct toi_module_ops *this_module; - int len = 0; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled) - continue; - if (this_module->print_debug_info) { - int result; - result = this_module->print_debug_info(buffer + len, - buffer_size - len); - len += result; - } - } - - /* Ensure null terminated */ - buffer[buffer_size] = 0; - - return len; -} - -/* - * toi_register_module - * - * Register a module. - */ -int toi_register_module(struct toi_module_ops *module) -{ - int i; - struct kobject *kobj; - - if (!hibernation_available()) - return -ENODEV; - - module->enabled = 1; - - if (toi_find_module_given_name(module->name)) { - printk(KERN_INFO "TuxOnIce: Trying to load module %s," - " which is already registered.\n", - module->name); - return -EBUSY; - } - - switch (module->type) { - case FILTER_MODULE: - list_add_tail(&module->type_list, &toi_filters); - toi_num_filters++; - break; - case WRITER_MODULE: - list_add_tail(&module->type_list, &toiAllocators); - toiNumAllocators++; - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Hmmm. Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return -EINVAL; - } - list_add_tail(&module->module_list, &toi_modules); - toi_num_modules++; - - if ((!module->directory && !module->shared_directory) || - !module->sysfs_data || !module->num_sysfs_entries) - return 0; - - /* - * Modules may share a directory, but those with shared_dir - * set must be loaded (via symbol dependencies) after parents - * and unloaded beforehand. - */ - if (module->shared_directory) { - struct toi_module_ops *shared = - toi_find_module_given_dir(module->shared_directory); - if (!shared) { - printk(KERN_ERR "TuxOnIce: Module %s wants to share " - "%s's directory but %s isn't loaded.\n", - module->name, module->shared_directory, - module->shared_directory); - toi_unregister_module(module); - return -ENODEV; - } - kobj = shared->dir_kobj; - } else { - if (!strncmp(module->directory, "[ROOT]", 6)) - kobj = tuxonice_kobj; - else - kobj = make_toi_sysdir(module->directory); - } - module->dir_kobj = kobj; - for (i = 0; i < module->num_sysfs_entries; i++) { - int result = toi_register_sysfs_file(kobj, - &module->sysfs_data[i]); - if (result) - return result; - } - return 0; -} - -/* - * toi_unregister_module - * - * Remove a module. - */ -void toi_unregister_module(struct toi_module_ops *module) -{ - int i; - - if (module->dir_kobj) - for (i = 0; i < module->num_sysfs_entries; i++) - toi_unregister_sysfs_file(module->dir_kobj, - &module->sysfs_data[i]); - - if (!module->shared_directory && module->directory && - strncmp(module->directory, "[ROOT]", 6)) - remove_toi_sysdir(module->dir_kobj); - - switch (module->type) { - case FILTER_MODULE: - list_del(&module->type_list); - toi_num_filters--; - break; - case WRITER_MODULE: - list_del(&module->type_list); - toiNumAllocators--; - if (toiActiveAllocator == module) { - toiActiveAllocator = NULL; - clear_toi_state(TOI_CAN_RESUME); - clear_toi_state(TOI_CAN_HIBERNATE); - } - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return; - } - list_del(&module->module_list); - toi_num_modules--; -} - -/* - * toi_move_module_tail - * - * Rearrange modules when reloading the config. - */ -void toi_move_module_tail(struct toi_module_ops *module) -{ - switch (module->type) { - case FILTER_MODULE: - if (toi_num_filters > 1) - list_move_tail(&module->type_list, &toi_filters); - break; - case WRITER_MODULE: - if (toiNumAllocators > 1) - list_move_tail(&module->type_list, &toiAllocators); - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return; - } - if ((toi_num_filters + toiNumAllocators) > 1) - list_move_tail(&module->module_list, &toi_modules); -} - -/* - * toi_initialise_modules - * - * Get ready to do some work! - */ -int toi_initialise_modules(int starting_cycle, int early) -{ - struct toi_module_ops *this_module; - int result; - - list_for_each_entry(this_module, &toi_modules, module_list) { - this_module->header_requested = 0; - this_module->header_used = 0; - if (!this_module->enabled) - continue; - if (this_module->early != early) - continue; - if (this_module->initialise) { - result = this_module->initialise(starting_cycle); - if (result) { - toi_cleanup_modules(starting_cycle); - return result; - } - this_module->initialised = 1; - } - } - - return 0; -} - -/* - * toi_cleanup_modules - * - * Tell modules the work is done. - */ -void toi_cleanup_modules(int finishing_cycle) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || !this_module->initialised) - continue; - if (this_module->cleanup) - this_module->cleanup(finishing_cycle); - this_module->initialised = 0; - } -} - -/* - * toi_pre_atomic_restore_modules - * - * Get ready to do some work! - */ -void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->enabled && this_module->pre_atomic_restore) - this_module->pre_atomic_restore(bkd); - } -} - -/* - * toi_post_atomic_restore_modules - * - * Get ready to do some work! - */ -void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->enabled && this_module->post_atomic_restore) - this_module->post_atomic_restore(bkd); - } -} - -/* - * toi_get_next_filter - * - * Get the next filter in the pipeline. - */ -struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought) -{ - struct toi_module_ops *last_filter = NULL, *this_filter = NULL; - - list_for_each_entry(this_filter, &toi_filters, type_list) { - if (!this_filter->enabled) - continue; - if ((last_filter == filter_sought) || (!filter_sought)) - return this_filter; - last_filter = this_filter; - } - - return toiActiveAllocator; -} - -/** - * toi_show_modules: Printk what support is loaded. - */ -void toi_print_modules(void) -{ - struct toi_module_ops *this_module; - int prev = 0; - - printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for"); - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->type == MISC_HIDDEN_MODULE) - continue; - printk("%s %s%s%s", prev ? "," : "", - this_module->enabled ? "" : "[", - this_module->name, - this_module->enabled ? "" : "]"); - prev = 1; - } - - printk(".\n"); -} - -/* toi_get_modules - * - * Take a reference to modules so they can't go away under us. - */ - -int toi_get_modules(void) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - struct toi_module_ops *this_module2; - - if (try_module_get(this_module->module)) - continue; - - /* Failed! Reverse gets and return error */ - list_for_each_entry(this_module2, &toi_modules, - module_list) { - if (this_module == this_module2) - return -EINVAL; - module_put(this_module2->module); - } - } - return 0; -} - -/* toi_put_modules - * - * Release our references to modules we used. - */ - -void toi_put_modules(void) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) - module_put(this_module->module); -} diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h deleted file mode 100644 index 44f10abb9..000000000 --- a/kernel/power/tuxonice_modules.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * kernel/power/tuxonice_modules.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations for modules. Modules are additions to - * TuxOnIce that provide facilities such as image compression or - * encryption, backends for storage of the image and user interfaces. - * - */ - -#ifndef TOI_MODULES_H -#define TOI_MODULES_H - -/* This is the maximum size we store in the image header for a module name */ -#define TOI_MAX_MODULE_NAME_LENGTH 30 - -struct toi_boot_kernel_data; - -/* Per-module metadata */ -struct toi_module_header { - char name[TOI_MAX_MODULE_NAME_LENGTH]; - int enabled; - int type; - int index; - int data_length; - unsigned long signature; -}; - -enum { - FILTER_MODULE, - WRITER_MODULE, - BIO_ALLOCATOR_MODULE, - MISC_MODULE, - MISC_HIDDEN_MODULE, -}; - -enum { - TOI_ASYNC, - TOI_SYNC -}; - -enum { - TOI_VIRT, - TOI_PAGE, -}; - -#define TOI_MAP(type, addr) \ - (type == TOI_PAGE ? kmap(addr) : addr) - -#define TOI_UNMAP(type, addr) \ - do { \ - if (type == TOI_PAGE) \ - kunmap(addr); \ - } while(0) - -struct toi_module_ops { - /* Functions common to all modules */ - int type; - char *name; - char *directory; - char *shared_directory; - struct kobject *dir_kobj; - struct module *module; - int enabled, early, initialised; - struct list_head module_list; - - /* List of filters or allocators */ - struct list_head list, type_list; - - /* - * Requirements for memory and storage in - * the image header.. - */ - int (*memory_needed) (void); - int (*storage_needed) (void); - - int header_requested, header_used; - - int (*expected_compression) (void); - - /* - * Debug info - */ - int (*print_debug_info) (char *buffer, int size); - int (*save_config_info) (char *buffer); - void (*load_config_info) (char *buffer, int len); - - /* - * Initialise & cleanup - general routines called - * at the start and end of a cycle. - */ - int (*initialise) (int starting_cycle); - void (*cleanup) (int finishing_cycle); - - void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd); - void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd); - - /* - * Calls for allocating storage (allocators only). - * - * Header space is requested separately and cannot fail, but the - * reservation is only applied when main storage is allocated. - * The header space reservation is thus always set prior to - * requesting the allocation of storage - and prior to querying - * how much storage is available. - */ - - unsigned long (*storage_available) (void); - void (*reserve_header_space) (unsigned long space_requested); - int (*register_storage) (void); - int (*allocate_storage) (unsigned long space_requested); - unsigned long (*storage_allocated) (void); - void (*free_unused_storage) (void); - - /* - * Routines used in image I/O. - */ - int (*rw_init) (int rw, int stream_number); - int (*rw_cleanup) (int rw); - int (*write_page) (unsigned long index, int buf_type, void *buf, - unsigned int buf_size); - int (*read_page) (unsigned long *index, int buf_type, void *buf, - unsigned int *buf_size); - int (*io_flusher) (int rw); - - /* Reset module if image exists but reading aborted */ - void (*noresume_reset) (void); - - /* Read and write the metadata */ - int (*write_header_init) (void); - int (*write_header_cleanup) (void); - - int (*read_header_init) (void); - int (*read_header_cleanup) (void); - - /* To be called after read_header_init */ - int (*get_header_version) (void); - - int (*rw_header_chunk) (int rw, struct toi_module_ops *owner, - char *buffer_start, int buffer_size); - - int (*rw_header_chunk_noreadahead) (int rw, - struct toi_module_ops *owner, char *buffer_start, - int buffer_size); - - /* Attempt to parse an image location */ - int (*parse_sig_location) (char *buffer, int only_writer, int quiet); - - /* Throttle I/O according to throughput */ - void (*update_throughput_throttle) (int jif_index); - - /* Flush outstanding I/O */ - int (*finish_all_io) (void); - - /* Determine whether image exists that we can restore */ - int (*image_exists) (int quiet); - - /* Mark the image as having tried to resume */ - int (*mark_resume_attempted) (int); - - /* Destroy image if one exists */ - int (*remove_image) (void); - - /* Sysfs Data */ - struct toi_sysfs_data *sysfs_data; - int num_sysfs_entries; - - /* Block I/O allocator */ - struct toi_bio_allocator_ops *bio_allocator_ops; -}; - -extern int toi_num_modules, toiNumAllocators; - -extern struct toi_module_ops *toiActiveAllocator; -extern struct list_head toi_filters, toiAllocators, toi_modules; - -extern void toi_prepare_console_modules(void); -extern void toi_cleanup_console_modules(void); - -extern struct toi_module_ops *toi_find_module_given_name(char *name); -extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *); - -extern int toi_register_module(struct toi_module_ops *module); -extern void toi_move_module_tail(struct toi_module_ops *module); - -extern long toi_header_storage_for_modules(void); -extern long toi_memory_for_modules(int print_parts); -extern void print_toi_header_storage_for_modules(void); -extern int toi_expected_compression_ratio(void); - -extern int toi_print_module_debug_info(char *buffer, int buffer_size); -extern int toi_register_module(struct toi_module_ops *module); -extern void toi_unregister_module(struct toi_module_ops *module); - -extern int toi_initialise_modules(int starting_cycle, int early); -#define toi_initialise_modules_early(starting) \ - toi_initialise_modules(starting, 1) -#define toi_initialise_modules_late(starting) \ - toi_initialise_modules(starting, 0) -extern void toi_cleanup_modules(int finishing_cycle); - -extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd); -extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd); - -extern void toi_print_modules(void); - -int toi_get_modules(void); -void toi_put_modules(void); -#endif diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c deleted file mode 100644 index 78bd31b05..000000000 --- a/kernel/power/tuxonice_netlink.c +++ /dev/null @@ -1,324 +0,0 @@ -/* - * kernel/power/tuxonice_netlink.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Functions for communicating with a userspace helper via netlink. - */ - -#include <linux/suspend.h> -#include <linux/sched.h> -#include <linux/kmod.h> -#include "tuxonice_netlink.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" - -static struct user_helper_data *uhd_list; - -/* - * Refill our pool of SKBs for use in emergencies (eg, when eating memory and - * none can be allocated). - */ -static void toi_fill_skb_pool(struct user_helper_data *uhd) -{ - while (uhd->pool_level < uhd->pool_limit) { - struct sk_buff *new_skb = - alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); - - if (!new_skb) - break; - - new_skb->next = uhd->emerg_skbs; - uhd->emerg_skbs = new_skb; - uhd->pool_level++; - } -} - -/* - * Try to allocate a single skb. If we can't get one, try to use one from - * our pool. - */ -static struct sk_buff *toi_get_skb(struct user_helper_data *uhd) -{ - struct sk_buff *skb = - alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); - - if (skb) - return skb; - - skb = uhd->emerg_skbs; - if (skb) { - uhd->pool_level--; - uhd->emerg_skbs = skb->next; - skb->next = NULL; - } - - return skb; -} - -void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - void *dest; - struct task_struct *t; - - if (uhd->pid == -1) - return; - - if (uhd->debug) - printk(KERN_ERR "toi_send_netlink_message: Send " - "message type %d.\n", type); - - skb = toi_get_skb(uhd); - if (!skb) { - printk(KERN_INFO "toi_netlink: Can't allocate skb!\n"); - return; - } - - nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0); - uhd->sock_seq++; - - dest = NLMSG_DATA(nlh); - if (params && len > 0) - memcpy(dest, params, len); - - netlink_unicast(uhd->nl, skb, uhd->pid, 0); - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); - if (!t) { - toi_read_unlock_tasklist(); - if (uhd->pid > -1) - printk(KERN_INFO "Hmm. Can't find the userspace task" - " %d.\n", uhd->pid); - return; - } - wake_up_process(t); - toi_read_unlock_tasklist(); - - yield(); -} - -static void send_whether_debugging(struct user_helper_data *uhd) -{ - static u8 is_debugging = 1; - - toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING, - &is_debugging, sizeof(u8)); -} - -/* - * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we - * are hibernating. - */ -static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid) -{ - struct task_struct *t; - - if (uhd->debug) - printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid); - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(pid, &init_pid_ns); - if (!t) { - toi_read_unlock_tasklist(); - printk(KERN_INFO "Strange. Can't find the userspace task %d.\n", - pid); - return -EINVAL; - } - - t->flags |= PF_NOFREEZE; - - toi_read_unlock_tasklist(); - uhd->pid = pid; - - toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0); - - return 0; -} - -/* - * Called when the userspace process has informed us that it's ready to roll. - */ -static int nl_ready(struct user_helper_data *uhd, u32 version) -{ - if (version != uhd->interface_version) { - printk(KERN_INFO "%s userspace process using invalid interface" - " version (%d - kernel wants %d). Trying to " - "continue without it.\n", - uhd->name, version, uhd->interface_version); - if (uhd->not_ready) - uhd->not_ready(); - return -EINVAL; - } - - complete(&uhd->wait_for_process); - - return 0; -} - -void toi_netlink_close_complete(struct user_helper_data *uhd) -{ - if (uhd->nl) { - netlink_kernel_release(uhd->nl); - uhd->nl = NULL; - } - - while (uhd->emerg_skbs) { - struct sk_buff *next = uhd->emerg_skbs->next; - kfree_skb(uhd->emerg_skbs); - uhd->emerg_skbs = next; - } - - uhd->pid = -1; -} - -static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd, - struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type = nlh->nlmsg_type; - int *data; - int err; - - if (uhd->debug) - printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n", - type); - - /* Let the more specific handler go first. It returns - * 1 for valid messages that it doesn't know. */ - err = uhd->rcv_msg(skb, nlh); - if (err != 1) - return err; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) { - printk(KERN_INFO "Received extra nofreeze me requests.\n"); - return -EBUSY; - } - - data = NLMSG_DATA(nlh); - - switch (type) { - case NETLINK_MSG_NOFREEZE_ME: - return nl_set_nofreeze(uhd, nlh->nlmsg_pid); - case NETLINK_MSG_GET_DEBUGGING: - send_whether_debugging(uhd); - return 0; - case NETLINK_MSG_READY: - if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) { - printk(KERN_INFO "Invalid ready mesage.\n"); - if (uhd->not_ready) - uhd->not_ready(); - return -EINVAL; - } - return nl_ready(uhd, (u32) *data); - case NETLINK_MSG_CLEANUP: - toi_netlink_close_complete(uhd); - return 0; - } - - return -EINVAL; -} - -static void toi_user_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr *nlh; - struct user_helper_data *uhd = uhd_list; - - while (uhd && uhd->netlink_id != skb->sk->sk_protocol) - uhd = uhd->next; - - if (!uhd) - return; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *) skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - - err = toi_nl_gen_rcv_msg(uhd, skb, nlh); - if (err) - netlink_ack(skb, nlh, err); - else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } -} - -static int netlink_prepare(struct user_helper_data *uhd) -{ - struct netlink_kernel_cfg cfg = { - .groups = 0, - .input = toi_user_rcv_skb, - }; - - uhd->next = uhd_list; - uhd_list = uhd; - - uhd->sock_seq = 0x42c0ffee; - uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg); - if (!uhd->nl) { - printk(KERN_INFO "Failed to allocate netlink socket for %s.\n", - uhd->name); - return -ENOMEM; - } - - toi_fill_skb_pool(uhd); - - return 0; -} - -void toi_netlink_close(struct user_helper_data *uhd) -{ - struct task_struct *t; - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); - if (t) - t->flags &= ~PF_NOFREEZE; - toi_read_unlock_tasklist(); - - toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0); -} -int toi_netlink_setup(struct user_helper_data *uhd) -{ - /* In case userui didn't cleanup properly on us */ - toi_netlink_close_complete(uhd); - - if (netlink_prepare(uhd) < 0) { - printk(KERN_INFO "Netlink prepare failed.\n"); - return 1; - } - - if (toi_launch_userspace_program(uhd->program, uhd->netlink_id, - UMH_WAIT_EXEC, uhd->debug) < 0) { - printk(KERN_INFO "Launch userspace program failed.\n"); - toi_netlink_close_complete(uhd); - return 1; - } - - /* Wait 2 seconds for the userspace process to make contact */ - wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ); - - if (uhd->pid == -1) { - printk(KERN_INFO "%s: Failed to contact userspace process.\n", - uhd->name); - toi_netlink_close_complete(uhd); - return 1; - } - - return 0; -} diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h deleted file mode 100644 index 6613c8eaa..000000000 --- a/kernel/power/tuxonice_netlink.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * kernel/power/tuxonice_netlink.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Declarations for functions for communicating with a userspace helper - * via netlink. - */ - -#include <linux/netlink.h> -#include <net/sock.h> - -#define NETLINK_MSG_BASE 0x10 - -#define NETLINK_MSG_READY 0x10 -#define NETLINK_MSG_NOFREEZE_ME 0x16 -#define NETLINK_MSG_GET_DEBUGGING 0x19 -#define NETLINK_MSG_CLEANUP 0x24 -#define NETLINK_MSG_NOFREEZE_ACK 0x27 -#define NETLINK_MSG_IS_DEBUGGING 0x28 - -struct user_helper_data { - int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh); - void (*not_ready) (void); - struct sock *nl; - u32 sock_seq; - pid_t pid; - char *comm; - char program[256]; - int pool_level; - int pool_limit; - struct sk_buff *emerg_skbs; - int skb_size; - int netlink_id; - char *name; - struct user_helper_data *next; - struct completion wait_for_process; - u32 interface_version; - int must_init; - int debug; -}; - -#ifdef CONFIG_NET -int toi_netlink_setup(struct user_helper_data *uhd); -void toi_netlink_close(struct user_helper_data *uhd); -void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len); -void toi_netlink_close_complete(struct user_helper_data *uhd); -#else -static inline int toi_netlink_setup(struct user_helper_data *uhd) -{ - return 0; -} - -static inline void toi_netlink_close(struct user_helper_data *uhd) { }; -static inline void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len) { }; -static inline void toi_netlink_close_complete(struct user_helper_data *uhd) - { }; -#endif diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c deleted file mode 100644 index d469f3d2d..000000000 --- a/kernel/power/tuxonice_pagedir.c +++ /dev/null @@ -1,345 +0,0 @@ -/* - * kernel/power/tuxonice_pagedir.c - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr> - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for handling pagesets. - * Note that pbes aren't actually stored as such. They're stored as - * bitmaps and extents. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <linux/hardirq.h> -#include <linux/sched.h> -#include <linux/cpu.h> -#include <asm/tlbflush.h> - -#include "tuxonice_pageflags.h" -#include "tuxonice_ui.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice.h" -#include "tuxonice_builtin.h" -#include "tuxonice_alloc.h" - -static int ptoi_pfn; -static struct pbe *this_low_pbe; -static struct pbe **last_low_pbe_ptr; - -void toi_reset_alt_image_pageset2_pfn(void) -{ - memory_bm_position_reset(pageset2_map); -} - -static struct page *first_conflicting_page; - -/* - * free_conflicting_pages - */ - -static void free_conflicting_pages(void) -{ - while (first_conflicting_page) { - struct page *next = - *((struct page **) kmap(first_conflicting_page)); - kunmap(first_conflicting_page); - toi__free_page(29, first_conflicting_page); - first_conflicting_page = next; - } -} - -/* __toi_get_nonconflicting_page - * - * Description: Gets order zero pages that won't be overwritten - * while copying the original pages. - */ - -struct page *___toi_get_nonconflicting_page(int can_be_highmem) -{ - struct page *page; - gfp_t flags = TOI_ATOMIC_GFP; - if (can_be_highmem) - flags |= __GFP_HIGHMEM; - - - if (test_toi_state(TOI_LOADING_ALT_IMAGE) && - pageset2_map && ptoi_pfn) { - do { - ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0); - if (ptoi_pfn != BM_END_OF_MAP) { - page = pfn_to_page(ptoi_pfn); - if (!PagePageset1(page) && - (can_be_highmem || !PageHighMem(page))) - return page; - } - } while (ptoi_pfn); - } - - do { - page = toi_alloc_page(29, flags | __GFP_ZERO); - if (!page) { - printk(KERN_INFO "Failed to get nonconflicting " - "page.\n"); - return NULL; - } - if (PagePageset1(page)) { - struct page **next = (struct page **) kmap(page); - *next = first_conflicting_page; - first_conflicting_page = page; - kunmap(page); - } - } while (PagePageset1(page)); - - return page; -} - -unsigned long __toi_get_nonconflicting_page(void) -{ - struct page *page = ___toi_get_nonconflicting_page(0); - return page ? (unsigned long) page_address(page) : 0; -} - -static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe, - int highmem) -{ - if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1)) - + 2 * sizeof(struct pbe)) > PAGE_SIZE) { - struct page *new_page = - ___toi_get_nonconflicting_page(highmem); - if (!new_page) - return ERR_PTR(-ENOMEM); - this_pbe = (struct pbe *) kmap(new_page); - memset(this_pbe, 0, PAGE_SIZE); - *page_ptr = new_page; - } else - this_pbe++; - - return this_pbe; -} - -/** - * get_pageset1_load_addresses - generate pbes for conflicting pages - * - * We check here that pagedir & pages it points to won't collide - * with pages where we're going to restore from the loaded pages - * later. - * - * Returns: - * Zero on success, one if couldn't find enough pages (shouldn't - * happen). - **/ -int toi_get_pageset1_load_addresses(void) -{ - int pfn, highallocd = 0, lowallocd = 0; - int low_needed = pagedir1.size - get_highmem_size(pagedir1); - int high_needed = get_highmem_size(pagedir1); - int low_pages_for_highmem = 0; - gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM; - struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL, - *low_pbe_page, *last_low_pbe_page = NULL; - struct pbe **last_high_pbe_ptr = &restore_highmem_pblist, - *this_high_pbe = NULL; - unsigned long orig_low_pfn, orig_high_pfn; - int high_pbes_done = 0, low_pbes_done = 0; - int low_direct = 0, high_direct = 0, result = 0, i; - int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0; - - toi_trace_index++; - - memory_bm_position_reset(pageset1_map); - memory_bm_position_reset(pageset1_copy_map); - - last_low_pbe_ptr = &restore_pblist; - - /* First, allocate pages for the start of our pbe lists. */ - if (high_needed) { - high_pbe_page = ___toi_get_nonconflicting_page(1); - if (!high_pbe_page) { - result = -ENOMEM; - goto out; - } - this_high_pbe = (struct pbe *) kmap(high_pbe_page); - memset(this_high_pbe, 0, PAGE_SIZE); - } - - low_pbe_page = ___toi_get_nonconflicting_page(0); - if (!low_pbe_page) { - result = -ENOMEM; - goto out; - } - this_low_pbe = (struct pbe *) page_address(low_pbe_page); - - /* - * Next, allocate the number of pages we need. - */ - - i = low_needed + high_needed; - - do { - int is_high; - - if (i == low_needed) - flags &= ~__GFP_HIGHMEM; - - page = toi_alloc_page(30, flags); - BUG_ON(!page); - - SetPagePageset1Copy(page); - is_high = PageHighMem(page); - - if (PagePageset1(page)) { - if (is_high) - high_direct++; - else - low_direct++; - } else { - if (is_high) - highallocd++; - else - lowallocd++; - } - } while (--i); - - high_needed -= high_direct; - low_needed -= low_direct; - - /* - * Do we need to use some lowmem pages for the copies of highmem - * pages? - */ - if (high_needed > highallocd) { - low_pages_for_highmem = high_needed - highallocd; - high_needed -= low_pages_for_highmem; - low_needed += low_pages_for_highmem; - } - - /* - * Now generate our pbes (which will be used for the atomic restore), - * and free unneeded pages. - */ - memory_bm_position_reset(pageset1_copy_map); - for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) { - int is_high; - page = pfn_to_page(pfn); - is_high = PageHighMem(page); - - if (PagePageset1(page)) - continue; - - /* Nope. We're going to use this page. Add a pbe. */ - if (is_high || low_pages_for_highmem) { - struct page *orig_page; - high_pbes_done++; - if (!is_high) - low_pages_for_highmem--; - do { - orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0); - BUG_ON(orig_high_pfn == BM_END_OF_MAP); - orig_page = pfn_to_page(orig_high_pfn); - } while (!PageHighMem(orig_page) || - PagePageset1Copy(orig_page)); - - this_high_pbe->orig_address = (void *) orig_high_pfn; - this_high_pbe->address = page; - this_high_pbe->next = NULL; - toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p", - high_page, high_offset, page, orig_high_pfn, orig_page); - if (last_high_pbe_page != high_pbe_page) { - *last_high_pbe_ptr = - (struct pbe *) high_pbe_page; - if (last_high_pbe_page) { - kunmap(last_high_pbe_page); - high_page++; - high_offset = 0; - } else - high_offset++; - last_high_pbe_page = high_pbe_page; - } else { - *last_high_pbe_ptr = this_high_pbe; - high_offset++; - } - last_high_pbe_ptr = &this_high_pbe->next; - this_high_pbe = get_next_pbe(&high_pbe_page, - this_high_pbe, 1); - if (IS_ERR(this_high_pbe)) { - printk(KERN_INFO - "This high pbe is an error.\n"); - return -ENOMEM; - } - } else { - struct page *orig_page; - low_pbes_done++; - do { - orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0); - BUG_ON(orig_low_pfn == BM_END_OF_MAP); - orig_page = pfn_to_page(orig_low_pfn); - } while (PageHighMem(orig_page) || - PagePageset1Copy(orig_page)); - - this_low_pbe->orig_address = page_address(orig_page); - this_low_pbe->address = page_address(page); - this_low_pbe->next = NULL; - toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p", - low_page, low_offset, this_low_pbe->orig_address, - orig_low_pfn, this_low_pbe->address); - TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address); - *last_low_pbe_ptr = this_low_pbe; - last_low_pbe_ptr = &this_low_pbe->next; - this_low_pbe = get_next_pbe(&low_pbe_page, - this_low_pbe, 0); - if (low_pbe_page != last_low_pbe_page) { - if (last_low_pbe_page) { - low_page++; - low_offset = 0; - } else { - low_offset++; - } - last_low_pbe_page = low_pbe_page; - } else - low_offset++; - if (IS_ERR(this_low_pbe)) { - printk(KERN_INFO "this_low_pbe is an error.\n"); - return -ENOMEM; - } - } - } - - if (high_pbe_page) - kunmap(high_pbe_page); - - if (last_high_pbe_page != high_pbe_page) { - if (last_high_pbe_page) - kunmap(last_high_pbe_page); - toi__free_page(29, high_pbe_page); - } - - free_conflicting_pages(); - -out: - return result; -} - -int add_boot_kernel_data_pbe(void) -{ - this_low_pbe->address = (char *) __toi_get_nonconflicting_page(); - if (!this_low_pbe->address) { - printk(KERN_INFO "Failed to get bkd atomic restore buffer."); - return -ENOMEM; - } - - toi_bkd.size = sizeof(toi_bkd); - memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd)); - - *last_low_pbe_ptr = this_low_pbe; - this_low_pbe->orig_address = (char *) boot_kernel_data_buffer; - this_low_pbe->next = NULL; - return 0; -} diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h deleted file mode 100644 index 046535918..000000000 --- a/kernel/power/tuxonice_pagedir.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * kernel/power/tuxonice_pagedir.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Declarations for routines for handling pagesets. - */ - -#ifndef KERNEL_POWER_PAGEDIR_H -#define KERNEL_POWER_PAGEDIR_H - -/* Pagedir - * - * Contains the metadata for a set of pages saved in the image. - */ - -struct pagedir { - int id; - unsigned long size; -#ifdef CONFIG_HIGHMEM - unsigned long size_high; -#endif -}; - -#ifdef CONFIG_HIGHMEM -#define get_highmem_size(pagedir) (pagedir.size_high) -#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0) -#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0) -#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high) -#else -#define get_highmem_size(pagedir) (0) -#define set_highmem_size(pagedir, sz) do { } while (0) -#define inc_highmem_size(pagedir) do { } while (0) -#define get_lowmem_size(pagedir) (pagedir.size) -#endif - -extern struct pagedir pagedir1, pagedir2; - -extern void toi_copy_pageset1(void); - -extern int toi_get_pageset1_load_addresses(void); - -extern unsigned long __toi_get_nonconflicting_page(void); -struct page *___toi_get_nonconflicting_page(int can_be_highmem); - -extern void toi_reset_alt_image_pageset2_pfn(void); -extern int add_boot_kernel_data_pbe(void); -#endif diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c deleted file mode 100644 index 0fe92edd7..000000000 --- a/kernel/power/tuxonice_pageflags.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * kernel/power/tuxonice_pageflags.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for serialising and relocating pageflags in which we - * store our image metadata. - */ - -#include "tuxonice_pageflags.h" -#include "power.h" - -int toi_pageflags_space_needed(void) -{ - return memory_bm_space_needed(pageset1_map); -} diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h deleted file mode 100644 index ddeeaf1e7..000000000 --- a/kernel/power/tuxonice_pageflags.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * kernel/power/tuxonice_pageflags.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H -#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H - -struct memory_bitmap; -void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); -void memory_bm_clear(struct memory_bitmap *bm); - -int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index); -unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index); -void memory_bm_position_reset(struct memory_bitmap *bm); -void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); -int toi_alloc_bitmap(struct memory_bitmap **bm); -void toi_free_bitmap(struct memory_bitmap **bm); -void memory_bm_clear(struct memory_bitmap *bm); -void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); - -struct toi_module_ops; -int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); -int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); -int memory_bm_space_needed(struct memory_bitmap *bm); - -extern struct memory_bitmap *pageset1_map; -extern struct memory_bitmap *pageset1_copy_map; -extern struct memory_bitmap *pageset2_map; -extern struct memory_bitmap *page_resave_map; -extern struct memory_bitmap *io_map; -extern struct memory_bitmap *nosave_map; -extern struct memory_bitmap *free_map; -extern struct memory_bitmap *compare_map; - -#define PagePageset1(page) \ - (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset1(page) \ - (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset1(page) \ - (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) - -#define PagePageset1Copy(page) \ - (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset1Copy(page) \ - (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset1Copy(page) \ - (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) - -#define PagePageset2(page) \ - (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset2(page) \ - (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset2(page) \ - (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) - -#define PageWasRW(page) \ - (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define SetPageWasRW(page) \ - (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageWasRW(page) \ - (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) - -#define PageResave(page) (page_resave_map ? \ - memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageResave(page) \ - (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageResave(page) \ - (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) - -#define PageNosave(page) (nosave_map ? \ - memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageNosave(page) \ - (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageNosave(page) \ - (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page))) - -#define PageNosaveFree(page) (free_map ? \ - memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageNosaveFree(page) \ - (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageNosaveFree(page) \ - (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page))) - -#define PageCompareChanged(page) (compare_map ? \ - memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageCompareChanged(page) \ - (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageCompareChanged(page) \ - (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page))) - -extern void save_pageflags(struct memory_bitmap *pagemap); -extern int load_pageflags(struct memory_bitmap *pagemap); -extern int toi_pageflags_space_needed(void); -#endif diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c deleted file mode 100644 index 7c78773cf..000000000 --- a/kernel/power/tuxonice_power_off.c +++ /dev/null @@ -1,286 +0,0 @@ -/* - * kernel/power/tuxonice_power_off.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Support for powering down. - */ - -#include <linux/device.h> -#include <linux/suspend.h> -#include <linux/mm.h> -#include <linux/pm.h> -#include <linux/reboot.h> -#include <linux/cpu.h> -#include <linux/console.h> -#include <linux/fs.h> -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_power_off.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" - -unsigned long toi_poweroff_method; /* 0 - Kernel power off */ - -static int wake_delay; -static char lid_state_file[256], wake_alarm_dir[256]; -static struct file *lid_file, *alarm_file, *epoch_file; -static int post_wake_state = -1; - -static int did_suspend_to_both; - -/* - * __toi_power_down - * Functionality : Powers down or reboots the computer once the image - * has been written to disk. - * Key Assumptions : Able to reboot/power down via code called or that - * the warning emitted if the calls fail will be visible - * to the user (ie printk resumes devices). - */ - -static void __toi_power_down(int method) -{ - int error; - - toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." : - "Powering down."); - - if (test_result_state(TOI_ABORTED)) - goto out; - - if (test_action_state(TOI_REBOOT)) - kernel_restart(NULL); - - switch (method) { - case 0: - break; - case 3: - /* - * Re-read the overwritten part of pageset2 to make post-resume - * faster. - */ - if (read_pageset2(1)) - panic("Attempt to reload pagedir 2 failed. " - "Try rebooting."); - - pm_prepare_console(); - - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (!error) { - pm_restore_gfp_mask(); - error = suspend_devices_and_enter(PM_SUSPEND_MEM); - pm_restrict_gfp_mask(); - if (!error) - did_suspend_to_both = 1; - } - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); - - /* Success - we're now post-resume-from-ram */ - if (did_suspend_to_both) - return; - - /* Failed to suspend to ram - do normal power off */ - break; - case 4: - /* - * If succeeds, doesn't return. If fails, do a simple - * powerdown. - */ - hibernation_platform_enter(); - break; - case 5: - /* Historic entry only now */ - break; - } - - if (method && method != 5) - toi_cond_pause(1, - "Falling back to alternate power off method."); - - if (test_result_state(TOI_ABORTED)) - goto out; - - if (pm_power_off) - kernel_power_off(); - kernel_halt(); - toi_cond_pause(1, "Powerdown failed."); - while (1) - cpu_relax(); - -out: - if (read_pageset2(1)) - panic("Attempt to reload pagedir 2 failed. Try rebooting."); - return; -} - -#define CLOSE_FILE(file) \ - if (file) { \ - filp_close(file, NULL); file = NULL; \ - } - -static void powerdown_cleanup(int toi_or_resume) -{ - if (!toi_or_resume) - return; - - CLOSE_FILE(lid_file); - CLOSE_FILE(alarm_file); - CLOSE_FILE(epoch_file); -} - -static void open_file(char *format, char *arg, struct file **var, int mode, - char *desc) -{ - char buf[256]; - - if (strlen(arg)) { - sprintf(buf, format, arg); - *var = filp_open(buf, mode, 0); - if (IS_ERR(*var) || !*var) { - printk(KERN_INFO "Failed to open %s file '%s' (%p).\n", - desc, buf, *var); - *var = NULL; - } - } -} - -static int powerdown_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - did_suspend_to_both = 0; - - open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file, - O_RDONLY, "lid"); - - if (strlen(wake_alarm_dir)) { - open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir, - &alarm_file, O_WRONLY, "alarm"); - - open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir, - &epoch_file, O_RDONLY, "epoch"); - } - - return 0; -} - -static int lid_closed(void) -{ - char array[25]; - ssize_t size; - loff_t pos = 0; - - if (!lid_file) - return 0; - - size = vfs_read(lid_file, (char __user *) array, 25, &pos); - if ((int) size < 1) { - printk(KERN_INFO "Failed to read lid state file (%d).\n", - (int) size); - return 0; - } - - if (!strcmp(array, "state: closed\n")) - return 1; - - return 0; -} - -static void write_alarm_file(int value) -{ - ssize_t size; - char buf[40]; - loff_t pos = 0; - - if (!alarm_file) - return; - - sprintf(buf, "%d\n", value); - - size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos); - - if (size < 0) - printk(KERN_INFO "Error %d writing alarm value %s.\n", - (int) size, buf); -} - -/** - * toi_check_resleep: See whether to powerdown again after waking. - * - * After waking, check whether we should powerdown again in a (usually - * different) way. We only do this if the lid switch is still closed. - */ -void toi_check_resleep(void) -{ - /* We only return if we suspended to ram and woke. */ - if (lid_closed() && post_wake_state >= 0) - __toi_power_down(post_wake_state); -} - -void toi_power_down(void) -{ - if (alarm_file && wake_delay) { - char array[25]; - loff_t pos = 0; - size_t size = vfs_read(epoch_file, (char __user *) array, 25, - &pos); - - if (((int) size) < 1) - printk(KERN_INFO "Failed to read epoch file (%d).\n", - (int) size); - else { - unsigned long since_epoch; - if (!kstrtoul(array, 0, &since_epoch)) { - /* Clear any wakeup time. */ - write_alarm_file(0); - - /* Set new wakeup time. */ - write_alarm_file(since_epoch + wake_delay); - } - } - } - - __toi_power_down(toi_poweroff_method); - - toi_check_resleep(); -} - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_ACPI) - SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL), - SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL), - SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL), - SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0, - NULL), - SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0), - SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both, - 0, 0, 0, NULL) -#endif -}; - -static struct toi_module_ops powerdown_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "poweroff", - .initialise = powerdown_init, - .cleanup = powerdown_cleanup, - .directory = "[ROOT]", - .module = THIS_MODULE, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_poweroff_init(void) -{ - return toi_register_module(&powerdown_ops); -} - -void toi_poweroff_exit(void) -{ - toi_unregister_module(&powerdown_ops); -} diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h deleted file mode 100644 index 6e1d8bb39..000000000 --- a/kernel/power/tuxonice_power_off.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * kernel/power/tuxonice_power_off.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Support for the powering down. - */ - -int toi_pm_state_finish(void); -void toi_power_down(void); -extern unsigned long toi_poweroff_method; -int toi_poweroff_init(void); -void toi_poweroff_exit(void); -void toi_check_resleep(void); - -extern int platform_begin(int platform_mode); -extern int platform_pre_snapshot(int platform_mode); -extern void platform_leave(int platform_mode); -extern void platform_end(int platform_mode); -extern void platform_finish(int platform_mode); -extern int platform_pre_restore(int platform_mode); -extern void platform_restore_cleanup(int platform_mode); diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c deleted file mode 100644 index df37cc805..000000000 --- a/kernel/power/tuxonice_prepare_image.c +++ /dev/null @@ -1,1089 +0,0 @@ -/* - * kernel/power/tuxonice_prepare_image.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * We need to eat memory until we can: - * 1. Perform the save without changing anything (RAM_NEEDED < #pages) - * 2. Fit it all in available space (toiActiveAllocator->available_space() >= - * main_storage_needed()) - * 3. Reload the pagedir and pageset1 to places that don't collide with their - * final destinations, not knowing to what extent the resumed kernel will - * overlap with the one loaded at boot time. I think the resumed kernel - * should overlap completely, but I don't want to rely on this as it is - * an unproven assumption. We therefore assume there will be no overlap at - * all (worse case). - * 4. Meet the user's requested limit (if any) on the size of the image. - * The limit is in MB, so pages/256 (assuming 4K pages). - * - */ - -#include <linux/highmem.h> -#include <linux/freezer.h> -#include <linux/hardirq.h> -#include <linux/mmzone.h> -#include <linux/console.h> -#include <linux/tuxonice.h> - -#include "tuxonice_pageflags.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice.h" -#include "tuxonice_extent.h" -#include "tuxonice_checksum.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_alloc.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_builtin.h" - -static unsigned long num_nosave, main_storage_allocated, storage_limit, - header_storage_needed; -unsigned long extra_pd1_pages_allowance = - CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE; -long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT; -static int no_ps2_needed; - -struct attention_list { - struct task_struct *task; - struct attention_list *next; -}; - -static struct attention_list *attention_list; - -#define PAGESET1 0 -#define PAGESET2 1 - -void free_attention_list(void) -{ - struct attention_list *last = NULL; - - while (attention_list) { - last = attention_list; - attention_list = attention_list->next; - toi_kfree(6, last, sizeof(*last)); - } -} - -static int build_attention_list(void) -{ - int i, task_count = 0; - struct task_struct *p; - struct attention_list *next; - - /* - * Count all userspace process (with task->mm) marked PF_NOFREEZE. - */ - toi_read_lock_tasklist(); - for_each_process(p) - if ((p->flags & PF_NOFREEZE) || p == current) - task_count++; - toi_read_unlock_tasklist(); - - /* - * Allocate attention list structs. - */ - for (i = 0; i < task_count; i++) { - struct attention_list *this = - toi_kzalloc(6, sizeof(struct attention_list), - TOI_WAIT_GFP); - if (!this) { - printk(KERN_INFO "Failed to allocate slab for " - "attention list.\n"); - free_attention_list(); - return 1; - } - this->next = NULL; - if (attention_list) - this->next = attention_list; - attention_list = this; - } - - next = attention_list; - toi_read_lock_tasklist(); - for_each_process(p) - if ((p->flags & PF_NOFREEZE) || p == current) { - next->task = p; - next = next->next; - } - toi_read_unlock_tasklist(); - return 0; -} - -static void pageset2_full(void) -{ - struct zone *zone; - struct page *page; - unsigned long flags; - int i; - - toi_trace_index++; - - for_each_populated_zone(zone) { - spin_lock_irqsave(&zone->lru_lock, flags); - for_each_lru(i) { - if (!zone_page_state(zone, NR_LRU_BASE + i)) - continue; - - list_for_each_entry(page, &zone->lruvec.lists[i], lru) { - struct address_space *mapping; - - mapping = page_mapping(page); - if (!mapping || !mapping->host || - !(mapping->host->i_flags & S_ATOMIC_COPY)) { - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified."); - } else { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full."); - SetPagePageset2(page); - } - } - } - } - spin_unlock_irqrestore(&zone->lru_lock, flags); - } -} - -/* - * toi_mark_task_as_pageset - * Functionality : Marks all the saveable pages belonging to a given process - * as belonging to a particular pageset. - */ - -static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2) -{ - struct vm_area_struct *vma; - struct mm_struct *mm; - - mm = t->active_mm; - - if (!mm || !mm->mmap) - return; - - toi_trace_index++; - - if (!irqs_disabled()) - down_read(&mm->mmap_sem); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long posn; - - if (!vma->vm_start || - vma->vm_flags & VM_PFNMAP) - continue; - - for (posn = vma->vm_start; posn < vma->vm_end; - posn += PAGE_SIZE) { - struct page *page = follow_page(vma, posn, 0); - struct address_space *mapping; - - if (!page || !pfn_valid(page_to_pfn(page))) - continue; - - mapping = page_mapping(page); - if (mapping && mapping->host && - mapping->host->i_flags & S_ATOMIC_COPY && pageset2) - continue; - - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2); - continue; - } - - if (pageset2) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1"); - SetPagePageset2(page); - } else { - TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2"); - ClearPagePageset2(page); - SetPagePageset1(page); - } - } - } - - if (!irqs_disabled()) - up_read(&mm->mmap_sem); -} - -static void mark_tasks(int pageset) -{ - struct task_struct *p; - - toi_read_lock_tasklist(); - for_each_process(p) { - if (!p->mm) - continue; - - if (p->flags & PF_KTHREAD) - continue; - - toi_mark_task_as_pageset(p, pageset); - } - toi_read_unlock_tasklist(); - -} - -/* mark_pages_for_pageset2 - * - * Description: Mark unshared pages in processes not needed for hibernate as - * being able to be written out in a separate pagedir. - * HighMem pages are simply marked as pageset2. They won't be - * needed during hibernate. - */ - -static void toi_mark_pages_for_pageset2(void) -{ - struct attention_list *this = attention_list; - - memory_bm_clear(pageset2_map); - - if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed) - return; - - if (test_action_state(TOI_PAGESET2_FULL)) - pageset2_full(); - else - mark_tasks(PAGESET2); - - /* - * Because the tasks in attention_list are ones related to hibernating, - * we know that they won't go away under us. - */ - - while (this) { - if (!test_result_state(TOI_ABORTED)) - toi_mark_task_as_pageset(this->task, PAGESET1); - this = this->next; - } -} - -/* - * The atomic copy of pageset1 is stored in pageset2 pages. - * But if pageset1 is larger (normally only just after boot), - * we need to allocate extra pages to store the atomic copy. - * The following data struct and functions are used to handle - * the allocation and freeing of that memory. - */ - -static unsigned long extra_pages_allocated; - -struct extras { - struct page *page; - int order; - struct extras *next; -}; - -static struct extras *extras_list; - -/* toi_free_extra_pagedir_memory - * - * Description: Free previously allocated extra pagedir memory. - */ -void toi_free_extra_pagedir_memory(void) -{ - /* Free allocated pages */ - while (extras_list) { - struct extras *this = extras_list; - int i; - - extras_list = this->next; - - for (i = 0; i < (1 << this->order); i++) - ClearPageNosave(this->page + i); - - toi_free_pages(9, this->page, this->order); - toi_kfree(7, this, sizeof(*this)); - } - - extra_pages_allocated = 0; -} - -/* toi_allocate_extra_pagedir_memory - * - * Description: Allocate memory for making the atomic copy of pagedir1 in the - * case where it is bigger than pagedir2. - * Arguments: int num_to_alloc: Number of extra pages needed. - * Result: int. Number of extra pages we now have allocated. - */ -static int toi_allocate_extra_pagedir_memory(int extra_pages_needed) -{ - int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated; - gfp_t flags = TOI_ATOMIC_GFP; - - if (num_to_alloc < 1) - return 0; - - order = fls(num_to_alloc); - if (order >= MAX_ORDER) - order = MAX_ORDER - 1; - - while (num_to_alloc) { - struct page *newpage; - unsigned long virt; - struct extras *extras_entry; - - while ((1 << order) > num_to_alloc) - order--; - - extras_entry = (struct extras *) toi_kzalloc(7, - sizeof(struct extras), TOI_ATOMIC_GFP); - - if (!extras_entry) - return extra_pages_allocated; - - virt = toi_get_free_pages(9, flags, order); - while (!virt && order) { - order--; - virt = toi_get_free_pages(9, flags, order); - } - - if (!virt) { - toi_kfree(7, extras_entry, sizeof(*extras_entry)); - return extra_pages_allocated; - } - - newpage = virt_to_page(virt); - - extras_entry->page = newpage; - extras_entry->order = order; - extras_entry->next = extras_list; - - extras_list = extras_entry; - - for (j = 0; j < (1 << order); j++) { - SetPageNosave(newpage + j); - SetPagePageset1Copy(newpage + j); - } - - extra_pages_allocated += (1 << order); - num_to_alloc -= (1 << order); - } - - return extra_pages_allocated; -} - -/* - * real_nr_free_pages: Count pcp pages for a zone type or all zones - * (-1 for all, otherwise zone_idx() result desired). - */ -unsigned long real_nr_free_pages(unsigned long zone_idx_mask) -{ - struct zone *zone; - int result = 0, cpu; - - /* PCP lists */ - for_each_populated_zone(zone) { - if (!(zone_idx_mask & (1 << zone_idx(zone)))) - continue; - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pset = - per_cpu_ptr(zone->pageset, cpu); - struct per_cpu_pages *pcp = &pset->pcp; - result += pcp->count; - } - - result += zone_page_state(zone, NR_FREE_PAGES); - } - return result; -} - -/* - * Discover how much extra memory will be required by the drivers - * when they're asked to hibernate. We can then ensure that amount - * of memory is available when we really want it. - */ -static void get_extra_pd1_allowance(void) -{ - unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final; - - toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers."); - - if (toi_go_atomic(PMSG_FREEZE, 1)) - return; - - final = real_nr_free_pages(all_zones_mask); - toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0); - - extra_pd1_pages_allowance = (orig_num_free > final) ? - orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE : - MIN_EXTRA_PAGES_ALLOWANCE; -} - -/* - * Amount of storage needed, possibly taking into account the - * expected compression ratio and possibly also ignoring our - * allowance for extra pages. - */ -static unsigned long main_storage_needed(int use_ecr, - int ignore_extra_pd1_allow) -{ - return (pagedir1.size + pagedir2.size + - (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) * - (use_ecr ? toi_expected_compression_ratio() : 100) / 100; -} - -/* - * Storage needed for the image header, in bytes until the return. - * - * fs_info_space_needed is saved in a static variable unless we - * explicitly want to reset the value (done at the start of a cycle) - * as it requires memory allocation that may result in a hang if we're - * also trying to free memory. - */ -unsigned long get_header_storage_needed(int reset) -{ - unsigned long bytes = sizeof(struct toi_header) + - toi_header_storage_for_modules() + - toi_pageflags_space_needed() + - fs_info_space_needed(0); - - return DIV_ROUND_UP(bytes, PAGE_SIZE); -} - -/* - * When freeing memory, pages from either pageset might be freed. - * - * When seeking to free memory to be able to hibernate, for every ps1 page - * freed, we need 2 less pages for the atomic copy because there is one less - * page to copy and one more page into which data can be copied. - * - * Freeing ps2 pages saves us nothing directly. No more memory is available - * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but - * that's too much work to figure out. - * - * => ps1_to_free functions - * - * Of course if we just want to reduce the image size, because of storage - * limitations or an image size limit either ps will do. - * - * => any_to_free function - */ - -static unsigned long lowpages_usable_for_highmem_copy(void) -{ - unsigned long needed = get_lowmem_size(pagedir1) + - extra_pd1_pages_allowance + MIN_FREE_RAM + - toi_memory_for_modules(0), - available = get_lowmem_size(pagedir2) + - real_nr_free_low_pages() + extra_pages_allocated; - - return available > needed ? available - needed : 0; -} - -static unsigned long highpages_ps1_to_free(void) -{ - unsigned long need = get_highmem_size(pagedir1), - available = get_highmem_size(pagedir2) + - real_nr_free_high_pages() + - lowpages_usable_for_highmem_copy(); - - return need > available ? DIV_ROUND_UP(need - available, 2) : 0; -} - -static unsigned long lowpages_ps1_to_free(void) -{ - unsigned long needed = get_lowmem_size(pagedir1) + - extra_pd1_pages_allowance + MIN_FREE_RAM + - toi_memory_for_modules(0), - available = get_lowmem_size(pagedir2) + - real_nr_free_low_pages() + extra_pages_allocated; - - return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0; -} - -static unsigned long current_image_size(void) -{ - return pagedir1.size + pagedir2.size + header_storage_needed; -} - -static unsigned long storage_still_required(void) -{ - unsigned long needed = main_storage_needed(1, 1); - return needed > storage_limit ? needed - storage_limit : 0; -} - -static unsigned long ram_still_required(void) -{ - unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) + - 2 * extra_pd1_pages_allowance, - available = real_nr_free_low_pages() + extra_pages_allocated; - return needed > available ? needed - available : 0; -} - -unsigned long any_to_free(int use_image_size_limit) -{ - int use_soft_limit = use_image_size_limit && image_size_limit > 0; - unsigned long current_size = current_image_size(), - soft_limit = use_soft_limit ? (image_size_limit << 8) : 0, - to_free = use_soft_limit ? (current_size > soft_limit ? - current_size - soft_limit : 0) : 0, - storage_limit = storage_still_required(), - ram_limit = ram_still_required(), - first_max = max(to_free, storage_limit); - - return max(first_max, ram_limit); -} - -static int need_pageset2(void) -{ - return (real_nr_free_low_pages() + extra_pages_allocated - - 2 * extra_pd1_pages_allowance - MIN_FREE_RAM - - toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size; -} - -/* amount_needed - * - * Calculates the amount by which the image size needs to be reduced to meet - * our constraints. - */ -static unsigned long amount_needed(int use_image_size_limit) -{ - return max(highpages_ps1_to_free() + lowpages_ps1_to_free(), - any_to_free(use_image_size_limit)); -} - -static int image_not_ready(int use_image_size_limit) -{ - toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, - "Amount still needed (%lu) > 0:%u," - " Storage allocd: %lu < %lu: %u.\n", - amount_needed(use_image_size_limit), - (amount_needed(use_image_size_limit) > 0), - main_storage_allocated, - main_storage_needed(1, 1), - main_storage_allocated < main_storage_needed(1, 1)); - - toi_cond_pause(0, NULL); - - return (amount_needed(use_image_size_limit) > 0) || - main_storage_allocated < main_storage_needed(1, 1); -} - -static void display_failure_reason(int tries_exceeded) -{ - unsigned long storage_required = storage_still_required(), - ram_required = ram_still_required(), - high_ps1 = highpages_ps1_to_free(), - low_ps1 = lowpages_ps1_to_free(); - - printk(KERN_INFO "Failed to prepare the image because...\n"); - - if (!storage_limit) { - printk(KERN_INFO "- You need some storage available to be " - "able to hibernate.\n"); - return; - } - - if (tries_exceeded) - printk(KERN_INFO "- The maximum number of iterations was " - "reached without successfully preparing the " - "image.\n"); - - if (storage_required) { - printk(KERN_INFO " - We need at least %lu pages of storage " - "(ignoring the header), but only have %lu.\n", - main_storage_needed(1, 1), - main_storage_allocated); - set_abort_result(TOI_INSUFFICIENT_STORAGE); - } - - if (ram_required) { - printk(KERN_INFO " - We need %lu more free pages of low " - "memory.\n", ram_required); - printk(KERN_INFO " Minimum free : %8d\n", MIN_FREE_RAM); - printk(KERN_INFO " + Reqd. by modules : %8lu\n", - toi_memory_for_modules(0)); - printk(KERN_INFO " + 2 * extra allow : %8lu\n", - 2 * extra_pd1_pages_allowance); - printk(KERN_INFO " - Currently free : %8lu\n", - real_nr_free_low_pages()); - printk(KERN_INFO " - Pages allocd : %8lu\n", - extra_pages_allocated); - printk(KERN_INFO " : ========\n"); - printk(KERN_INFO " Still needed : %8lu\n", - ram_required); - - /* Print breakdown of memory needed for modules */ - toi_memory_for_modules(1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } - - if (high_ps1) { - printk(KERN_INFO "- We need to free %lu highmem pageset 1 " - "pages.\n", high_ps1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } - - if (low_ps1) { - printk(KERN_INFO " - We need to free %ld lowmem pageset 1 " - "pages.\n", low_ps1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } -} - -static void display_stats(int always, int sub_extra_pd1_allow) -{ - char buffer[255]; - snprintf(buffer, 254, - "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). " - "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). " - "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n", - - /* Free */ - real_nr_free_pages(all_zones_mask), - real_nr_free_low_pages(), - - /* Sets */ - pagedir1.size, pagedir1.size - get_highmem_size(pagedir1), - pagedir2.size, pagedir2.size - get_highmem_size(pagedir2), - - /* Nosave */ - num_nosave, extra_pages_allocated, - num_nosave - extra_pages_allocated, - - /* Storage */ - main_storage_allocated, - storage_limit, - main_storage_needed(1, sub_extra_pd1_allow), - main_storage_needed(1, 1), - - /* Needed */ - lowpages_ps1_to_free(), highpages_ps1_to_free(), - any_to_free(1), - MIN_FREE_RAM, toi_memory_for_modules(0), - extra_pd1_pages_allowance, - image_size_limit, - - need_pageset2() ? "yes" : "no"); - - if (always) - printk("%s", buffer); - else - toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer); -} - -/* flag_image_pages - * - * This routine generates our lists of pages to be stored in each - * pageset. Since we store the data using extents, and adding new - * extents might allocate a new extent page, this routine may well - * be called more than once. - */ -static void flag_image_pages(int atomic_copy) -{ - int num_free = 0, num_unmodified = 0; - unsigned long loop; - struct zone *zone; - - pagedir1.size = 0; - pagedir2.size = 0; - - set_highmem_size(pagedir1, 0); - set_highmem_size(pagedir2, 0); - - num_nosave = 0; - toi_trace_index++; - - memory_bm_clear(pageset1_map); - - toi_generate_free_page_map(); - - /* - * Pages not to be saved are marked Nosave irrespective of being - * reserved. - */ - for_each_populated_zone(zone) { - int highmem = is_highmem(zone); - - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - TOI_TRACE_DEBUG(pfn, "_Flag Invalid"); - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - unsigned long y; - for (y = pfn; y < pfn + chunk_size; y++) { - page = pfn_to_page(y); - TOI_TRACE_DEBUG(y, "_Flag Free"); - ClearPagePageset1(page); - ClearPagePageset2(page); - } - num_free += chunk_size; - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageNosave(page)) { - char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave"; - TOI_TRACE_DEBUG(pfn, "_Flag %s", desc); - num_nosave++; - continue; - } - - page = highmem ? saveable_highmem_page(zone, pfn) : - saveable_page(zone, pfn); - - if (!page) { - TOI_TRACE_DEBUG(pfn, "_Flag Nosave2"); - num_nosave++; - continue; - } - - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(pfn, "_Unmodified"); - num_unmodified++; - continue; - } - - if (PagePageset2(page)) { - pagedir2.size++; - TOI_TRACE_DEBUG(pfn, "_Flag PS2"); - if (PageHighMem(page)) - inc_highmem_size(pagedir2); - else - SetPagePageset1Copy(page); - if (PageResave(page)) { - SetPagePageset1(page); - ClearPagePageset1Copy(page); - pagedir1.size++; - if (PageHighMem(page)) - inc_highmem_size(pagedir1); - } - } else { - pagedir1.size++; - TOI_TRACE_DEBUG(pfn, "_Flag PS1"); - SetPagePageset1(page); - if (PageHighMem(page)) - inc_highmem_size(pagedir1); - } - } - } - - if (!atomic_copy) - toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0, - "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)" - " + Unmodified (%d) + NumFree (%d) = %d.\n", - pagedir1.size, pagedir2.size, num_nosave, num_unmodified, - num_free, pagedir1.size + pagedir2.size + num_nosave + num_free); -} - -void toi_recalculate_image_contents(int atomic_copy) -{ - memory_bm_clear(pageset1_map); - if (!atomic_copy) { - unsigned long pfn; - memory_bm_position_reset(pageset2_map); - for (pfn = memory_bm_next_pfn(pageset2_map, 0); - pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset2_map, 0)) - ClearPagePageset1Copy(pfn_to_page(pfn)); - /* Need to call this before getting pageset1_size! */ - toi_mark_pages_for_pageset2(); - } - memory_bm_position_reset(pageset2_map); - flag_image_pages(atomic_copy); - - if (!atomic_copy) { - storage_limit = toiActiveAllocator->storage_available(); - display_stats(0, 0); - } -} - -int try_allocate_extra_memory(void) -{ - unsigned long wanted = pagedir1.size + extra_pd1_pages_allowance - - get_lowmem_size(pagedir2); - if (wanted > extra_pages_allocated) { - unsigned long got = toi_allocate_extra_pagedir_memory(wanted); - if (wanted < got) { - toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, - "Want %d extra pages for pageset1, got %d.\n", - wanted, got); - return 1; - } - } - return 0; -} - -/* update_image - * - * Allocate [more] memory and storage for the image. - */ -static void update_image(int ps2_recalc) -{ - int old_header_req; - unsigned long seek; - - if (try_allocate_extra_memory()) - return; - - if (ps2_recalc) - goto recalc; - - thaw_kernel_threads(); - - /* - * Allocate remaining storage space, if possible, up to the - * maximum we know we'll need. It's okay to allocate the - * maximum if the writer is the swapwriter, but - * we don't want to grab all available space on an NFS share. - * We therefore ignore the expected compression ratio here, - * thereby trying to allocate the maximum image size we could - * need (assuming compression doesn't expand the image), but - * don't complain if we can't get the full amount we're after. - */ - - do { - int result; - - old_header_req = header_storage_needed; - toiActiveAllocator->reserve_header_space(header_storage_needed); - - /* How much storage is free with the reservation applied? */ - storage_limit = toiActiveAllocator->storage_available(); - seek = min(storage_limit, main_storage_needed(0, 0)); - - result = toiActiveAllocator->allocate_storage(seek); - if (result) - printk("Failed to allocate storage (%d).\n", result); - - main_storage_allocated = - toiActiveAllocator->storage_allocated(); - - /* Need more header because more storage allocated? */ - header_storage_needed = get_header_storage_needed(0); - - } while (header_storage_needed > old_header_req); - - if (freeze_kernel_threads()) - set_abort_result(TOI_FREEZING_FAILED); - -recalc: - toi_recalculate_image_contents(0); -} - -/* attempt_to_freeze - * - * Try to freeze processes. - */ - -static int attempt_to_freeze(void) -{ - int result; - - /* Stop processes before checking again */ - toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing " - "filesystems."); - result = freeze_processes(); - - if (result) - set_abort_result(TOI_FREEZING_FAILED); - - result = freeze_kernel_threads(); - - if (result) - set_abort_result(TOI_FREEZING_FAILED); - - return result; -} - -/* eat_memory - * - * Try to free some memory, either to meet hard or soft constraints on the image - * characteristics. - * - * Hard constraints: - * - Pageset1 must be < half of memory; - * - We must have enough memory free at resume time to have pageset1 - * be able to be loaded in pages that don't conflict with where it has to - * be restored. - * Soft constraints - * - User specificied image size limit. - */ -static void eat_memory(void) -{ - unsigned long amount_wanted = 0; - int did_eat_memory = 0; - - /* - * Note that if we have enough storage space and enough free memory, we - * may exit without eating anything. We give up when the last 10 - * iterations ate no extra pages because we're not going to get much - * more anyway, but the few pages we get will take a lot of time. - * - * We freeze processes before beginning, and then unfreeze them if we - * need to eat memory until we think we have enough. If our attempts - * to freeze fail, we give up and abort. - */ - - amount_wanted = amount_needed(1); - - switch (image_size_limit) { - case -1: /* Don't eat any memory */ - if (amount_wanted > 0) { - set_abort_result(TOI_WOULD_EAT_MEMORY); - return; - } - break; - case -2: /* Free caches only */ - drop_pagecache(); - toi_recalculate_image_contents(0); - amount_wanted = amount_needed(1); - break; - default: - break; - } - - if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) && - image_size_limit != -1) { - unsigned long request = amount_wanted; - unsigned long high_req = max(highpages_ps1_to_free(), - any_to_free(1)); - unsigned long low_req = lowpages_ps1_to_free(); - unsigned long got = 0; - - toi_prepare_status(CLEAR_BAR, - "Seeking to free %ldMB of memory.", - MB(amount_wanted)); - - thaw_kernel_threads(); - - /* - * Ask for too many because shrink_memory_mask doesn't - * currently return enough most of the time. - */ - - if (low_req) - got = shrink_memory_mask(low_req, GFP_KERNEL); - if (high_req) - shrink_memory_mask(high_req - got, GFP_HIGHUSER); - - did_eat_memory = 1; - - toi_recalculate_image_contents(0); - - amount_wanted = amount_needed(1); - - printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &" - " %ld pages from anywhere, got %ld.\n", - high_req, low_req, - request - amount_wanted); - - toi_cond_pause(0, NULL); - - if (freeze_kernel_threads()) - set_abort_result(TOI_FREEZING_FAILED); - } - - if (did_eat_memory) - toi_recalculate_image_contents(0); -} - -/* toi_prepare_image - * - * Entry point to the whole image preparation section. - * - * We do four things: - * - Freeze processes; - * - Ensure image size constraints are met; - * - Complete all the preparation for saving the image, - * including allocation of storage. The only memory - * that should be needed when we're finished is that - * for actually storing the image (and we know how - * much is needed for that because the modules tell - * us). - * - Make sure that all dirty buffers are written out. - */ -#define MAX_TRIES 2 -int toi_prepare_image(void) -{ - int result = 1, tries = 1; - - main_storage_allocated = 0; - - // Force recalculation of the amount of header storage needed for fs info. - fs_info_space_needed(1); - - no_ps2_needed = 0; - - if (attempt_to_freeze()) - return 1; - - lock_device_hotplug(); - set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); - - if (!extra_pd1_pages_allowance) - get_extra_pd1_allowance(); - - storage_limit = toiActiveAllocator->storage_available(); - - if (!storage_limit) { - printk(KERN_INFO "No storage available. Didn't try to prepare " - "an image.\n"); - display_failure_reason(0); - set_abort_result(TOI_NOSTORAGE_AVAILABLE); - return 1; - } - - if (build_attention_list()) { - abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, - "Unable to successfully prepare the image.\n"); - return 1; - } - - toi_recalculate_image_contents(0); - - do { - toi_prepare_status(CLEAR_BAR, - "Preparing Image. Try %d.", tries); - - eat_memory(); - - if (test_result_state(TOI_ABORTED)) - break; - - update_image(0); - - tries++; - - } while (image_not_ready(1) && tries <= MAX_TRIES && - !test_result_state(TOI_ABORTED)); - - result = image_not_ready(0); - - /* TODO: Handle case where need to remove existing image and resave - * instead of adding to incremental image. */ - - if (!test_result_state(TOI_ABORTED)) { - if (result) { - display_stats(1, 0); - display_failure_reason(tries > MAX_TRIES); - abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, - "Unable to successfully prepare the image.\n"); - } else { - /* Pageset 2 needed? */ - if (!need_pageset2() && - test_action_state(TOI_NO_PS2_IF_UNNEEDED)) { - no_ps2_needed = 1; - toi_recalculate_image_contents(0); - update_image(1); - } - - toi_cond_pause(1, "Image preparation complete."); - } - } - - return result ? result : allocate_checksum_pages(); -} diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h deleted file mode 100644 index f7f2b695c..000000000 --- a/kernel/power/tuxonice_prepare_image.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * kernel/power/tuxonice_prepare_image.h - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include <asm/sections.h> - -extern int toi_prepare_image(void); -extern void toi_recalculate_image_contents(int storage_available); -extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask); -extern long image_size_limit; -extern void toi_free_extra_pagedir_memory(void); -extern unsigned long extra_pd1_pages_allowance; -extern void free_attention_list(void); - -#define MIN_FREE_RAM 100 -#define MIN_EXTRA_PAGES_ALLOWANCE 500 - -#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1)) -#ifdef CONFIG_HIGHMEM -#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM)) -#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \ - (1 << ZONE_HIGHMEM))) -#else -#define real_nr_free_high_pages() (0) -#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask)) - -/* For eat_memory function */ -#define ZONE_HIGHMEM (MAX_NR_ZONES + 1) -#endif - -unsigned long get_header_storage_needed(int reset); -unsigned long any_to_free(int use_image_size_limit); -int try_allocate_extra_memory(void); diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c deleted file mode 100644 index 5bc56d3a1..000000000 --- a/kernel/power/tuxonice_prune.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - * kernel/power/tuxonice_prune.c - * - * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file implements a TuxOnIce module that seeks to prune the - * amount of data written to disk. It builds a table of hashes - * of the uncompressed data, and writes the pfn of the previous page - * with the same contents instead of repeating the data when a match - * is found. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <linux/crypto.h> -#include <linux/scatterlist.h> -#include <crypto/hash.h> - -#include "tuxonice_builtin.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" - -/* - * We never write a page bigger than PAGE_SIZE, so use a large number - * to indicate that data is a PFN. - */ -#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100) - -static unsigned long toi_pruned_pages; - -static struct toi_module_ops toi_prune_ops; -static struct toi_module_ops *next_driver; - -static char toi_prune_hash_algo_name[32] = "sha1"; - -static DEFINE_MUTEX(stats_lock); - -struct cpu_context { - struct shash_desc desc; - char *digest; -}; - -#define OUT_BUF_SIZE (2 * PAGE_SIZE) - -static DEFINE_PER_CPU(struct cpu_context, contexts); - -/* - * toi_crypto_prepare - * - * Prepare to do some work by allocating buffers and transforms. - */ -static int toi_prune_crypto_prepare(void) -{ - int cpu, ret, digestsize; - - if (!*toi_prune_hash_algo_name) { - printk(KERN_INFO "TuxOnIce: Pruning enabled but no " - "hash algorithm set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0); - if (IS_ERR(this->desc.tfm)) { - printk(KERN_INFO "TuxOnIce: Failed to allocate the " - "%s prune hash algorithm.\n", - toi_prune_hash_algo_name); - this->desc.tfm = NULL; - return 1; - } - - if (!digestsize) - digestsize = crypto_shash_digestsize(this->desc.tfm); - - this->digest = kmalloc(digestsize, GFP_KERNEL); - if (!this->digest) { - printk(KERN_INFO "TuxOnIce: Failed to allocate space " - "for digest output.\n"); - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - } - - this->desc.flags = 0; - - ret = crypto_shash_init(&this->desc); - if (ret < 0) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s prune hash algorithm.\n", - toi_prune_hash_algo_name); - kfree(this->digest); - this->digest = NULL; - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - return 1; - } - } - - return 0; -} - -static int toi_prune_rw_cleanup(int writing) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->desc.tfm) { - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - } - - if (this->digest) { - kfree(this->digest); - this->digest = NULL; - } - } - - return 0; -} - -/* - * toi_prune_init - */ - -static int toi_prune_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - toi_pruned_pages = 0; - - next_driver = toi_get_next_filter(&toi_prune_ops); - - return next_driver ? 0 : -ECHILD; -} - -/* - * toi_prune_rw_init() - */ - -static int toi_prune_rw_init(int rw, int stream_number) -{ - if (toi_prune_crypto_prepare()) { - printk(KERN_ERR "Failed to initialise prune " - "algorithm.\n"); - if (rw == READ) { - printk(KERN_INFO "Unable to read the image.\n"); - return -ENODEV; - } else { - printk(KERN_INFO "Continuing without " - "pruning the image.\n"); - toi_prune_ops.enabled = 0; - } - } - - return 0; -} - -/* - * toi_prune_write_page() - * - * Compress a page of data, buffering output and passing on filled - * pages to the next module in the pipeline. - * - * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing - * data to be checked. - * - * Returns: 0 on success. Otherwise the error is that returned by later - * modules, -ECHILD if we have a broken pipeline or -EIO if - * zlib errs. - */ -static int toi_prune_write_page(unsigned long index, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - int ret = 0, cpu = smp_processor_id(), write_data = 1; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - u8* output_buffer = buffer_page; - int output_len = buf_size; - int out_buf_type = buf_type; - void *buffer_start; - u32 buf[4]; - - if (ctx->desc.tfm) { - - buffer_start = TOI_MAP(buf_type, buffer_page); - ctx->len = OUT_BUF_SIZE; - - ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest); - if (ret) { - printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret); - } else { - mutex_lock(&stats_lock); - - toi_pruned_pages++; - - mutex_unlock(&stats_lock); - - } - - TOI_UNMAP(buf_type, buffer_page); - } - - if (write_data) - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - else - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - - return ret; -} - -/* - * toi_prune_read_page() - * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. - * - * Retrieve data from later modules or from a previously loaded page and - * fill the input buffer. - * Zero if successful. Error condition from me or from downstream on failure. - */ -static int toi_prune_read_page(unsigned long *index, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int ret, cpu = smp_processor_id(); - unsigned int len; - char *buffer_start; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!ctx->desc.tfm) - return next_driver->read_page(index, TOI_PAGE, buffer_page, - buf_size); - - /* - * All our reads must be synchronous - we can't handle - * data that hasn't been read yet. - */ - - ret = next_driver->read_page(index, buf_type, buffer_page, &len); - - if (len == PRUNE_DATA_IS_PFN) { - buffer_start = kmap(buffer_page); - } - - return ret; -} - -/* - * toi_prune_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_prune_print_debug_stats(char *buffer, int size) -{ - int len; - - /* Output the number of pages pruned. */ - if (*toi_prune_hash_algo_name) - len = scnprintf(buffer, size, "- Compressor is '%s'.\n", - toi_prune_hash_algo_name); - else - len = scnprintf(buffer, size, "- Compressor is not set.\n"); - - if (toi_pruned_pages) - len += scnprintf(buffer+len, size - len, " Pruned " - "%lu pages).\n", - toi_pruned_pages); - return len; -} - -/* - * toi_prune_memory_needed - * - * Tell the caller how much memory we need to operate during hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_prune_memory_needed(void) -{ - return 2 * PAGE_SIZE; -} - -static int toi_prune_storage_needed(void) -{ - return 2 * sizeof(unsigned long) + 2 * sizeof(int) + - strlen(toi_prune_hash_algo_name) + 1; -} - -/* - * toi_prune_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_prune_save_config_info(char *buffer) -{ - int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0; - - *((unsigned long *) buffer) = toi_pruned_pages; - offset += sizeof(unsigned long); - *((int *) (buffer + offset)) = len; - offset += sizeof(int); - strncpy(buffer + offset, toi_prune_hash_algo_name, len); - return offset + len; -} - -/* toi_prune_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for passing back to the - * resumed kernel. - */ -static void toi_prune_load_config_info(char *buffer, int size) -{ - int len, offset = 0; - - toi_pruned_pages = *((unsigned long *) buffer); - offset += sizeof(unsigned long); - len = *((int *) (buffer + offset)); - offset += sizeof(int); - strncpy(toi_prune_hash_algo_name, buffer + offset, len); -} - -static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - bkd->pruned_pages = toi_pruned_pages; -} - -static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_pruned_pages = bkd->pruned_pages; -} - -/* - * toi_expected_ratio - * - * Description: Returns the expected ratio between data passed into this module - * and the amount of data output when writing. - * Returns: 100 - we have no idea how many pages will be pruned. - */ - -static int toi_prune_expected_ratio(void) -{ - return 100; -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0, - NULL), - SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL), -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_prune_ops = { - .type = FILTER_MODULE, - .name = "prune", - .directory = "prune", - .module = THIS_MODULE, - .initialise = toi_prune_init, - .memory_needed = toi_prune_memory_needed, - .print_debug_info = toi_prune_print_debug_stats, - .save_config_info = toi_prune_save_config_info, - .load_config_info = toi_prune_load_config_info, - .storage_needed = toi_prune_storage_needed, - .expected_compression = toi_prune_expected_ratio, - - .pre_atomic_restore = toi_prune_pre_atomic_restore, - .post_atomic_restore = toi_prune_post_atomic_restore, - - .rw_init = toi_prune_rw_init, - .rw_cleanup = toi_prune_rw_cleanup, - - .write_page = toi_prune_write_page, - .read_page = toi_prune_read_page, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -static __init int toi_prune_load(void) -{ - return toi_register_module(&toi_prune_ops); -} - -late_initcall(toi_prune_load); diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c deleted file mode 100644 index d8539c275..000000000 --- a/kernel/power/tuxonice_storage.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - * kernel/power/tuxonice_storage.c - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for talking to a userspace program that manages storage. - * - * The kernel side: - * - starts the userspace program; - * - sends messages telling it when to open and close the connection; - * - tells it when to quit; - * - * The user space side: - * - passes messages regarding status; - * - */ - -#include <linux/suspend.h> -#include <linux/freezer.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_netlink.h" -#include "tuxonice_storage.h" -#include "tuxonice_ui.h" - -static struct user_helper_data usm_helper_data; -static struct toi_module_ops usm_ops; -static int message_received, usm_prepare_count; -static int storage_manager_last_action, storage_manager_action; - -static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type; - int *data; - - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < NETLINK_MSG_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ - if (type >= USM_MSG_MAX) - return -EINVAL; - - /* All operations require privileges, even GET */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1) - return -EBUSY; - - data = (int *) NLMSG_DATA(nlh); - - switch (type) { - case USM_MSG_SUCCESS: - case USM_MSG_FAILED: - message_received = type; - complete(&usm_helper_data.wait_for_process); - break; - default: - printk(KERN_INFO "Storage manager doesn't recognise " - "message %d.\n", type); - } - - return 1; -} - -#ifdef CONFIG_NET -static int activations; - -int toi_activate_storage(int force) -{ - int tries = 1; - - if (usm_helper_data.pid == -1 || !usm_ops.enabled) - return 0; - - message_received = 0; - activations++; - - if (activations > 1 && !force) - return 0; - - while ((!message_received || message_received == USM_MSG_FAILED) && - tries < 2) { - toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt " - "%d.\n", tries); - - init_completion(&usm_helper_data.wait_for_process); - - toi_send_netlink_message(&usm_helper_data, - USM_MSG_CONNECT, - NULL, 0); - - /* Wait 2 seconds for the userspace process to make contact */ - wait_for_completion_timeout(&usm_helper_data.wait_for_process, - 2*HZ); - - tries++; - } - - return 0; -} - -int toi_deactivate_storage(int force) -{ - if (usm_helper_data.pid == -1 || !usm_ops.enabled) - return 0; - - message_received = 0; - activations--; - - if (activations && !force) - return 0; - - init_completion(&usm_helper_data.wait_for_process); - - toi_send_netlink_message(&usm_helper_data, - USM_MSG_DISCONNECT, - NULL, 0); - - wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ); - - if (!message_received || message_received == USM_MSG_FAILED) { - printk(KERN_INFO "Returning failure disconnecting storage.\n"); - return 1; - } - - return 0; -} -#endif - -static void storage_manager_simulate(void) -{ - printk(KERN_INFO "--- Storage manager simulate ---\n"); - toi_prepare_usm(); - schedule(); - printk(KERN_INFO "--- Activate storage 1 ---\n"); - toi_activate_storage(1); - schedule(); - printk(KERN_INFO "--- Deactivate storage 1 ---\n"); - toi_deactivate_storage(1); - schedule(); - printk(KERN_INFO "--- Cleanup usm ---\n"); - toi_cleanup_usm(); - schedule(); - printk(KERN_INFO "--- Storage manager simulate ends ---\n"); -} - -static int usm_storage_needed(void) -{ - return sizeof(int) + strlen(usm_helper_data.program) + 1; -} - -static int usm_save_config_info(char *buf) -{ - int len = strlen(usm_helper_data.program); - memcpy(buf, usm_helper_data.program, len + 1); - return sizeof(int) + len + 1; -} - -static void usm_load_config_info(char *buf, int size) -{ - /* Don't load the saved path if one has already been set */ - if (usm_helper_data.program[0]) - return; - - memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf)); -} - -static int usm_memory_needed(void) -{ - /* ball park figure of 32 pages */ - return 32 * PAGE_SIZE; -} - -/* toi_prepare_usm - */ -int toi_prepare_usm(void) -{ - usm_prepare_count++; - - if (usm_prepare_count > 1 || !usm_ops.enabled) - return 0; - - usm_helper_data.pid = -1; - - if (!*usm_helper_data.program) - return 0; - - toi_netlink_setup(&usm_helper_data); - - if (usm_helper_data.pid == -1) - printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't" - " start it.\n"); - - toi_activate_storage(0); - - return usm_helper_data.pid != -1; -} - -void toi_cleanup_usm(void) -{ - usm_prepare_count--; - - if (usm_helper_data.pid > -1 && !usm_prepare_count) { - toi_deactivate_storage(0); - toi_netlink_close(&usm_helper_data); - } -} - -static void storage_manager_activate(void) -{ - if (storage_manager_action == storage_manager_last_action) - return; - - if (storage_manager_action) - toi_prepare_usm(); - else - toi_cleanup_usm(); - - storage_manager_last_action = storage_manager_action; -} - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate), - SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL), - SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0, - NULL), - SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1, - 0, storage_manager_activate) -}; - -static struct toi_module_ops usm_ops = { - .type = MISC_MODULE, - .name = "usm", - .directory = "storage_manager", - .module = THIS_MODULE, - .storage_needed = usm_storage_needed, - .save_config_info = usm_save_config_info, - .load_config_info = usm_load_config_info, - .memory_needed = usm_memory_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* toi_usm_sysfs_init - * Description: Boot time initialisation for user interface. - */ -int toi_usm_init(void) -{ - usm_helper_data.nl = NULL; - usm_helper_data.program[0] = '\0'; - usm_helper_data.pid = -1; - usm_helper_data.skb_size = 0; - usm_helper_data.pool_limit = 6; - usm_helper_data.netlink_id = NETLINK_TOI_USM; - usm_helper_data.name = "userspace storage manager"; - usm_helper_data.rcv_msg = usm_user_rcv_msg; - usm_helper_data.interface_version = 2; - usm_helper_data.must_init = 0; - init_completion(&usm_helper_data.wait_for_process); - - return toi_register_module(&usm_ops); -} - -void toi_usm_exit(void) -{ - toi_netlink_close_complete(&usm_helper_data); - toi_unregister_module(&usm_ops); -} diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h deleted file mode 100644 index 0189c8888..000000000 --- a/kernel/power/tuxonice_storage.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * kernel/power/tuxonice_storage.h - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifdef CONFIG_NET -int toi_prepare_usm(void); -void toi_cleanup_usm(void); - -int toi_activate_storage(int force); -int toi_deactivate_storage(int force); -extern int toi_usm_init(void); -extern void toi_usm_exit(void); -#else -static inline int toi_usm_init(void) { return 0; } -static inline void toi_usm_exit(void) { } - -static inline int toi_activate_storage(int force) -{ - return 0; -} - -static inline int toi_deactivate_storage(int force) -{ - return 0; -} - -static inline int toi_prepare_usm(void) { return 0; } -static inline void toi_cleanup_usm(void) { } -#endif - -enum { - USM_MSG_BASE = 0x10, - - /* Kernel -> Userspace */ - USM_MSG_CONNECT = 0x30, - USM_MSG_DISCONNECT = 0x31, - USM_MSG_SUCCESS = 0x40, - USM_MSG_FAILED = 0x41, - - USM_MSG_MAX, -}; diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c deleted file mode 100644 index 9f555c932..000000000 --- a/kernel/power/tuxonice_swap.c +++ /dev/null @@ -1,474 +0,0 @@ -/* - * kernel/power/tuxonice_swap.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file encapsulates functions for usage of swap space as a - * backing store. - */ - -#include <linux/suspend.h> -#include <linux/blkdev.h> -#include <linux/swapops.h> -#include <linux/swap.h> -#include <linux/syscalls.h> -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_extent.h" -#include "tuxonice_bio.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" - -static struct toi_module_ops toi_swapops; - -/* For swapfile automatically swapon/off'd. */ -static char swapfilename[255] = ""; -static int toi_swapon_status; - -/* Swap Pages */ -static unsigned long swap_allocated; - -static struct sysinfo swapinfo; - -static int is_ram_backed(struct swap_info_struct *si) -{ - if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) || - !strncmp(si->bdev->bd_disk->disk_name, "zram", 4)) - return 1; - - return 0; -} - -/** - * enable_swapfile: Swapon the user specified swapfile prior to hibernating. - * - * Activate the given swapfile if it wasn't already enabled. Remember whether - * we really did swapon it for swapoffing later. - */ -static void enable_swapfile(void) -{ - int activateswapresult = -EINVAL; - - if (swapfilename[0]) { - /* Attempt to swap on with maximum priority */ - activateswapresult = sys_swapon(swapfilename, 0xFFFF); - if (activateswapresult && activateswapresult != -EBUSY) - printk(KERN_ERR "TuxOnIce: The swapfile/partition " - "specified by /sys/power/tuxonice/swap/swapfile" - " (%s) could not be turned on (error %d). " - "Attempting to continue.\n", - swapfilename, activateswapresult); - if (!activateswapresult) - toi_swapon_status = 1; - } -} - -/** - * disable_swapfile: Swapoff any file swaponed at the start of the cycle. - * - * If we did successfully swapon a file at the start of the cycle, swapoff - * it now (finishing up). - */ -static void disable_swapfile(void) -{ - if (!toi_swapon_status) - return; - - sys_swapoff(swapfilename); - toi_swapon_status = 0; -} - -static int add_blocks_to_extent_chain(struct toi_bdev_info *chain, - unsigned long start, unsigned long end) -{ - if (test_action_state(TOI_TEST_BIO)) - toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to " - "chain %p.", start << chain->bmap_shift, - end << chain->bmap_shift, chain); - - return toi_add_to_extent_chain(&chain->blocks, start, end); -} - - -static int get_main_pool_phys_params(struct toi_bdev_info *chain) -{ - struct hibernate_extent *extentpointer = NULL; - unsigned long address, extent_min = 0, extent_max = 0; - int empty = 1; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for " - "chain %d.", chain->allocator_index); - - if (!chain->allocations.first) - return 0; - - if (chain->blocks.first) - toi_put_extent_chain(&chain->blocks); - - toi_extent_for_each(&chain->allocations, extentpointer, address) { - swp_entry_t swap_address = (swp_entry_t) { address }; - struct block_device *bdev; - sector_t new_sector = map_swap_entry(swap_address, &bdev); - - if (empty) { - empty = 0; - extent_min = extent_max = new_sector; - continue; - } - - if (new_sector == extent_max + 1) { - extent_max++; - continue; - } - - if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) { - printk(KERN_ERR "Out of memory while making block " - "chains.\n"); - return -ENOMEM; - } - - extent_min = new_sector; - extent_max = new_sector; - } - - if (!empty && - add_blocks_to_extent_chain(chain, extent_min, extent_max)) { - printk(KERN_ERR "Out of memory while making block chains.\n"); - return -ENOMEM; - } - - return 0; -} - -/* - * Like si_swapinfo, except that we don't include ram backed swap (compcache!) - * and don't need to use the spinlocks (userspace is stopped when this - * function is called). - */ -void si_swapinfo_no_compcache(void) -{ - unsigned int i; - - si_swapinfo(&swapinfo); - swapinfo.freeswap = 0; - swapinfo.totalswap = 0; - - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) { - swapinfo.totalswap += si->inuse_pages; - swapinfo.freeswap += si->pages - si->inuse_pages; - } - } -} -/* - * We can't just remember the value from allocation time, because other - * processes might have allocated swap in the mean time. - */ -static unsigned long toi_swap_storage_available(void) -{ - toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available."); - si_swapinfo_no_compcache(); - return swapinfo.freeswap + swap_allocated; -} - -static int toi_swap_initialise(int starting_cycle) -{ - if (!starting_cycle) - return 0; - - enable_swapfile(); - return 0; -} - -static void toi_swap_cleanup(int ending_cycle) -{ - if (!ending_cycle) - return; - - disable_swapfile(); -} - -static void toi_swap_free_storage(struct toi_bdev_info *chain) -{ - /* Free swap entries */ - struct hibernate_extent *extentpointer; - unsigned long extentvalue; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.", - chain); - - swap_allocated -= chain->allocations.size; - toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) - swap_free((swp_entry_t) { extentvalue }); - - toi_put_extent_chain(&chain->allocations); -} - -static void free_swap_range(unsigned long min, unsigned long max) -{ - int j; - - for (j = min; j <= max; j++) - swap_free((swp_entry_t) { j }); - swap_allocated -= (max - min + 1); -} - -/* - * Allocation of a single swap type. Swap priorities are handled at the higher - * level. - */ -static int toi_swap_allocate_storage(struct toi_bdev_info *chain, - unsigned long request) -{ - unsigned long gotten = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, " Swap allocate storage: Asked to" - " allocate %lu pages from device %d.", request, - chain->allocator_index); - - while (gotten < request) { - swp_entry_t start, end; - if (0) { - /* Broken at the moment for SSDs */ - get_swap_range_of_type(chain->allocator_index, &start, &end, - request - gotten + 1); - } else { - start = end = get_swap_page_of_type(chain->allocator_index); - } - if (start.val) { - int added = end.val - start.val + 1; - if (toi_add_to_extent_chain(&chain->allocations, - start.val, end.val)) { - printk(KERN_INFO "Failed to allocate extent for " - "%lu-%lu.\n", start.val, end.val); - free_swap_range(start.val, end.val); - break; - } - gotten += added; - swap_allocated += added; - } else - break; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, " Allocated %lu pages.", gotten); - return gotten; -} - -static int toi_swap_register_storage(void) -{ - int i, result = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage."); - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - struct toi_bdev_info *devinfo; - unsigned char *p; - unsigned char buf[256]; - struct fs_info *fs_info; - - if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si)) - continue; - - devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), - GFP_ATOMIC); - if (!devinfo) { - printk("Failed to allocate devinfo struct for swap " - "device %d.\n", i); - return -ENOMEM; - } - - devinfo->bdev = si->bdev; - devinfo->allocator = &toi_swapops; - devinfo->allocator_index = i; - - fs_info = fs_info_from_block_dev(si->bdev); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(devinfo->uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - if (!fs_info) - printk("fs_info from block dev returned %d.\n", result); - devinfo->dev_t = si->bdev->bd_dev; - devinfo->prio = si->prio; - devinfo->bmap_shift = 3; - devinfo->blocks_per_page = 1; - - p = d_path(&si->swap_file->f_path, buf, sizeof(buf)); - sprintf(devinfo->name, "swap on %s", p); - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:" - " Device %d (%lx), prio %d.", i, - (unsigned long) devinfo->dev_t, devinfo->prio); - toi_bio_ops.register_storage(devinfo); - } - - return 0; -} - -static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used) -{ - struct hibernate_extent *extentpointer = NULL; - unsigned long extentvalue; - unsigned long i = 0, first_freed = 0; - - toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) { - i++; - if (i > used) { - swap_free((swp_entry_t) { extentvalue }); - if (!first_freed) - first_freed = extentvalue; - } - } - - return first_freed; -} - -/* - * workspace_size - * - * Description: - * Returns the number of bytes of RAM needed for this - * code to do its work. (Used when calculating whether - * we have enough memory to be able to hibernate & resume). - * - */ -static int toi_swap_memory_needed(void) -{ - return 1; -} - -/* - * Print debug info - * - * Description: - */ -static int toi_swap_print_debug_stats(char *buffer, int size) -{ - int len = 0; - - len = scnprintf(buffer, size, "- Swap Allocator enabled.\n"); - if (swapfilename[0]) - len += scnprintf(buffer+len, size-len, - " Attempting to automatically swapon: %s.\n", - swapfilename); - - si_swapinfo_no_compcache(); - - len += scnprintf(buffer+len, size-len, - " Swap available for image: %lu pages.\n", - swapinfo.freeswap + swap_allocated); - - return len; -} - -static int header_locations_read_sysfs(const char *page, int count) -{ - int i, printedpartitionsmessage = 0, len = 0, haveswap = 0; - struct inode *swapf = NULL; - int zone; - char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL); - char *path, *output = (char *) page; - int path_len; - - if (!page) - return 0; - - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - - if (!si || !(si->flags & SWP_WRITEOK)) - continue; - - if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) { - haveswap = 1; - if (!printedpartitionsmessage) { - len += sprintf(output + len, - "For swap partitions, simply use the " - "format: resume=swap:/dev/hda1.\n"); - printedpartitionsmessage = 1; - } - } else { - path_len = 0; - - path = d_path(&si->swap_file->f_path, path_page, - PAGE_SIZE); - path_len = snprintf(path_page, PAGE_SIZE, "%s", path); - - haveswap = 1; - swapf = si->swap_file->f_mapping->host; - zone = bmap(swapf, 0); - if (!zone) { - len += sprintf(output + len, - "Swapfile %s has been corrupted. Reuse" - " mkswap on it and try again.\n", - path_page); - } else { - char name_buffer[BDEVNAME_SIZE]; - len += sprintf(output + len, - "For swapfile `%s`," - " use resume=swap:/dev/%s:0x%x.\n", - path_page, - bdevname(si->bdev, name_buffer), - zone << (swapf->i_blkbits - 9)); - } - } - } - - if (!haveswap) - len = sprintf(output, "You need to turn on swap partitions " - "before examining this file.\n"); - - toi_free_page(10, (unsigned long) path_page); - return len; -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL), - SYSFS_CUSTOM("headerlocations", SYSFS_READONLY, - header_locations_read_sysfs, NULL, 0, NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0, - attempt_to_parse_resume_device2), -}; - -static struct toi_bio_allocator_ops toi_bio_swapops = { - .register_storage = toi_swap_register_storage, - .storage_available = toi_swap_storage_available, - .allocate_storage = toi_swap_allocate_storage, - .bmap = get_main_pool_phys_params, - .free_storage = toi_swap_free_storage, - .free_unused_storage = toi_swap_free_unused_storage, -}; - -static struct toi_module_ops toi_swapops = { - .type = BIO_ALLOCATOR_MODULE, - .name = "swap storage", - .directory = "swap", - .module = THIS_MODULE, - .memory_needed = toi_swap_memory_needed, - .print_debug_info = toi_swap_print_debug_stats, - .initialise = toi_swap_initialise, - .cleanup = toi_swap_cleanup, - .bio_allocator_ops = &toi_bio_swapops, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -static __init int toi_swap_load(void) -{ - return toi_register_module(&toi_swapops); -} - -late_initcall(toi_swap_load); diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c deleted file mode 100644 index 77f36dbeb..000000000 --- a/kernel/power/tuxonice_sysfs.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * kernel/power/tuxonice_sysfs.c - * - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains support for sysfs entries for tuning TuxOnIce. - * - * We have a generic handler that deals with the most common cases, and - * hooks for special handlers to use. - */ - -#include <linux/suspend.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice.h" -#include "tuxonice_storage.h" -#include "tuxonice_alloc.h" - -static int toi_sysfs_initialised; - -static void toi_initialise_sysfs(void); - -static struct toi_sysfs_data sysfs_params[]; - -#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr) - -static void toi_main_wrapper(void) -{ - toi_try_hibernate(); -} - -static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); - int len = 0; - int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ; - - if (full_prep && toi_start_anything(0)) - return -EBUSY; - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) - toi_prepare_usm(); - - switch (sysfs_data->type) { - case TOI_SYSFS_DATA_CUSTOM: - len = (sysfs_data->data.special.read_sysfs) ? - (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE) - : 0; - break; - case TOI_SYSFS_DATA_BIT: - len = sprintf(page, "%d\n", - -test_bit(sysfs_data->data.bit.bit, - sysfs_data->data.bit.bit_vector)); - break; - case TOI_SYSFS_DATA_INTEGER: - len = sprintf(page, "%d\n", - *(sysfs_data->data.integer.variable)); - break; - case TOI_SYSFS_DATA_LONG: - len = sprintf(page, "%ld\n", - *(sysfs_data->data.a_long.variable)); - break; - case TOI_SYSFS_DATA_UL: - len = sprintf(page, "%lu\n", - *(sysfs_data->data.ul.variable)); - break; - case TOI_SYSFS_DATA_STRING: - len = sprintf(page, "%s\n", - sysfs_data->data.string.variable); - break; - } - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) - toi_cleanup_usm(); - - if (full_prep) - toi_finish_anything(0); - - return len; -} - -#define BOUND(_variable, _type) do { \ - if (*_variable < sysfs_data->data._type.minimum) \ - *_variable = sysfs_data->data._type.minimum; \ - else if (*_variable > sysfs_data->data._type.maximum) \ - *_variable = sysfs_data->data._type.maximum; \ -} while (0) - -static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr, - const char *my_buf, size_t count) -{ - int assigned_temp_buffer = 0, result = count; - struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); - - if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME))) - return -EBUSY; - - ((char *) my_buf)[count] = 0; - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) - toi_prepare_usm(); - - switch (sysfs_data->type) { - case TOI_SYSFS_DATA_CUSTOM: - if (sysfs_data->data.special.write_sysfs) - result = (sysfs_data->data.special.write_sysfs)(my_buf, - count); - break; - case TOI_SYSFS_DATA_BIT: - { - unsigned long value; - result = kstrtoul(my_buf, 0, &value); - if (result) - break; - if (value) - set_bit(sysfs_data->data.bit.bit, - (sysfs_data->data.bit.bit_vector)); - else - clear_bit(sysfs_data->data.bit.bit, - (sysfs_data->data.bit.bit_vector)); - } - break; - case TOI_SYSFS_DATA_INTEGER: - { - long temp; - result = kstrtol(my_buf, 0, &temp); - if (result) - break; - *(sysfs_data->data.integer.variable) = (int) temp; - BOUND(sysfs_data->data.integer.variable, integer); - break; - } - case TOI_SYSFS_DATA_LONG: - { - long *variable = - sysfs_data->data.a_long.variable; - result = kstrtol(my_buf, 0, variable); - if (result) - break; - BOUND(variable, a_long); - break; - } - case TOI_SYSFS_DATA_UL: - { - unsigned long *variable = - sysfs_data->data.ul.variable; - result = kstrtoul(my_buf, 0, variable); - if (result) - break; - BOUND(variable, ul); - break; - } - break; - case TOI_SYSFS_DATA_STRING: - { - int copy_len = count; - char *variable = - sysfs_data->data.string.variable; - - if (sysfs_data->data.string.max_length && - (copy_len > sysfs_data->data.string.max_length)) - copy_len = sysfs_data->data.string.max_length; - - if (!variable) { - variable = (char *) toi_get_zeroed_page(31, - TOI_ATOMIC_GFP); - sysfs_data->data.string.variable = variable; - assigned_temp_buffer = 1; - } - strncpy(variable, my_buf, copy_len); - if (copy_len && my_buf[copy_len - 1] == '\n') - variable[count - 1] = 0; - variable[count] = 0; - } - break; - } - - if (!result) - result = count; - - /* Side effect routine? */ - if (result == count && sysfs_data->write_side_effect) - sysfs_data->write_side_effect(); - - /* Free temporary buffers */ - if (assigned_temp_buffer) { - toi_free_page(31, - (unsigned long) sysfs_data->data.string.variable); - sysfs_data->data.string.variable = NULL; - } - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) - toi_cleanup_usm(); - - toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME); - - return result; -} - -static struct sysfs_ops toi_sysfs_ops = { - .show = &toi_attr_show, - .store = &toi_attr_store, -}; - -static struct kobj_type toi_ktype = { - .sysfs_ops = &toi_sysfs_ops, -}; - -struct kobject *tuxonice_kobj; - -/* Non-module sysfs entries. - * - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL, - SYSFS_HIBERNATING, toi_main_wrapper), - SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL, - SYSFS_RESUMING, toi_try_resume) -}; - -void remove_toi_sysdir(struct kobject *kobj) -{ - if (!kobj) - return; - - kobject_put(kobj); -} - -struct kobject *make_toi_sysdir(char *name) -{ - struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj); - - if (!kobj) { - printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs " - "dir!\n"); - return NULL; - } - - kobj->ktype = &toi_ktype; - - return kobj; -} - -/* toi_register_sysfs_file - * - * Helper for registering a new /sysfs/tuxonice entry. - */ - -int toi_register_sysfs_file( - struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data) -{ - int result; - - if (!toi_sysfs_initialised) - toi_initialise_sysfs(); - - result = sysfs_create_file(kobj, &toi_sysfs_data->attr); - if (result) - printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s " - "returned %d.\n", - toi_sysfs_data->attr.name, result); - kobj->ktype = &toi_ktype; - - return result; -} - -/* toi_unregister_sysfs_file - * - * Helper for removing unwanted /sys/power/tuxonice entries. - * - */ -void toi_unregister_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data) -{ - sysfs_remove_file(kobj, &toi_sysfs_data->attr); -} - -void toi_cleanup_sysfs(void) -{ - int i, - numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - if (!toi_sysfs_initialised) - return; - - for (i = 0; i < numfiles; i++) - toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]); - - kobject_put(tuxonice_kobj); - toi_sysfs_initialised = 0; -} - -/* toi_initialise_sysfs - * - * Initialise the /sysfs/tuxonice directory. - */ - -static void toi_initialise_sysfs(void) -{ - int i; - int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - if (toi_sysfs_initialised) - return; - - /* Make our TuxOnIce directory a child of /sys/power */ - tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj); - if (!tuxonice_kobj) - return; - - toi_sysfs_initialised = 1; - - for (i = 0; i < numfiles; i++) - toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); -} - -int toi_sysfs_init(void) -{ - toi_initialise_sysfs(); - return 0; -} - -void toi_sysfs_exit(void) -{ - toi_cleanup_sysfs(); -} diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h deleted file mode 100644 index 1de954ce1..000000000 --- a/kernel/power/tuxonice_sysfs.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * kernel/power/tuxonice_sysfs.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#include <linux/sysfs.h> - -struct toi_sysfs_data { - struct attribute attr; - int type; - int flags; - union { - struct { - unsigned long *bit_vector; - int bit; - } bit; - struct { - int *variable; - int minimum; - int maximum; - } integer; - struct { - long *variable; - long minimum; - long maximum; - } a_long; - struct { - unsigned long *variable; - unsigned long minimum; - unsigned long maximum; - } ul; - struct { - char *variable; - int max_length; - } string; - struct { - int (*read_sysfs) (const char *buffer, int count); - int (*write_sysfs) (const char *buffer, int count); - void *data; - } special; - } data; - - /* Side effects routine. Used, eg, for reparsing the - * resume= entry when it changes */ - void (*write_side_effect) (void); - struct list_head sysfs_data_list; -}; - -enum { - TOI_SYSFS_DATA_NONE = 1, - TOI_SYSFS_DATA_CUSTOM, - TOI_SYSFS_DATA_BIT, - TOI_SYSFS_DATA_INTEGER, - TOI_SYSFS_DATA_UL, - TOI_SYSFS_DATA_LONG, - TOI_SYSFS_DATA_STRING -}; - -#define SYSFS_WRITEONLY 0200 -#define SYSFS_READONLY 0444 -#define SYSFS_RW 0644 - -#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_BIT, \ - .flags = _flags, \ - .data = { .bit = { .bit_vector = _ul, .bit = _bit } } } - -#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_INTEGER, \ - .flags = _flags, \ - .data = { .integer = { .variable = _int, .minimum = _min, \ - .maximum = _max } }, \ - .write_side_effect = _wse } - -#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_UL, \ - .flags = _flags, \ - .data = { .ul = { .variable = _ul, .minimum = _min, \ - .maximum = _max } } } - -#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_LONG, \ - .flags = _flags, \ - .data = { .a_long = { .variable = _long, .minimum = _min, \ - .maximum = _max } } } - -#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_STRING, \ - .flags = _flags, \ - .data = { .string = { .variable = _string, .max_length = _max_len } }, \ - .write_side_effect = _wse } - -#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_CUSTOM, \ - .flags = _flags, \ - .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \ - .write_side_effect = _wse } - -#define SYSFS_NONE(_name, _wse) { \ - .attr = {.name = _name , .mode = SYSFS_WRITEONLY }, \ - .type = TOI_SYSFS_DATA_NONE, \ - .write_side_effect = _wse, \ -} - -/* Flags */ -#define SYSFS_NEEDS_SM_FOR_READ 1 -#define SYSFS_NEEDS_SM_FOR_WRITE 2 -#define SYSFS_HIBERNATE 4 -#define SYSFS_RESUME 8 -#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME) -#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE) -#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE) -#define SYSFS_NEEDS_SM_FOR_BOTH \ - (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE) - -int toi_register_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data); -void toi_unregister_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data); - -extern struct kobject *tuxonice_kobj; - -struct kobject *make_toi_sysdir(char *name); -void remove_toi_sysdir(struct kobject *obj); -extern void toi_cleanup_sysfs(void); - -extern int toi_sysfs_init(void); -extern void toi_sysfs_exit(void); diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c deleted file mode 100644 index 76152f3ff..000000000 --- a/kernel/power/tuxonice_ui.c +++ /dev/null @@ -1,247 +0,0 @@ -/* - * kernel/power/tuxonice_ui.c - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr> - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for TuxOnIce's user interface. - * - * The user interface code talks to a userspace program via a - * netlink socket. - * - * The kernel side: - * - starts the userui program; - * - sends text messages and progress bar status; - * - * The user space side: - * - passes messages regarding user requests (abort, toggle reboot etc) - * - */ - -#define __KERNEL_SYSCALLS__ - -#include <linux/reboot.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_netlink.h" -#include "tuxonice_power_off.h" -#include "tuxonice_builtin.h" - -static char local_printf_buf[1024]; /* Same as printk - should be safe */ -struct ui_ops *toi_current_ui; - -/** - * toi_wait_for_keypress - Wait for keypress via userui or /dev/console. - * - * @timeout: Maximum time to wait. - * - * Wait for a keypress, either from userui or /dev/console if userui isn't - * available. The non-userui path is particularly for at boot-time, prior - * to userui being started, when we have an important warning to give to - * the user. - */ -static char toi_wait_for_keypress(int timeout) -{ - if (toi_current_ui && toi_current_ui->wait_for_key(timeout)) - return ' '; - - return toi_wait_for_keypress_dev_console(timeout); -} - -/* toi_early_boot_message() - * Description: Handle errors early in the process of booting. - * The user may press C to continue booting, perhaps - * invalidating the image, or space to reboot. - * This works from either the serial console or normally - * attached keyboard. - * - * Note that we come in here from init, while the kernel is - * locked. If we want to get events from the serial console, - * we need to temporarily unlock the kernel. - * - * toi_early_boot_message may also be called post-boot. - * In this case, it simply printks the message and returns. - * - * Arguments: int Whether we are able to erase the image. - * int default_answer. What to do when we timeout. This - * will normally be continue, but the user might - * provide command line options (__setup) to override - * particular cases. - * Char *. Pointer to a string explaining why we're moaning. - */ - -#define say(message, a...) printk(KERN_EMERG message, ##a) - -void toi_early_boot_message(int message_detail, int default_answer, - char *warning_reason, ...) -{ -#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) - unsigned long orig_state = get_toi_state(), continue_req = 0; - unsigned long orig_loglevel = console_loglevel; - int can_ask = 1; -#else - int can_ask = 0; -#endif - - va_list args; - int printed_len; - - if (!toi_wait) { - set_toi_state(TOI_CONTINUE_REQ); - can_ask = 0; - } - - if (warning_reason) { - va_start(args, warning_reason); - printed_len = vsnprintf(local_printf_buf, - sizeof(local_printf_buf), - warning_reason, - args); - va_end(args); - } - - if (!test_toi_state(TOI_BOOT_TIME)) { - printk("TuxOnIce: %s\n", local_printf_buf); - return; - } - - if (!can_ask) { - continue_req = !!default_answer; - goto post_ask; - } - -#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) - console_loglevel = 7; - - say("=== TuxOnIce ===\n\n"); - if (warning_reason) { - say("BIG FAT WARNING!! %s\n\n", local_printf_buf); - switch (message_detail) { - case 0: - say("If you continue booting, note that any image WILL" - "NOT BE REMOVED.\nTuxOnIce is unable to do so " - "because the appropriate modules aren't\n" - "loaded. You should manually remove the image " - "to avoid any\npossibility of corrupting your " - "filesystem(s) later.\n"); - break; - case 1: - say("If you want to use the current TuxOnIce image, " - "reboot and try\nagain with the same kernel " - "that you hibernated from. If you want\n" - "to forget that image, continue and the image " - "will be erased.\n"); - break; - } - say("Press SPACE to reboot or C to continue booting with " - "this kernel\n\n"); - if (toi_wait > 0) - say("Default action if you don't select one in %d " - "seconds is: %s.\n", - toi_wait, - default_answer == TOI_CONTINUE_REQ ? - "continue booting" : "reboot"); - } else { - say("BIG FAT WARNING!!\n\n" - "You have tried to resume from this image before.\n" - "If it failed once, it may well fail again.\n" - "Would you like to remove the image and boot " - "normally?\nThis will be equivalent to entering " - "noresume on the\nkernel command line.\n\n" - "Press SPACE to remove the image or C to continue " - "resuming.\n\n"); - if (toi_wait > 0) - say("Default action if you don't select one in %d " - "seconds is: %s.\n", toi_wait, - !!default_answer ? - "continue resuming" : "remove the image"); - } - console_loglevel = orig_loglevel; - - set_toi_state(TOI_SANITY_CHECK_PROMPT); - clear_toi_state(TOI_CONTINUE_REQ); - - if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */ - continue_req = !!default_answer; - else - continue_req = test_toi_state(TOI_CONTINUE_REQ); - -#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */ - -post_ask: - if ((warning_reason) && (!continue_req)) - kernel_restart(NULL); - - restore_toi_state(orig_state); - if (continue_req) - set_toi_state(TOI_CONTINUE_REQ); -} - -#undef say - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) - SYSFS_INT("default_console_level", SYSFS_RW, - &toi_bkd.toi_default_console_level, 0, 7, 0, NULL), - SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0, - 1 << 30, 0), - SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL, - 0) -#endif -}; - -static struct toi_module_ops userui_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "printk ui", - .directory = "user_interface", - .module = THIS_MODULE, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_register_ui_ops(struct ui_ops *this_ui) -{ - if (toi_current_ui) { - printk(KERN_INFO "Only one TuxOnIce user interface module can " - "be loaded at a time."); - return -EBUSY; - } - - toi_current_ui = this_ui; - - return 0; -} - -void toi_remove_ui_ops(struct ui_ops *this_ui) -{ - if (toi_current_ui != this_ui) - return; - - toi_current_ui = NULL; -} - -/* toi_console_sysfs_init - * Description: Boot time initialisation for user interface. - */ - -int toi_ui_init(void) -{ - return toi_register_module(&userui_ops); -} - -void toi_ui_exit(void) -{ - toi_unregister_module(&userui_ops); -} diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h deleted file mode 100644 index 4934e3a91..000000000 --- a/kernel/power/tuxonice_ui.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * kernel/power/tuxonice_ui.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - */ - -enum { - DONT_CLEAR_BAR, - CLEAR_BAR -}; - -enum { - /* Userspace -> Kernel */ - USERUI_MSG_ABORT = 0x11, - USERUI_MSG_SET_STATE = 0x12, - USERUI_MSG_GET_STATE = 0x13, - USERUI_MSG_GET_DEBUG_STATE = 0x14, - USERUI_MSG_SET_DEBUG_STATE = 0x15, - USERUI_MSG_SPACE = 0x18, - USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A, - USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B, - USERUI_MSG_GET_LOGLEVEL = 0x1C, - USERUI_MSG_SET_LOGLEVEL = 0x1D, - USERUI_MSG_PRINTK = 0x1E, - - /* Kernel -> Userspace */ - USERUI_MSG_MESSAGE = 0x21, - USERUI_MSG_PROGRESS = 0x22, - USERUI_MSG_POST_ATOMIC_RESTORE = 0x25, - - USERUI_MSG_MAX, -}; - -struct userui_msg_params { - u32 a, b, c, d; - char text[255]; -}; - -struct ui_ops { - char (*wait_for_key) (int timeout); - u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...); - void (*prepare_status) (int clearbar, const char *fmt, ...); - void (*cond_pause) (int pause, char *message); - void (*abort)(int result_code, const char *fmt, ...); - void (*prepare)(void); - void (*cleanup)(void); - void (*message)(u32 section, u32 level, u32 normally_logged, - const char *fmt, ...); -}; - -extern struct ui_ops *toi_current_ui; - -#define toi_update_status(val, max, fmt, args...) \ - (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \ - max) - -#define toi_prepare_console(void) \ - do { if (toi_current_ui) \ - (toi_current_ui->prepare)(); \ - } while (0) - -#define toi_cleanup_console(void) \ - do { if (toi_current_ui) \ - (toi_current_ui->cleanup)(); \ - } while (0) - -#define abort_hibernate(result, fmt, args...) \ - do { if (toi_current_ui) \ - (toi_current_ui->abort)(result, fmt, ##args); \ - else { \ - set_abort_result(result); \ - } \ - } while (0) - -#define toi_cond_pause(pause, message) \ - do { if (toi_current_ui) \ - (toi_current_ui->cond_pause)(pause, message); \ - } while (0) - -#define toi_prepare_status(clear, fmt, args...) \ - do { if (toi_current_ui) \ - (toi_current_ui->prepare_status)(clear, fmt, ##args); \ - else \ - printk(KERN_INFO fmt "%s", ##args, "\n"); \ - } while (0) - -#define toi_message(sn, lev, log, fmt, a...) \ -do { \ - if (toi_current_ui && (!sn || test_debug_state(sn))) \ - toi_current_ui->message(sn, lev, log, fmt, ##a); \ -} while (0) - -__exit void toi_ui_cleanup(void); -extern int toi_ui_init(void); -extern void toi_ui_exit(void); -extern int toi_register_ui_ops(struct ui_ops *this_ui); -extern void toi_remove_ui_ops(struct ui_ops *this_ui); diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c deleted file mode 100644 index 6aa5ac3eb..000000000 --- a/kernel/power/tuxonice_userui.c +++ /dev/null @@ -1,658 +0,0 @@ -/* - * kernel/power/user_ui.c - * - * Copyright (C) 2005-2007 Bernard Blackham - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for TuxOnIce's user interface. - * - * The user interface code talks to a userspace program via a - * netlink socket. - * - * The kernel side: - * - starts the userui program; - * - sends text messages and progress bar status; - * - * The user space side: - * - passes messages regarding user requests (abort, toggle reboot etc) - * - */ - -#define __KERNEL_SYSCALLS__ - -#include <linux/suspend.h> -#include <linux/freezer.h> -#include <linux/console.h> -#include <linux/ctype.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> -#include <linux/reboot.h> -#include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/vt.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_netlink.h" -#include "tuxonice_power_off.h" - -static char local_printf_buf[1024]; /* Same as printk - should be safe */ - -static struct user_helper_data ui_helper_data; -static struct toi_module_ops userui_ops; -static int orig_kmsg; - -static char lastheader[512]; -static int lastheader_message_len; -static int ui_helper_changed; /* Used at resume-time so don't overwrite value - set from initrd/ramfs. */ - -/* Number of distinct progress amounts that userspace can display */ -static int progress_granularity = 30; - -static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key); -static int userui_wait_should_wake; - -#define toi_stop_waiting_for_userui_key() \ -{ \ - userui_wait_should_wake = true; \ - wake_up_interruptible(&userui_wait_for_key); \ -} - -/** - * ui_nl_set_state - Update toi_action based on a message from userui. - * - * @n: The bit (1 << bit) to set. - */ -static void ui_nl_set_state(int n) -{ - /* Only let them change certain settings */ - static const u32 toi_action_mask = - (1 << TOI_REBOOT) | (1 << TOI_PAUSE) | - (1 << TOI_LOGALL) | - (1 << TOI_SINGLESTEP) | - (1 << TOI_PAUSE_NEAR_PAGESET_END); - static unsigned long new_action; - - new_action = (toi_bkd.toi_action & (~toi_action_mask)) | - (n & toi_action_mask); - - printk(KERN_DEBUG "n is %x. Action flags being changed from %lx " - "to %lx.", n, toi_bkd.toi_action, new_action); - toi_bkd.toi_action = new_action; - - if (!test_action_state(TOI_PAUSE) && - !test_action_state(TOI_SINGLESTEP)) - toi_stop_waiting_for_userui_key(); -} - -/** - * userui_post_atomic_restore - Tell userui that atomic restore just happened. - * - * Tell userui that atomic restore just occured, so that it can do things like - * redrawing the screen, re-getting settings and so on. - */ -static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0); -} - -/** - * userui_storage_needed - Report how much memory in image header is needed. - */ -static int userui_storage_needed(void) -{ - return sizeof(ui_helper_data.program) + 1 + sizeof(int); -} - -/** - * userui_save_config_info - Fill buffer with config info for image header. - * - * @buf: Buffer into which to put the config info we want to save. - */ -static int userui_save_config_info(char *buf) -{ - *((int *) buf) = progress_granularity; - memcpy(buf + sizeof(int), ui_helper_data.program, - sizeof(ui_helper_data.program)); - return sizeof(ui_helper_data.program) + sizeof(int) + 1; -} - -/** - * userui_load_config_info - Restore config info from buffer. - * - * @buf: Buffer containing header info loaded. - * @size: Size of data loaded for this module. - */ -static void userui_load_config_info(char *buf, int size) -{ - progress_granularity = *((int *) buf); - size -= sizeof(int); - - /* Don't load the saved path if one has already been set */ - if (ui_helper_changed) - return; - - if (size > sizeof(ui_helper_data.program)) - size = sizeof(ui_helper_data.program); - - memcpy(ui_helper_data.program, buf + sizeof(int), size); - ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0'; -} - -/** - * set_ui_program_set: Record that userui program was changed. - * - * Side effect routine for when the userui program is set. In an initrd or - * ramfs, the user may set a location for the userui program. If this happens, - * we don't want to reload the value that was saved in the image header. This - * routine allows us to flag that we shouldn't restore the program name from - * the image header. - */ -static void set_ui_program_set(void) -{ - ui_helper_changed = 1; -} - -/** - * userui_memory_needed - Tell core how much memory to reserve for us. - */ -static int userui_memory_needed(void) -{ - /* ball park figure of 128 pages */ - return 128 * PAGE_SIZE; -} - -/** - * userui_update_status - Update the progress bar and (if on) in-bar message. - * - * @value: Current progress percentage numerator. - * @maximum: Current progress percentage denominator. - * @fmt: Message to be displayed in the middle of the progress bar. - * - * Note that a NULL message does not mean that any previous message is erased! - * For that, you need toi_prepare_status with clearbar on. - * - * Returns an unsigned long, being the next numerator (as determined by the - * maximum and progress granularity) where status needs to be updated. - * This is to reduce unnecessary calls to update_status. - */ -static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...) -{ - static u32 last_step = 9999; - struct userui_msg_params msg; - u32 this_step, next_update; - int bitshift; - - if (ui_helper_data.pid == -1) - return 0; - - if ((!maximum) || (!progress_granularity)) - return maximum; - - if (value < 0) - value = 0; - - if (value > maximum) - value = maximum; - - /* Try to avoid math problems - we can't do 64 bit math here - * (and shouldn't need it - anyone got screen resolution - * of 65536 pixels or more?) */ - bitshift = fls(maximum) - 16; - if (bitshift > 0) { - u32 temp_maximum = maximum >> bitshift; - u32 temp_value = value >> bitshift; - this_step = (u32) - (temp_value * progress_granularity / temp_maximum); - next_update = (((this_step + 1) * temp_maximum / - progress_granularity) + 1) << bitshift; - } else { - this_step = (u32) (value * progress_granularity / maximum); - next_update = ((this_step + 1) * maximum / - progress_granularity) + 1; - } - - if (this_step == last_step) - return next_update; - - memset(&msg, 0, sizeof(msg)); - - msg.a = this_step; - msg.b = progress_granularity; - - if (fmt) { - va_list args; - va_start(args, fmt); - vsnprintf(msg.text, sizeof(msg.text), fmt, args); - va_end(args); - msg.text[sizeof(msg.text)-1] = '\0'; - } - - toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS, - &msg, sizeof(msg)); - last_step = this_step; - - return next_update; -} - -/** - * userui_message - Display a message without necessarily logging it. - * - * @section: Type of message. Messages can be filtered by type. - * @level: Degree of importance of the message. Lower values = higher priority. - * @normally_logged: Whether logged even if log_everything is off. - * @fmt: Message (and parameters). - * - * This function is intended to do the same job as printk, but without normally - * logging what is printed. The point is to be able to get debugging info on - * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M" - * - * It may be called from an interrupt context - can't sleep! - */ -static void userui_message(u32 section, u32 level, u32 normally_logged, - const char *fmt, ...) -{ - struct userui_msg_params msg; - - if ((level) && (level > console_loglevel)) - return; - - memset(&msg, 0, sizeof(msg)); - - msg.a = section; - msg.b = level; - msg.c = normally_logged; - - if (fmt) { - va_list args; - va_start(args, fmt); - vsnprintf(msg.text, sizeof(msg.text), fmt, args); - va_end(args); - msg.text[sizeof(msg.text)-1] = '\0'; - } - - if (test_action_state(TOI_LOGALL)) - printk(KERN_INFO "%s\n", msg.text); - - toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE, - &msg, sizeof(msg)); -} - -/** - * wait_for_key_via_userui - Wait for userui to receive a keypress. - */ -static void wait_for_key_via_userui(void) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&userui_wait_for_key, &wait); - set_current_state(TASK_INTERRUPTIBLE); - - wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake); - userui_wait_should_wake = false; - - set_current_state(TASK_RUNNING); - remove_wait_queue(&userui_wait_for_key, &wait); -} - -/** - * userui_prepare_status - Display high level messages. - * - * @clearbar: Whether to clear the progress bar. - * @fmt...: New message for the title. - * - * Prepare the 'nice display', drawing the header and version, along with the - * current action and perhaps also resetting the progress bar. - */ -static void userui_prepare_status(int clearbar, const char *fmt, ...) -{ - va_list args; - - if (fmt) { - va_start(args, fmt); - lastheader_message_len = vsnprintf(lastheader, 512, fmt, args); - va_end(args); - } - - if (clearbar) - toi_update_status(0, 1, NULL); - - if (ui_helper_data.pid == -1) - printk(KERN_EMERG "%s\n", lastheader); - else - toi_message(0, TOI_STATUS, 1, lastheader, NULL); -} - -/** - * toi_wait_for_keypress - Wait for keypress via userui. - * - * @timeout: Maximum time to wait. - * - * Wait for a keypress from userui. - * - * FIXME: Implement timeout? - */ -static char userui_wait_for_keypress(int timeout) -{ - char key = '\0'; - - if (ui_helper_data.pid != -1) { - wait_for_key_via_userui(); - key = ' '; - } - - return key; -} - -/** - * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it. - * - * @result_code: Reason why we're aborting (1 << bit). - * @fmt: Message to display if telling the user what's going on. - * - * Abort a cycle. If this wasn't at the user's request (and we're displaying - * output), tell the user why and wait for them to acknowledge the message. - */ -static void userui_abort_hibernate(int result_code, const char *fmt, ...) -{ - va_list args; - int printed_len = 0; - - set_result_state(result_code); - - if (test_result_state(TOI_ABORTED)) - return; - - set_result_state(TOI_ABORTED); - - if (test_result_state(TOI_ABORT_REQUESTED)) - return; - - va_start(args, fmt); - printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf), - fmt, args); - va_end(args); - if (ui_helper_data.pid != -1) - printed_len = sprintf(local_printf_buf + printed_len, - " (Press SPACE to continue)"); - - toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf); - - if (ui_helper_data.pid != -1) - userui_wait_for_keypress(0); -} - -/** - * request_abort_hibernate - Abort hibernating or resuming at user request. - * - * Handle the user requesting the cancellation of a hibernation or resume by - * pressing escape. - */ -static void request_abort_hibernate(void) -{ - if (test_result_state(TOI_ABORT_REQUESTED) || - !test_action_state(TOI_CAN_CANCEL)) - return; - - if (test_toi_state(TOI_NOW_RESUMING)) { - toi_prepare_status(CLEAR_BAR, "Escape pressed. " - "Powering down again."); - set_toi_state(TOI_STOP_RESUME); - while (!test_toi_state(TOI_IO_STOPPED)) - schedule(); - if (toiActiveAllocator->mark_resume_attempted) - toiActiveAllocator->mark_resume_attempted(0); - toi_power_down(); - } - - toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :" - " ABORTING HIBERNATION ---"); - set_abort_result(TOI_ABORT_REQUESTED); - toi_stop_waiting_for_userui_key(); -} - -/** - * userui_user_rcv_msg - Receive a netlink message from userui. - * - * @skb: skb received. - * @nlh: Netlink header received. - */ -static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type; - int *data; - - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < NETLINK_MSG_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ - if (type >= USERUI_MSG_MAX) - return -EINVAL; - - /* All operations require privileges, even GET */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) { - printk(KERN_INFO "Got NOFREEZE_ME request when " - "ui_helper_data.pid is %d.\n", ui_helper_data.pid); - return -EBUSY; - } - - data = (int *) NLMSG_DATA(nlh); - - switch (type) { - case USERUI_MSG_ABORT: - request_abort_hibernate(); - return 0; - case USERUI_MSG_GET_STATE: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_STATE, &toi_bkd.toi_action, - sizeof(toi_bkd.toi_action)); - return 0; - case USERUI_MSG_GET_DEBUG_STATE: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_DEBUG_STATE, - &toi_bkd.toi_debug_state, - sizeof(toi_bkd.toi_debug_state)); - return 0; - case USERUI_MSG_SET_STATE: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - ui_nl_set_state(*data); - return 0; - case USERUI_MSG_SET_DEBUG_STATE: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - toi_bkd.toi_debug_state = (*data); - return 0; - case USERUI_MSG_SPACE: - toi_stop_waiting_for_userui_key(); - return 0; - case USERUI_MSG_GET_POWERDOWN_METHOD: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_POWERDOWN_METHOD, - &toi_poweroff_method, - sizeof(toi_poweroff_method)); - return 0; - case USERUI_MSG_SET_POWERDOWN_METHOD: - if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char))) - return -EINVAL; - toi_poweroff_method = (unsigned long)(*data); - return 0; - case USERUI_MSG_GET_LOGLEVEL: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_LOGLEVEL, - &toi_bkd.toi_default_console_level, - sizeof(toi_bkd.toi_default_console_level)); - return 0; - case USERUI_MSG_SET_LOGLEVEL: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - toi_bkd.toi_default_console_level = (*data); - return 0; - case USERUI_MSG_PRINTK: - printk(KERN_INFO "%s", (char *) data); - return 0; - } - - /* Unhandled here */ - return 1; -} - -/** - * userui_cond_pause - Possibly pause at user request. - * - * @pause: Whether to pause or just display the message. - * @message: Message to display at the start of pausing. - * - * Potentially pause and wait for the user to tell us to continue. We normally - * only pause when @pause is set. While paused, the user can do things like - * changing the loglevel, toggling the display of debugging sections and such - * like. - */ -static void userui_cond_pause(int pause, char *message) -{ - int displayed_message = 0, last_key = 0; - - while (last_key != 32 && - ui_helper_data.pid != -1 && - ((test_action_state(TOI_PAUSE) && pause) || - (test_action_state(TOI_SINGLESTEP)))) { - if (!displayed_message) { - toi_prepare_status(DONT_CLEAR_BAR, - "%s Press SPACE to continue.%s", - message ? message : "", - (test_action_state(TOI_SINGLESTEP)) ? - " Single step on." : ""); - displayed_message = 1; - } - last_key = userui_wait_for_keypress(0); - } - schedule(); -} - -/** - * userui_prepare_console - Prepare the console for use. - * - * Prepare a console for use, saving current kmsg settings and attempting to - * start userui. Console loglevel changes are handled by userui. - */ -static void userui_prepare_console(void) -{ - orig_kmsg = vt_kmsg_redirect(fg_console + 1); - - ui_helper_data.pid = -1; - - if (!userui_ops.enabled) { - printk(KERN_INFO "TuxOnIce: Userui disabled.\n"); - return; - } - - if (*ui_helper_data.program) - toi_netlink_setup(&ui_helper_data); - else - printk(KERN_INFO "TuxOnIce: Userui program not configured.\n"); -} - -/** - * userui_cleanup_console - Cleanup after a cycle. - * - * Tell userui to cleanup, and restore kmsg_redirect to its original value. - */ - -static void userui_cleanup_console(void) -{ - if (ui_helper_data.pid > -1) - toi_netlink_close(&ui_helper_data); - - vt_kmsg_redirect(orig_kmsg); -} - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) - SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action, - TOI_CAN_CANCEL, 0), - SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action, - TOI_PAUSE, 0), - SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL), - SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1, - 2048, 0, NULL), - SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0, - set_ui_program_set), - SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL) -#endif -}; - -static struct toi_module_ops userui_ops = { - .type = MISC_MODULE, - .name = "userui", - .shared_directory = "user_interface", - .module = THIS_MODULE, - .storage_needed = userui_storage_needed, - .save_config_info = userui_save_config_info, - .load_config_info = userui_load_config_info, - .memory_needed = userui_memory_needed, - .post_atomic_restore = userui_post_atomic_restore, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -static struct ui_ops my_ui_ops = { - .update_status = userui_update_status, - .message = userui_message, - .prepare_status = userui_prepare_status, - .abort = userui_abort_hibernate, - .cond_pause = userui_cond_pause, - .prepare = userui_prepare_console, - .cleanup = userui_cleanup_console, - .wait_for_key = userui_wait_for_keypress, -}; - -/** - * toi_user_ui_init - Boot time initialisation for user interface. - * - * Invoked from the core init routine. - */ -static __init int toi_user_ui_init(void) -{ - int result; - - ui_helper_data.nl = NULL; - strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255); - ui_helper_data.pid = -1; - ui_helper_data.skb_size = sizeof(struct userui_msg_params); - ui_helper_data.pool_limit = 6; - ui_helper_data.netlink_id = NETLINK_TOI_USERUI; - ui_helper_data.name = "userspace ui"; - ui_helper_data.rcv_msg = userui_user_rcv_msg; - ui_helper_data.interface_version = 8; - ui_helper_data.must_init = 0; - ui_helper_data.not_ready = userui_cleanup_console; - init_completion(&ui_helper_data.wait_for_process); - result = toi_register_module(&userui_ops); - if (!result) { - result = toi_register_ui_ops(&my_ui_ops); - if (result) - toi_unregister_module(&userui_ops); - } - - return result; -} - -late_initcall(toi_user_ui_init); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8c1204fa3..a787aa942 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,8 +15,8 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -ifdef CONFIG_SCHED_BFS -obj-y += bfs.o clock.o +ifdef CONFIG_SCHED_MUQSS +obj-y += MuQSS.o clock.o else obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o diff --git a/kernel/sched/bfs.c b/kernel/sched/MuQSS.c index bb5bac4b2..6a656ad4b 100644 --- a/kernel/sched/bfs.c +++ b/kernel/sched/MuQSS.c @@ -1,5 +1,5 @@ /* - * kernel/sched/bfs.c, was kernel/sched.c + * kernel/sched/MuQSS.c, was kernel/sched.c * * Kernel scheduler and related syscalls * @@ -26,6 +26,8 @@ * Thomas Gleixner, Mike Kravetz * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes * a whole lot of those previous things. + * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS + * scheduler by Con Kolivas. */ #include <linux/kasan.h> @@ -74,7 +76,7 @@ #include <linux/context_tracking.h> #include <linux/sched/prio.h> #include <linux/tick.h> -#include <linux/skip_lists.h> +#include <linux/skip_list.h> #include <asm/irq_regs.h> #include <asm/switch_to.h> @@ -92,11 +94,10 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -#include "bfs_sched.h" +#include "MuQSS.h" #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) -#define rt_queue(rq) rt_prio((rq)->rq_prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ (policy) == SCHED_RR) @@ -105,29 +106,26 @@ #define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) #define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) #define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy))) #define is_iso_policy(policy) ((policy) == SCHED_ISO) #define iso_task(p) unlikely(is_iso_policy((p)->policy)) -#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy)) #define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO) #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) -#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) +#define ISO_PERIOD (5 * HZ) -#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) #define STOP_PRIO (MAX_RT_PRIO - 1) /* * Some helpers for converting to/from various scales. Use shifts to get * approximate multiples of ten for less overhead. */ -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -#define JIFFY_NS (1000000000 / HZ) -#define HALF_JIFFY_NS (1000000000 / HZ / 2) -#define HALF_JIFFY_US (1000000 / HZ / 2) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ)) +#define JIFFY_NS (1073741824 / HZ) +#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) +#define HALF_JIFFY_NS (1073741824 / HZ / 2) +#define HALF_JIFFY_US (1048576 / HZ / 2) #define MS_TO_NS(TIME) ((TIME) << 20) #define MS_TO_US(TIME) ((TIME) << 10) #define NS_TO_MS(TIME) ((TIME) >> 20) @@ -137,7 +135,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "BFS CPU scheduler v0.512 by Con Kolivas.\n"); + printk(KERN_INFO "MuQSS CPU scheduler v0.114 by Con Kolivas.\n"); } /* @@ -182,30 +180,24 @@ static inline int timeslice(void) } /* - * The global runqueue data that all CPUs work off. Data is protected either - * by the global grq lock, or the discrete lock that precedes the data in this - * struct. + * The global runqueue data that all CPUs work off. Contains either atomic + * variables and a cpu bitmap set atomically. */ struct global_rq { - raw_spinlock_t lock; - unsigned long nr_running; - unsigned long nr_uninterruptible; - unsigned long long nr_switches; - unsigned long qnr; /* queued not running */ +#ifdef CONFIG_SMP + atomic_t nr_running ____cacheline_aligned_in_smp; + atomic_t nr_uninterruptible ____cacheline_aligned_in_smp; + atomic64_t nr_switches ____cacheline_aligned_in_smp; + atomic_t qnr ____cacheline_aligned_in_smp; /* queued not running */ +#else + atomic_t nr_running ____cacheline_aligned; + atomic_t nr_uninterruptible ____cacheline_aligned; + atomic64_t nr_switches ____cacheline_aligned; + atomic_t qnr ____cacheline_aligned; /* queued not running */ +#endif #ifdef CONFIG_SMP cpumask_t cpu_idle_map; - bool idle_cpus; #endif - int noc; /* num_online_cpus stored and updated when it changes */ - u64 niffies; /* Nanosecond jiffies */ - unsigned long last_jiffy; /* Last jiffy we updated niffies */ - - raw_spinlock_t iso_lock; - int iso_ticks; - bool iso_refractory; - - skiplist_node node; - skiplist *sl; }; #ifdef CONFIG_SMP @@ -241,7 +233,11 @@ static struct root_domain def_root_domain; #endif /* CONFIG_SMP */ /* There can be only one */ -static struct global_rq grq; +#ifdef CONFIG_SMP +static struct global_rq grq ____cacheline_aligned_in_smp; +#else +static struct global_rq grq ____cacheline_aligned; +#endif static DEFINE_MUTEX(sched_hotcpu_mutex); @@ -276,77 +272,16 @@ int __weak arch_sd_sibling_asym_packing(void) struct rq *uprq; #endif /* CONFIG_SMP */ -static inline void update_rq_clock(struct rq *rq); - -/* - * Sanity check should sched_clock return bogus values. We make sure it does - * not appear to go backwards, and use jiffies to determine the maximum and - * minimum it could possibly have increased, and round down to the nearest - * jiffy when it falls outside this. - */ -static inline void niffy_diff(s64 *niff_diff, int jiff_diff) -{ - unsigned long min_diff, max_diff; - - if (jiff_diff > 1) - min_diff = JIFFIES_TO_NS(jiff_diff - 1); - else - min_diff = 1; - /* Round up to the nearest tick for maximum */ - max_diff = JIFFIES_TO_NS(jiff_diff + 1); - - if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff)) - *niff_diff = min_diff; -} - #ifdef CONFIG_SMP static inline int cpu_of(struct rq *rq) { return rq->cpu; } - -/* - * Niffies are a globally increasing nanosecond counter. Whenever a runqueue - * clock is updated with the grq.lock held, it is an opportunity to update the - * niffies value. Any CPU can update it by adding how much its clock has - * increased since it last updated niffies, minus any added niffies by other - * CPUs. - */ -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; - /* old_clock is only updated when we are updating niffies */ - rq->old_clock = rq->clock; - ndiff -= grq.niffies - rq->last_niffy; - jdiff = jiffies - grq.last_jiffy; - niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; - rq->last_niffy = grq.niffies; -} #else /* CONFIG_SMP */ static inline int cpu_of(struct rq *rq) { return 0; } - -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; - rq->old_clock = rq->clock; - jdiff = jiffies - grq.last_jiffy; - niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; -} #endif #include "stats.h" @@ -362,10 +297,10 @@ static inline void update_clocks(struct rq *rq) #endif /* - * All common locking functions performed on grq.lock. rq->clock is local to + * All common locking functions performed on rq->lock. rq->clock is local to * the CPU accessing it so it can be modified just with interrupts disabled * when we're not updating niffies. - * Looking up task_rq must be done under grq.lock to be safe. + * Looking up task_rq must be done under rq->lock to be safe. */ static void update_rq_clock_task(struct rq *rq, s64 delta); @@ -379,102 +314,501 @@ static inline void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } -static inline bool task_running(struct task_struct *p) +/* + * Niffies are a globally increasing nanosecond counter. They're only used by + * update_load_avg and time_slice_expired, however deadlines are based on them + * across CPUs. Update them whenever we will call one of those functions, and + * synchronise them across CPUs whenever we hold both runqueue locks. + */ +static inline void update_clocks(struct rq *rq) { + s64 ndiff, minndiff; + long jdiff; + + update_rq_clock(rq); + ndiff = rq->clock - rq->old_clock; + rq->old_clock = rq->clock; + jdiff = jiffies - rq->last_jiffy; + + /* Subtract any niffies added by balancing with other rqs */ + ndiff -= rq->niffies - rq->last_niffy; + minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; + if (minndiff < 0) + minndiff = 0; + ndiff = max(ndiff, minndiff); + rq->niffies += ndiff; + rq->last_niffy = rq->niffies; + if (jdiff) { + rq->last_jiffy += jdiff; + rq->last_jiffy_niffies = rq->niffies; + } +} + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP return p->on_cpu; +#else + return task_current(rq, p); +#endif } -static inline void grq_lock(void) - __acquires(grq.lock) +static inline int task_on_rq_queued(struct task_struct *p) { - raw_spin_lock(&grq.lock); + return p->on_rq == TASK_ON_RQ_QUEUED; } -static inline void grq_unlock(void) - __releases(grq.lock) +static inline int task_on_rq_migrating(struct task_struct *p) { - raw_spin_unlock(&grq.lock); + return p->on_rq == TASK_ON_RQ_MIGRATING; } -static inline void grq_lock_irq(void) - __acquires(grq.lock) +static inline void rq_lock(struct rq *rq) + __acquires(rq->lock) { - raw_spin_lock_irq(&grq.lock); + raw_spin_lock(&rq->lock); } -static inline void time_lock_grq(struct rq *rq) +static inline int rq_trylock(struct rq *rq) + __acquires(rq->lock) { - grq_lock(); - update_clocks(rq); + return raw_spin_trylock(&rq->lock); +} + +static inline void rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); } -static inline void grq_unlock_irq(void) - __releases(grq.lock) +static inline struct rq *this_rq_lock(void) + __acquires(rq->lock) { - raw_spin_unlock_irq(&grq.lock); + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + raw_spin_lock(&rq->lock); + + return rq; } -static inline void grq_lock_irqsave(unsigned long *flags) - __acquires(grq.lock) +/* + * Any time we have two runqueues locked we use that as an opportunity to + * synchronise niffies to the highest value as idle ticks may have artificially + * kept niffies low on one CPU and the truth can only be later. + */ +static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) { - raw_spin_lock_irqsave(&grq.lock, *flags); + if (rq1->niffies > rq2->niffies) + rq2->niffies = rq1->niffies; + else + rq1->niffies = rq2->niffies; } -static inline void grq_unlock_irqrestore(unsigned long *flags) - __releases(grq.lock) +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ + +/* For when we know rq1 != rq2 */ +static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) { - raw_spin_unlock_irqrestore(&grq.lock, *flags); + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } } -static inline struct rq -*task_grq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else + __double_rq_lock(rq1, rq2); + synchronise_niffies(rq1, rq2); +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +/* Must be sure rq1 != rq2 and irqs are disabled */ +static inline void lock_second_rq(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (unlikely(!raw_spin_trylock(&rq2->lock))) { + raw_spin_unlock(&rq1->lock); + __double_rq_lock(rq1, rq2); + } + synchronise_niffies(rq1, rq2); +} + +static inline void lock_all_rqs(void) +{ + int cpu; + + preempt_disable(); + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + do_raw_spin_lock(&rq->lock); + } +} + +static inline void unlock_all_rqs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + do_raw_spin_unlock(&rq->lock); + } + preempt_enable(); +} + +/* Specially nest trylock an rq */ +static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) +{ + if (unlikely(!do_raw_spin_trylock(&rq->lock))) + return false; + spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); + synchronise_niffies(this_rq, rq); + return true; +} + +/* Unlock a specially nested trylocked rq */ +static inline void unlock_rq(struct rq *rq) +{ + spin_release(&rq->lock.dep_map, 1, _RET_IP_); + do_raw_spin_unlock(&rq->lock); +} + +static inline void rq_lock_irq(struct rq *rq) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); +} + +static inline void rq_unlock_irq(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock_irq(&rq->lock); +} + +static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags) + __acquires(rq->lock) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - grq_lock(); - return task_rq(p); + raw_spin_lock_irqsave(&rq->lock, *flags); +} + +static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags) + __releases(rq->lock) +{ + raw_spin_unlock_irqrestore(&rq->lock, *flags); } static inline struct rq -*time_task_grq_lock(struct task_struct *p, unsigned long *flags) +*task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) { - struct rq *rq = task_grq_lock(p, flags); + struct rq *rq; - update_clocks(rq); + while (42) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + break; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + } return rq; } -static inline void task_grq_unlock(struct task_struct *p, unsigned long *flags) +static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) __releases(p->pi_lock) { - grq_unlock(); + rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } -static inline void time_grq_lock(struct rq *rq, unsigned long *flags) +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + while (42) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + break; + raw_spin_unlock(&rq->lock); + } + return rq; +} + +static inline void __task_rq_unlock(struct rq *rq) +{ + rq_unlock(rq); +} + +/* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, mask) \ + ({ \ + typeof(ptr) _ptr = (ptr); \ + typeof(mask) _mask = (mask); \ + typeof(*_ptr) _old, _val = *_ptr; \ + \ + for (;;) { \ + _old = cmpxchg(_ptr, _val, _val | _mask); \ + if (_old == _val) \ + break; \ + _val = _old; \ + } \ + _old; \ +}) + +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif +#endif + +void wake_q_add(struct wake_q_head *head, struct task_struct *task) { - local_irq_save(*flags); - time_lock_grq(rq); + struct wake_q_node *node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means + * its already queued (either by us or someone else) and will get the + * wakeup due to that. + * + * This cmpxchg() implies a full barrier, which pairs with the write + * barrier implied by the wakeup in wake_up_q(). + */ + if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + return; + + get_task_struct(task); + + /* + * The head is context local, there can be no concurrency. + */ + *head->lastp = node; + head->lastp = &node->next; +} + +void wake_up_q(struct wake_q_head *head) +{ + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + + task = container_of(node, struct task_struct, wake_q); + BUG_ON(!task); + /* task can safely be re-inserted now */ + node = node->next; + task->wake_q.next = NULL; + + /* + * wake_up_process() implies a wmb() to pair with the queueing + * in wake_q_add() so as not to miss wakeups. + */ + wake_up_process(task); + put_task_struct(task); + } } static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { + next->on_cpu = 1; +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +void resched_task(struct task_struct *p) +{ + int cpu; +#ifdef CONFIG_LOCKDEP + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); +#endif + if (test_tsk_need_resched(p)) + return; + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); + set_preempt_need_resched(); + return; + } + + if (set_nr_and_not_polling(p)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} + +/* + * A task that is not running or queued will not have a node set. + * A task that is queued but not running will have a node set. + * A task that is currently running will have ->on_cpu set but no node set. + */ +static inline bool task_queued(struct task_struct *p) +{ + return !skiplist_node_empty(&p->node); } +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +static inline void resched_if_idle(struct rq *rq); + static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - grq.lock.owner = current; + rq->lock.owner = current; #endif /* * If we are tracking spinlock dependencies then we have to * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ - spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_); + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - grq_unlock_irq(); +#ifdef CONFIG_SMP + /* + * If prev was marked as migrating to another CPU in return_task, drop + * the local runqueue lock but leave interrupts disabled and grab the + * remote lock we're migrating it to before enabling them. + */ + if (unlikely(task_on_rq_migrating(prev))) { + sched_info_dequeued(rq, prev); + /* + * We move the ownership of prev to the new cpu now. ttwu can't + * activate prev to the wrong cpu since it has to grab this + * runqueue in ttwu_remote. + */ + task_thread_info(prev)->cpu = prev->wake_cpu; + raw_spin_unlock(&rq->lock); + + raw_spin_lock(&prev->pi_lock); + rq = __task_rq_lock(prev); + /* Check that someone else hasn't already queued prev */ + if (likely(!task_queued(prev))) { + enqueue_task(rq, prev, 0); + prev->on_rq = TASK_ON_RQ_QUEUED; + /* Wake up the CPU if it's not already running */ + resched_if_idle(rq); + } + raw_spin_unlock(&prev->pi_lock); + } +#endif + raw_spin_unlock_irq(&rq->lock); } static inline bool deadline_before(u64 deadline, u64 time) @@ -482,11 +816,6 @@ static inline bool deadline_before(u64 deadline, u64 time) return (deadline < time); } -static inline bool deadline_after(u64 deadline, u64 time) -{ - return (deadline > time); -} - /* * Deadline is "now" in niffies + (offset by priority). Setting the deadline * is the key to everything. It distributes cpu fairly amongst tasks of the @@ -521,26 +850,54 @@ static inline int ms_longest_deadline_diff(void) return NS_TO_MS(longest_deadline_diff()); } +static inline int rq_load(struct rq *rq) +{ + return rq->sl->entries + !rq_idle(rq); +} + +static inline bool rq_local(struct rq *rq); + /* - * A task that is not running or queued will not have a node set. - * A task that is queued but not running will have a node set. - * A task that is currently running will have ->on_cpu set but no node set. + * Update the load average for feeding into cpu frequency governors. Use a + * rough estimate of a rolling average with ~ time constant of 32ms. + * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 + * Make sure a call to update_clocks has been made before calling this to get + * an updated rq->niffies. */ -static inline bool task_queued(struct task_struct *p) +static void update_load_avg(struct rq *rq) { - return !skiplist_node_empty(&p->node); + /* rq clock can go backwards so skip update if that happens */ + if (likely(rq->clock > rq->load_update)) { + unsigned long us_interval = (rq->clock - rq->load_update) >> 10; + long load, curload = rq_load(rq); + + load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); + if (unlikely(load < 0)) + load = 0; + load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; + rq->load_avg = load; + } else + return; + + rq->load_update = rq->clock; + if (likely(rq_local(rq))) + cpufreq_trigger(rq->niffies, rq->load_avg); } /* - * Removing from the global runqueue. Enter with grq locked. Deleting a task + * Removing from the runqueue. Enter with rq locked. Deleting a task * from the skip list is done via the stored node reference in the task struct * and does not require a full look up. Thus it occurs in O(k) time where k - * is the "level" of the list the task was stored at - usually < 4, max 16. + * is the "level" of the list the task was stored at - usually < 4, max 8. */ -static void dequeue_task(struct task_struct *p) +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { - skiplist_delete(grq.sl, &p->node); - sched_info_dequeued(task_rq(p), p); + skiplist_delete(rq->sl, &p->node); + rq->best_key = rq->node.next[0]->key; + update_clocks(rq); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(task_rq(p), p); + update_load_avg(rq); } #ifdef CONFIG_PREEMPT_RCU @@ -566,15 +923,15 @@ static bool idleprio_suitable(struct task_struct *p) * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check * that the iso_refractory flag is not set. */ -static bool isoprio_suitable(void) +static inline bool isoprio_suitable(struct rq *rq) { - return !grq.iso_refractory; + return !rq->iso_refractory; } /* - * Adding to the global runqueue. Enter with grq locked. + * Adding to the runqueue. Enter with rq locked. */ -static void enqueue_task(struct task_struct *p, struct rq *rq) +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { unsigned int randseed; u64 sl_id; @@ -582,7 +939,7 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) if (!rt_task(p)) { /* Check it hasn't gotten rt from PI */ if ((idleprio_task(p) && idleprio_suitable(p)) || - (iso_task(p) && isoprio_suitable())) + (iso_task(p) && isoprio_suitable(rq))) p->prio = p->normal_prio; else p->prio = NORMAL_PRIO; @@ -604,9 +961,8 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) else { sl_id = p->deadline; if (idleprio_task(p)) { - /* Set it to cope with 4 left shifts with locality_diff */ if (p->prio == IDLE_PRIO) - sl_id |= 0x00FF000000000000; + sl_id |= 0xF000000000000000; else sl_id += longest_deadline_diff(); } @@ -615,14 +971,13 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) * Some architectures don't have better than microsecond resolution * so mask out ~microseconds as the random seed for skiplist insertion. */ - randseed = (grq.niffies >> 10) & 0xFFFFFFFF; - skiplist_insert(grq.sl, &p->node, sl_id, p, randseed); - sched_info_queued(rq, p); -} - -static inline void requeue_task(struct task_struct *p) -{ - sched_info_queued(task_rq(p), p); + update_clocks(rq); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); + randseed = (rq->niffies >> 10) & 0xFFFFFFFF; + skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); + rq->best_key = rq->node.next[0]->key; + update_load_avg(rq); } /* @@ -644,13 +999,6 @@ static inline int task_timeslice(struct task_struct *p) return (rr_interval * task_prio_ratio(p) / 128); } -static void resched_task(struct task_struct *p); - -static inline void resched_curr(struct rq *rq) -{ - resched_task(rq->curr); -} - /* * qnr is the "queued but not running" count which is the total number of * tasks on the global runqueue list waiting for cpu time but not actually @@ -658,24 +1006,31 @@ static inline void resched_curr(struct rq *rq) */ static inline void inc_qnr(void) { - grq.qnr++; + atomic_inc(&grq.qnr); } static inline void dec_qnr(void) { - grq.qnr--; + atomic_dec(&grq.qnr); } static inline int queued_notrunning(void) { - return grq.qnr; + return atomic_read(&grq.qnr); } -static unsigned long rq_load_avg(struct rq *rq) +#ifdef CONFIG_SMP +/* Entered with rq locked */ +static inline void resched_if_idle(struct rq *rq) { - return rq->soft_affined * SCHED_CAPACITY_SCALE; + if (rq_idle(rq)) + resched_task(rq->curr); } +static inline bool rq_local(struct rq *rq) +{ + return (rq->cpu == smp_processor_id()); +} #ifdef CONFIG_SMT_NICE static const cpumask_t *thread_cpumask(int cpu); @@ -751,35 +1106,70 @@ static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) #else /* CONFIG_SMT_NICE */ #define smt_schedule(p, this_rq) (true) #endif /* CONFIG_SMT_NICE */ -#ifdef CONFIG_SMP + +static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) +{ + set_bit(cpu, (volatile unsigned long *)cpumask); +} + /* * The cpu_idle_map stores a bitmap of all the CPUs currently idle to * allow easy lookup of whether any suitable idle CPUs are available. * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the - * idle_cpus variable than to do a full bitmask check when we are busy. + * idle_cpus variable than to do a full bitmask check when we are busy. The + * bits are set atomically but read locklessly as occasional false positive / + * negative is harmless. */ static inline void set_cpuidle_map(int cpu) { - if (likely(cpu_online(cpu))) { - cpumask_set_cpu(cpu, &grq.cpu_idle_map); - grq.idle_cpus = true; - } + if (likely(cpu_online(cpu))) + atomic_set_cpu(cpu, &grq.cpu_idle_map); +} + +static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) +{ + clear_bit(cpu, (volatile unsigned long *)cpumask); } static inline void clear_cpuidle_map(int cpu) { - cpumask_clear_cpu(cpu, &grq.cpu_idle_map); - if (cpumask_empty(&grq.cpu_idle_map)) - grq.idle_cpus = false; + atomic_clear_cpu(cpu, &grq.cpu_idle_map); } static bool suitable_idle_cpus(struct task_struct *p) { - if (!grq.idle_cpus) - return false; return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map)); } +/* + * Resched current on rq. We don't know if rq is local to this CPU nor if it + * is locked so we do not use an intermediate variable for the task to avoid + * having it dereferenced. + */ +static void resched_curr(struct rq *rq) +{ + int cpu; + + if (test_tsk_need_resched(rq->curr)) + return; + + rq->preempt = rq->curr; + cpu = rq->cpu; + + /* We're doing this without holding the rq lock if it's not task_rq */ + + if (cpu == smp_processor_id()) { + set_tsk_need_resched(rq->curr); + set_preempt_need_resched(); + return; + } + + if (set_nr_and_not_polling(rq->curr)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} + #define CPUIDLE_DIFF_THREAD (1) #define CPUIDLE_DIFF_CORE (2) #define CPUIDLE_CACHE_BUSY (4) @@ -855,30 +1245,48 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return (this_rq->cpu_locality[that_cpu] < 3); } -static bool resched_best_idle(struct task_struct *p) +/* As per resched_curr but only will resched idle task */ +static inline void resched_idle(struct rq *rq) +{ + if (test_tsk_need_resched(rq->idle)) + return; + + rq->preempt = rq->idle; + + set_tsk_need_resched(rq->idle); + + if (rq_local(rq)) { + set_preempt_need_resched(); + return; + } + + smp_send_reschedule(rq->cpu); +} + +static struct rq *resched_best_idle(struct task_struct *p, int cpu) { cpumask_t tmpmask; struct rq *rq; int best_cpu; cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); - best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask); + best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); rq = cpu_rq(best_cpu); if (!smt_schedule(p, rq)) - return false; - resched_curr(rq); - return true; + return NULL; + resched_idle(rq); + return rq; } static inline void resched_suitable_idle(struct task_struct *p) { if (suitable_idle_cpus(p)) - resched_best_idle(p); + resched_best_idle(p, task_cpu(p)); } -static inline int locality_diff(int cpu, struct rq *rq) +static inline struct rq *rq_order(struct rq *rq, int cpu) { - return rq->cpu_locality[cpu]; + return rq->rq_order[cpu]; } #else /* CONFIG_SMP */ static inline void set_cpuidle_map(int cpu) @@ -898,9 +1306,28 @@ static inline void resched_suitable_idle(struct task_struct *p) { } -static inline int locality_diff(int cpu, struct rq *rq) +static inline void resched_curr(struct rq *rq) { - return 0; + resched_task(rq->curr); +} + +static inline void resched_if_idle(struct rq *rq) +{ +} + +static inline bool rq_local(struct rq *rq) +{ + return true; +} + +static inline struct rq *rq_order(struct rq *rq, int cpu) +{ + return rq; +} + +static inline bool smt_schedule(struct task_struct *p, struct rq *rq) +{ + return true; } #endif /* CONFIG_SMP */ @@ -935,32 +1362,11 @@ static int effective_prio(struct task_struct *p) } /* - * Update the load average for feeding into cpu frequency governors. Use a - * rough estimate of a rolling average with ~ time constant of 32ms. - * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 - */ -static void update_load_avg(struct rq *rq) -{ - /* rq clock can go backwards so skip update if that happens */ - if (likely(rq->clock > rq->load_update)) { - unsigned long us_interval = (rq->clock - rq->load_update) >> 10; - long load; - - load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); - if (unlikely(load < 0)) - load = 0; - load += rq->soft_affined * rq_load_avg(rq) * us_interval * 5 / 262144; - rq->load_avg = load; - } - rq->load_update = rq->clock; -} - -/* - * activate_task - move a task to the runqueue. Enter with grq locked. + * activate_task - move a task to the runqueue. Enter with rq locked. */ static void activate_task(struct task_struct *p, struct rq *rq) { - update_clocks(rq); + resched_if_idle(rq); /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -970,134 +1376,137 @@ static void activate_task(struct task_struct *p, struct rq *rq) if (unlikely(prof_on == SLEEP_PROFILING)) { if (p->state == TASK_UNINTERRUPTIBLE) profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (rq->clock_task - p->last_ran) >> 20); + (rq->niffies - p->last_ran) >> 20); } p->prio = effective_prio(p); if (task_contributes_to_load(p)) - grq.nr_uninterruptible--; - enqueue_task(p, rq); - rq->soft_affined++; - p->on_rq = 1; - grq.nr_running++; + atomic_dec(&grq.nr_uninterruptible); + + enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + atomic_inc(&grq.nr_running); inc_qnr(); - update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); } /* - * deactivate_task - If it's running, it's not on the grq and we can just - * decrement the nr_running. Enter with grq locked. + * deactivate_task - If it's running, it's not on the runqueue and we can just + * decrement the nr_running. Enter with rq locked. */ static inline void deactivate_task(struct task_struct *p, struct rq *rq) { if (task_contributes_to_load(p)) - grq.nr_uninterruptible++; - rq->soft_affined--; + atomic_inc(&grq.nr_uninterruptible); + p->on_rq = 0; - grq.nr_running--; - update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); + atomic_dec(&grq.nr_running); + sched_info_dequeued(rq, p); } #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int cpu) { + struct rq *rq = task_rq(p); + bool queued; + #ifdef CONFIG_LOCKDEP /* - * The caller should hold either p->pi_lock or grq lock, when changing - * a task's CPU. ->pi_lock for waking tasks, grq lock for runnable tasks. + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * Furthermore, all task_rq users should acquire both locks, see - * task_grq_lock(). + * task_rq_lock(). */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&grq.lock))); + lockdep_is_held(&task_rq(p)->lock))); #endif - if (task_cpu(p) == cpu) + if (p->wake_cpu == cpu) return; trace_sched_migrate_task(p, cpu); perf_event_task_migrate(p); /* - * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be * successfully executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment. */ smp_wmb(); - if (p->on_rq) { - struct rq *rq = task_rq(p); + if (task_running(rq, p)) { + /* + * We should only be calling this on a running task if we're + * holding rq lock. + */ + lockdep_assert_held(&rq->lock); - rq->soft_affined--; - update_load_avg(rq); - rq = cpu_rq(cpu); - rq->soft_affined++; - update_load_avg(rq); + /* + * We can't change the task_thread_info cpu on a running task + * as p will still be protected by the rq lock of the cpu it + * is still running on so we set the wake_cpu for it to be + * lazily updated once off the cpu. + */ + p->wake_cpu = cpu; + return; } - task_thread_info(p)->cpu = cpu; + + if ((queued = task_queued(p))) + dequeue_task(rq, p, 0); + task_thread_info(p)->cpu = p->wake_cpu = cpu; + if (queued) + enqueue_task(cpu_rq(cpu), p, 0); } #endif /* CONFIG_SMP */ /* - * Move a task off the global queue and take it to a cpu for it will + * Move a task off the runqueue and take it to a cpu for it will * become the running task. */ -static inline void take_task(int cpu, struct task_struct *p) +static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) { + struct rq *p_rq = task_rq(p); + + dequeue_task(p_rq, p, DEQUEUE_SAVE); + if (p_rq != rq) { + sched_info_dequeued(p_rq, p); + sched_info_queued(rq, p); + } set_task_cpu(p, cpu); - dequeue_task(p); dec_qnr(); } /* - * Returns a descheduling task to the grq runqueue unless it is being + * Returns a descheduling task to the runqueue unless it is being * deactivated. */ -static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate) +static inline void return_task(struct task_struct *p, struct rq *rq, + int cpu, bool deactivate) { if (deactivate) deactivate_task(p, rq); else { inc_qnr(); - enqueue_task(p, rq); +#ifdef CONFIG_SMP + /* + * set_task_cpu was called on the running task that doesn't + * want to deactivate so it has to be enqueued to a different + * CPU and we need its lock. Tag it to be moved with as the + * lock is dropped in finish_lock_switch. + */ + if (unlikely(p->wake_cpu != cpu)) + p->on_rq = TASK_ON_RQ_MIGRATING; + else +#endif + enqueue_task(rq, p, ENQUEUE_RESTORE); } } -/* Enter with grq lock held. We know p is on the local cpu */ +/* Enter with rq lock held. We know p is on the local cpu */ static inline void __set_tsk_resched(struct task_struct *p) { set_tsk_need_resched(p); set_preempt_need_resched(); } -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -void resched_task(struct task_struct *p) -{ - int cpu; - - lockdep_assert_held(&grq.lock); - - if (test_tsk_need_resched(p)) - return; - - set_tsk_need_resched(p); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) { - set_preempt_need_resched(); - return; - } - - smp_send_reschedule(cpu); -} - /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@ -1128,8 +1537,8 @@ inline int task_curr(const struct task_struct *p) */ unsigned long wait_task_inactive(struct task_struct *p, long match_state) { + int running, queued; unsigned long flags; - bool running, on_rq; unsigned long ncsw; struct rq *rq; @@ -1147,25 +1556,25 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * if the runqueue has changed and p is actually now * running somewhere else! */ - while (task_running(p) && p == rq->curr) { + while (task_running(rq, p)) { if (match_state && unlikely(p->state != match_state)) return 0; cpu_relax(); } /* - * Ok, time to look more closely! We need the grq + * Ok, time to look more closely! We need the rq * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); - running = task_running(p); - on_rq = p->on_rq; + running = task_running(rq, p); + queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); /* * If it changed from the expected state, bail out now. @@ -1193,7 +1602,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(on_rq)) { + if (unlikely(queued)) { ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ); set_current_state(TASK_UNINTERRUPTIBLE); @@ -1252,32 +1661,15 @@ can_preempt(struct task_struct *p, int prio, u64 deadline) return true; if (p->prio > prio) return false; - /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */ + if (p->policy == SCHED_BATCH) + return false; + /* SCHED_NORMAL and ISO will preempt based on deadline */ if (!deadline_before(p->deadline, deadline)) return false; return true; } #ifdef CONFIG_SMP -#define cpu_online_map (*(cpumask_t *)cpu_online_mask) -#ifdef CONFIG_HOTPLUG_CPU -/* - * Check to see if there is a task that is affined only to offline CPUs but - * still wants runtime. This happens to kernel threads during suspend/halt and - * disabling of CPUs. - */ -static inline bool online_cpus(struct task_struct *p) -{ - return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed))); -} -#else /* CONFIG_HOTPLUG_CPU */ -/* All available CPUs are always online without hotplug. */ -static inline bool online_cpus(struct task_struct *p) -{ - return true; -} -#endif - /* * Check to see if p can run on cpu, and if not, whether there are any online * CPUs it can run on instead. @@ -1288,13 +1680,14 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu) return true; return false; } +#define cpu_online_map (*(cpumask_t *)cpu_online_mask) static void try_preempt(struct task_struct *p, struct rq *this_rq) { - int i, this_entries = this_rq->soft_affined; + int i, this_entries = rq_load(this_rq); cpumask_t tmp; - if (suitable_idle_cpus(p) && resched_best_idle(p)) + if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) return; /* IDLEPRIO tasks never preempt anything but idle */ @@ -1303,26 +1696,15 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq) cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); - /* - * We iterate over CPUs in locality order using rq_order, finding the - * first one we can preempt if possible, thus staying closest in - * locality. - */ for (i = 0; i < num_possible_cpus(); i++) { struct rq *rq = this_rq->rq_order[i]; if (!cpumask_test_cpu(rq->cpu, &tmp)) continue; - if (!sched_interactive && rq != this_rq && rq->soft_affined <= this_entries) + if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) continue; if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { - /* - * If we have decided this task should preempt this CPU, - * set the task's CPU to match thereby speeding up matching - * this task in earliest_deadline_task. - */ - set_task_cpu(p, rq->cpu); resched_curr(rq); return; } @@ -1352,6 +1734,13 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, } #endif /* CONFIG_SMP */ +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x04 /* internal use, task got migrated */ + static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -1382,27 +1771,96 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #endif /* CONFIG_SCHEDSTATS */ } -void wake_up_if_idle(int cpu) +static inline void ttwu_activate(struct rq *rq, struct task_struct *p) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; + activate_task(p, rq); - rcu_read_lock(); + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); +} - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption if there are no idle cpus, + * instead waiting for current to deschedule. + */ + if (wake_flags & WF_SYNC) + resched_suitable_idle(p); + else + try_preempt(p, rq); + p->state = TASK_RUNNING; + trace_sched_wakeup(p); +} - grq_lock_irqsave(&flags); - if (likely(is_idle_task(rq->curr))) - smp_send_reschedule(cpu); - /* Else cpu is not in idle, do nothing here */ - grq_unlock_irqrestore(&flags); +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ + lockdep_assert_held(&rq->lock); -out: - rcu_read_unlock(); +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + atomic_dec(&grq.nr_uninterruptible); +#endif + + ttwu_activate(rq, p); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (likely(task_on_rq_queued(p))) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; } #ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; + +void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; + unsigned long flags; + + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + + while (llist) { + int wake_flags = 0; + + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); + + ttwu_do_activate(rq, p, wake_flags); + } + + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + void scheduler_ipi(void) { /* @@ -1411,45 +1869,152 @@ void scheduler_ipi(void) * this IPI. */ preempt_fold_need_resched(); -} -#endif -static inline void ttwu_activate(struct task_struct *p, struct rq *rq, - bool is_sync) -{ - activate_task(p, rq); + if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) + return; /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption if there are no idle cpus, - * instead waiting for current to deschedule. + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. */ - if (!is_sync || suitable_idle_cpus(p)) - try_preempt(p, rq); + irq_enter(); + sched_ttwu_pending(); + irq_exit(); } -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, - bool success) +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { - trace_sched_wakeup(p); - p->state = TASK_RUNNING; + struct rq *rq = cpu_rq(cpu); - /* - * if a worker is waking up, notify workqueue. Note that on BFS, we - * don't really know what cpu it will be, so we fake it for - * wq_worker_waking_up :/ - */ - if ((p->flags & PF_WQ_WORKER) && success) - wq_worker_waking_up(p, cpu_of(rq)); + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } +} + +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + rq_lock_irqsave(rq, &flags); + if (likely(is_idle_task(rq->curr))) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + rq_unlock_irqrestore(rq, &flags); + } + +out: + rcu_read_unlock(); +} + +static int valid_task_cpu(struct task_struct *p) +{ + cpumask_t valid_mask; + + if (p->flags & PF_KTHREAD) + cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_online_mask); + else + cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_active_mask); + + if (unlikely(!cpumask_weight(&valid_mask))) { + /* Hotplug boot threads do this before the CPU is up */ + WARN_ON(sched_smp_initialized); + return cpumask_any(tsk_cpus_allowed(p)); + } + return cpumask_any(&valid_mask); } /* - * wake flags + * For a task that's just being woken up we have a valuable balancing + * opportunity so choose the nearest cache most lightly loaded runqueue. + * Entered with rq locked and returns with the chosen runqueue locked. */ -#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* child wakeup after fork */ -#define WF_MIGRATED 0x4 /* internal use, task got migrated */ +static inline int select_best_cpu(struct task_struct *p) +{ + unsigned int idlest = ~0U; + struct rq *rq = NULL; + int i; + + if (suitable_idle_cpus(p)) { + int cpu = task_cpu(p); + + if (unlikely(needs_other_cpu(p, cpu))) + cpu = valid_task_cpu(p); + rq = resched_best_idle(p, cpu); + if (likely(rq)) + return rq->cpu; + } + + for (i = 0; i < num_possible_cpus(); i++) { + struct rq *other_rq = task_rq(p)->rq_order[i]; + int entries; + + if (!other_rq->online) + continue; + if (needs_other_cpu(p, other_rq->cpu)) + continue; + entries = rq_load(other_rq); + if (entries >= idlest) + continue; + idlest = entries; + rq = other_rq; + } + if (unlikely(!rq)) + return smp_processor_id(); + return rq->cpu; +} +#else /* CONFIG_SMP */ +static int valid_task_cpu(struct task_struct *p) +{ + return 0; +} + +static inline int select_best_cpu(struct task_struct *p) +{ + return 0; +} + +static struct rq *resched_best_idle(struct task_struct *p, int cpu) +{ + return NULL; +} +#endif /* CONFIG_SMP */ + +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) +{ + struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) + if (!cpus_share_cache(smp_processor_id(), cpu)) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ + ttwu_queue_remote(p, cpu, wake_flags); + return; + } +#endif + rq_lock(rq); + ttwu_do_activate(rq, p, wake_flags); + rq_unlock(rq); +} /*** * try_to_wake_up - wake up a thread @@ -1466,13 +2031,11 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, * Return: %true if @p was woken up, %false if it was already running. * or @state didn't match @p's state. */ -static bool try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - bool success = false; unsigned long flags; - struct rq *rq; - int cpu; + int cpu, success = 0; /* * If we are going to wake up a thread waiting for CONDITION we @@ -1481,68 +2044,133 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state, * set_current_state() the waiting thread does. */ smp_mb__before_spinlock(); - - /* - * No need to do time_lock_grq as we only need to update the rq clock - * if we activate the task - */ - rq = task_grq_lock(p, &flags); - cpu = task_cpu(p); - + raw_spin_lock_irqsave(&p->pi_lock, flags); /* state is a volatile long, どうして、分からない */ if (!((unsigned int)p->state & state)) - goto out_unlock; + goto out; trace_sched_waking(p); - if (task_queued(p) || task_running(p)) - goto out_running; + success = 1; /* we're going to change ->state */ + cpu = task_cpu(p); - ttwu_activate(p, rq, wake_flags & WF_SYNC); - success = true; + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; -out_running: - ttwu_post_activation(p, rq, success); -out_unlock: - task_grq_unlock(p, &flags); +#ifdef CONFIG_SMP + /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. + */ + smp_rmb(); + + /* + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. + */ + smp_cond_load_acquire(&p->on_cpu, !VAL); + + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + + cpu = select_best_cpu(p); + if (task_cpu(p) != cpu) + set_task_cpu(p, cpu); +#endif /* CONFIG_SMP */ + ttwu_queue(p, cpu, wake_flags); +stat: if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags); +out: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return success; } /** - * try_to_wake_up_local - try to wake up a local task with grq lock held + * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * * Put @p on the run-queue if it's not already there. The caller must - * ensure that grq is locked and, @p is not the current task. - * grq stays locked over invocation. + * ensure that rq is locked and, @p is not the current task. + * rq stays locked over invocation. */ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - bool success = false; - lockdep_assert_held(&grq.lock); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + + lockdep_assert_held(&rq->lock); + + if (!raw_spin_trylock(&p->pi_lock)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet picked a replacement task. + */ + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } if (!(p->state & TASK_NORMAL)) - return; + goto out; trace_sched_waking(p); - if (!task_queued(p)) { - if (likely(!task_running(p))) { - schedstat_inc(rq, ttwu_count); - schedstat_inc(rq, ttwu_local); - } - ttwu_activate(p, rq, false); - if (schedstat_enabled()) - ttwu_stat(p, smp_processor_id(), 0); - success = true; - } - ttwu_post_activation(p, rq, success); + if (!task_on_rq_queued(p)) + ttwu_activate(rq, p); + + ttwu_do_wakeup(rq, p, 0); + if (schedstat_enabled()) + ttwu_stat(p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); } /** @@ -1568,7 +2196,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } -static void time_slice_expired(struct task_struct *p); +static void time_slice_expired(struct task_struct *p, struct rq *rq); /* * Perform scheduler related setup for a newly forked process p. @@ -1576,35 +2204,39 @@ static void time_slice_expired(struct task_struct *p); */ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) { + unsigned long flags; + int cpu = get_cpu(); + #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif /* + * We mark the process as NEW here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_NEW; + + /* * The process state is set to the same value of the process executing * do_fork() code. That is running. This guarantees that nobody will * actually run it, and a signal or other external event cannot wake * it up and insert it on the runqueue either. */ - /* Should be reset in fork.c but done here for ease of bfs patching */ + /* Should be reset in fork.c but done here for ease of MuQSS patching */ + p->on_cpu = p->on_rq = p->utime = p->stime = p->utimescaled = p->stimescaled = p->sched_time = - p->stime_pc = - p->utime_pc = 0; + p->stime_ns = + p->utime_ns = 0; skiplist_node_init(&p->node); /* - * We mark the process as NEW here. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_NEW; - - /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { @@ -1625,12 +2257,20 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } + /* + * Silence PROVE_RCU. + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + set_task_cpu(p, cpu); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + #ifdef CONFIG_SCHED_INFO if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif - p->on_cpu = false; init_task_preempt_count(p); + + put_cpu(); return 0; } @@ -1725,24 +2365,19 @@ void wake_up_new_task(struct task_struct *p) unsigned long flags; parent = p->parent; - rq = task_grq_lock(p, &flags); - if (unlikely(needs_other_cpu(p, task_cpu(p)))) - set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); - rq_curr = rq->curr; - p->state = TASK_RUNNING; - - /* - * Reinit new task deadline as its creator deadline could have changed - * since call to dup_task_struct(). - */ - p->deadline = rq->rq_deadline; - /* The new task might not be able to run on the same CPU as rq->curr */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + p->state = TASK_RUNNING; + /* Task_rq can't change yet on a new task */ + new_rq = rq = task_rq(p); if (unlikely(needs_other_cpu(p, task_cpu(p)))) { - set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); + set_task_cpu(p, valid_task_cpu(p)); new_rq = task_rq(p); - } else - new_rq = rq; + } + + double_rq_lock(rq, new_rq); + update_clocks(rq); + rq_curr = rq->curr; /* * Make sure we do not leak PI boosting priority to the child. @@ -1756,30 +2391,29 @@ void wake_up_new_task(struct task_struct *p) * Share the timeslice between parent and child, thus the * total amount of pending timeslices in the system doesn't change, * resulting in more scheduling fairness. If it's negative, it won't - * matter since that's the same as being 0. current's time_slice is - * actually in rq_time_slice when it's running, as is its last_ran - * value. rq->rq_deadline is only modified within schedule() so it - * is always equal to current->deadline. + * matter since that's the same as being 0. rq->rq_deadline is only + * modified within schedule() so it is always equal to + * current->deadline. */ - p->last_ran = rq->rq_last_ran; + p->last_ran = rq_curr->last_ran; if (likely(rq_curr->policy != SCHED_FIFO)) { - rq->rq_time_slice /= 2; - if (unlikely(rq->rq_time_slice < RESCHED_US)) { + rq_curr->time_slice /= 2; + if (unlikely(rq_curr->time_slice < RESCHED_US)) { /* * Forking task has run out of timeslice. Reschedule it and * start its child with a new time slice and deadline. The * child will end up running first because its deadline will * be slightly earlier. */ - rq->rq_time_slice = 0; + rq_curr->time_slice = 0; __set_tsk_resched(rq_curr); - time_slice_expired(p); + time_slice_expired(p, new_rq); if (suitable_idle_cpus(p)) - resched_best_idle(p); + resched_best_idle(p, task_cpu(p)); else if (unlikely(rq != new_rq)) try_preempt(p, new_rq); } else { - p->time_slice = rq->rq_time_slice; + p->time_slice = rq_curr->time_slice; if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { /* * The VM isn't cloned, so we're in a good position to @@ -1791,10 +2425,11 @@ void wake_up_new_task(struct task_struct *p) try_preempt(p, new_rq); } } else { - time_slice_expired(p); + time_slice_expired(p, new_rq); try_preempt(p, new_rq); } - task_grq_unlock(p, &flags); + double_rq_unlock(rq, new_rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1928,7 +2563,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * because prev may have moved to another CPU. */ static struct rq *finish_task_switch(struct task_struct *prev) - __releases(grq.lock) + __releases(rq->lock) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; @@ -1988,7 +2623,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) * @prev: the thread we just switched away from. */ asmlinkage __visible void schedule_tail(struct task_struct *prev) - __releases(grq.lock) + __releases(rq->lock) { struct rq *rq; @@ -2045,7 +2680,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2058,26 +2693,16 @@ context_switch(struct rq *rq, struct task_struct *prev, * nr_running, nr_uninterruptible and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, total number of context switches performed since bootup. All are - * measured without grabbing the grq lock but the occasional inaccurate result - * doesn't matter so long as it's positive. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { - long nr = grq.nr_running; - - if (unlikely(nr < 0)) - nr = 0; - return (unsigned long)nr; + return atomic_read(&grq.nr_running); } static unsigned long nr_uninterruptible(void) { - long nu = grq.nr_uninterruptible; - - if (unlikely(nu < 0)) - nu = 0; - return nu; + return atomic_read(&grq.nr_uninterruptible); } /* @@ -2095,7 +2720,9 @@ static unsigned long nr_uninterruptible(void) */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->soft_affined == 1) + struct rq *rq = cpu_rq(smp_processor_id()); + + if (rq_load(rq) == 1) return true; else return false; @@ -2104,12 +2731,7 @@ EXPORT_SYMBOL(single_task_running); unsigned long long nr_context_switches(void) { - long long ns = grq.nr_switches; - - /* This is of course impossible */ - if (unlikely(ns < 0)) - ns = 1; - return (unsigned long long)ns; + return (unsigned long long)atomic64_read(&grq.nr_switches); } unsigned long nr_iowait(void) @@ -2133,15 +2755,16 @@ unsigned long nr_active(void) return nr_running() + nr_uninterruptible(); } -/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we - * base it on the number of running or queued tasks with their ->rq pointer - * set to this cpu as being the CPU they're more likely to run on. */ +/* + * I/O wait is the number of running or queued tasks with their ->rq pointer + * set to this cpu as being the CPU they're more likely to run on. + */ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { struct rq *rq = this_rq(); *nr_waiters = atomic_read(&rq->nr_iowait); - *load = rq->soft_affined; + *load = rq_load(rq); } /* Variables and functions for calc_load */ @@ -2364,7 +2987,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= steal; } #endif - rq->clock_task += delta; } @@ -2456,86 +3078,89 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) } /* - * On each tick, see what percentage of that tick was attributed to each - * component and add the percentage to the _pc values. Once a _pc value has - * accumulated one tick's worth, account for that. This means the total - * percentage of load components will always be 128 (pseudo 100) per tick. + * On each tick, add the number of nanoseconds to the unbanked variables and + * once one tick's worth has accumulated, account it allowing for accurate + * sub-tick accounting and totals. */ -static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc) +static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) { u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; if (atomic_read(&rq->nr_iowait) > 0) { - rq->iowait_pc += pc; - if (rq->iowait_pc >= 128) { - cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128; - rq->iowait_pc %= 128; + rq->iowait_ns += ns; + if (rq->iowait_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->iowait_ns); + cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks; + rq->iowait_ns %= JIFFY_NS; } } else { - rq->idle_pc += pc; - if (rq->idle_pc >= 128) { - cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128; - rq->idle_pc %= 128; + rq->idle_ns += ns; + if (rq->idle_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->idle_ns); + cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks; + rq->idle_ns %= JIFFY_NS; } } acct_update_integrals(idle); } -static void -pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset, - unsigned long pc, unsigned long ns) +static void pc_system_time(struct rq *rq, struct task_struct *p, + int hardirq_offset, unsigned long ns) { - u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; - p->stime_pc += pc; - if (p->stime_pc >= 128) { - int jiffs = p->stime_pc / 128; - - p->stime_pc %= 128; - p->stime += (__force u64)cputime_one_jiffy * jiffs; - p->stimescaled += one_jiffy_scaled * jiffs; - account_group_system_time(p, cputime_one_jiffy * jiffs); + p->stime_ns += ns; + if (p->stime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->stime_ns); + p->stime_ns %= JIFFY_NS; + p->stime += (__force u64)cputime_one_jiffy * ticks; + p->stimescaled += one_jiffy_scaled * ticks; + account_group_system_time(p, cputime_one_jiffy * ticks); } p->sched_time += ns; account_group_exec_runtime(p, ns); if (hardirq_count() - hardirq_offset) { - rq->irq_pc += pc; - if (rq->irq_pc >= 128) { - cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128; - rq->irq_pc %= 128; + rq->irq_ns += ns; + if (rq->irq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->irq_ns); + cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->irq_ns %= JIFFY_NS; } } else if (in_serving_softirq()) { - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; } } else { - rq->system_pc += pc; - if (rq->system_pc >= 128) { - cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128; - rq->system_pc %= 128; + rq->system_ns += ns; + if (rq->system_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->system_ns); + cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks; + rq->system_ns %= JIFFY_NS; } } acct_update_integrals(p); } -static void pc_user_time(struct rq *rq, struct task_struct *p, - unsigned long pc, unsigned long ns) +static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) { - u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; - p->utime_pc += pc; - if (p->utime_pc >= 128) { - int jiffs = p->utime_pc / 128; - - p->utime_pc %= 128; - p->utime += (__force u64)cputime_one_jiffy * jiffs; - p->utimescaled += one_jiffy_scaled * jiffs; - account_group_user_time(p, cputime_one_jiffy * jiffs); + p->utime_ns += ns; + if (p->utime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->utime_ns); + p->utime_ns %= JIFFY_NS; + p->utime += (__force u64)cputime_one_jiffy * ticks; + p->utimescaled += one_jiffy_scaled * ticks; + account_group_user_time(p, cputime_one_jiffy * ticks); } p->sched_time += ns; account_group_exec_runtime(p, ns); @@ -2545,36 +3170,33 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. */ - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; } } if (task_nice(p) > 0 || idleprio_task(p)) { - rq->nice_pc += pc; - if (rq->nice_pc >= 128) { - cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128; - rq->nice_pc %= 128; + rq->nice_ns += ns; + if (rq->nice_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->nice_ns); + cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks; + rq->nice_ns %= JIFFY_NS; } } else { - rq->user_pc += pc; - if (rq->user_pc >= 128) { - cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128; - rq->user_pc %= 128; + rq->user_ns += ns; + if (rq->user_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->user_ns); + cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks; + rq->user_ns %= JIFFY_NS; } } acct_update_integrals(p); } /* - * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast - * shifts instead of 100 - */ -#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS) - -/* * This is called on clock ticks. * Bank in p->sched_time the ns elapsed since the last tick or switch. * CPU scheduler quota accounting is also performed here in microseconds. @@ -2582,38 +3204,29 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock_task - rq->rq_last_ran; + s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; - unsigned long account_pc; - if (unlikely(account_ns < 0) || steal_account_process_tick()) + if (steal_account_process_tick()) goto ts_account; - account_pc = NS_TO_PC(account_ns); - /* Accurate tick timekeeping */ if (user_mode(get_irq_regs())) - pc_user_time(rq, p, account_pc, account_ns); - else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) - pc_system_time(rq, p, HARDIRQ_OFFSET, - account_pc, account_ns); - else - pc_idle_time(rq, idle, account_pc); + pc_user_time(rq, p, account_ns); + else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { + pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); + } else + pc_idle_time(rq, idle, account_ns); if (sched_clock_irqtime) irqtime_account_hi_si(); ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); - } + if (p->policy != SCHED_FIFO && p != idle) + p->time_slice -= NS_TO_US(account_ns); - rq->rq_last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; + p->last_ran = rq->niffies; } /* @@ -2624,40 +3237,25 @@ ts_account: static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock_task - rq->rq_last_ran; + s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; - unsigned long account_pc; - - if (unlikely(account_ns < 0)) - goto ts_account; - - account_pc = NS_TO_PC(account_ns); /* Accurate subtick timekeeping */ - if (p != idle) { - pc_user_time(rq, p, account_pc, account_ns); - } + if (p != idle) + pc_user_time(rq, p, account_ns); else - pc_idle_time(rq, idle, account_pc); + pc_idle_time(rq, idle, account_ns); -ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); - } - - rq->rq_last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; + if (p->policy != SCHED_FIFO && p != idle) + p->time_slice -= NS_TO_US(account_ns); } /* * Return any ns on the sched_clock that have not yet been accounted in * @p in case that task is currently running. * - * Called with task_grq_lock() held. + * Called with task_rq_lock(p) held. */ static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) { @@ -2668,11 +3266,9 @@ static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (p == rq->curr && p->on_rq) { + if (p == rq->curr && task_on_rq_queued(p)) { update_clocks(rq); - ns = rq->clock_task - rq->rq_last_ran; - if (unlikely((s64)ns < 0)) - ns = 0; + ns = rq->niffies - p->last_ran; } return ns; @@ -2702,13 +3298,13 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we see ->on_cpu without ->on_rq, the task is leaving, and has * been accounted, so we're correct here as well. */ - if (!p->on_cpu || !p->on_rq) + if (!p->on_cpu || !task_on_rq_queued(p)) return tsk_seruntime(p); #endif - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); ns = p->sched_time + do_task_delta_exec(p, rq); - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -2841,37 +3437,12 @@ void account_idle_ticks(unsigned long ticks) } #endif -static inline void grq_iso_lock(void) - __acquires(grq.iso_lock) -{ - raw_spin_lock(&grq.iso_lock); -} - -static inline void grq_iso_unlock(void) - __releases(grq.iso_lock) -{ - raw_spin_unlock(&grq.iso_lock); -} - /* * Functions to test for when SCHED_ISO tasks have used their allocated - * quota as real time scheduling and convert them back to SCHED_NORMAL. - * Where possible, the data is tested lockless, to avoid grabbing iso_lock - * because the occasional inaccurate result won't matter. However the - * tick data is only ever modified under lock. iso_refractory is only simply - * set to 0 or 1 so it's not worth grabbing the lock yet again for that. + * quota as real time scheduling and convert them back to SCHED_NORMAL. All + * data is modified only by the local runqueue during scheduler_tick with + * interrupts disabled. */ -static bool set_iso_refractory(void) -{ - grq.iso_refractory = true; - return grq.iso_refractory; -} - -static bool clear_iso_refractory(void) -{ - grq.iso_refractory = false; - return grq.iso_refractory; -} /* * Test if SCHED_ISO tasks have run longer than their alloted period as RT @@ -2879,97 +3450,82 @@ static bool clear_iso_refractory(void) * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a * slow division. */ -static bool test_ret_isorefractory(struct rq *rq) +static inline void iso_tick(struct rq *rq) { - if (likely(!grq.iso_refractory)) { - if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu) - return set_iso_refractory(); - } else { - if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) - return clear_iso_refractory(); + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; + rq->iso_ticks += 100; + if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { + rq->iso_refractory = true; + if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) + rq->iso_ticks = ISO_PERIOD * 100; } - return grq.iso_refractory; -} - -static void iso_tick(void) -{ - grq_iso_lock(); - grq.iso_ticks += 100; - grq_iso_unlock(); } /* No SCHED_ISO task was running so decrease rq->iso_ticks */ -static inline void no_iso_tick(void) -{ - if (grq.iso_ticks) { - grq_iso_lock(); - grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; - if (unlikely(grq.iso_refractory && grq.iso_ticks < - ISO_PERIOD * (sched_iso_cpu * 115 / 128))) - clear_iso_refractory(); - grq_iso_unlock(); +static inline void no_iso_tick(struct rq *rq, int ticks) +{ + if (rq->iso_ticks > 0 || rq->iso_refractory) { + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; + if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { + rq->iso_refractory = false; + if (unlikely(rq->iso_ticks < 0)) + rq->iso_ticks = 0; + } } } /* This manages tasks that have run out of timeslice during a scheduler_tick */ static void task_running_tick(struct rq *rq) { - struct task_struct *p; + struct task_struct *p = rq->curr; /* * If a SCHED_ISO task is running we increment the iso_ticks. In * order to prevent SCHED_ISO tasks from causing starvation in the * presence of true RT tasks we account those as iso_ticks as well. */ - if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) { - if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128) - iso_tick(); - } else - no_iso_tick(); + if (rt_task(p) || task_running_iso(p)) + iso_tick(rq); + else + no_iso_tick(rq, 1); - if (iso_queue(rq)) { - if (unlikely(test_ret_isorefractory(rq))) { - if (rq_running_iso(rq)) { + /* SCHED_FIFO tasks never run out of timeslice. */ + if (p->policy == SCHED_FIFO) + return; + + if (iso_task(p)) { + if (task_running_iso(p)) { + if (rq->iso_refractory) { /* * SCHED_ISO task is running as RT and limit * has been hit. Force it to reschedule as * SCHED_NORMAL by zeroing its time_slice */ - rq->rq_time_slice = 0; + p->time_slice = 0; } + } else if (!rq->iso_refractory) { + /* Can now run again ISO. Reschedule to pick up prio */ + goto out_resched; } } - /* SCHED_FIFO tasks never run out of timeslice. */ - if (rq->rq_policy == SCHED_FIFO) - return; /* * Tasks that were scheduled in the first half of a tick are not * allowed to run into the 2nd half of the next tick if they will * run out of time slice in the interim. Otherwise, if they have * less than RESCHED_US μs of time slice left they will be rescheduled. */ - if (rq->dither) { - if (rq->rq_time_slice > HALF_JIFFY_US) - return; - else - rq->rq_time_slice = 0; - } else if (rq->rq_time_slice >= RESCHED_US) - return; - - /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ - p = rq->curr; - - grq_lock(); - requeue_task(p); - resched_task(p); - grq_unlock(); + if (p->time_slice - rq->dither >= RESCHED_US) + return; +out_resched: + rq_lock(rq); + __set_tsk_resched(p); + rq_unlock(rq); } /* * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. The data modified is all - * local to struct rq so we don't need to grab grq lock. + * We call it with interrupts disabled. */ void scheduler_tick(void) { @@ -2977,15 +3533,14 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); sched_clock_tick(); - /* grq lock not grabbed, so only update rq clock */ update_rq_clock(rq); - update_cpu_clock_tick(rq, rq->curr); update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); + update_cpu_clock_tick(rq, rq->curr); if (!rq_idle(rq)) task_running_tick(rq); else - no_iso_tick(); + no_iso_tick(rq, rq->last_scheduler_tick - rq->last_jiffy); + rq->last_scheduler_tick = rq->last_jiffy; rq->last_tick = rq->clock; perf_event_task_tick(); } @@ -3068,12 +3623,13 @@ static inline void preempt_latency_stop(int val) { } /* * The time_slice is only refilled when it is empty and that is when we set a - * new deadline. + * new deadline. Make sure update_clocks has been called recently to update + * rq->niffies. */ -static void time_slice_expired(struct task_struct *p) +static void time_slice_expired(struct task_struct *p, struct rq *rq) { p->time_slice = timeslice(); - p->deadline = grq.niffies + task_deadline_diff(p); + p->deadline = rq->niffies + task_deadline_diff(p); #ifdef CONFIG_SMT_NICE if (!p->mm) p->smt_bias = 0; @@ -3100,107 +3656,107 @@ static void time_slice_expired(struct task_struct *p) * SCHED_NORMAL tasks. */ -static inline void check_deadline(struct task_struct *p) +static inline void check_deadline(struct task_struct *p, struct rq *rq) { if (p->time_slice < RESCHED_US || batch_task(p)) - time_slice_expired(p); + time_slice_expired(p, rq); } #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) /* - * Scheduler queue bitmap specific find next bit. - */ -static inline unsigned long -next_sched_bit(const unsigned long *addr, unsigned long offset) -{ - const unsigned long *p; - unsigned long result; - unsigned long size; - unsigned long tmp; - - size = PRIO_LIMIT; - if (offset >= size) - return size; - - p = addr + BITOP_WORD(offset); - result = offset & ~(BITS_PER_LONG-1); - size -= result; - offset %= BITS_PER_LONG; - if (offset) { - tmp = *(p++); - tmp &= (~0UL << offset); - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - } - while (size & ~(BITS_PER_LONG-1)) { - if ((tmp = *(p++))) - goto found_middle; - result += BITS_PER_LONG; - size -= BITS_PER_LONG; - } - if (!size) - return result; - tmp = *p; - -found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __ffs(tmp); -} - -/* * Task selection with skiplists is a simple matter of picking off the first - * task in the sorted list, an O(1) operation. The only time it takes longer - * is if tasks do not have suitable affinity and then we iterate over entries - * till we find the first that does. Worst case here is no tasks with suitable - * affinity and taking O(n). + * task in the sorted list, an O(1) operation. The lookup is amortised O(1) + * being bound to the number of processors. + * + * Runqueues are selectively locked based on their unlocked data and then + * unlocked if not needed. At most 3 locks will be held at any time and are + * released as soon as they're no longer needed. All balancing between CPUs + * is thus done here in an extremely simple first come best fit manner. + * + * This iterates over runqueues in cache locality order. In interactive mode + * it iterates over all CPUs and finds the task with the best key/deadline. + * In non-interactive mode it will only take a task if it's from the current + * runqueue or a runqueue with more tasks than the current one with a better + * key/deadline. */ static inline struct task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) { - skiplist_node *node = &grq.node; struct task_struct *edt = idle; - u64 earliest_deadline = ~0ULL; + struct rq *locked = NULL; + int i, best_entries = 0; + u64 best_key = ~0ULL; - while ((node = node->next[0]) != &grq.node) { - struct task_struct *p = node->value; + for (i = 0; i < num_possible_cpus(); i++) { + struct rq *other_rq = rq_order(rq, i); + int entries = other_rq->sl->entries; + struct task_struct *p; + u64 key; - /* Make sure affinity is ok */ - if (needs_other_cpu(p, cpu)) + /* + * Check for queued entres lockless first. The local runqueue + * is locked so entries will always be accurate. + */ + if (!sched_interactive) { + if (entries <= best_entries) + continue; + } else if (!entries) continue; - if (!smt_schedule(p, rq)) - continue; + /* if (i) implies other_rq != rq */ + if (i) { + /* Check for best id queued lockless first */ + if (other_rq->best_key >= best_key) + continue; - if (!sched_interactive) { - int tcpu; + if (unlikely(!trylock_rq(rq, other_rq))) + continue; - if ((tcpu = task_cpu(p)) != cpu) { - u64 dl = p->deadline << locality_diff(tcpu, rq); + /* Need to reevaluate entries after locking */ + entries = other_rq->sl->entries; + if (unlikely(!entries)) { + unlock_rq(other_rq); + continue; + } + } + key = other_rq->node.next[0]->key; + /* Reevaluate key after locking */ + if (unlikely(key >= best_key)) { + if (i) + unlock_rq(other_rq); + continue; + } - if (!deadline_before(dl, earliest_deadline)) - continue; - earliest_deadline = dl; - edt = p; - /* We continue even though we've found the earliest - * deadline task as the locality offset means there - * may be a better candidate after it. */ + p = other_rq->node.next[0]->value; + if (!smt_schedule(p, rq)) { + if (i) + unlock_rq(other_rq); + continue; + } + + /* Make sure affinity is ok */ + if (i) { + if (needs_other_cpu(p, cpu)) { + unlock_rq(other_rq); continue; } + if (locked) + unlock_rq(locked); + locked = other_rq; } - /* We've encountered the best deadline local task */ + + best_entries = entries; + best_key = key; edt = p; - break; } + if (likely(edt != idle)) - take_task(cpu, edt); + take_task(rq, cpu, edt); + + if (locked) + unlock_rq(locked); + return edt; } @@ -3226,9 +3782,6 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif - if (panic_on_warn) - panic("scheduling while atomic\n"); - dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -3256,15 +3809,11 @@ static inline void schedule_debug(struct task_struct *prev) /* * The currently running task's information is all stored in rq local data - * which is only modified by the local CPU, thereby allowing the data to be - * changed without grabbing the grq lock. + * which is only modified by the local CPU. */ static inline void set_rq_task(struct rq *rq, struct task_struct *p) { - rq->rq_time_slice = p->time_slice; rq->rq_deadline = p->deadline; - rq->rq_last_ran = p->last_ran = rq->clock_task; - rq->rq_policy = p->policy; rq->rq_prio = p->prio; #ifdef CONFIG_SMT_NICE rq->rq_mm = p->mm; @@ -3272,15 +3821,6 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p) #endif } -static void reset_rq_task(struct rq *rq, struct task_struct *p) -{ - rq->rq_policy = p->policy; - rq->rq_prio = p->prio; -#ifdef CONFIG_SMT_NICE - rq->rq_smt_bias = p->smt_bias; -#endif -} - #ifdef CONFIG_SMT_NICE static void check_no_siblings(struct rq __maybe_unused *this_rq) {} static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} @@ -3381,11 +3921,13 @@ static void __sched notrace __schedule(bool preempt) unsigned long *switch_count; bool deactivate = false; struct rq *rq; + u64 niffies; int cpu; cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; + idle = rq->idle; /* * do_exit() calls schedule() with preemption disabled as an exception; @@ -3409,7 +3951,23 @@ static void __sched notrace __schedule(bool preempt) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - grq_lock(); + rq_lock(rq); +#ifdef CONFIG_SMP + if (rq->preempt) { + /* + * Make sure resched_curr hasn't triggered a preemption + * locklessly on a task that has since scheduled away. Spurious + * wakeup of idle is okay though. + */ + if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { + rq->preempt = NULL; + clear_preempt_need_resched(); + rq_unlock_irq(rq); + return; + } + rq->preempt = NULL; + } +#endif switch_count = &prev->nivcsw; if (!preempt && prev->state) { @@ -3431,7 +3989,7 @@ static void __sched notrace __schedule(bool preempt) to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) { /* This shouldn't happen, but does */ - if (unlikely(to_wakeup == prev)) + if (WARN_ONCE((to_wakeup == prev), "Waking up prev as worker\n")) deactivate = false; else try_to_wake_up_local(to_wakeup); @@ -3441,64 +3999,64 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nvcsw; } + /* + * Store the niffy value here for use by the next task's last_ran + * below to avoid losing niffies due to update_clocks being called + * again after this point. + */ update_clocks(rq); + niffies = rq->niffies; update_cpu_clock_switch(rq, prev); if (rq->clock - rq->last_tick > HALF_JIFFY_NS) - rq->dither = false; + rq->dither = 0; else - rq->dither = true; + rq->dither = HALF_JIFFY_US; clear_tsk_need_resched(prev); clear_preempt_need_resched(); - idle = rq->idle; if (idle != prev) { - /* Update all the information stored on struct rq */ - prev->time_slice = rq->rq_time_slice; - prev->deadline = rq->rq_deadline; - check_deadline(prev); - prev->last_ran = rq->clock_task; - return_task(prev, rq, deactivate); + check_deadline(prev, rq); + return_task(prev, rq, cpu, deactivate); } if (unlikely(!queued_notrunning())) { - /* - * This CPU is now truly idle as opposed to when idle is - * scheduled as a high priority task in its own right. - */ next = idle; schedstat_inc(rq, sched_goidle); set_cpuidle_map(cpu); + update_load_avg(rq); } else { next = earliest_deadline_task(rq, cpu, idle); if (likely(next->prio != PRIO_LIMIT)) clear_cpuidle_map(cpu); - else + else { set_cpuidle_map(cpu); + update_load_avg(rq); + } } + set_rq_task(rq, next); + next->last_ran = niffies; + if (likely(prev != next)) { /* * Don't reschedule an idle task or deactivated tasks */ if (prev != idle && !deactivate) resched_suitable_idle(prev); - set_rq_task(rq, next); if (next != idle) check_siblings(rq); else wake_siblings(rq); - grq.nr_switches++; - prev->on_cpu = false; - next->on_cpu = true; + atomic64_inc(&grq.nr_switches); rq->curr = next; ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next); /* unlocks the grq */ + rq = context_switch(rq, prev, next); /* unlocks the rq */ } else { check_siblings(rq); - grq_unlock_irq(); + rq_unlock_irq(rq); } } @@ -3713,13 +4271,12 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - unsigned long flags; struct rq *rq; int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); - rq = task_grq_lock(p, &flags); + rq = __task_rq_lock(p); /* * Idle task boosting is a nono in general. There is one @@ -3742,17 +4299,17 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; p->prio = prio; - if (task_running(p)){ + if (task_running(rq, p)){ if (prio > oldprio) resched_task(p); } else if (task_queued(p)) { - dequeue_task(p); - enqueue_task(p, rq); + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); if (prio < oldprio) try_preempt(p, rq); } out_unlock: - task_grq_unlock(p, &flags); + __task_rq_unlock(rq); } #endif @@ -3779,7 +4336,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = time_task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3797,17 +4354,17 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); if (task_queued(p)) { - dequeue_task(p); - enqueue_task(p, rq); + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); if (new_static < old_static) try_preempt(p, rq); - } else if (task_running(p)) { - reset_rq_task(rq, p); + } else if (task_running(rq, p)) { + set_rq_task(rq, p); if (old_static < new_static) resched_task(p); } out_unlock: - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -3878,7 +4435,7 @@ int task_prio(const struct task_struct *p) goto out; /* Convert to ms to avoid overflows */ - delta = NS_TO_MS(p->deadline - grq.niffies); + delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); delta = delta * 40 / ms_longest_deadline_diff(); if (delta > 0 && delta <= 80) prio += delta; @@ -3921,7 +4478,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } -/* Actually do priority change: must hold grq lock. */ +/* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio, bool keep_boost) { @@ -3948,12 +4505,12 @@ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, } else p->prio = p->normal_prio; - if (task_running(p)) { - reset_rq_task(rq, p); + if (task_running(rq, p)) { + set_rq_task(rq, p); resched_task(p); } else if (task_queued(p)) { - dequeue_task(p); - enqueue_task(p, rq); + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); if (p->prio < oldprio || p->rt_priority > oldrtprio) try_preempt(p, rq); } @@ -4055,7 +4612,7 @@ recheck: case SCHED_ISO: if (policy == SCHED_ISO) goto out; - if (policy == SCHED_NORMAL) + if (policy != SCHED_NORMAL) return -EPERM; break; case SCHED_BATCH: @@ -4092,16 +4649,16 @@ recheck: * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: * - * To be able to change p->policy safely, the grunqueue lock must be + * To be able to change p->policy safely, the runqueue lock must be * held. */ - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); return -EINVAL; } @@ -4110,22 +4667,20 @@ recheck: */ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || param->sched_priority == p->rt_priority))) { - - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); return 0; } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); goto recheck; } - update_clocks(rq); p->sched_reset_on_fork = reset_on_fork; __setscheduler(p, rq, policy, param->sched_priority, pi); - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); if (pi) rt_mutex_adjust_pi(p); @@ -4625,9 +5180,9 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (retval) goto out_unlock; - grq_lock_irqsave(&flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask); - grq_unlock_irqrestore(&flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); @@ -4642,8 +5197,7 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: size of CPU mask copied to user_mask_ptr on success. An - * error code otherwise. + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -4685,19 +5239,20 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, SYSCALL_DEFINE0(sched_yield) { struct task_struct *p; + struct rq *rq; p = current; - grq_lock_irq(); + rq = this_rq_lock(); + time_slice_expired(p, rq); schedstat_inc(task_rq(p), yld_count); - requeue_task(p); /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(grq.lock); - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&grq.lock); + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + do_raw_spin_unlock(&rq->lock); sched_preempt_enable_no_resched(); schedule(); @@ -4803,29 +5358,44 @@ EXPORT_SYMBOL(yield); */ int __sched yield_to(struct task_struct *p, bool preempt) { + struct task_struct *rq_p; struct rq *rq, *p_rq; unsigned long flags; int yielded = 0; + local_irq_save(flags); rq = this_rq(); - grq_lock_irqsave(&flags); - if (task_running(p) || p->state) { + +again: + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (task_running(p_rq, p) || p->state) { yielded = -ESRCH; - goto out_unlock; + goto out_irq; + } + + double_rq_lock(rq, p_rq); + if (unlikely(task_rq(p) != p_rq)) { + double_rq_unlock(rq, p_rq); + goto again; } - p_rq = task_rq(p); yielded = 1; - if (p->deadline > rq->rq_deadline) - p->deadline = rq->rq_deadline; - p->time_slice += rq->rq_time_slice; - rq->rq_time_slice = 0; + rq_p = rq->curr; + if (p->deadline > rq_p->deadline) + p->deadline = rq_p->deadline; + p->time_slice += rq_p->time_slice; if (p->time_slice > timeslice()) p->time_slice = timeslice(); + time_slice_expired(rq_p, rq); if (preempt && rq != p_rq) - resched_curr(p_rq); -out_unlock: - grq_unlock_irqrestore(&flags); + resched_task(p_rq->curr); + double_rq_unlock(rq, p_rq); +out_irq: + local_irq_restore(flags); if (yielded > 0) schedule(); @@ -4931,8 +5501,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct task_struct *p; unsigned int time_slice; unsigned long flags; - int retval; struct timespec t; + struct rq *rq; + int retval; if (pid < 0) return -EINVAL; @@ -4947,9 +5518,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - grq_lock_irqsave(&flags); + rq = task_rq_lock(p, &flags); time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); - grq_unlock_irqrestore(&flags); + task_rq_unlock(rq, p, &flags); rcu_read_unlock(); t = ns_to_timespec(time_slice); @@ -5049,9 +5620,21 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { + struct rq *rq = task_rq(p); + + lockdep_assert_held(&p->pi_lock); + cpumask_copy(tsk_cpus_allowed(p), new_mask); + + if (task_queued(p)) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + } if (needs_other_cpu(p, task_cpu(p))) - set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); + set_task_cpu(p, valid_task_cpu(p)); } #endif @@ -5069,8 +5652,8 @@ void init_idle(struct task_struct *idle, int cpu) unsigned long flags; raw_spin_lock_irqsave(&idle->pi_lock, flags); - time_lock_grq(rq); - idle->last_ran = rq->clock_task; + raw_spin_lock(&rq->lock); + idle->last_ran = rq->niffies; idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ idle->prio = PRIO_LIMIT; @@ -5097,8 +5680,8 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; - idle->on_cpu = 1; - grq_unlock(); + idle->on_rq = TASK_ON_RQ_QUEUED; + raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -5136,59 +5719,14 @@ int task_can_attach(struct task_struct *p, return ret; } -void wake_q_add(struct wake_q_head *head, struct task_struct *task) -{ - struct wake_q_node *node = &task->wake_q; - - /* - * Atomically grab the task, if ->wake_q is !nil already it means - * its already queued (either by us or someone else) and will get the - * wakeup due to that. - * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_q(). - */ - if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) - return; - - get_task_struct(task); - - /* - * The head is context local, there can be no concurrency. - */ - *head->lastp = node; - head->lastp = &node->next; -} - -void wake_up_q(struct wake_q_head *head) -{ - struct wake_q_node *node = head->first; - - while (node != WAKE_Q_TAIL) { - struct task_struct *task; - - task = container_of(node, struct task_struct, wake_q); - BUG_ON(!task); - /* task can safely be re-inserted now */ - node = node->next; - task->wake_q.next = NULL; - - /* - * wake_up_process() implies a wmb() to pair with the queueing - * in wake_q_add() so as not to miss wakeups. - */ - wake_up_process(task); - put_task_struct(task); - } -} - void resched_cpu(int cpu) { + struct rq *rq = cpu_rq(cpu); unsigned long flags; - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); resched_task(cpu_curr(cpu)); - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); } #ifdef CONFIG_SMP @@ -5262,7 +5800,7 @@ int get_nohz_timer_target(void) continue; if (!idle_cpu(i) && is_housekeeping_cpu(i)) { - cpu = i; + cpu = i; cpu = i; goto unlock; } @@ -5291,8 +5829,10 @@ void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; - set_tsk_need_resched(cpu_rq(cpu)->idle); - smp_send_reschedule(cpu); + if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void wake_up_nohz_cpu(int cpu) @@ -5321,7 +5861,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq *rq; int ret = 0; - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); if (p->flags & PF_KTHREAD) { /* @@ -5366,22 +5906,30 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (task_running(p)) { + if (task_running(rq, p)) { /* Task is running on the wrong cpu now, reschedule it. */ if (rq == this_rq()) { set_tsk_need_resched(p); running_wrong = true; } else resched_task(p); - } else - set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask)); + } else { + int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + struct rq *dest_rq = cpu_rq(dest_cpu); + + /* Switch rq locks here */ + lock_second_rq(rq, dest_rq); + set_task_cpu(p, dest_cpu); + rq_unlock(rq); + rq = dest_rq; + } out: if (queued && !cpumask_subset(new_mask, &old_mask)) try_preempt(p, rq); if (running_wrong) preempt_disable(); - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); if (running_wrong) { __schedule(true); @@ -5397,11 +5945,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -static bool sched_smp_initialized __read_mostly; - #ifdef CONFIG_HOTPLUG_CPU -/* Run through task list and find tasks affined to the dead cpu, then remove - * that cpu from the list, enable cpu0 and set the zerobound flag. */ +/* + * Run through task list and find tasks affined to the dead cpu, then remove + * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold + * cpu 0 and src_cpu's runqueue locks. + */ static void bind_zero(int src_cpu) { struct task_struct *p, *t; @@ -5412,15 +5961,17 @@ static void bind_zero(int src_cpu) do_each_thread(t, p) { if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) { - cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p)); - cpumask_set_cpu(0, tsk_cpus_allowed(p)); + bool local = (task_cpu(p) == src_cpu); + + /* task_running is the cpu stopper thread */ + if (local && task_running(task_rq(p), p)) + continue; + atomic_clear_cpu(src_cpu, tsk_cpus_allowed(p)); + atomic_set_cpu(0, tsk_cpus_allowed(p)); p->zerobound = true; bound++; - if (task_cpu(p) == src_cpu) { + if (local) set_task_cpu(p, 0); - if (task_running(p)) - resched_task(p); - } } } while_each_thread(t, p); @@ -5834,7 +6385,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) struct root_domain *old_rd = NULL; unsigned long flags; - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); if (rq->rd) { old_rd = rq->rd; @@ -5860,7 +6411,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); if (old_rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); @@ -6839,14 +7390,13 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } unbind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); return 0; } @@ -6894,14 +7444,15 @@ int sched_cpu_dying(unsigned int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - grq_lock_irqsave(&flags); + local_irq_save(flags); + double_rq_lock(rq, cpu_rq(0)); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } bind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); + double_rq_unlock(rq, cpu_rq(0)); + local_irq_restore(flags); return 0; } @@ -6958,9 +7509,8 @@ void __init sched_init_smp(void) #ifdef CONFIG_SCHED_SMT bool smt_threads = false; #endif - struct rq *rq; - cpumask_var_t non_isolated_cpus; + struct rq *rq; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); @@ -6985,7 +7535,8 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); mutex_lock(&sched_domains_mutex); - grq_lock_irq(); + local_irq_disable(); + lock_all_rqs(); /* * Set up the relative cache distance of each online cpu from each * other in a simple array for quick lookup. Locality is determined @@ -7025,21 +7576,21 @@ void __init sched_init_smp(void) } #endif #ifdef CONFIG_SCHED_SMT - for_each_cpu(other_cpu, thread_cpumask(cpu)) - rq->cpu_locality[other_cpu] = 1; if (cpumask_weight(thread_cpumask(cpu)) > 1) { cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); cpumask_clear_cpu(cpu, &rq->thread_mask); + for_each_cpu(other_cpu, thread_cpumask(cpu)) + rq->cpu_locality[other_cpu] = 1; rq->siblings_idle = siblings_cpu_idle; smt_threads = true; } #endif } for_each_possible_cpu(cpu) { - int total_cpus = 0, locality; + int total_cpus = 1, locality; rq = cpu_rq(cpu); - for (locality = 0; locality <= 4; locality++) { + for (locality = 1; locality <= 4; locality++) { for_each_possible_cpu(other_cpu) { if (rq->cpu_locality[other_cpu] == locality) rq->rq_order[total_cpus++] = cpu_rq(other_cpu); @@ -7053,7 +7604,8 @@ void __init sched_init_smp(void) smt_schedule = &smt_should_schedule; } #endif - grq_unlock_irq(); + unlock_all_rqs(); + local_irq_enable(); mutex_unlock(&sched_domains_mutex); for_each_online_cpu(cpu) { @@ -7062,7 +7614,7 @@ void __init sched_init_smp(void) for_each_online_cpu(other_cpu) { if (other_cpu <= cpu) continue; - printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); + printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); } } sched_smp_initialized = true; @@ -7116,21 +7668,14 @@ void __init sched_init(void) for (i = 1 ; i < NICE_WIDTH ; i++) prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; - raw_spin_lock_init(&grq.lock); - grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; - grq.niffies = 0; - grq.last_jiffy = jiffies; - raw_spin_lock_init(&grq.iso_lock); - grq.iso_ticks = 0; - grq.iso_refractory = false; - grq.noc = 1; - skiplist_init(&grq.node); - grq.sl = new_skiplist(&grq.node); + atomic_set(&grq.nr_running, 0); + atomic_set(&grq.nr_uninterruptible, 0); + atomic64_set(&grq.nr_switches, 0); skiplist_node_init(&init_task.node); #ifdef CONFIG_SMP init_defrootdomain(); - grq.qnr = grq.idle_cpus = 0; + atomic_set(&grq.qnr, 0); cpumask_clear(&grq.cpu_idle_map); #else uprq = &per_cpu(runqueues, 0); @@ -7145,12 +7690,18 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { rq = cpu_rq(i); - rq->grq_lock = &grq.lock; - rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = - rq->iowait_pc = rq->idle_pc = 0; - rq->dither = false; + skiplist_init(&rq->node); + rq->sl = new_skiplist(&rq->node); + raw_spin_lock_init(&rq->lock); + rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; + rq->last_jiffy = jiffies; + rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = + rq->iowait_ns = rq->idle_ns = 0; + rq->dither = 0; + set_rq_task(rq, &init_task); + rq->iso_ticks = 0; + rq->iso_refractory = false; #ifdef CONFIG_SMP - rq->last_niffy = 0; rq->sd = NULL; rq->rd = NULL; rq->online = false; @@ -7302,9 +7853,9 @@ static inline void normalise_rt_tasks(void) if (!rt_task(p) && !iso_task(p)) continue; - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); __setscheduler(p, rq, SCHED_NORMAL, 0, false); - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); } read_unlock(&tasklist_lock); } @@ -7367,6 +7918,25 @@ void set_curr_task(int cpu, struct task_struct *p) /* * Use precise platform statistics if available: */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_common_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + vtime_account_user(prev); +#endif + arch_vtime_task_switch(prev); +} +#endif + +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -7395,20 +7965,26 @@ void vtime_account_system_irqsafe(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); else - vtime_account_system(prev); - - vtime_account_user(prev); - arch_vtime_task_switch(prev); + vtime_account_system(tsk); } -#endif +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#else +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ /* * Perform (stime * rtime) / total, but avoid multiplication overflow by * losing precision when the numbers are big. @@ -7530,7 +8106,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime thread_group_cputime(p, &cputime); cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } -#endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ void init_idle_bootup_task(struct task_struct *idle) {} diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/MuQSS.h index 00a16ba0a..4e3115dac 100644 --- a/kernel/sched/bfs_sched.h +++ b/kernel/sched/MuQSS.h @@ -1,9 +1,14 @@ #include <linux/sched.h> #include <linux/cpuidle.h> +#include <linux/skip_list.h> #include <linux/stop_machine.h> -#ifndef BFS_SCHED_H -#define BFS_SCHED_H +#ifndef MUQSS_SCHED_H +#define MUQSS_SCHED_H + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 /* * This is the main, per-CPU runqueue data structure. @@ -13,16 +18,21 @@ struct rq { struct task_struct *curr, *idle, *stop; struct mm_struct *prev_mm; - /* Pointer to grq spinlock */ - raw_spinlock_t *grq_lock; + raw_spinlock_t lock; - /* Stored data about rq->curr to work outside grq lock */ + /* Stored data about rq->curr to work outside rq lock */ u64 rq_deadline; - unsigned int rq_policy; - int rq_time_slice; - u64 rq_last_ran; int rq_prio; - int soft_affined; /* Running or queued tasks with this set as their rq */ + + /* Best queued id for use outside lock */ + u64 best_key; + + unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ + unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ + u64 niffies; /* Last time this RQ updated rq clock */ + u64 last_niffy; /* Last niffies as updated by local clock */ + u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ + u64 load_update; /* When we last updated load */ unsigned long load_avg; /* Rolling load average */ #ifdef CONFIG_SMT_NICE @@ -30,12 +40,15 @@ struct rq { int rq_smt_bias; /* Policy/nice level bias across smt siblings */ #endif /* Accurate timekeeping data */ - u64 timekeep_clock; - unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, - iowait_pc, idle_pc; + unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, + iowait_ns, idle_ns; atomic_t nr_iowait; + skiplist_node node; + skiplist *sl; #ifdef CONFIG_SMP + struct task_struct *preempt; /* Preempt triggered on this task */ + int cpu; /* cpu of this runqueue */ bool online; @@ -43,6 +56,7 @@ struct rq { struct sched_domain *sd; int *cpu_locality; /* CPU relative cache distance */ struct rq **rq_order; /* RQs ordered by relative cache distance */ + #ifdef CONFIG_SCHED_SMT cpumask_t thread_mask; bool (*siblings_idle)(struct rq *rq); @@ -53,7 +67,6 @@ struct rq { bool (*cache_idle)(struct rq *rq); /* See if all cache siblings are idle */ #endif /* CONFIG_SCHED_MC */ - u64 last_niffy; /* Last time this RQ updated grq.niffies */ #endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -67,7 +80,10 @@ struct rq { u64 clock, old_clock, last_tick; u64 clock_task; - bool dither; + int dither; + + int iso_ticks; + bool iso_refractory; #ifdef CONFIG_SCHEDSTATS @@ -88,6 +104,11 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif /* CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; @@ -111,6 +132,41 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define raw_rq() raw_cpu_ptr(&runqueues) #endif /* CONFIG_SMP */ +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 +#ifdef CONFIG_SMP +#define ENQUEUE_MIGRATED 0x20 +#else +#define ENQUEUE_MIGRATED 0x00 +#endif + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); @@ -118,13 +174,13 @@ static inline u64 __rq_clock_broken(struct rq *rq) static inline u64 rq_clock(struct rq *rq) { - lockdep_assert_held(rq->grq_lock); + lockdep_assert_held(&rq->lock); return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { - lockdep_assert_held(rq->grq_lock); + lockdep_assert_held(&rq->lock); return rq->clock_task; } @@ -157,17 +213,11 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -static inline void sched_ttwu_pending(void) { } - -static inline int task_on_rq_queued(struct task_struct *p) -{ - return p->on_rq; -} - #ifdef CONFIG_SMP - +extern void sched_ttwu_pending(void); extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); - +#else +static inline void sched_ttwu_pending(void) { } #endif #ifdef CONFIG_CPU_IDLE @@ -221,4 +271,4 @@ static inline void cpufreq_trigger(u64 time, unsigned long util) #define arch_scale_freq_invariant() (false) #endif -#endif /* BFS_SCHED_H */ +#endif /* MUQSS_SCHED_H */ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 2ebd7b0c4..2a644b6be 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -9,8 +9,8 @@ * published by the Free Software Foundation. */ -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" #else #include "sched.h" #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index eba226d7c..a2bf1ea69 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -16,8 +16,8 @@ #include <linux/slab.h> #include <trace/events/power.h> -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" #else #include "sched.h" #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1e855dcbd..060b76d85 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -14,8 +14,8 @@ #include <trace/events/power.h> -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" #else #include "sched.h" #endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 7466a0bb2..ba7b137c3 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -4,10 +4,10 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> -#ifndef CONFIG_SCHED_BFS +#ifndef CONFIG_SCHED_MUQSS #include "sched.h" #else -#include "bfs_sched.h" +#include "MuQSS.h" #endif /* diff --git a/kernel/skip_lists.c b/kernel/skip_list.c index 40b7ba240..5c66067f2 100644 --- a/kernel/skip_lists.c +++ b/kernel/skip_list.c @@ -26,17 +26,17 @@ init: defines slnode new_skiplist: returns a new, empty list randomLevel: Returns a random level based on a u64 random seed passed to it. -In BFS, the "niffy" time is used for this purpose. +In MuQSS, the "niffy" time is used for this purpose. insert(l,key, value): inserts the binding (key, value) into l. This operation occurs in O(log n) time. delnode(slnode, l, node): deletes any binding of key from the l based on the actual node value. This operation occurs in O(k) time where k is the -number of levels of the node in question (max 16). The original delete +number of levels of the node in question (max 8). The original delete function occurred in O(log n) time and involved a search. -BFS Notes: In this implementation of skiplists, there are bidirectional +MuQSS Notes: In this implementation of skiplists, there are bidirectional next/prev pointers and the insert function returns a pointer to the actual node the value is stored. The key here is chosen by the scheduler so as to sort tasks according to the priority list requirements and is no longer used @@ -49,9 +49,9 @@ aid of prev<->next pointer manipulation and no searching. */ #include <linux/slab.h> -#include <linux/skip_lists.h> +#include <linux/skip_list.h> -#define MaxNumberOfLevels 16 +#define MaxNumberOfLevels 8 #define MaxLevel (MaxNumberOfLevels - 1) void skiplist_init(skiplist_node *slnode) @@ -111,9 +111,7 @@ static inline unsigned int randomLevel(int entries, unsigned int randseed) { unsigned int mask; - if (entries > 31) - mask = 0xF; - else if (entries > 15) + if (entries > 15) mask = 0x7; else if (entries > 7) mask = 0x3; @@ -139,6 +137,8 @@ void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType va } while (--k >= 0); k = randomLevel(++l->entries, randseed); + if (k > MaxLevel) + k = MaxLevel; if (k > l->level) { k = ++l->level; update[k] = l->header; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index fc0d8270f..13bc43d1f 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data) if (kthread_should_park()) { __set_current_state(TASK_RUNNING); + preempt_enable(); if (ht->park && td->status == HP_THREAD_ACTIVE) { BUG_ON(td->cpu != smp_processor_id()); ht->park(td->cpu); td->status = HP_THREAD_PARKED; } - preempt_enable(); kthread_parkme(); /* We might have been woken for stop */ continue; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ca8093ed7..4bd185938 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -127,7 +127,7 @@ static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int __read_mostly one_hundred = 100; static int __read_mostly one_thousand = 1000; -#ifdef CONFIG_SCHED_BFS +#ifdef CONFIG_SCHED_MUQSS extern int rr_interval; extern int sched_interactive; extern int sched_iso_cpu; @@ -269,7 +269,7 @@ static struct ctl_table sysctl_base_table[] = { { } }; -#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS) +#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ @@ -286,7 +286,7 @@ static int max_extfrag_threshold = 1000; #endif static struct ctl_table kern_table[] = { -#ifndef CONFIG_SCHED_BFS +#ifndef CONFIG_SCHED_MUQSS { .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, @@ -455,7 +455,7 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif -#endif /* !CONFIG_SCHED_BFS */ +#endif /* !CONFIG_SCHED_MUQSS */ #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -1020,7 +1020,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SCHED_BFS +#ifdef CONFIG_SCHED_MUQSS { .procname = "rr_interval", .data = &rr_interval, diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 6931b6e3c..10e18d267 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -89,7 +89,7 @@ config NO_HZ_IDLE config NO_HZ_FULL bool "Full dynticks system (tickless)" # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_MUQSS # We need at least one periodic CPU for timekeeping depends on SMP depends on HAVE_CONTEXT_TRACKING diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 287cf721c..69ee53a67 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1039,8 +1039,8 @@ static int trace_wakeup_test_thread(void *data) { /* Make this a -deadline thread */ static const struct sched_attr attr = { -#ifdef CONFIG_SCHED_BFS - /* No deadline on BFS, use RR */ +#ifdef CONFIG_SCHED_MUQSS + /* No deadline on MuQSS, use RR */ .sched_policy = SCHED_RR, #else .sched_policy = SCHED_DEADLINE, |