From 670027c507e99521d416994a18a498def9ef2ea3 Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Sat, 22 Oct 2016 19:31:08 -0300 Subject: Linux-libre 4.8.3-gnu --- kernel/Makefile | 2 +- kernel/power/tuxonice.h | 260 - kernel/power/tuxonice_alloc.c | 308 -- kernel/power/tuxonice_alloc.h | 54 - kernel/power/tuxonice_atomic_copy.c | 469 -- kernel/power/tuxonice_atomic_copy.h | 25 - kernel/power/tuxonice_bio.h | 80 - kernel/power/tuxonice_bio_chains.c | 1121 ---- kernel/power/tuxonice_bio_core.c | 1937 ------- kernel/power/tuxonice_bio_internal.h | 101 - kernel/power/tuxonice_bio_signature.c | 403 -- kernel/power/tuxonice_builtin.c | 498 -- kernel/power/tuxonice_builtin.h | 41 - kernel/power/tuxonice_checksum.c | 392 -- kernel/power/tuxonice_checksum.h | 31 - kernel/power/tuxonice_cluster.c | 1058 ---- kernel/power/tuxonice_cluster.h | 18 - kernel/power/tuxonice_compress.c | 452 -- kernel/power/tuxonice_copy_before_write.c | 240 - kernel/power/tuxonice_extent.c | 144 - kernel/power/tuxonice_extent.h | 45 - kernel/power/tuxonice_file.c | 484 -- kernel/power/tuxonice_highlevel.c | 1414 ----- kernel/power/tuxonice_incremental.c | 402 -- kernel/power/tuxonice_io.c | 1936 ------- kernel/power/tuxonice_io.h | 72 - kernel/power/tuxonice_modules.c | 520 -- kernel/power/tuxonice_modules.h | 212 - kernel/power/tuxonice_netlink.c | 324 -- kernel/power/tuxonice_netlink.h | 62 - kernel/power/tuxonice_pagedir.c | 345 -- kernel/power/tuxonice_pagedir.h | 50 - kernel/power/tuxonice_pageflags.c | 18 - kernel/power/tuxonice_pageflags.h | 106 - kernel/power/tuxonice_power_off.c | 286 - kernel/power/tuxonice_power_off.h | 24 - kernel/power/tuxonice_prepare_image.c | 1089 ---- kernel/power/tuxonice_prepare_image.h | 38 - kernel/power/tuxonice_prune.c | 406 -- kernel/power/tuxonice_storage.c | 282 - kernel/power/tuxonice_storage.h | 45 - kernel/power/tuxonice_swap.c | 474 -- kernel/power/tuxonice_sysfs.c | 333 -- kernel/power/tuxonice_sysfs.h | 137 - kernel/power/tuxonice_ui.c | 247 - kernel/power/tuxonice_ui.h | 97 - kernel/power/tuxonice_userui.c | 658 --- kernel/sched/Makefile | 4 +- kernel/sched/MuQSS.c | 8247 +++++++++++++++++++++++++++++ kernel/sched/MuQSS.h | 274 + kernel/sched/bfs.c | 7671 --------------------------- kernel/sched/bfs_sched.h | 224 - kernel/sched/cpufreq.c | 4 +- kernel/sched/cpufreq_schedutil.c | 4 +- kernel/sched/idle.c | 4 +- kernel/sched/stats.c | 4 +- kernel/skip_list.c | 174 + kernel/skip_lists.c | 174 - kernel/smpboot.c | 2 +- kernel/sysctl.c | 10 +- kernel/time/Kconfig | 2 +- kernel/trace/trace_selftest.c | 4 +- 62 files changed, 8715 insertions(+), 25827 deletions(-) delete mode 100644 kernel/power/tuxonice.h delete mode 100644 kernel/power/tuxonice_alloc.c delete mode 100644 kernel/power/tuxonice_alloc.h delete mode 100644 kernel/power/tuxonice_atomic_copy.c delete mode 100644 kernel/power/tuxonice_atomic_copy.h delete mode 100644 kernel/power/tuxonice_bio.h delete mode 100644 kernel/power/tuxonice_bio_chains.c delete mode 100644 kernel/power/tuxonice_bio_core.c delete mode 100644 kernel/power/tuxonice_bio_internal.h delete mode 100644 kernel/power/tuxonice_bio_signature.c delete mode 100644 kernel/power/tuxonice_builtin.c delete mode 100644 kernel/power/tuxonice_builtin.h delete mode 100644 kernel/power/tuxonice_checksum.c delete mode 100644 kernel/power/tuxonice_checksum.h delete mode 100644 kernel/power/tuxonice_cluster.c delete mode 100644 kernel/power/tuxonice_cluster.h delete mode 100644 kernel/power/tuxonice_compress.c delete mode 100644 kernel/power/tuxonice_copy_before_write.c delete mode 100644 kernel/power/tuxonice_extent.c delete mode 100644 kernel/power/tuxonice_extent.h delete mode 100644 kernel/power/tuxonice_file.c delete mode 100644 kernel/power/tuxonice_highlevel.c delete mode 100644 kernel/power/tuxonice_incremental.c delete mode 100644 kernel/power/tuxonice_io.c delete mode 100644 kernel/power/tuxonice_io.h delete mode 100644 kernel/power/tuxonice_modules.c delete mode 100644 kernel/power/tuxonice_modules.h delete mode 100644 kernel/power/tuxonice_netlink.c delete mode 100644 kernel/power/tuxonice_netlink.h delete mode 100644 kernel/power/tuxonice_pagedir.c delete mode 100644 kernel/power/tuxonice_pagedir.h delete mode 100644 kernel/power/tuxonice_pageflags.c delete mode 100644 kernel/power/tuxonice_pageflags.h delete mode 100644 kernel/power/tuxonice_power_off.c delete mode 100644 kernel/power/tuxonice_power_off.h delete mode 100644 kernel/power/tuxonice_prepare_image.c delete mode 100644 kernel/power/tuxonice_prepare_image.h delete mode 100644 kernel/power/tuxonice_prune.c delete mode 100644 kernel/power/tuxonice_storage.c delete mode 100644 kernel/power/tuxonice_storage.h delete mode 100644 kernel/power/tuxonice_swap.c delete mode 100644 kernel/power/tuxonice_sysfs.c delete mode 100644 kernel/power/tuxonice_sysfs.h delete mode 100644 kernel/power/tuxonice_ui.c delete mode 100644 kernel/power/tuxonice_ui.h delete mode 100644 kernel/power/tuxonice_userui.c create mode 100644 kernel/sched/MuQSS.c create mode 100644 kernel/sched/MuQSS.h delete mode 100644 kernel/sched/bfs.c delete mode 100644 kernel/sched/bfs_sched.h create mode 100644 kernel/skip_list.c delete mode 100644 kernel/skip_lists.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index dd84575cb..7241e3459 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o skip_lists.o + async.o range.o smpboot.o skip_list.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h deleted file mode 100644 index 10b65633f..000000000 --- a/kernel/power/tuxonice.h +++ /dev/null @@ -1,260 +0,0 @@ -/* - * kernel/power/tuxonice.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations used throughout swsusp. - * - */ - -#ifndef KERNEL_POWER_TOI_H -#define KERNEL_POWER_TOI_H - -#include -#include -#include -#include -#include -#include "tuxonice_pageflags.h" -#include "power.h" - -#define TOI_CORE_VERSION "3.3" -#define TOI_HEADER_VERSION 3 -#define MY_BOOT_KERNEL_DATA_VERSION 4 - -struct toi_boot_kernel_data { - int version; - int size; - unsigned long toi_action; - unsigned long toi_debug_state; - u32 toi_default_console_level; - int toi_io_time[2][2]; - char toi_nosave_commandline[COMMAND_LINE_SIZE]; - unsigned long pages_used[33]; - unsigned long incremental_bytes_in; - unsigned long incremental_bytes_out; - unsigned long compress_bytes_in; - unsigned long compress_bytes_out; - unsigned long pruned_pages; -}; - -extern struct toi_boot_kernel_data toi_bkd; - -/* Location of book kernel data struct in kernel being resumed */ -extern unsigned long boot_kernel_data_buffer; - -/* == Action states == */ - -enum { - TOI_REBOOT, - TOI_PAUSE, - TOI_LOGALL, - TOI_CAN_CANCEL, - TOI_KEEP_IMAGE, - TOI_FREEZER_TEST, - TOI_SINGLESTEP, - TOI_PAUSE_NEAR_PAGESET_END, - TOI_TEST_FILTER_SPEED, - TOI_TEST_BIO, - TOI_NO_PAGESET2, - TOI_IGNORE_ROOTFS, - TOI_REPLACE_SWSUSP, - TOI_PAGESET2_FULL, - TOI_ABORT_ON_RESAVE_NEEDED, - TOI_NO_MULTITHREADED_IO, - TOI_NO_DIRECT_LOAD, /* Obsolete */ - TOI_LATE_CPU_HOTPLUG, /* Obsolete */ - TOI_GET_MAX_MEM_ALLOCD, - TOI_NO_FLUSHER_THREAD, - TOI_NO_PS2_IF_UNNEEDED, - TOI_POST_RESUME_BREAKPOINT, - TOI_NO_READAHEAD, - TOI_TRACE_DEBUG_ON, - TOI_INCREMENTAL_IMAGE, -}; - -extern unsigned long toi_bootflags_mask; - -#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action)) - -/* == Result states == */ - -enum { - TOI_ABORTED, - TOI_ABORT_REQUESTED, - TOI_NOSTORAGE_AVAILABLE, - TOI_INSUFFICIENT_STORAGE, - TOI_FREEZING_FAILED, - TOI_KEPT_IMAGE, - TOI_WOULD_EAT_MEMORY, - TOI_UNABLE_TO_FREE_ENOUGH_MEMORY, - TOI_PM_SEM, - TOI_DEVICE_REFUSED, - TOI_SYSDEV_REFUSED, - TOI_EXTRA_PAGES_ALLOW_TOO_SMALL, - TOI_UNABLE_TO_PREPARE_IMAGE, - TOI_FAILED_MODULE_INIT, - TOI_FAILED_MODULE_CLEANUP, - TOI_FAILED_IO, - TOI_OUT_OF_MEMORY, - TOI_IMAGE_ERROR, - TOI_PLATFORM_PREP_FAILED, - TOI_CPU_HOTPLUG_FAILED, - TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */ - TOI_RESAVE_NEEDED, - TOI_CANT_SUSPEND, - TOI_NOTIFIERS_PREPARE_FAILED, - TOI_PRE_SNAPSHOT_FAILED, - TOI_PRE_RESTORE_FAILED, - TOI_USERMODE_HELPERS_ERR, - TOI_CANT_USE_ALT_RESUME, - TOI_HEADER_TOO_BIG, - TOI_WAKEUP_EVENT, - TOI_SYSCORE_REFUSED, - TOI_DPM_PREPARE_FAILED, - TOI_DPM_SUSPEND_FAILED, - TOI_NUM_RESULT_STATES /* Used in printing debug info only */ -}; - -extern unsigned long toi_result; - -#define set_result_state(bit) (test_and_set_bit(bit, &toi_result)) -#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \ - test_and_set_bit(bit, &toi_result)) -#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result)) -#define test_result_state(bit) (test_bit(bit, &toi_result)) - -/* == Debug sections and levels == */ - -/* debugging levels. */ -enum { - TOI_STATUS = 0, - TOI_ERROR = 2, - TOI_LOW, - TOI_MEDIUM, - TOI_HIGH, - TOI_VERBOSE, -}; - -enum { - TOI_ANY_SECTION, - TOI_EAT_MEMORY, - TOI_IO, - TOI_HEADER, - TOI_WRITER, - TOI_MEMORY, - TOI_PAGEDIR, - TOI_COMPRESS, - TOI_BIO, -}; - -#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state)) -#define clear_debug_state(bit) \ - (test_and_clear_bit(bit, &toi_bkd.toi_debug_state)) -#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state)) - -/* == Steps in hibernating == */ - -enum { - STEP_HIBERNATE_PREPARE_IMAGE, - STEP_HIBERNATE_SAVE_IMAGE, - STEP_HIBERNATE_POWERDOWN, - STEP_RESUME_CAN_RESUME, - STEP_RESUME_LOAD_PS1, - STEP_RESUME_DO_RESTORE, - STEP_RESUME_READ_PS2, - STEP_RESUME_GO, - STEP_RESUME_ALT_IMAGE, - STEP_CLEANUP, - STEP_QUIET_CLEANUP -}; - -/* == TuxOnIce states == - (see also include/linux/suspend.h) */ - -#define get_toi_state() (toi_state) -#define restore_toi_state(saved_state) \ - do { toi_state = saved_state; } while (0) - -/* == Module support == */ - -struct toi_core_fns { - int (*post_context_save)(void); - unsigned long (*get_nonconflicting_page)(void); - int (*try_hibernate)(void); - void (*try_resume)(void); -}; - -extern struct toi_core_fns *toi_core_fns; - -/* == All else == */ -#define KB(x) ((x) << (PAGE_SHIFT - 10)) -#define MB(x) ((x) >> (20 - PAGE_SHIFT)) - -extern int toi_start_anything(int toi_or_resume); -extern void toi_finish_anything(int toi_or_resume); - -extern int save_image_part1(void); -extern int toi_atomic_restore(void); - -extern int toi_try_hibernate(void); -extern void toi_try_resume(void); - -extern int __toi_post_context_save(void); - -extern unsigned int nr_hibernates; -extern char alt_resume_param[256]; - -extern void copyback_post(void); -extern int toi_hibernate(void); -extern unsigned long extra_pd1_pages_used; - -#define SECTOR_SIZE 512 - -extern void toi_early_boot_message(int can_erase_image, int default_answer, - char *warning_reason, ...); - -extern int do_check_can_resume(void); -extern int do_toi_step(int step); -extern int toi_launch_userspace_program(char *command, int channel_no, - int wait, int debug); - -extern char tuxonice_signature[9]; - -extern int toi_start_other_threads(void); -extern void toi_stop_other_threads(void); - -extern int toi_trace_index; -#define TOI_TRACE_DEBUG(PFN, DESC, ...) \ - do { \ - if (test_action_state(TOI_TRACE_DEBUG_ON)) { \ - printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \ - } \ - } while(0) - -#ifdef CONFIG_TOI_KEEP_IMAGE -#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE)) -#else -#define toi_keeping_image (0) -#endif - -#ifdef CONFIG_TOI_INCREMENTAL -extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose); -extern int toi_reset_dirtiness(int verbose); -extern void toi_cbw_write(void); -extern void toi_cbw_restore(void); -extern int toi_allocate_cbw_data(void); -extern void toi_free_cbw_data(void); -extern int toi_cbw_init(void); -extern void toi_mark_tasks_cbw(void); -#else -static inline int toi_reset_dirtiness(int verbose) { return 0; } -#define toi_cbw_write() do { } while(0) -#define toi_cbw_restore() do { } while(0) -#define toi_allocate_cbw_data() do { } while(0) -#define toi_free_cbw_data() do { } while(0) -static inline int toi_cbw_init(void) { return 0; } -#endif -#endif diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c deleted file mode 100644 index 1d8b1cbda..000000000 --- a/kernel/power/tuxonice_alloc.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * kernel/power/tuxonice_alloc.c - * - * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include -#include -#include "tuxonice_modules.h" -#include "tuxonice_alloc.h" -#include "tuxonice_sysfs.h" -#include "tuxonice.h" - -#define TOI_ALLOC_PATHS 41 - -static DEFINE_MUTEX(toi_alloc_mutex); - -static struct toi_module_ops toi_alloc_ops; - -static int toi_fail_num; - -static atomic_t toi_alloc_count[TOI_ALLOC_PATHS], - toi_free_count[TOI_ALLOC_PATHS], - toi_test_count[TOI_ALLOC_PATHS], - toi_fail_count[TOI_ALLOC_PATHS]; -static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS]; -static int cur_allocd, max_allocd; - -static char *toi_alloc_desc[TOI_ALLOC_PATHS] = { - "", /* 0 */ - "get_io_info_struct", - "extent", - "extent (loading chain)", - "userui channel", - "userui arg", /* 5 */ - "attention list metadata", - "extra pagedir memory metadata", - "bdev metadata", - "extra pagedir memory", - "header_locations_read", /* 10 */ - "bio queue", - "prepare_readahead", - "i/o buffer", - "writer buffer in bio_init", - "checksum buffer", /* 15 */ - "compression buffer", - "filewriter signature op", - "set resume param alloc1", - "set resume param alloc2", - "debugging info buffer", /* 20 */ - "check can resume buffer", - "write module config buffer", - "read module config buffer", - "write image header buffer", - "read pageset1 buffer", /* 25 */ - "get_have_image_data buffer", - "checksum page", - "worker rw loop", - "get nonconflicting page", - "ps1 load addresses", /* 30 */ - "remove swap image", - "swap image exists", - "swap parse sig location", - "sysfs kobj", - "swap mark resume attempted buffer", /* 35 */ - "cluster member", - "boot kernel data buffer", - "setting swap signature", - "block i/o bdev struct", - "copy before write", /* 40 */ -}; - -#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \ - do { \ - BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \ - \ - if (FAIL_NUM == toi_fail_num) { \ - atomic_inc(&toi_test_count[FAIL_NUM]); \ - toi_fail_num = 0; \ - return FAIL_VAL; \ - } \ - } while (0) - -static void alloc_update_stats(int fail_num, void *result, int size) -{ - if (!result) { - atomic_inc(&toi_fail_count[fail_num]); - return; - } - - atomic_inc(&toi_alloc_count[fail_num]); - if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { - mutex_lock(&toi_alloc_mutex); - toi_cur_allocd[fail_num]++; - cur_allocd += size; - if (unlikely(cur_allocd > max_allocd)) { - int i; - - for (i = 0; i < TOI_ALLOC_PATHS; i++) - toi_max_allocd[i] = toi_cur_allocd[i]; - max_allocd = cur_allocd; - } - mutex_unlock(&toi_alloc_mutex); - } -} - -static void free_update_stats(int fail_num, int size) -{ - BUG_ON(fail_num >= TOI_ALLOC_PATHS); - atomic_inc(&toi_free_count[fail_num]); - if (unlikely(atomic_read(&toi_free_count[fail_num]) > - atomic_read(&toi_alloc_count[fail_num]))) - dump_stack(); - if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { - mutex_lock(&toi_alloc_mutex); - cur_allocd -= size; - toi_cur_allocd[fail_num]--; - mutex_unlock(&toi_alloc_mutex); - } -} - -void *toi_kzalloc(int fail_num, size_t size, gfp_t flags) -{ - void *result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, NULL); - result = kzalloc(size, flags); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, result, size); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -unsigned long toi_get_free_pages(int fail_num, gfp_t mask, - unsigned int order) -{ - unsigned long result; - - mask |= ___GFP_TOI_NOTRACK; - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, 0); - result = __get_free_pages(mask, order); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, - PAGE_SIZE << order); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -struct page *toi_alloc_page(int fail_num, gfp_t mask) -{ - struct page *result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, NULL); - mask |= ___GFP_TOI_NOTRACK; - result = alloc_page(mask); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask) -{ - unsigned long result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, 0); - mask |= ___GFP_TOI_NOTRACK; - result = get_zeroed_page(mask); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -void toi_kfree(int fail_num, const void *arg, int size) -{ - if (arg && toi_alloc_ops.enabled) - free_update_stats(fail_num, size); - - if (fail_num == toi_trace_allocs) - dump_stack(); - kfree(arg); -} - -void toi_free_page(int fail_num, unsigned long virt) -{ - if (virt && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE); - - if (fail_num == toi_trace_allocs) - dump_stack(); - free_page(virt); -} - -void toi__free_page(int fail_num, struct page *page) -{ - if (page && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE); - - if (fail_num == toi_trace_allocs) - dump_stack(); - __free_page(page); -} - -void toi_free_pages(int fail_num, struct page *page, int order) -{ - if (page && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE << order); - - if (fail_num == toi_trace_allocs) - dump_stack(); - __free_pages(page, order); -} - -void toi_alloc_print_debug_stats(void) -{ - int i, header_done = 0; - - if (!toi_alloc_ops.enabled) - return; - - for (i = 0; i < TOI_ALLOC_PATHS; i++) - if (atomic_read(&toi_alloc_count[i]) != - atomic_read(&toi_free_count[i])) { - if (!header_done) { - printk(KERN_INFO "Idx Allocs Frees Tests " - " Fails Max Description\n"); - header_done = 1; - } - - printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i, - atomic_read(&toi_alloc_count[i]), - atomic_read(&toi_free_count[i]), - atomic_read(&toi_test_count[i]), - atomic_read(&toi_fail_count[i]), - toi_max_allocd[i], - toi_alloc_desc[i]); - } -} - -static int toi_alloc_initialise(int starting_cycle) -{ - int i; - - if (!starting_cycle) - return 0; - - if (toi_trace_allocs) - dump_stack(); - - for (i = 0; i < TOI_ALLOC_PATHS; i++) { - atomic_set(&toi_alloc_count[i], 0); - atomic_set(&toi_free_count[i], 0); - atomic_set(&toi_test_count[i], 0); - atomic_set(&toi_fail_count[i], 0); - toi_cur_allocd[i] = 0; - toi_max_allocd[i] = 0; - }; - - max_allocd = 0; - cur_allocd = 0; - return 0; -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL), - SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0, - NULL), - SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action, - TOI_GET_MAX_MEM_ALLOCD, 0), - SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0, - NULL) -}; - -static struct toi_module_ops toi_alloc_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "allocation debugging", - .directory = "alloc", - .module = THIS_MODULE, - .early = 1, - .initialise = toi_alloc_initialise, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_alloc_init(void) -{ - int result = toi_register_module(&toi_alloc_ops); - return result; -} - -void toi_alloc_exit(void) -{ - toi_unregister_module(&toi_alloc_ops); -} diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h deleted file mode 100644 index 0cd6b686f..000000000 --- a/kernel/power/tuxonice_alloc.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * kernel/power/tuxonice_alloc.h - * - * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include -#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN) -#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN) - -#ifdef CONFIG_PM_DEBUG -extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags); -extern void toi_kfree(int fail_num, const void *arg, int size); - -extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask, - unsigned int order); -#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0) -extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask); -extern void toi_free_page(int fail_num, unsigned long buf); -extern void toi__free_page(int fail_num, struct page *page); -extern void toi_free_pages(int fail_num, struct page *page, int order); -extern struct page *toi_alloc_page(int fail_num, gfp_t mask); -extern int toi_alloc_init(void); -extern void toi_alloc_exit(void); - -extern void toi_alloc_print_debug_stats(void); - -#else /* CONFIG_PM_DEBUG */ - -#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS)) -#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN)) - -#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER) -#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS) -#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS) -#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0) -#define toi__free_page(FAIL, PAGE) __free_page(PAGE) -#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER) -#define toi_alloc_page(FAIL, MASK) alloc_page(MASK) -static inline int toi_alloc_init(void) -{ - return 0; -} - -static inline void toi_alloc_exit(void) { } - -static inline void toi_alloc_print_debug_stats(void) { } - -#endif - -extern int toi_trace_allocs; diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c deleted file mode 100644 index 5845217f8..000000000 --- a/kernel/power/tuxonice_atomic_copy.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * kernel/power/tuxonice_atomic_copy.c - * - * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * Routines for doing the atomic save/restore. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "tuxonice.h" -#include "tuxonice_storage.h" -#include "tuxonice_power_off.h" -#include "tuxonice_ui.h" -#include "tuxonice_io.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_checksum.h" -#include "tuxonice_builtin.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_alloc.h" -#include "tuxonice_modules.h" - -unsigned long extra_pd1_pages_used; - -/** - * free_pbe_list - free page backup entries used by the atomic copy code. - * @list: List to free. - * @highmem: Whether the list is in highmem. - * - * Normally, this function isn't used. If, however, we need to abort before - * doing the atomic copy, we use this to free the pbes previously allocated. - **/ -static void free_pbe_list(struct pbe **list, int highmem) -{ - while (*list) { - int i; - struct pbe *free_pbe, *next_page = NULL; - struct page *page; - - if (highmem) { - page = (struct page *) *list; - free_pbe = (struct pbe *) kmap(page); - } else { - page = virt_to_page(*list); - free_pbe = *list; - } - - for (i = 0; i < PBES_PER_PAGE; i++) { - if (!free_pbe) - break; - if (highmem) - toi__free_page(29, free_pbe->address); - else - toi_free_page(29, - (unsigned long) free_pbe->address); - free_pbe = free_pbe->next; - } - - if (highmem) { - if (free_pbe) - next_page = free_pbe; - kunmap(page); - } else { - if (free_pbe) - next_page = free_pbe; - } - - toi__free_page(29, page); - *list = (struct pbe *) next_page; - }; -} - -/** - * copyback_post - post atomic-restore actions - * - * After doing the atomic restore, we have a few more things to do: - * 1) We want to retain some values across the restore, so we now copy - * these from the nosave variables to the normal ones. - * 2) Set the status flags. - * 3) Resume devices. - * 4) Tell userui so it can redraw & restore settings. - * 5) Reread the page cache. - **/ -void copyback_post(void) -{ - struct toi_boot_kernel_data *bkd = - (struct toi_boot_kernel_data *) boot_kernel_data_buffer; - - if (toi_activate_storage(1)) - panic("Failed to reactivate our storage."); - - toi_post_atomic_restore_modules(bkd); - - toi_cond_pause(1, "About to reload secondary pagedir."); - - if (read_pageset2(0)) - panic("Unable to successfully reread the page cache."); - - /* - * If the user wants to sleep again after resuming from full-off, - * it's most likely to be in order to suspend to ram, so we'll - * do this check after loading pageset2, to give them the fastest - * wakeup when they are ready to use the computer again. - */ - toi_check_resleep(); - - if (test_action_state(TOI_INCREMENTAL_IMAGE)) - toi_reset_dirtiness(1); -} - -/** - * toi_copy_pageset1 - do the atomic copy of pageset1 - * - * Make the atomic copy of pageset1. We can't use copy_page (as we once did) - * because we can't be sure what side effects it has. On my old Duron, with - * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt - * count at resume time 4 instead of 3. - * - * We don't want to call kmap_atomic unconditionally because it has the side - * effect of incrementing the preempt count, which will leave it one too high - * post resume (the page containing the preempt count will be copied after - * its incremented. This is essentially the same problem. - **/ -void toi_copy_pageset1(void) -{ - int i; - unsigned long source_index, dest_index; - - memory_bm_position_reset(pageset1_map); - memory_bm_position_reset(pageset1_copy_map); - - source_index = memory_bm_next_pfn(pageset1_map, 0); - dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); - - for (i = 0; i < pagedir1.size; i++) { - unsigned long *origvirt, *copyvirt; - struct page *origpage, *copypage; - int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1, - was_present1, was_present2; - - origpage = pfn_to_page(source_index); - copypage = pfn_to_page(dest_index); - - origvirt = PageHighMem(origpage) ? - kmap_atomic(origpage) : - page_address(origpage); - - copyvirt = PageHighMem(copypage) ? - kmap_atomic(copypage) : - page_address(copypage); - - was_present1 = kernel_page_present(origpage); - if (!was_present1) - kernel_map_pages(origpage, 1, 1); - - was_present2 = kernel_page_present(copypage); - if (!was_present2) - kernel_map_pages(copypage, 1, 1); - - while (loop >= 0) { - *(copyvirt + loop) = *(origvirt + loop); - loop--; - } - - if (!was_present1) - kernel_map_pages(origpage, 1, 0); - - if (!was_present2) - kernel_map_pages(copypage, 1, 0); - - if (PageHighMem(origpage)) - kunmap_atomic(origvirt); - - if (PageHighMem(copypage)) - kunmap_atomic(copyvirt); - - source_index = memory_bm_next_pfn(pageset1_map, 0); - dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); - } -} - -/** - * __toi_post_context_save - steps after saving the cpu context - * - * Steps taken after saving the CPU state to make the actual - * atomic copy. - * - * Called from swsusp_save in snapshot.c via toi_post_context_save. - **/ -int __toi_post_context_save(void) -{ - unsigned long old_ps1_size = pagedir1.size; - - check_checksums(); - - free_checksum_pages(); - - toi_recalculate_image_contents(1); - - extra_pd1_pages_used = pagedir1.size > old_ps1_size ? - pagedir1.size - old_ps1_size : 0; - - if (extra_pd1_pages_used > extra_pd1_pages_allowance) { - printk(KERN_INFO "Pageset1 has grown by %lu pages. " - "extra_pages_allowance is currently only %lu.\n", - pagedir1.size - old_ps1_size, - extra_pd1_pages_allowance); - - /* - * Highlevel code will see this, clear the state and - * retry if we haven't already done so twice. - */ - if (any_to_free(1)) { - set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - return 1; - } - if (try_allocate_extra_memory()) { - printk(KERN_INFO "Failed to allocate the extra memory" - " needed. Restarting the process."); - set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - return 1; - } - printk(KERN_INFO "However it looks like there's enough" - " free ram and storage to handle this, so " - " continuing anyway."); - /* - * What if try_allocate_extra_memory above calls - * toi_allocate_extra_pagedir_memory and it allocs a new - * slab page via toi_kzalloc which should be in ps1? So... - */ - toi_recalculate_image_contents(1); - } - - if (!test_action_state(TOI_TEST_FILTER_SPEED) && - !test_action_state(TOI_TEST_BIO)) - toi_copy_pageset1(); - - return 0; -} - -/** - * toi_hibernate - high level code for doing the atomic copy - * - * High-level code which prepares to do the atomic copy. Loosely based - * on the swsusp version, but with the following twists: - * - We set toi_running so the swsusp code uses our code paths. - * - We give better feedback regarding what goes wrong if there is a - * problem. - * - We use an extra function to call the assembly, just in case this code - * is in a module (return address). - **/ -int toi_hibernate(void) -{ - int error; - - error = toi_lowlevel_builtin(); - - if (!error) { - struct toi_boot_kernel_data *bkd = - (struct toi_boot_kernel_data *) boot_kernel_data_buffer; - - /* - * The boot kernel's data may be larger (newer version) or - * smaller (older version) than ours. Copy the minimum - * of the two sizes, so that we don't overwrite valid values - * from pre-atomic copy. - */ - - memcpy(&toi_bkd, (char *) boot_kernel_data_buffer, - min_t(int, sizeof(struct toi_boot_kernel_data), - bkd->size)); - } - - return error; -} - -/** - * toi_atomic_restore - prepare to do the atomic restore - * - * Get ready to do the atomic restore. This part gets us into the same - * state we are in prior to do calling do_toi_lowlevel while - * hibernating: hot-unplugging secondary cpus and freeze processes, - * before starting the thread that will do the restore. - **/ -int toi_atomic_restore(void) -{ - int error; - - toi_prepare_status(DONT_CLEAR_BAR, "Atomic restore."); - - memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line, - strlen(saved_command_line)); - - toi_pre_atomic_restore_modules(&toi_bkd); - - if (add_boot_kernel_data_pbe()) - goto Failed; - - toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); - - if (toi_go_atomic(PMSG_QUIESCE, 0)) - goto Failed; - - /* We'll ignore saved state, but this gets preempt count (etc) right */ - save_processor_state(); - - error = swsusp_arch_resume(); - /* - * Code below is only ever reached in case of failure. Otherwise - * execution continues at place where swsusp_arch_suspend was called. - * - * We don't know whether it's safe to continue (this shouldn't happen), - * so lets err on the side of caution. - */ - BUG(); - -Failed: - free_pbe_list(&restore_pblist, 0); -#ifdef CONFIG_HIGHMEM - free_pbe_list(&restore_highmem_pblist, 1); -#endif - return 1; -} - -/** - * toi_go_atomic - do the actual atomic copy/restore - * @state: The state to use for dpm_suspend_start & power_down calls. - * @suspend_time: Whether we're suspending or resuming. - **/ -int toi_go_atomic(pm_message_t state, int suspend_time) -{ - if (suspend_time) { - if (platform_begin(1)) { - set_abort_result(TOI_PLATFORM_PREP_FAILED); - toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); - return 1; - } - - if (dpm_prepare(PMSG_FREEZE)) { - set_abort_result(TOI_DPM_PREPARE_FAILED); - dpm_complete(PMSG_RECOVER); - toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); - return 1; - } - } - - suspend_console(); - pm_restrict_gfp_mask(); - - if (suspend_time) { - if (dpm_suspend(state)) { - set_abort_result(TOI_DPM_SUSPEND_FAILED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); - return 1; - } - } else { - if (dpm_suspend_start(state)) { - set_abort_result(TOI_DPM_SUSPEND_FAILED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); - return 1; - } - } - - /* At this point, dpm_suspend_start() has been called, but *not* - * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ - - if (dpm_suspend_end(state)) { - set_abort_result(TOI_DEVICE_REFUSED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1); - return 1; - } - - if (suspend_time) { - if (platform_pre_snapshot(1)) - set_abort_result(TOI_PRE_SNAPSHOT_FAILED); - } else { - if (platform_pre_restore(1)) - set_abort_result(TOI_PRE_RESTORE_FAILED); - } - - if (test_result_state(TOI_ABORTED)) { - toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1); - return 1; - } - - if (disable_nonboot_cpus()) { - set_abort_result(TOI_CPU_HOTPLUG_FAILED); - toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG, - suspend_time, 1); - return 1; - } - - local_irq_disable(); - - if (syscore_suspend()) { - set_abort_result(TOI_SYSCORE_REFUSED); - toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1); - return 1; - } - - if (suspend_time && pm_wakeup_pending()) { - set_abort_result(TOI_WAKEUP_EVENT); - toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1); - return 1; - } - return 0; -} - -/** - * toi_end_atomic - post atomic copy/restore routines - * @stage: What step to start at. - * @suspend_time: Whether we're suspending or resuming. - * @error: Whether we're recovering from an error. - **/ -void toi_end_atomic(int stage, int suspend_time, int error) -{ - pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) : - PMSG_RESTORE; - - switch (stage) { - case ATOMIC_ALL_STEPS: - if (!suspend_time) { - events_check_enabled = false; - } - platform_leave(1); - case ATOMIC_STEP_SYSCORE_RESUME: - syscore_resume(); - case ATOMIC_STEP_IRQS: - local_irq_enable(); - case ATOMIC_STEP_CPU_HOTPLUG: - enable_nonboot_cpus(); - case ATOMIC_STEP_PLATFORM_FINISH: - if (!suspend_time && error & 2) - platform_restore_cleanup(1); - else - platform_finish(1); - dpm_resume_start(msg); - case ATOMIC_STEP_DEVICE_RESUME: - if (suspend_time && (error & 2)) - platform_recover(1); - dpm_resume(msg); - if (!toi_in_suspend()) { - dpm_resume_end(PMSG_RECOVER); - } - if (error || !toi_in_suspend()) { - pm_restore_gfp_mask(); - } - resume_console(); - case ATOMIC_STEP_DPM_COMPLETE: - dpm_complete(msg); - case ATOMIC_STEP_PLATFORM_END: - platform_end(1); - - toi_prepare_status(DONT_CLEAR_BAR, "Post atomic."); - } -} diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h deleted file mode 100644 index e2d2b4fb3..000000000 --- a/kernel/power/tuxonice_atomic_copy.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * kernel/power/tuxonice_atomic_copy.h - * - * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * Routines for doing the atomic save/restore. - */ - -enum { - ATOMIC_ALL_STEPS, - ATOMIC_STEP_SYSCORE_RESUME, - ATOMIC_STEP_IRQS, - ATOMIC_STEP_CPU_HOTPLUG, - ATOMIC_STEP_PLATFORM_FINISH, - ATOMIC_STEP_DEVICE_RESUME, - ATOMIC_STEP_DPM_COMPLETE, - ATOMIC_STEP_PLATFORM_END, -}; - -int toi_go_atomic(pm_message_t state, int toi_time); -void toi_end_atomic(int stage, int toi_time, int error); - -extern void platform_recover(int platform_mode); diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h deleted file mode 100644 index 2f717f5c5..000000000 --- a/kernel/power/tuxonice_bio.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * kernel/power/tuxonice_bio.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains declarations for functions exported from - * tuxonice_bio.c, which contains low level io functions. - */ - -#include -#include "tuxonice_extent.h" - -void toi_put_extent_chain(struct hibernate_extent_chain *chain); -int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, - unsigned long start, unsigned long end); - -struct hibernate_extent_saved_state { - int extent_num; - struct hibernate_extent *extent_ptr; - unsigned long offset; -}; - -struct toi_bdev_info { - struct toi_bdev_info *next; - struct hibernate_extent_chain blocks; - struct block_device *bdev; - struct toi_module_ops *allocator; - int allocator_index; - struct hibernate_extent_chain allocations; - char name[266]; /* "swap on " or "file " + up to 256 chars */ - - /* Saved in header */ - char uuid[17]; - dev_t dev_t; - int prio; - int bmap_shift; - int blocks_per_page; - unsigned long pages_used; - struct hibernate_extent_saved_state saved_state[4]; -}; - -struct toi_extent_iterate_state { - struct toi_bdev_info *current_chain; - int num_chains; - int saved_chain_number[4]; - struct toi_bdev_info *saved_chain_ptr[4]; -}; - -/* - * Our exported interface so the swapwriter and filewriter don't - * need these functions duplicated. - */ -struct toi_bio_ops { - int (*bdev_page_io) (int rw, struct block_device *bdev, long pos, - struct page *page); - int (*register_storage)(struct toi_bdev_info *new); - void (*free_storage)(void); -}; - -struct toi_allocator_ops { - unsigned long (*toi_swap_storage_available) (void); -}; - -extern struct toi_bio_ops toi_bio_ops; - -extern char *toi_writer_buffer; -extern int toi_writer_buffer_posn; - -struct toi_bio_allocator_ops { - int (*register_storage) (void); - unsigned long (*storage_available)(void); - int (*allocate_storage) (struct toi_bdev_info *, unsigned long); - int (*bmap) (struct toi_bdev_info *); - void (*free_storage) (struct toi_bdev_info *); - unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used); -}; - -extern int toi_bio_register_storage(void); diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c deleted file mode 100644 index 11cd37f77..000000000 --- a/kernel/power/tuxonice_bio_chains.c +++ /dev/null @@ -1,1121 +0,0 @@ -/* - * kernel/power/tuxonice_bio_devinfo.c - * - * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - */ - -#include -#include "tuxonice_bio.h" -#include "tuxonice_bio_internal.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" -#include "tuxonice.h" -#include "tuxonice_io.h" - -static struct toi_bdev_info *prio_chain_head; -static int num_chains; - -/* Pointer to current entry being loaded/saved. */ -struct toi_extent_iterate_state toi_writer_posn; - -#define metadata_size (sizeof(struct toi_bdev_info) - \ - offsetof(struct toi_bdev_info, uuid)) - -/* - * After section 0 (header) comes 2 => next_section[0] = 2 - */ -static int next_section[3] = { 2, 3, 1 }; - -/** - * dump_block_chains - print the contents of the bdev info array. - **/ -void dump_block_chains(void) -{ - int i = 0; - int j; - struct toi_bdev_info *cur_chain = prio_chain_head; - - while (cur_chain) { - struct hibernate_extent *this = cur_chain->blocks.first; - - printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio); - - while (this) { - printk(KERN_CONT " [%lu-%lu]%s", this->start, - this->end, this->next ? "," : ""); - this = this->next; - } - - printk("\n"); - cur_chain = cur_chain->next; - i++; - } - - printk(KERN_DEBUG "Saved states:\n"); - for (i = 0; i < 4; i++) { - printk(KERN_DEBUG "Slot %d: Chain %d.\n", - i, toi_writer_posn.saved_chain_number[i]); - - cur_chain = prio_chain_head; - j = 0; - while (cur_chain) { - printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n", - j, cur_chain->saved_state[i].extent_num, - cur_chain->saved_state[i].offset); - cur_chain = cur_chain->next; - j++; - } - printk(KERN_CONT "\n"); - } -} - -/** - * - **/ -static void toi_extent_chain_next(void) -{ - struct toi_bdev_info *this = toi_writer_posn.current_chain; - - if (!this->blocks.current_extent) - return; - - if (this->blocks.current_offset == this->blocks.current_extent->end) { - if (this->blocks.current_extent->next) { - this->blocks.current_extent = - this->blocks.current_extent->next; - this->blocks.current_offset = - this->blocks.current_extent->start; - } else { - this->blocks.current_extent = NULL; - this->blocks.current_offset = 0; - } - } else - this->blocks.current_offset++; -} - -/** - * - */ - -static struct toi_bdev_info *__find_next_chain_same_prio(void) -{ - struct toi_bdev_info *start_chain = toi_writer_posn.current_chain; - struct toi_bdev_info *this = start_chain; - int orig_prio = this->prio; - - do { - this = this->next; - - if (!this) - this = prio_chain_head; - - /* Back on original chain? Use it again. */ - if (this == start_chain) - return start_chain; - - } while (!this->blocks.current_extent || this->prio != orig_prio); - - return this; -} - -static void find_next_chain(void) -{ - struct toi_bdev_info *this; - - this = __find_next_chain_same_prio(); - - /* - * If we didn't get another chain of the same priority that we - * can use, look for the next priority. - */ - while (this && !this->blocks.current_extent) - this = this->next; - - toi_writer_posn.current_chain = this; -} - -/** - * toi_extent_state_next - go to the next extent - * @blocks: The number of values to progress. - * @stripe_mode: Whether to spread usage across all chains. - * - * Given a state, progress to the next valid entry. We may begin in an - * invalid state, as we do when invoked after extent_state_goto_start below. - * - * When using compression and expected_compression > 0, we let the image size - * be larger than storage, so we can validly run out of data to return. - **/ -static unsigned long toi_extent_state_next(int blocks, int current_stream) -{ - int i; - - if (!toi_writer_posn.current_chain) - return -ENOSPC; - - /* Assume chains always have lengths that are multiples of @blocks */ - for (i = 0; i < blocks; i++) - toi_extent_chain_next(); - - /* The header stream is not striped */ - if (current_stream || - !toi_writer_posn.current_chain->blocks.current_extent) - find_next_chain(); - - return toi_writer_posn.current_chain ? 0 : -ENOSPC; -} - -static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this) -{ - struct toi_bdev_info **prev_ptr; - struct toi_bdev_info *cur; - - /* Loop through the existing chain, finding where to insert it */ - prev_ptr = &prio_chain_head; - cur = prio_chain_head; - - while (cur && cur->prio >= this->prio) { - prev_ptr = &cur->next; - cur = cur->next; - } - - this->next = *prev_ptr; - *prev_ptr = this; - - this = prio_chain_head; - while (this) - this = this->next; - num_chains++; -} - -/** - * toi_extent_state_goto_start - reinitialize an extent chain iterator - * @state: Iterator to reinitialize - **/ -void toi_extent_state_goto_start(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Setting current extent to %p.", this->blocks.first); - this->blocks.current_extent = this->blocks.first; - if (this->blocks.current_extent) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Setting current offset to %lu.", - this->blocks.current_extent->start); - this->blocks.current_offset = - this->blocks.current_extent->start; - } - - this = this->next; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.", - prio_chain_head); - toi_writer_posn.current_chain = prio_chain_head; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start."); -} - -/** - * toi_extent_state_save - save state of the iterator - * @state: Current state of the chain - * @saved_state: Iterator to populate - * - * Given a state and a struct hibernate_extent_state_store, save the current - * position in a format that can be used with relocated chains (at - * resume time). - **/ -void toi_extent_state_save(int slot) -{ - struct toi_bdev_info *cur_chain = prio_chain_head; - struct hibernate_extent *extent; - struct hibernate_extent_saved_state *chain_state; - int i = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.", - slot); - - if (!toi_writer_posn.current_chain) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => " - "chain_num = -1."); - toi_writer_posn.saved_chain_number[slot] = -1; - return; - } - - while (cur_chain) { - i++; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) " - "state, slot %d.", i, cur_chain, slot); - - chain_state = &cur_chain->saved_state[slot]; - - chain_state->offset = cur_chain->blocks.current_offset; - - if (toi_writer_posn.current_chain == cur_chain) { - toi_writer_posn.saved_chain_number[slot] = i; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain " - "we were on => chain_num is %d.", i); - } - - if (!cur_chain->blocks.current_extent) { - chain_state->extent_num = 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent " - "for this chain => extent_num %d is 0.", - i); - cur_chain = cur_chain->next; - continue; - } - - extent = cur_chain->blocks.first; - chain_state->extent_num = 1; - - while (extent != cur_chain->blocks.current_extent) { - chain_state->extent_num++; - extent = extent->next; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i, - chain_state->extent_num); - - cur_chain = cur_chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Completed saving extent state slot %d.", slot); -} - -/** - * toi_extent_state_restore - restore the position saved by extent_state_save - * @state: State to populate - * @saved_state: Iterator saved to restore - **/ -void toi_extent_state_restore(int slot) -{ - int i = 0; - struct toi_bdev_info *cur_chain = prio_chain_head; - struct hibernate_extent_saved_state *chain_state; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "toi_extent_state_restore - slot %d.", slot); - - if (toi_writer_posn.saved_chain_number[slot] == -1) { - toi_writer_posn.current_chain = NULL; - return; - } - - while (cur_chain) { - int posn; - int j; - i++; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) " - "state, slot %d.", i, cur_chain, slot); - - chain_state = &cur_chain->saved_state[slot]; - - posn = chain_state->extent_num; - - cur_chain->blocks.current_extent = cur_chain->blocks.first; - cur_chain->blocks.current_offset = chain_state->offset; - - if (i == toi_writer_posn.saved_chain_number[slot]) { - toi_writer_posn.current_chain = cur_chain; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Found current chain."); - } - - for (j = 0; j < 4; j++) - if (i == toi_writer_posn.saved_chain_number[j]) { - toi_writer_posn.saved_chain_ptr[j] = cur_chain; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Found saved chain ptr %d (%p) (offset" - " %d).", j, cur_chain, - cur_chain->saved_state[j].offset); - } - - if (posn) { - while (--posn) - cur_chain->blocks.current_extent = - cur_chain->blocks.current_extent->next; - } else - cur_chain->blocks.current_extent = NULL; - - cur_chain = cur_chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done."); - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); -} - -/* - * Storage needed - * - * Returns amount of space in the image header required - * for the chain data. This ignores the links between - * pages, which we factor in when allocating the space. - */ -int toi_bio_devinfo_storage_needed(void) -{ - int result = sizeof(num_chains); - struct toi_bdev_info *chain = prio_chain_head; - - while (chain) { - result += metadata_size; - - /* Chain size */ - result += sizeof(int); - - /* Extents */ - result += (2 * sizeof(unsigned long) * - chain->blocks.num_extents); - - chain = chain->next; - } - - result += 4 * sizeof(int); - return result; -} - -static unsigned long chain_pages_used(struct toi_bdev_info *chain) -{ - struct hibernate_extent *this = chain->blocks.first; - struct hibernate_extent_saved_state *state = &chain->saved_state[3]; - unsigned long size = 0; - int extent_idx = 1; - - if (!state->extent_num) { - if (!this) - return 0; - else - return chain->blocks.size; - } - - while (extent_idx < state->extent_num) { - size += (this->end - this->start + 1); - this = this->next; - extent_idx++; - } - - /* We didn't use the one we're sitting on, so don't count it */ - return size + state->offset - this->start; -} - -void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain) -{ - unsigned long used = chain_pages_used(chain); - - /* Free the storage */ - unsigned long first_freed = 0; - - if (chain->allocator->bio_allocator_ops->free_unused_storage) - first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used); - - printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed); - - /* Adjust / free the extents. */ - toi_put_extent_chain_from(&chain->blocks, first_freed); - - { - struct hibernate_extent *this = chain->blocks.first; - while (this) { - printk("Extent %lx-%lx.\n", this->start, this->end); - this = this->next; - } - } -} - -/** - * toi_serialise_extent_chain - write a chain in the image - * @chain: Chain to write. - **/ -static int toi_serialise_extent_chain(struct toi_bdev_info *chain) -{ - struct hibernate_extent *this; - int ret; - int i = 1; - - chain->pages_used = chain_pages_used(chain); - - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).", - chain->dev_t); - /* Device info - dev_t, prio, bmap_shift, blocks per page, positions */ - ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, - (char *) &chain->uuid, metadata_size); - if (ret) - return ret; - - /* Num extents */ - ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, - (char *) &chain->blocks.num_extents, sizeof(int)); - if (ret) - return ret; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", - chain->blocks.num_extents); - - this = chain->blocks.first; - while (this) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i); - ret = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, - (char *) this, 2 * sizeof(this->start)); - if (ret) - return ret; - this = this->next; - i++; - } - - return ret; -} - -int toi_serialise_extent_chains(void) -{ - struct toi_bdev_info *this = prio_chain_head; - int result; - - /* Write the number of chains */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)", - num_chains); - result = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, (char *) &num_chains, - sizeof(int)); - if (result) - return result; - - /* Then the chains themselves */ - while (this) { - result = toi_serialise_extent_chain(this); - if (result) - return result; - this = this->next; - } - - /* - * Finally, the chain we should be on at the start of each - * section. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers."); - result = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, - (char *) &toi_writer_posn.saved_chain_number[0], - 4 * sizeof(int)); - - return result; -} - -int toi_register_storage_chain(struct toi_bdev_info *new) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.", - new); - toi_insert_chain_in_prio_list(new); - return 0; -} - -static void free_bdev_info(struct toi_bdev_info *chain) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents."); - toi_put_extent_chain(&chain->blocks); - - /* - * The allocator may need to do more than just free the chains - * (swap_free, for example). Don't call from boot kernel. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents."); - if (chain->allocator) - chain->allocator->bio_allocator_ops->free_storage(chain); - - /* - * Dropping out of reading atomic copy? Need to undo - * toi_open_by_devnum. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev."); - if (chain->bdev && !IS_ERR(chain->bdev) && - chain->bdev != resume_block_device && - chain->bdev != header_block_device && - test_toi_state(TOI_TRYING_TO_RESUME)) - toi_close_bdev(chain->bdev); - - /* Poison */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct."); - toi_kfree(39, chain, sizeof(*chain)); - - if (prio_chain_head == chain) - prio_chain_head = NULL; - - num_chains--; -} - -void free_all_bdev_info(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - struct toi_bdev_info *next = this->next; - free_bdev_info(this); - this = next; - } - - memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn)); - prio_chain_head = NULL; -} - -static void set_up_start_position(void) -{ - toi_writer_posn.current_chain = prio_chain_head; - go_next_page(0, 0); -} - -/** - * toi_load_extent_chain - read back a chain saved in the image - * @chain: Chain to load - * - * The linked list of extents is reconstructed from the disk. chain will point - * to the first entry. - **/ -int toi_load_extent_chain(int index, int *num_loaded) -{ - struct toi_bdev_info *chain = toi_kzalloc(39, - sizeof(struct toi_bdev_info), GFP_ATOMIC); - struct hibernate_extent *this, *last = NULL; - int i, ret; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index); - /* Get dev_t, prio, bmap_shift, blocks per page, positions */ - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &chain->uuid, metadata_size); - - if (ret) { - printk(KERN_ERR "Failed to read the size of extent chain.\n"); - toi_kfree(39, chain, sizeof(*chain)); - return 1; - } - - toi_bkd.pages_used[index] = chain->pages_used; - - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &chain->blocks.num_extents, sizeof(int)); - if (ret) { - printk(KERN_ERR "Failed to read the size of extent chain.\n"); - toi_kfree(39, chain, sizeof(*chain)); - return 1; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", - chain->blocks.num_extents); - - for (i = 0; i < chain->blocks.num_extents; i++) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1); - - this = toi_kzalloc(2, sizeof(struct hibernate_extent), - TOI_ATOMIC_GFP); - if (!this) { - printk(KERN_INFO "Failed to allocate a new extent.\n"); - free_bdev_info(chain); - return -ENOMEM; - } - this->next = NULL; - /* Get the next page */ - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, - NULL, (char *) this, 2 * sizeof(this->start)); - if (ret) { - printk(KERN_INFO "Failed to read an extent.\n"); - toi_kfree(2, this, sizeof(struct hibernate_extent)); - free_bdev_info(chain); - return 1; - } - - if (last) - last->next = this; - else { - char b1[32], b2[32], b3[32]; - /* - * Open the bdev - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Chain dev_t is %s. Resume dev t is %s. Header" - " bdev_t is %s.\n", - format_dev_t(b1, chain->dev_t), - format_dev_t(b2, resume_dev_t), - format_dev_t(b3, toi_sig_data->header_dev_t)); - - if (chain->dev_t == resume_dev_t) - chain->bdev = resume_block_device; - else if (chain->dev_t == toi_sig_data->header_dev_t) - chain->bdev = header_block_device; - else { - chain->bdev = toi_open_bdev(chain->uuid, - chain->dev_t, 1); - if (IS_ERR(chain->bdev)) { - free_bdev_info(chain); - return -ENODEV; - } - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift " - "is %d and blocks per page is %d.", - chain->bmap_shift, - chain->blocks_per_page); - - chain->blocks.first = this; - - /* - * Couldn't do this earlier, but can't do - * goto_start now - we may have already used blocks - * in the first chain. - */ - chain->blocks.current_extent = this; - chain->blocks.current_offset = this->start; - - /* - * Can't wait until we've read the whole chain - * before we insert it in the list. We might need - * this chain to read the next page in the header - */ - toi_insert_chain_in_prio_list(chain); - } - - /* - * We have to wait until 2 extents are loaded before setting up - * properly because if the first extent has only one page, we - * will need to put the position on the second extent. Sounds - * obvious, but it wasn't! - */ - (*num_loaded)++; - if ((*num_loaded) == 2) - set_up_start_position(); - last = this; - } - - /* - * Shouldn't get empty chains, but it's not impossible. Link them in so - * they get freed properly later. - */ - if (!chain->blocks.num_extents) - toi_insert_chain_in_prio_list(chain); - - if (!chain->blocks.current_extent) { - chain->blocks.current_extent = chain->blocks.first; - if (chain->blocks.current_extent) - chain->blocks.current_offset = - chain->blocks.current_extent->start; - } - return 0; -} - -int toi_load_extent_chains(void) -{ - int result; - int to_load; - int i; - int extents_loaded = 0; - - result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &to_load, - sizeof(int)); - if (result) - return result; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load); - - for (i = 0; i < to_load; i++) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.", - i, to_load); - result = toi_load_extent_chain(i, &extents_loaded); - if (result) - return result; - } - - /* If we never got to a second extent, we still need to do this. */ - if (extents_loaded == 1) - set_up_start_position(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers."); - result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, - &toi_blockwriter_ops, - (char *) &toi_writer_posn.saved_chain_number[0], - 4 * sizeof(int)); - - return result; -} - -static int toi_end_of_stream(int writing, int section_barrier) -{ - struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; - int compare_to = next_section[current_stream]; - struct toi_bdev_info *compare_chain = - toi_writer_posn.saved_chain_ptr[compare_to]; - int compare_offset = compare_chain ? - compare_chain->saved_state[compare_to].offset : 0; - - if (!section_barrier) - return 0; - - if (!cur_chain) - return 1; - - if (cur_chain == compare_chain && - cur_chain->blocks.current_offset == compare_offset) { - if (writing) { - if (!current_stream) { - debug_broken_header(); - return 1; - } - } else { - more_readahead = 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Reached the end of stream %d " - "(not an error).", current_stream); - return 1; - } - } - - return 0; -} - -/** - * go_next_page - skip blocks to the start of the next page - * @writing: Whether we're reading or writing the image. - * - * Go forward one page. - **/ -int go_next_page(int writing, int section_barrier) -{ - struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; - int max = cur_chain ? cur_chain->blocks_per_page : 1; - - /* Nope. Go foward a page - or maybe two. Don't stripe the header, - * so that bad fragmentation doesn't put the extent data containing - * the location of the second page out of the first header page. - */ - if (toi_extent_state_next(max, current_stream)) { - /* Don't complain if readahead falls off the end */ - if (writing && section_barrier) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. " - "Expected compression ratio too optimistic?"); - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to " - "read/write. (Not necessarily a fatal error."); - return -ENOSPC; - } - - return 0; -} - -int devices_of_same_priority(struct toi_bdev_info *this) -{ - struct toi_bdev_info *check = prio_chain_head; - int i = 0; - - while (check) { - if (check->prio == this->prio) - i++; - check = check->next; - } - - return i; -} - -/** - * toi_bio_rw_page - do i/o on the next disk page in the image - * @writing: Whether reading or writing. - * @page: Page to do i/o on. - * @is_readahead: Whether we're doing readahead - * @free_group: The group used in allocating the page - * - * Submit a page for reading or writing, possibly readahead. - * Pass the group used in allocating the page as well, as it should - * be freed on completion of the bio if we're writing the page. - **/ -int toi_bio_rw_page(int writing, struct page *page, - int is_readahead, int free_group) -{ - int result = toi_end_of_stream(writing, 1); - struct toi_bdev_info *dev_info = toi_writer_posn.current_chain; - - if (result) { - if (writing) - abort_hibernate(TOI_INSUFFICIENT_STORAGE, - "Insufficient storage for your image."); - else - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to " - "read/write another page when stream has " - "ended."); - return -ENOSPC; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "%s %lx:%ld", - writing ? "Write" : "Read", - dev_info->dev_t, dev_info->blocks.current_offset); - - result = toi_do_io(writing, dev_info->bdev, - dev_info->blocks.current_offset << dev_info->bmap_shift, - page, is_readahead, 0, free_group); - - /* Ignore the result here - will check end of stream if come in again */ - go_next_page(writing, 1); - - if (result) - printk(KERN_ERR "toi_do_io returned %d.\n", result); - return result; -} - -dev_t get_header_dev_t(void) -{ - return prio_chain_head->dev_t; -} - -struct block_device *get_header_bdev(void) -{ - return prio_chain_head->bdev; -} - -unsigned long get_headerblock(void) -{ - return prio_chain_head->blocks.first->start << - prio_chain_head->bmap_shift; -} - -int get_main_pool_phys_params(void) -{ - struct toi_bdev_info *this = prio_chain_head; - int result; - - while (this) { - result = this->allocator->bio_allocator_ops->bmap(this); - if (result) - return result; - this = this->next; - } - - return 0; -} - -static int apply_header_reservation(void) -{ - int i; - - if (!header_pages_reserved) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "No header pages reserved at the moment."); - return 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation."); - - /* Apply header space reservation */ - toi_extent_state_goto_start(); - - for (i = 0; i < header_pages_reserved; i++) - if (go_next_page(1, 0)) - return -ENOSPC; - - /* The end of header pages will be the start of pageset 2 */ - toi_extent_state_save(2); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Finished applying header reservation."); - return 0; -} - -int toi_bio_register_storage(void) -{ - int result = 0; - struct toi_module_ops *this_module; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " - "Registering storage."); - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type != BIO_ALLOCATOR_MODULE) - continue; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Registering storage from %s.", - this_module->name); - result = this_module->bio_allocator_ops->register_storage(); - if (result) - break; - } - - return result; -} - -void toi_bio_free_unused_storage(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - toi_bio_free_unused_storage_chain(this); - this = this->next; - } -} - -int toi_bio_allocate_storage(unsigned long request) -{ - struct toi_bdev_info *chain = prio_chain_head; - unsigned long to_get = request; - unsigned long extra_pages, needed; - int no_free = 0; - - if (!chain) { - printk("TuxOnIce: No storage was registered.\n"); - return 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " - "Request is %lu pages.", request); - extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long) - + sizeof(int)), PAGE_SIZE); - needed = request + extra_pages + header_pages_reserved; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu " - "for header => %lu.", - extra_pages, header_pages_reserved, needed); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.", - raw_pages_allocd); - - to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get); - - if (!to_get) - return apply_header_reservation(); - - while (to_get && chain) { - int num_group = devices_of_same_priority(chain); - int divisor = num_group - no_free; - int i; - unsigned long portion = DIV_ROUND_UP(to_get, divisor); - unsigned long got = 0; - unsigned long got_this_round = 0; - struct toi_bdev_info *top = chain; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Start of loop. To get is %lu. Divisor is %d.", - to_get, divisor); - no_free = 0; - - /* - * We're aiming to spread the allocated storage as evenly - * as possible, but we also want to get all the storage we - * can off this priority. - */ - for (i = 0; i < num_group; i++) { - struct toi_bio_allocator_ops *ops = - chain->allocator->bio_allocator_ops; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Asking for %lu pages from chain %p.", - portion, chain); - got = ops->allocate_storage(chain, portion); - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Got %lu pages from allocator %p.", - got, chain); - if (!got) - no_free++; - got_this_round += got; - chain = chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a " - "total of %lu pages from %d allocators.", - got_this_round, divisor - no_free); - - raw_pages_allocd += got_this_round; - to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : - 0; - - /* - * If we got anything from chains of this priority and we - * still have storage to allocate, go over this priority - * again. - */ - if (got_this_round && to_get) - chain = top; - else - no_free = 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling " - "get_main_pool_phys_params"); - /* Now let swap allocator bmap the pages */ - get_main_pool_phys_params(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header."); - return apply_header_reservation(); -} - -void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd) -{ - int i = 0; - struct toi_bdev_info *cur_chain = prio_chain_head; - - while (cur_chain) { - cur_chain->pages_used = bkd->pages_used[i]; - cur_chain = cur_chain->next; - i++; - } -} - -int toi_bio_chains_debug_info(char *buffer, int size) -{ - /* Show what we actually used */ - struct toi_bdev_info *cur_chain = prio_chain_head; - int len = 0; - - while (cur_chain) { - len += scnprintf(buffer + len, size - len, " Used %lu pages " - "from %s.\n", cur_chain->pages_used, - cur_chain->name); - cur_chain = cur_chain->next; - } - - return len; -} - -void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr) -{ - struct toi_bdev_info *this = toi_writer_posn.current_chain, - *cmp = prio_chain_head; - - ptr->save.chain = 1; - while (this != cmp) { - ptr->save.chain++; - cmp = cmp->next; - } - ptr->save.block = this->blocks.current_offset; - - /* Save the raw info internally for quicker access when updating pointers */ - ptr->bdev = this->bdev; - ptr->block = this->blocks.current_offset << this->bmap_shift; -} - -void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr) -{ - int i = ptr->save.chain - 1; - struct toi_bdev_info *this; - struct hibernate_extent *hib; - - /* Find chain by stored index */ - this = prio_chain_head; - while (i) { - this = this->next; - i--; - } - toi_writer_posn.current_chain = this; - - /* Restore block */ - this->blocks.current_offset = ptr->save.block; - - /* Find current offset from block number */ - hib = this->blocks.first; - - while (hib->start > ptr->save.block) { - hib = hib->next; - } - - this->blocks.last_touched = this->blocks.current_extent = hib; -} diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c deleted file mode 100644 index bc27662d7..000000000 --- a/kernel/power/tuxonice_bio_core.c +++ /dev/null @@ -1,1937 +0,0 @@ -/* - * kernel/power/tuxonice_bio.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains block io functions for TuxOnIce. These are - * used by the swapwriter and it is planned that they will also - * be used by the NFSwriter. - * - */ - -#include -#include -#include -#include -#include -#include - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_bio.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" -#include "tuxonice_builtin.h" -#include "tuxonice_bio_internal.h" - -#define MEMORY_ONLY 1 -#define THROTTLE_WAIT 2 - -/* #define MEASURE_MUTEX_CONTENTION */ -#ifndef MEASURE_MUTEX_CONTENTION -#define my_mutex_lock(index, the_lock) mutex_lock(the_lock) -#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock) -#else -unsigned long mutex_times[2][2][NR_CPUS]; -#define my_mutex_lock(index, the_lock) do { \ - int have_mutex; \ - have_mutex = mutex_trylock(the_lock); \ - if (!have_mutex) { \ - mutex_lock(the_lock); \ - mutex_times[index][0][smp_processor_id()]++; \ - } else { \ - mutex_times[index][1][smp_processor_id()]++; \ - } - -#define my_mutex_unlock(index, the_lock) \ - mutex_unlock(the_lock); \ -} while (0) -#endif - -static int page_idx, reset_idx; - -static int target_outstanding_io = 1024; -static int max_outstanding_writes, max_outstanding_reads; - -static struct page *bio_queue_head, *bio_queue_tail; -static atomic_t toi_bio_queue_size; -static DEFINE_SPINLOCK(bio_queue_lock); - -static int free_mem_throttle, throughput_throttle; -int more_readahead = 1; -static struct page *readahead_list_head, *readahead_list_tail; - -static struct page *waiting_on; - -static atomic_t toi_io_in_progress, toi_io_done; -static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait); - -int current_stream; -/* Not static, so that the allocators can setup and complete - * writing the header */ -char *toi_writer_buffer; -int toi_writer_buffer_posn; - -static DEFINE_MUTEX(toi_bio_mutex); -static DEFINE_MUTEX(toi_bio_readahead_mutex); - -static struct task_struct *toi_queue_flusher; -static int toi_bio_queue_flush_pages(int dedicated_thread); - -struct toi_module_ops toi_blockwriter_ops; - -struct toi_incremental_image_pointer toi_inc_ptr[2][2]; - -#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \ - atomic_read(&toi_bio_queue_size)) - -unsigned long raw_pages_allocd, header_pages_reserved; - -static int toi_rw_buffer(int writing, char *buffer, int buffer_size, - int no_readahead); - -/** - * set_free_mem_throttle - set the point where we pause to avoid oom. - * - * Initially, this value is zero, but when we first fail to allocate memory, - * we set it (plus a buffer) and thereafter throttle i/o once that limit is - * reached. - **/ -static void set_free_mem_throttle(void) -{ - int new_throttle = nr_free_buffer_pages() + 256; - - if (new_throttle > free_mem_throttle) - free_mem_throttle = new_throttle; -} - -#define NUM_REASONS 7 -static atomic_t reasons[NUM_REASONS]; -static char *reason_name[NUM_REASONS] = { - "readahead not ready", - "bio allocation", - "synchronous I/O", - "toi_bio_get_new_page", - "memory low", - "readahead buffer allocation", - "throughput_throttle", -}; - -/* User Specified Parameters. */ -unsigned long resume_firstblock; -dev_t resume_dev_t; -struct block_device *resume_block_device; -static atomic_t resume_bdev_open_count; - -struct block_device *header_block_device; - -/** - * toi_open_bdev: Open a bdev at resume time. - * - * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t - * (the user can have resume= pointing at a swap partition/file that isn't - * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the - * header. It will be from a swap partition that was enabled when we hibernated, - * but we don't know it's real index until we read that first page. - * dev_t: The device major/minor. - * display_errs: Whether to try to do this quietly. - * - * We stored a dev_t in the image header. Open the matching device without - * requiring /dev/ in most cases and record the details needed - * to close it later and avoid duplicating work. - */ -struct block_device *toi_open_bdev(char *uuid, dev_t default_device, - int display_errs) -{ - struct block_device *bdev; - dev_t device = default_device; - char buf[32]; - int retried = 0; - -retry: - if (uuid) { - struct fs_info seek; - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = 0; - seek.last_mount_size = 0; - device = blk_lookup_fs_info(&seek); - if (!device) { - device = default_device; - printk(KERN_DEBUG "Unable to resolve uuid. Falling back" - " to dev_t.\n"); - } else - printk(KERN_DEBUG "Resolved uuid to device %s.\n", - format_dev_t(buf, device)); - } - - if (!device) { - printk(KERN_ERR "TuxOnIce attempting to open a " - "blank dev_t!\n"); - dump_stack(); - return NULL; - } - bdev = toi_open_by_devnum(device); - - if (IS_ERR(bdev) || !bdev) { - if (!retried) { - retried = 1; - wait_for_device_probe(); - goto retry; - } - if (display_errs) - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "Failed to get access to block device " - "\"%x\" (error %d).\n Maybe you need " - "to run mknod and/or lvmsetup in an " - "initrd/ramfs?", device, bdev); - return ERR_PTR(-EINVAL); - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "TuxOnIce got bdev %p for dev_t %x.", - bdev, device); - - return bdev; -} - -static void toi_bio_reserve_header_space(unsigned long request) -{ - header_pages_reserved = request; -} - -/** - * do_bio_wait - wait for some TuxOnIce I/O to complete - * @reason: The array index of the reason we're waiting. - * - * Wait for a particular page of I/O if we're after a particular page. - * If we're not after a particular page, wait instead for all in flight - * I/O to be completed or for us to have enough free memory to be able - * to submit more I/O. - * - * If we wait, we also update our statistics regarding why we waited. - **/ -static void do_bio_wait(int reason) -{ - struct page *was_waiting_on = waiting_on; - - /* On SMP, waiting_on can be reset, so we make a copy */ - if (was_waiting_on) { - wait_on_page_locked(was_waiting_on); - atomic_inc(&reasons[reason]); - } else { - atomic_inc(&reasons[reason]); - - wait_event(num_in_progress_wait, - !atomic_read(&toi_io_in_progress) || - nr_free_buffer_pages() > free_mem_throttle); - } -} - -/** - * throttle_if_needed - wait for I/O completion if throttle points are reached - * @flags: What to check and how to act. - * - * Check whether we need to wait for some I/O to complete. We always check - * whether we have enough memory available, but may also (depending upon - * @reason) check if the throughput throttle limit has been reached. - **/ -static int throttle_if_needed(int flags) -{ - int free_pages = nr_free_buffer_pages(); - - /* Getting low on memory and I/O is in progress? */ - while (unlikely(free_pages < free_mem_throttle) && - atomic_read(&toi_io_in_progress) && - !test_result_state(TOI_ABORTED)) { - if (!(flags & THROTTLE_WAIT)) - return -ENOMEM; - do_bio_wait(4); - free_pages = nr_free_buffer_pages(); - } - - while (!(flags & MEMORY_ONLY) && throughput_throttle && - TOTAL_OUTSTANDING_IO >= throughput_throttle && - !test_result_state(TOI_ABORTED)) { - int result = toi_bio_queue_flush_pages(0); - if (result) - return result; - atomic_inc(&reasons[6]); - wait_event(num_in_progress_wait, - !atomic_read(&toi_io_in_progress) || - TOTAL_OUTSTANDING_IO < throughput_throttle); - } - - return 0; -} - -/** - * update_throughput_throttle - update the raw throughput throttle - * @jif_index: The number of times this function has been called. - * - * This function is called four times per second by the core, and used to limit - * the amount of I/O we submit at once, spreading out our waiting through the - * whole job and letting userui get an opportunity to do its work. - * - * We don't start limiting I/O until 1/4s has gone so that we get a - * decent sample for our initial limit, and keep updating it because - * throughput may vary (on rotating media, eg) with our block number. - * - * We throttle to 1/10s worth of I/O. - **/ -static void update_throughput_throttle(int jif_index) -{ - int done = atomic_read(&toi_io_done); - throughput_throttle = done * 2 / 5 / jif_index; -} - -/** - * toi_finish_all_io - wait for all outstanding i/o to complete - * - * Flush any queued but unsubmitted I/O and wait for it all to complete. - **/ -static int toi_finish_all_io(void) -{ - int result = toi_bio_queue_flush_pages(0); - toi_bio_queue_flusher_should_finish = 1; - wake_up(&toi_io_queue_flusher); - wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO); - return result; -} - -/** - * toi_end_bio - bio completion function. - * @bio: bio that has completed. - * - * Function called by the block driver from interrupt context when I/O is - * completed. If we were writing the page, we want to free it and will have - * set bio->bi_private to the parameter we should use in telling the page - * allocation accounting code what the page was allocated for. If we're - * reading the page, it will be in the singly linked list made from - * page->private pointers. - **/ -static void toi_end_bio(struct bio *bio) -{ - struct page *page = bio->bi_io_vec[0].bv_page; - - BUG_ON(bio->bi_error); - - unlock_page(page); - bio_put(bio); - - if (waiting_on == page) - waiting_on = NULL; - - put_page(page); - - if (bio->bi_private) - toi__free_page((int) ((unsigned long) bio->bi_private) , page); - - bio_put(bio); - - atomic_dec(&toi_io_in_progress); - atomic_inc(&toi_io_done); - - wake_up(&num_in_progress_wait); -} - -/** - * submit - submit BIO request - * @writing: READ or WRITE. - * @dev: The block device we're using. - * @first_block: The first sector we're using. - * @page: The page being used for I/O. - * @free_group: If writing, the group that was used in allocating the page - * and which will be used in freeing the page from the completion - * routine. - * - * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the - * textbook - allocate and initialize the bio. If we're writing, make sure - * the page is marked as dirty. Then submit it and carry on." - * - * If we're just testing the speed of our own code, we fake having done all - * the hard work and all toi_end_bio immediately. - **/ -static int submit(int writing, struct block_device *dev, sector_t first_block, - struct page *page, int free_group) -{ - struct bio *bio = NULL; - int cur_outstanding_io, result; - - /* - * Shouldn't throttle if reading - can deadlock in the single - * threaded case as pages are only freed when we use the - * readahead. - */ - if (writing) { - result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT); - if (result) - return result; - } - - while (!bio) { - bio = bio_alloc(TOI_ATOMIC_GFP, 1); - if (!bio) { - set_free_mem_throttle(); - do_bio_wait(1); - } - } - - bio->bi_bdev = dev; - bio->bi_iter.bi_sector = first_block; - bio->bi_private = (void *) ((unsigned long) free_group); - bio->bi_end_io = toi_end_bio; - bio_set_flag(bio, BIO_TOI); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n", - (unsigned long long) first_block); - bio_put(bio); - return -EFAULT; - } - - bio_get(bio); - - cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress); - if (writing) { - if (cur_outstanding_io > max_outstanding_writes) - max_outstanding_writes = cur_outstanding_io; - } else { - if (cur_outstanding_io > max_outstanding_reads) - max_outstanding_reads = cur_outstanding_io; - } - - /* Still read the header! */ - if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) { - /* Fake having done the hard work */ - bio->bi_error = 0; - toi_end_bio(bio); - } else - submit_bio(writing | REQ_SYNC, bio); - - return 0; -} - -/** - * toi_do_io: Prepare to do some i/o on a page and submit or batch it. - * - * @writing: Whether reading or writing. - * @bdev: The block device which we're using. - * @block0: The first sector we're reading or writing. - * @page: The page on which I/O is being done. - * @readahead_index: If doing readahead, the index (reset this flag when done). - * @syncio: Whether the i/o is being done synchronously. - * - * Prepare and start a read or write operation. - * - * Note that we always work with our own page. If writing, we might be given a - * compression buffer that will immediately be used to start compressing the - * next page. For reading, we do readahead and therefore don't know the final - * address where the data needs to go. - **/ -int toi_do_io(int writing, struct block_device *bdev, long block0, - struct page *page, int is_readahead, int syncio, int free_group) -{ - page->private = 0; - - /* Do here so we don't race against toi_bio_get_next_page_read */ - lock_page(page); - - if (is_readahead) { - if (readahead_list_head) - readahead_list_tail->private = (unsigned long) page; - else - readahead_list_head = page; - - readahead_list_tail = page; - } - - /* Done before submitting to avoid races. */ - if (syncio) - waiting_on = page; - - /* Submit the page */ - get_page(page); - - if (submit(writing, bdev, block0, page, free_group)) - return -EFAULT; - - if (syncio) - do_bio_wait(2); - - return 0; -} - -/** - * toi_bdev_page_io - simpler interface to do directly i/o on a single page - * @writing: Whether reading or writing. - * @bdev: Block device on which we're operating. - * @pos: Sector at which page to read or write starts. - * @page: Page to be read/written. - * - * A simple interface to submit a page of I/O and wait for its completion. - * The caller must free the page used. - **/ -static int toi_bdev_page_io(int writing, struct block_device *bdev, - long pos, struct page *page) -{ - return toi_do_io(writing, bdev, pos, page, 0, 1, 0); -} - -/** - * toi_bio_memory_needed - report the amount of memory needed for block i/o - * - * We want to have at least enough memory so as to have target_outstanding_io - * or more transactions on the fly at once. If we can do more, fine. - **/ -static int toi_bio_memory_needed(void) -{ - return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) + - sizeof(struct bio)); -} - -/** - * toi_bio_print_debug_stats - put out debugging info in the buffer provided - * @buffer: A buffer of size @size into which text should be placed. - * @size: The size of @buffer. - * - * Fill a buffer with debugging info. This is used for both our debug_info sysfs - * entry and for recording the same info in dmesg. - **/ -static int toi_bio_print_debug_stats(char *buffer, int size) -{ - int len = 0; - - if (toiActiveAllocator != &toi_blockwriter_ops) { - len = scnprintf(buffer, size, - "- Block I/O inactive.\n"); - return len; - } - - len = scnprintf(buffer, size, "- Block I/O active.\n"); - - len += toi_bio_chains_debug_info(buffer + len, size - len); - - len += scnprintf(buffer + len, size - len, - "- Max outstanding reads %d. Max writes %d.\n", - max_outstanding_reads, max_outstanding_writes); - - len += scnprintf(buffer + len, size - len, - " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n", - target_outstanding_io, - PAGE_SIZE, (unsigned int) sizeof(struct request), - (unsigned int) sizeof(struct bio), toi_bio_memory_needed()); - -#ifdef MEASURE_MUTEX_CONTENTION - { - int i; - - len += scnprintf(buffer + len, size - len, - " Mutex contention while reading:\n Contended Free\n"); - - for_each_online_cpu(i) - len += scnprintf(buffer + len, size - len, - " %9lu %9lu\n", - mutex_times[0][0][i], mutex_times[0][1][i]); - - len += scnprintf(buffer + len, size - len, - " Mutex contention while writing:\n Contended Free\n"); - - for_each_online_cpu(i) - len += scnprintf(buffer + len, size - len, - " %9lu %9lu\n", - mutex_times[1][0][i], mutex_times[1][1][i]); - - } -#endif - - return len + scnprintf(buffer + len, size - len, - " Free mem throttle point reached %d.\n", free_mem_throttle); -} - -static int total_header_bytes; -static int unowned; - -void debug_broken_header(void) -{ - printk(KERN_DEBUG "Image header too big for size allocated!\n"); - print_toi_header_storage_for_modules(); - printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed()); - printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header)); - printk(KERN_DEBUG "Total unowned : %d.\n", unowned); - printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes, - DIV_ROUND_UP(total_header_bytes, PAGE_SIZE)); - printk(KERN_DEBUG "Space needed now : %ld.\n", - get_header_storage_needed(0)); - dump_block_chains(); - abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small."); -} - -static int toi_bio_update_previous_inc_img_ptr(int stream) -{ - int result; - char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP); - struct page *page; - struct toi_incremental_image_pointer *prev, *this; - - prev = &toi_inc_ptr[stream][0]; - this = &toi_inc_ptr[stream][1]; - - if (!buffer) { - // We're at the start of writing a pageset. Memory should not be that scarce. - return -ENOMEM; - } - - page = virt_to_page(buffer); - result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0); - - if (result) - goto out; - - memcpy(buffer, (char *) this, sizeof(this->save)); - - result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12); - - // If the IO is successfully submitted (!result), the page will be freed - // asynchronously on completion. -out: - if (result) - toi__free_page(12, virt_to_page(buffer)); - return result; -} - -/** - * toi_rw_init_incremental - incremental image part of setting up to write new section - */ -static int toi_write_init_incremental(int stream) -{ - int result = 0; - - // Remember the location of this block so we can link to it. - toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]); - - // Update the pointer at the start of the last pageset with the same stream number. - result = toi_bio_update_previous_inc_img_ptr(stream); - if (result) - return result; - - // Move the current to the previous slot. - memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1])); - - // Store a blank pointer at the start of this incremental pageset - memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1])); - result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); - if (result) - return result; - - // Serialise extent chains if this is an incremental pageset - return toi_serialise_extent_chains(); -} - -/** - * toi_read_init_incremental - incremental image part of setting up to read new section - */ -static int toi_read_init_incremental(int stream) -{ - int result; - - // Set our position to the start of the next pageset - toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]); - - // Read the start of the next incremental pageset (if any) - result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); - - if (!result) - result = toi_load_extent_chains(); - - return result; -} - -/** - * toi_rw_init - prepare to read or write a stream in the image - * @writing: Whether reading or writing. - * @stream number: Section of the image being processed. - * - * Prepare to read or write a section ('stream') in the image. - **/ -static int toi_rw_init(int writing, int stream_number) -{ - if (stream_number) - toi_extent_state_restore(stream_number); - else - toi_extent_state_goto_start(); - - if (writing) { - reset_idx = 0; - if (!current_stream) - page_idx = 0; - } else { - reset_idx = 1; - } - - atomic_set(&toi_io_done, 0); - if (!toi_writer_buffer) - toi_writer_buffer = (char *) toi_get_zeroed_page(11, - TOI_ATOMIC_GFP); - toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE; - - current_stream = stream_number; - - more_readahead = 1; - - if (test_result_state(TOI_KEPT_IMAGE)) { - int result; - - if (writing) { - result = toi_write_init_incremental(stream_number); - } else { - result = toi_read_init_incremental(stream_number); - } - - if (result) - return result; - } - - return toi_writer_buffer ? 0 : -ENOMEM; -} - -/** - * toi_bio_queue_write - queue a page for writing - * @full_buffer: Pointer to a page to be queued - * - * Add a page to the queue to be submitted. If we're the queue flusher, - * we'll do this once we've dropped toi_bio_mutex, so other threads can - * continue to submit I/O while we're on the slow path doing the actual - * submission. - **/ -static void toi_bio_queue_write(char **full_buffer) -{ - struct page *page = virt_to_page(*full_buffer); - unsigned long flags; - - *full_buffer = NULL; - page->private = 0; - - spin_lock_irqsave(&bio_queue_lock, flags); - if (!bio_queue_head) - bio_queue_head = page; - else - bio_queue_tail->private = (unsigned long) page; - - bio_queue_tail = page; - atomic_inc(&toi_bio_queue_size); - - spin_unlock_irqrestore(&bio_queue_lock, flags); - wake_up(&toi_io_queue_flusher); -} - -/** - * toi_rw_cleanup - Cleanup after i/o. - * @writing: Whether we were reading or writing. - * - * Flush all I/O and clean everything up after reading or writing a - * section of the image. - **/ -static int toi_rw_cleanup(int writing) -{ - int i, result = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup."); - if (writing) { - if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED)) - toi_bio_queue_write(&toi_writer_buffer); - - while (bio_queue_head && !result) - result = toi_bio_queue_flush_pages(0); - - if (result) - return result; - - if (current_stream == 2) - toi_extent_state_save(1); - else if (current_stream == 1) - toi_extent_state_save(3); - } - - result = toi_finish_all_io(); - - while (readahead_list_head) { - void *next = (void *) readahead_list_head->private; - toi__free_page(12, readahead_list_head); - readahead_list_head = next; - } - - readahead_list_tail = NULL; - - if (!current_stream) - return result; - - for (i = 0; i < NUM_REASONS; i++) { - if (!atomic_read(&reasons[i])) - continue; - printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n", - reason_name[i], atomic_read(&reasons[i])); - atomic_set(&reasons[i], 0); - } - - current_stream = 0; - return result; -} - -/** - * toi_start_one_readahead - start one page of readahead - * @dedicated_thread: Is this a thread dedicated to doing readahead? - * - * Start one new page of readahead. If this is being called by a thread - * whose only just is to submit readahead, don't quit because we failed - * to allocate a page. - **/ -static int toi_start_one_readahead(int dedicated_thread) -{ - char *buffer = NULL; - int oom = 0, result; - - result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0); - if (result) { - printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result); - return result; - } - - mutex_lock(&toi_bio_readahead_mutex); - - while (!buffer) { - buffer = (char *) toi_get_zeroed_page(12, - TOI_ATOMIC_GFP); - if (!buffer) { - if (oom && !dedicated_thread) { - mutex_unlock(&toi_bio_readahead_mutex); - printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result); - return -ENOMEM; - } - - oom = 1; - set_free_mem_throttle(); - do_bio_wait(5); - } - } - - result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0); - if (result) { - printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result); - } - if (result == -ENOSPC) - toi__free_page(12, virt_to_page(buffer)); - mutex_unlock(&toi_bio_readahead_mutex); - if (result) { - if (result == -ENOSPC) - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Last readahead page submitted."); - else - printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n", - result); - } - return result; -} - -/** - * toi_start_new_readahead - start new readahead - * @dedicated_thread: Are we dedicated to this task? - * - * Start readahead of image pages. - * - * We can be called as a thread dedicated to this task (may be helpful on - * systems with lots of CPUs), in which case we don't exit until there's no - * more readahead. - * - * If this is not called by a dedicated thread, we top up our queue until - * there's no more readahead to submit, we've submitted the number given - * in target_outstanding_io or the number in progress exceeds the target - * outstanding I/O value. - * - * No mutex needed because this is only ever called by the first cpu. - **/ -static int toi_start_new_readahead(int dedicated_thread) -{ - int last_result, num_submitted = 0; - - /* Start a new readahead? */ - if (!more_readahead) - return 0; - - do { - last_result = toi_start_one_readahead(dedicated_thread); - - if (last_result) { - if (last_result == -ENOMEM || last_result == -ENOSPC) - return 0; - - printk(KERN_DEBUG - "Begin read chunk returned %d.\n", - last_result); - } else - num_submitted++; - - } while (more_readahead && !last_result && - (dedicated_thread || - (num_submitted < target_outstanding_io && - atomic_read(&toi_io_in_progress) < target_outstanding_io))); - - return last_result; -} - -/** - * bio_io_flusher - start the dedicated I/O flushing routine - * @writing: Whether we're writing the image. - **/ -static int bio_io_flusher(int writing) -{ - - if (writing) - return toi_bio_queue_flush_pages(1); - else - return toi_start_new_readahead(1); -} - -/** - * toi_bio_get_next_page_read - read a disk page, perhaps with readahead - * @no_readahead: Whether we can use readahead - * - * Read a page from disk, submitting readahead and cleaning up finished i/o - * while we wait for the page we're after. - **/ -static int toi_bio_get_next_page_read(int no_readahead) -{ - char *virt; - struct page *old_readahead_list_head; - - /* - * When reading the second page of the header, we have to - * delay submitting the read until after we've gotten the - * extents out of the first page. - */ - if (unlikely(no_readahead)) { - int result = toi_start_one_readahead(0); - if (result) { - printk(KERN_EMERG "No readahead and toi_start_one_readahead " - "returned non-zero.\n"); - return -EIO; - } - } - - if (unlikely(!readahead_list_head)) { - /* - * If the last page finishes exactly on the page - * boundary, we will be called one extra time and - * have no data to return. In this case, we should - * not BUG(), like we used to! - */ - if (!more_readahead) { - printk(KERN_EMERG "No more readahead.\n"); - return -ENOSPC; - } - if (unlikely(toi_start_one_readahead(0))) { - printk(KERN_EMERG "No readahead and " - "toi_start_one_readahead returned non-zero.\n"); - return -EIO; - } - } - - if (PageLocked(readahead_list_head)) { - waiting_on = readahead_list_head; - do_bio_wait(0); - } - - virt = page_address(readahead_list_head); - memcpy(toi_writer_buffer, virt, PAGE_SIZE); - - mutex_lock(&toi_bio_readahead_mutex); - old_readahead_list_head = readahead_list_head; - readahead_list_head = (struct page *) readahead_list_head->private; - mutex_unlock(&toi_bio_readahead_mutex); - toi__free_page(12, old_readahead_list_head); - return 0; -} - -/** - * toi_bio_queue_flush_pages - flush the queue of pages queued for writing - * @dedicated_thread: Whether we're a dedicated thread - * - * Flush the queue of pages ready to be written to disk. - * - * If we're a dedicated thread, stay in here until told to leave, - * sleeping in wait_event. - * - * The first thread is normally the only one to come in here. Another - * thread can enter this routine too, though, via throttle_if_needed. - * Since that's the case, we must be careful to only have one thread - * doing this work at a time. Otherwise we have a race and could save - * pages out of order. - * - * If an error occurs, free all remaining pages without submitting them - * for I/O. - **/ - -int toi_bio_queue_flush_pages(int dedicated_thread) -{ - unsigned long flags; - int result = 0; - static DEFINE_MUTEX(busy); - - if (!mutex_trylock(&busy)) - return 0; - -top: - spin_lock_irqsave(&bio_queue_lock, flags); - while (bio_queue_head) { - struct page *page = bio_queue_head; - bio_queue_head = (struct page *) page->private; - if (bio_queue_tail == page) - bio_queue_tail = NULL; - atomic_dec(&toi_bio_queue_size); - spin_unlock_irqrestore(&bio_queue_lock, flags); - - /* Don't generate more error messages if already had one */ - if (!result) - result = toi_bio_rw_page(WRITE, page, 0, 11); - /* - * If writing the page failed, don't drop out. - * Flush the rest of the queue too. - */ - if (result) - toi__free_page(11 , page); - spin_lock_irqsave(&bio_queue_lock, flags); - } - spin_unlock_irqrestore(&bio_queue_lock, flags); - - if (dedicated_thread) { - wait_event(toi_io_queue_flusher, bio_queue_head || - toi_bio_queue_flusher_should_finish); - if (likely(!toi_bio_queue_flusher_should_finish)) - goto top; - toi_bio_queue_flusher_should_finish = 0; - } - - mutex_unlock(&busy); - return result; -} - -/** - * toi_bio_get_new_page - get a new page for I/O - * @full_buffer: Pointer to a page to allocate. - **/ -static int toi_bio_get_new_page(char **full_buffer) -{ - int result = throttle_if_needed(THROTTLE_WAIT); - if (result) - return result; - - while (!*full_buffer) { - *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP); - if (!*full_buffer) { - set_free_mem_throttle(); - do_bio_wait(3); - } - } - - return 0; -} - -/** - * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O - * @writing: Bool - whether writing (or reading). - * @buffer: The start of the buffer to write or fill. - * @buffer_size: The size of the buffer to write or fill. - * @no_readahead: Don't try to start readhead (when getting extents). - **/ -static int toi_rw_buffer(int writing, char *buffer, int buffer_size, - int no_readahead) -{ - int bytes_left = buffer_size, result = 0; - - while (bytes_left) { - char *source_start = buffer + buffer_size - bytes_left; - char *dest_start = toi_writer_buffer + toi_writer_buffer_posn; - int capacity = PAGE_SIZE - toi_writer_buffer_posn; - char *to = writing ? dest_start : source_start; - char *from = writing ? source_start : dest_start; - - if (bytes_left <= capacity) { - memcpy(to, from, bytes_left); - toi_writer_buffer_posn += bytes_left; - return 0; - } - - /* Complete this page and start a new one */ - memcpy(to, from, capacity); - bytes_left -= capacity; - - if (!writing) { - /* - * Perform actual I/O: - * read readahead_list_head into toi_writer_buffer - */ - int result = toi_bio_get_next_page_read(no_readahead); - if (result && bytes_left) { - printk("toi_bio_get_next_page_read " - "returned %d. Expecting to read %d bytes.\n", result, bytes_left); - return result; - } - } else { - toi_bio_queue_write(&toi_writer_buffer); - result = toi_bio_get_new_page(&toi_writer_buffer); - if (result) { - printk(KERN_ERR "toi_bio_get_new_page returned " - "%d.\n", result); - return result; - } - } - - toi_writer_buffer_posn = 0; - toi_cond_pause(0, NULL); - } - - return 0; -} - -/** - * toi_bio_read_page - read a page of the image - * @pfn: The pfn where the data belongs. - * @buffer_page: The page containing the (possibly compressed) data. - * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE). - * - * Read a (possibly compressed) page from the image, into buffer_page, - * returning its pfn and the buffer size. - **/ -static int toi_bio_read_page(unsigned long *pfn, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int result = 0; - int this_idx; - char *buffer_virt = TOI_MAP(buf_type, buffer_page); - - /* - * Only call start_new_readahead if we don't have a dedicated thread - * and we're the queue flusher. - */ - if (current == toi_queue_flusher && more_readahead && - !test_action_state(TOI_NO_READAHEAD)) { - int result2 = toi_start_new_readahead(0); - if (result2) { - printk(KERN_DEBUG "Queue flusher and " - "toi_start_one_readahead returned non-zero.\n"); - result = -EIO; - goto out; - } - } - - my_mutex_lock(0, &toi_bio_mutex); - - /* - * Structure in the image: - * [destination pfn|page size|page data] - * buf_size is PAGE_SIZE - * We can validly find there's nothing to read in a multithreaded - * situation. - */ - if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) || - toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) || - toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) || - toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) { - result = -ENODATA; - goto out_unlock; - } - - if (reset_idx) { - page_idx = this_idx; - reset_idx = 0; - } else { - page_idx++; - if (!this_idx) - result = -ENODATA; - else if (page_idx != this_idx) - printk(KERN_ERR "Got page index %d, expected %d.\n", - this_idx, page_idx); - } - -out_unlock: - my_mutex_unlock(0, &toi_bio_mutex); -out: - TOI_UNMAP(buf_type, buffer_page); - return result; -} - -/** - * toi_bio_write_page - write a page of the image - * @pfn: The pfn where the data belongs. - * @buffer_page: The page containing the (possibly compressed) data. - * @buf_size: The number of bytes on @buffer_page used. - * - * Write a (possibly compressed) page to the image from the buffer, together - * with it's index and buffer size. - **/ -static int toi_bio_write_page(unsigned long pfn, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - char *buffer_virt; - int result = 0, result2 = 0; - - if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED))) - return 0; - - my_mutex_lock(1, &toi_bio_mutex); - - if (test_result_state(TOI_ABORTED)) { - my_mutex_unlock(1, &toi_bio_mutex); - return 0; - } - - buffer_virt = TOI_MAP(buf_type, buffer_page); - page_idx++; - - /* - * Structure in the image: - * [destination pfn|page size|page data] - * buf_size is PAGE_SIZE - */ - if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) || - toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) || - toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) || - toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) { - printk(KERN_DEBUG "toi_rw_buffer returned non-zero to " - "toi_bio_write_page.\n"); - result = -EIO; - } - - TOI_UNMAP(buf_type, buffer_page); - my_mutex_unlock(1, &toi_bio_mutex); - - if (current == toi_queue_flusher) - result2 = toi_bio_queue_flush_pages(0); - - return result ? result : result2; -} - -/** - * _toi_rw_header_chunk - read or write a portion of the image header - * @writing: Whether reading or writing. - * @owner: The module for which we're writing. - * Used for confirming that modules - * don't use more header space than they asked for. - * @buffer: Address of the data to write. - * @buffer_size: Size of the data buffer. - * @no_readahead: Don't try to start readhead (when getting extents). - * - * Perform PAGE_SIZE I/O. Start readahead if needed. - **/ -static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner, - char *buffer, int buffer_size, int no_readahead) -{ - int result = 0; - - if (owner) { - owner->header_used += buffer_size; - toi_message(TOI_HEADER, TOI_LOW, 1, - "Header: %s : %d bytes (%d/%d) from offset %d.", - owner->name, - buffer_size, owner->header_used, - owner->header_requested, - toi_writer_buffer_posn); - if (owner->header_used > owner->header_requested && writing) { - printk(KERN_EMERG "TuxOnIce module %s is using more " - "header space (%u) than it requested (%u).\n", - owner->name, - owner->header_used, - owner->header_requested); - return buffer_size; - } - } else { - unowned += buffer_size; - toi_message(TOI_HEADER, TOI_LOW, 1, - "Header: (No owner): %d bytes (%d total so far) from " - "offset %d.", buffer_size, unowned, - toi_writer_buffer_posn); - } - - if (!writing && !no_readahead && more_readahead) { - result = toi_start_new_readahead(0); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead " - "returned %d.", result); - } - - if (!result) { - result = toi_rw_buffer(writing, buffer, buffer_size, - no_readahead); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned " - "%d.", result); - } - - total_header_bytes += buffer_size; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning " - "%d.", result); - return result; -} - -static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner, - char *buffer, int size) -{ - return _toi_rw_header_chunk(writing, owner, buffer, size, 1); -} - -static int toi_rw_header_chunk_noreadahead(int writing, - struct toi_module_ops *owner, char *buffer, int size) -{ - return _toi_rw_header_chunk(writing, owner, buffer, size, 1); -} - -/** - * toi_bio_storage_needed - get the amount of storage needed for my fns - **/ -static int toi_bio_storage_needed(void) -{ - return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed(); -} - -/** - * toi_bio_save_config_info - save block I/O config to image header - * @buf: PAGE_SIZE'd buffer into which data should be saved. - **/ -static int toi_bio_save_config_info(char *buf) -{ - int *ints = (int *) buf; - ints[0] = target_outstanding_io; - return sizeof(int); -} - -/** - * toi_bio_load_config_info - restore block I/O config - * @buf: Data to be reloaded. - * @size: Size of the buffer saved. - **/ -static void toi_bio_load_config_info(char *buf, int size) -{ - int *ints = (int *) buf; - target_outstanding_io = ints[0]; -} - -void close_resume_dev_t(int force) -{ - if (!resume_block_device) - return; - - if (force) - atomic_set(&resume_bdev_open_count, 0); - else - atomic_dec(&resume_bdev_open_count); - - if (!atomic_read(&resume_bdev_open_count)) { - toi_close_bdev(resume_block_device); - resume_block_device = NULL; - } -} - -int open_resume_dev_t(int force, int quiet) -{ - if (force) { - close_resume_dev_t(1); - atomic_set(&resume_bdev_open_count, 1); - } else - atomic_inc(&resume_bdev_open_count); - - if (resume_block_device) - return 0; - - resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0); - if (IS_ERR(resume_block_device)) { - if (!quiet) - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "Failed to open device %x, where" - " the header should be found.", - resume_dev_t); - resume_block_device = NULL; - atomic_set(&resume_bdev_open_count, 0); - return 1; - } - - return 0; -} - -/** - * toi_bio_initialise - initialise bio code at start of some action - * @starting_cycle: Whether starting a hibernation cycle, or just reading or - * writing a sysfs value. - **/ -static int toi_bio_initialise(int starting_cycle) -{ - int result; - - if (!starting_cycle || !resume_dev_t) - return 0; - - max_outstanding_writes = 0; - max_outstanding_reads = 0; - current_stream = 0; - toi_queue_flusher = current; -#ifdef MEASURE_MUTEX_CONTENTION - { - int i, j, k; - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - for_each_online_cpu(k) - mutex_times[i][j][k] = 0; - } -#endif - result = open_resume_dev_t(0, 1); - - if (result) - return result; - - result = toi_bio_register_storage(); - - if (result) - return result; - - return get_signature_page(); -} - -static unsigned long raw_to_real(unsigned long raw) -{ - unsigned long extra; - - extra = (raw * (sizeof(unsigned long) + sizeof(int)) + - (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) / - (PAGE_SIZE + sizeof(unsigned long) + sizeof(int)); - - return raw > extra ? raw - extra : 0; -} - -static unsigned long toi_bio_storage_available(void) -{ - unsigned long sum = 0; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type != BIO_ALLOCATOR_MODULE) - continue; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage " - "available from %s.", this_module->name); - sum += this_module->bio_allocator_ops->storage_available(); - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu " - "pages (%d header pages).", sum, header_pages_reserved); - - return sum > header_pages_reserved ? - raw_to_real(sum - header_pages_reserved) : 0; - -} - -static unsigned long toi_bio_storage_allocated(void) -{ - return raw_pages_allocd > header_pages_reserved ? - raw_to_real(raw_pages_allocd - header_pages_reserved) : 0; -} - -/* - * If we have read part of the image, we might have filled memory with - * data that should be zeroed out. - */ -static void toi_bio_noresume_reset(void) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset."); - toi_rw_cleanup(READ); - free_all_bdev_info(); -} - -/** - * toi_bio_cleanup - cleanup after some action - * @finishing_cycle: Whether completing a cycle. - **/ -static void toi_bio_cleanup(int finishing_cycle) -{ - if (!finishing_cycle) - return; - - if (toi_writer_buffer) { - toi_free_page(11, (unsigned long) toi_writer_buffer); - toi_writer_buffer = NULL; - } - - forget_signature_page(); - - if (header_block_device && toi_sig_data && - toi_sig_data->header_dev_t != resume_dev_t) - toi_close_bdev(header_block_device); - - header_block_device = NULL; - - close_resume_dev_t(0); -} - -static int toi_bio_write_header_init(void) -{ - int result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init"); - toi_rw_init(WRITE, 0); - toi_writer_buffer_posn = 0; - - /* Info needed to bootstrap goes at the start of the header. - * First we save the positions and devinfo, including the number - * of header pages. Then we save the structs containing data needed - * for reading the header pages back. - * Note that even if header pages take more than one page, when we - * read back the info, we will have restored the location of the - * next header page by the time we go to use it. - */ - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains."); - result = toi_serialise_extent_chains(); - - if (result) - return result; - - /* - * Signature page hasn't been modified at this point. Write it in - * the header so we can restore it later. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page."); - return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops, - (char *) toi_cur_sig_page, - PAGE_SIZE); -} - -static int toi_bio_write_header_cleanup(void) -{ - int result = 0; - - if (toi_writer_buffer_posn) - toi_bio_queue_write(&toi_writer_buffer); - - result = toi_finish_all_io(); - - unowned = 0; - total_header_bytes = 0; - - /* Set signature to save we have an image */ - if (!result) - result = toi_bio_mark_have_image(); - - return result; -} - -/* - * toi_bio_read_header_init() - * - * Description: - * 1. Attempt to read the device specified with resume=. - * 2. Check the contents of the swap header for our signature. - * 3. Warn, ignore, reset and/or continue as appropriate. - * 4. If continuing, read the toi_swap configuration section - * of the header and set up block device info so we can read - * the rest of the header & image. - * - * Returns: - * May not return if user choose to reboot at a warning. - * -EINVAL if cannot resume at this time. Booting should continue - * normally. - */ - -static int toi_bio_read_header_init(void) -{ - int result = 0; - char buf[32]; - - toi_writer_buffer_posn = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init"); - - if (!toi_sig_data) { - printk(KERN_INFO "toi_bio_read_header_init called when we " - "haven't verified there is an image!\n"); - return -EINVAL; - } - - /* - * If the header is not on the resume_swap_dev_t, get the resume device - * first. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.", - toi_sig_data->header_dev_t); - if (toi_sig_data->have_uuid) { - struct fs_info seek; - dev_t device; - - strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16); - seek.dev_t = toi_sig_data->header_dev_t; - seek.last_mount_size = 0; - device = blk_lookup_fs_info(&seek); - if (device) { - printk("Using dev_t %s, returned by blk_lookup_fs_info.\n", - format_dev_t(buf, device)); - toi_sig_data->header_dev_t = device; - } - } - if (toi_sig_data->header_dev_t != resume_dev_t) { - header_block_device = toi_open_bdev(NULL, - toi_sig_data->header_dev_t, 1); - - if (IS_ERR(header_block_device)) - return PTR_ERR(header_block_device); - } else - header_block_device = resume_block_device; - - if (!toi_writer_buffer) - toi_writer_buffer = (char *) toi_get_zeroed_page(11, - TOI_ATOMIC_GFP); - more_readahead = 1; - - /* - * Read toi_swap configuration. - * Headerblock size taken into account already. - */ - result = toi_bio_ops.bdev_page_io(READ, header_block_device, - toi_sig_data->first_header_block, - virt_to_page((unsigned long) toi_writer_buffer)); - if (result) - return result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains."); - result = toi_load_extent_chains(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page."); - toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); - if (!toi_orig_sig_page) { - printk(KERN_ERR "Failed to allocate memory for the current" - " image signature.\n"); - return -ENOMEM; - } - - return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops, - (char *) toi_orig_sig_page, - PAGE_SIZE); -} - -static int toi_bio_read_header_cleanup(void) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup."); - return toi_rw_cleanup(READ); -} - -/* Works only for digits and letters, but small and fast */ -#define TOLOWER(x) ((x) | 0x20) - -/* - * UUID must be 32 chars long. It may have dashes, but nothing - * else. - */ -char *uuid_from_commandline(char *commandline) -{ - int low = 0; - char *result = NULL, *output, *ptr; - - if (strncmp(commandline, "UUID=", 5)) - return NULL; - - result = kzalloc(17, GFP_KERNEL); - if (!result) { - printk("Failed to kzalloc UUID text memory.\n"); - return NULL; - } - - ptr = commandline + 5; - output = result; - - while (*ptr && (output - result) < 16) { - if (isxdigit(*ptr)) { - int value = isdigit(*ptr) ? *ptr - '0' : - TOLOWER(*ptr) - 'a' + 10; - if (low) { - *output += value; - output++; - } else { - *output = value << 4; - } - low = !low; - } else if (*ptr != '-') - break; - ptr++; - } - - if ((output - result) < 16 || *ptr) { - printk(KERN_DEBUG "Found resume=UUID=, but the value looks " - "invalid.\n"); - kfree(result); - result = NULL; - } - - return result; -} - -#define retry_if_fails(command) \ -do { \ - command; \ - if (!resume_dev_t && !waited_for_device_probe) { \ - wait_for_device_probe(); \ - command; \ - waited_for_device_probe = 1; \ - } \ -} while(0) - -/** - * try_to_open_resume_device: Try to parse and open resume= - * - * Any "swap:" has been stripped away and we just have the path to deal with. - * We attempt to do name_to_dev_t, open and stat the file. Having opened the - * file, get the struct block_device * to match. - */ -static int try_to_open_resume_device(char *commandline, int quiet) -{ - struct kstat stat; - int error = 0; - char *uuid = uuid_from_commandline(commandline); - int waited_for_device_probe = 0; - - resume_dev_t = MKDEV(0, 0); - - if (!strlen(commandline)) - retry_if_fails(toi_bio_scan_for_image(quiet)); - - if (uuid) { - struct fs_info seek; - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = resume_dev_t; - seek.last_mount_size = 0; - retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek)); - kfree(uuid); - } - - if (!resume_dev_t) - retry_if_fails(resume_dev_t = name_to_dev_t(commandline)); - - if (!resume_dev_t) { - struct file *file = filp_open(commandline, - O_RDONLY|O_LARGEFILE, 0); - - if (!IS_ERR(file) && file) { - vfs_getattr(&file->f_path, &stat); - filp_close(file, NULL); - } else - error = vfs_stat(commandline, &stat); - if (!error) - resume_dev_t = stat.rdev; - } - - if (!resume_dev_t) { - if (quiet) - return 1; - - if (test_toi_state(TOI_TRYING_TO_RESUME)) - toi_early_boot_message(1, toi_translate_err_default, - "Failed to translate \"%s\" into a device id.\n", - commandline); - else - printk("TuxOnIce: Can't translate \"%s\" into a device " - "id yet.\n", commandline); - return 1; - } - - return open_resume_dev_t(1, quiet); -} - -/* - * Parse Image Location - * - * Attempt to parse a resume= parameter. - * Swap Writer accepts: - * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE] - * - * Where: - * DEVNAME is convertable to a dev_t by name_to_dev_t - * FIRSTBLOCK is the location of the first block in the swap file - * (specifying for a swap partition is nonsensical but not prohibited). - * Data is validated by attempting to read a swap header from the - * location given. Failure will result in toi_swap refusing to - * save an image, and a reboot with correct parameters will be - * necessary. - */ -static int toi_bio_parse_sig_location(char *commandline, - int only_allocator, int quiet) -{ - char *thischar, *devstart, *colon = NULL; - int signature_found, result = -EINVAL, temp_result = 0; - - if (strncmp(commandline, "swap:", 5) && - strncmp(commandline, "file:", 5)) { - /* - * Failing swap:, we'll take a simple resume=/dev/hda2, or a - * blank value (scan) but fall through to other allocators - * if /dev/ or UUID= isn't matched. - */ - if (strncmp(commandline, "/dev/", 5) && - strncmp(commandline, "UUID=", 5) && - strlen(commandline)) - return 1; - } else - commandline += 5; - - devstart = commandline; - thischar = commandline; - while ((*thischar != ':') && (*thischar != '@') && - ((thischar - commandline) < 250) && (*thischar)) - thischar++; - - if (*thischar == ':') { - colon = thischar; - *colon = 0; - thischar++; - } - - while ((thischar - commandline) < 250 && *thischar) - thischar++; - - if (colon) { - unsigned long block; - temp_result = kstrtoul(colon + 1, 0, &block); - if (!temp_result) - resume_firstblock = (int) block; - } else - resume_firstblock = 0; - - clear_toi_state(TOI_CAN_HIBERNATE); - clear_toi_state(TOI_CAN_RESUME); - - if (!temp_result) - temp_result = try_to_open_resume_device(devstart, quiet); - - if (colon) - *colon = ':'; - - /* No error if we only scanned */ - if (temp_result) - return strlen(commandline) ? -EINVAL : 1; - - signature_found = toi_bio_image_exists(quiet); - - if (signature_found != -1) { - result = 0; - /* - * TODO: If only file storage, CAN_HIBERNATE should only be - * set if file allocator's target is valid. - */ - set_toi_state(TOI_CAN_HIBERNATE); - set_toi_state(TOI_CAN_RESUME); - } else - if (!quiet) - printk(KERN_ERR "TuxOnIce: Block I/O: No " - "signature found at %s.\n", devstart); - - return result; -} - -static void toi_bio_release_storage(void) -{ - header_pages_reserved = 0; - raw_pages_allocd = 0; - - free_all_bdev_info(); -} - -/* toi_swap_remove_image - * - */ -static int toi_bio_remove_image(void) -{ - int result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image."); - - result = toi_bio_restore_original_signature(); - - /* - * We don't do a sanity check here: we want to restore the swap - * whatever version of kernel made the hibernate image. - * - * We need to write swap, but swap may not be enabled so - * we write the device directly - * - * If we don't have an current_signature_page, we didn't - * read an image header, so don't change anything. - */ - - toi_bio_release_storage(); - - return result; -} - -struct toi_bio_ops toi_bio_ops = { - .bdev_page_io = toi_bdev_page_io, - .register_storage = toi_register_storage_chain, - .free_storage = toi_bio_release_storage, -}; - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io, - 0, 16384, 0, NULL), -}; - -struct toi_module_ops toi_blockwriter_ops = { - .type = WRITER_MODULE, - .name = "block i/o", - .directory = "block_io", - .module = THIS_MODULE, - .memory_needed = toi_bio_memory_needed, - .print_debug_info = toi_bio_print_debug_stats, - .storage_needed = toi_bio_storage_needed, - .save_config_info = toi_bio_save_config_info, - .load_config_info = toi_bio_load_config_info, - .initialise = toi_bio_initialise, - .cleanup = toi_bio_cleanup, - .post_atomic_restore = toi_bio_chains_post_atomic, - - .rw_init = toi_rw_init, - .rw_cleanup = toi_rw_cleanup, - .read_page = toi_bio_read_page, - .write_page = toi_bio_write_page, - .rw_header_chunk = toi_rw_header_chunk, - .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead, - .io_flusher = bio_io_flusher, - .update_throughput_throttle = update_throughput_throttle, - .finish_all_io = toi_finish_all_io, - - .noresume_reset = toi_bio_noresume_reset, - .storage_available = toi_bio_storage_available, - .storage_allocated = toi_bio_storage_allocated, - .reserve_header_space = toi_bio_reserve_header_space, - .allocate_storage = toi_bio_allocate_storage, - .free_unused_storage = toi_bio_free_unused_storage, - .image_exists = toi_bio_image_exists, - .mark_resume_attempted = toi_bio_mark_resume_attempted, - .write_header_init = toi_bio_write_header_init, - .write_header_cleanup = toi_bio_write_header_cleanup, - .read_header_init = toi_bio_read_header_init, - .read_header_cleanup = toi_bio_read_header_cleanup, - .get_header_version = toi_bio_get_header_version, - .remove_image = toi_bio_remove_image, - .parse_sig_location = toi_bio_parse_sig_location, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/** - * toi_block_io_load - load time routine for block I/O module - * - * Register block i/o ops and sysfs entries. - **/ -static __init int toi_block_io_load(void) -{ - return toi_register_module(&toi_blockwriter_ops); -} - -late_initcall(toi_block_io_load); diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h deleted file mode 100644 index 5e1964a61..000000000 --- a/kernel/power/tuxonice_bio_internal.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * kernel/power/tuxonice_bio_internal.h - * - * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains declarations for functions exported from - * tuxonice_bio.c, which contains low level io functions. - */ - -/* Extent chains */ -void toi_extent_state_goto_start(void); -void toi_extent_state_save(int slot); -int go_next_page(int writing, int section_barrier); -void toi_extent_state_restore(int slot); -void free_all_bdev_info(void); -int devices_of_same_priority(struct toi_bdev_info *this); -int toi_register_storage_chain(struct toi_bdev_info *new); -int toi_serialise_extent_chains(void); -int toi_load_extent_chains(void); -int toi_bio_rw_page(int writing, struct page *page, int is_readahead, - int free_group); -int toi_bio_restore_original_signature(void); -int toi_bio_devinfo_storage_needed(void); -unsigned long get_headerblock(void); -dev_t get_header_dev_t(void); -struct block_device *get_header_bdev(void); -int toi_bio_allocate_storage(unsigned long request); -void toi_bio_free_unused_storage(void); - -/* Signature functions */ -#define HaveImage "HaveImage" -#define NoImage "TuxOnIce" -#define sig_size (sizeof(HaveImage)) - -struct sig_data { - char sig[sig_size]; - int have_image; - int resumed_before; - - char have_uuid; - char header_uuid[17]; - dev_t header_dev_t; - unsigned long first_header_block; - - /* Repeat the signature to be sure we have a header version */ - char sig2[sig_size]; - int header_version; -}; - -void forget_signature_page(void); -int toi_check_for_signature(void); -int toi_bio_image_exists(int quiet); -int get_signature_page(void); -int toi_bio_mark_resume_attempted(int); -extern char *toi_cur_sig_page; -extern char *toi_orig_sig_page; -int toi_bio_mark_have_image(void); -extern struct sig_data *toi_sig_data; -extern dev_t resume_dev_t; -extern struct block_device *resume_block_device; -extern struct block_device *header_block_device; -extern unsigned long resume_firstblock; - -struct block_device *open_bdev(dev_t device, int display_errs); -extern int current_stream; -extern int more_readahead; -int toi_do_io(int writing, struct block_device *bdev, long block0, - struct page *page, int is_readahead, int syncio, int free_group); -int get_main_pool_phys_params(void); - -void toi_close_bdev(struct block_device *bdev); -struct block_device *toi_open_bdev(char *uuid, dev_t default_device, - int display_errs); - -extern struct toi_module_ops toi_blockwriter_ops; -void dump_block_chains(void); -void debug_broken_header(void); -extern unsigned long raw_pages_allocd, header_pages_reserved; -int toi_bio_chains_debug_info(char *buffer, int size); -void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd); -int toi_bio_scan_for_image(int quiet); -int toi_bio_get_header_version(void); - -void close_resume_dev_t(int force); -int open_resume_dev_t(int force, int quiet); - -struct toi_incremental_image_pointer_saved_data { - unsigned long block; - int chain; -}; - -struct toi_incremental_image_pointer { - struct toi_incremental_image_pointer_saved_data save; - struct block_device *bdev; - unsigned long block; -}; - -void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr); -void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr); diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c deleted file mode 100644 index f5418f092..000000000 --- a/kernel/power/tuxonice_bio_signature.c +++ /dev/null @@ -1,403 +0,0 @@ -/* - * kernel/power/tuxonice_bio_signature.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - */ - -#include - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_bio.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" -#include "tuxonice_builtin.h" -#include "tuxonice_bio_internal.h" - -struct sig_data *toi_sig_data; - -/* Struct of swap header pages */ - -struct old_sig_data { - dev_t device; - unsigned long sector; - int resume_attempted; - int orig_sig_type; -}; - -union diskpage { - union swap_header swh; /* swh.magic is the only member used */ - struct sig_data sig_data; - struct old_sig_data old_sig_data; -}; - -union p_diskpage { - union diskpage *pointer; - char *ptr; - unsigned long address; -}; - -char *toi_cur_sig_page; -char *toi_orig_sig_page; -int have_image; -int have_old_image; - -int get_signature_page(void) -{ - if (!toi_cur_sig_page) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Allocating current signature page."); - toi_cur_sig_page = (char *) toi_get_zeroed_page(38, - TOI_ATOMIC_GFP); - if (!toi_cur_sig_page) { - printk(KERN_ERR "Failed to allocate memory for the " - "current image signature.\n"); - return -ENOMEM; - } - - toi_sig_data = (struct sig_data *) toi_cur_sig_page; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx," - " sector %d.", - resume_block_device->bd_dev, resume_firstblock); - - return toi_bio_ops.bdev_page_io(READ, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -void forget_signature_page(void) -{ - if (toi_cur_sig_page) { - toi_sig_data = NULL; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page" - " (%p).", toi_cur_sig_page); - toi_free_page(38, (unsigned long) toi_cur_sig_page); - toi_cur_sig_page = NULL; - } - - if (toi_orig_sig_page) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page" - " (%p).", toi_orig_sig_page); - toi_free_page(38, (unsigned long) toi_orig_sig_page); - toi_orig_sig_page = NULL; - } -} - -/* - * We need to ensure we use the signature page that's currently on disk, - * so as to not remove the image header. Post-atomic-restore, the orig sig - * page will be empty, so we can use that as our method of knowing that we - * need to load the on-disk signature and not use the non-image sig in - * memory. (We're going to powerdown after writing the change, so it's safe. - */ -int toi_bio_mark_resume_attempted(int flag) -{ - toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.", - flag); - if (!toi_orig_sig_page) { - forget_signature_page(); - get_signature_page(); - } - toi_sig_data->resumed_before = flag; - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -int toi_bio_mark_have_image(void) -{ - int result = 0; - char buf[32]; - struct fs_info *fs_info; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists."); - memcpy(toi_sig_data->sig, tuxonice_signature, - sizeof(tuxonice_signature)); - toi_sig_data->have_image = 1; - toi_sig_data->resumed_before = 0; - toi_sig_data->header_dev_t = get_header_dev_t(); - toi_sig_data->have_uuid = 0; - - fs_info = fs_info_from_block_dev(get_header_bdev()); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - if (!result) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.", - format_dev_t(buf, get_header_dev_t())); - toi_sig_data->have_uuid = 1; - } else - toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for " - "dev_t %s.", - format_dev_t(buf, get_header_dev_t())); - - toi_sig_data->first_header_block = get_headerblock(); - have_image = 1; - toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block " - "is %d.", toi_sig_data->header_dev_t, - toi_sig_data->first_header_block); - - memcpy(toi_sig_data->sig2, tuxonice_signature, - sizeof(tuxonice_signature)); - toi_sig_data->header_version = TOI_HEADER_VERSION; - - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -int remove_old_signature(void) -{ - union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page; - char *orig_sig; - char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); - int result; - struct block_device *header_bdev; - struct old_sig_data *old_sig_data = - &swap_header_page.pointer->old_sig_data; - - header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1); - result = toi_bio_ops.bdev_page_io(READ, header_bdev, - old_sig_data->sector, virt_to_page(header_start)); - - if (result) - goto out; - - /* - * TODO: Get the original contents of the first bytes of the swap - * header page. - */ - if (!old_sig_data->orig_sig_type) - orig_sig = "SWAP-SPACE"; - else - orig_sig = "SWAPSPACE2"; - - memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10); - memcpy(swap_header_page.ptr, header_start, 10); - - result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(swap_header_page.ptr)); - -out: - toi_close_bdev(header_bdev); - have_old_image = 0; - toi_free_page(38, (unsigned long) header_start); - return result; -} - -/* - * toi_bio_restore_original_signature - restore the original signature - * - * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used. - * It will have the original signature page contents, stored in the image - * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain - * the contents that were loaded when we started the cycle. - */ -int toi_bio_restore_original_signature(void) -{ - char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page; - - if (have_old_image) - return remove_old_signature(); - - if (!use) { - printk("toi_bio_restore_original_signature: No signature " - "page loaded.\n"); - return 0; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists."); - have_image = 0; - toi_sig_data->have_image = 0; - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(use)); -} - -/* - * check_for_signature - See whether we have an image. - * - * Returns 0 if no image, 1 if there is one, -1 if indeterminate. - */ -int toi_check_for_signature(void) -{ - union p_diskpage swap_header_page; - int type; - const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" }; - const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" }; - char *swap_header; - - if (!toi_cur_sig_page) { - int result = get_signature_page(); - - if (result) - return result; - } - - /* - * Start by looking for the binary header. - */ - if (!memcmp(tuxonice_signature, toi_cur_sig_page, - sizeof(tuxonice_signature))) { - have_image = toi_sig_data->have_image; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. " - "Have image is %d.", have_image); - if (have_image) - toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is " - "%x. First block is %d.", - toi_sig_data->header_dev_t, - toi_sig_data->first_header_block); - return toi_sig_data->have_image; - } - - /* - * Failing that, try old file allocator headers. - */ - - if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) { - have_image = 1; - return 1; - } - - have_image = 0; - - if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage))) - return 0; - - /* - * Nope? How about swap? - */ - swap_header_page = (union p_diskpage) toi_cur_sig_page; - swap_header = swap_header_page.pointer->swh.magic.magic; - - /* Normal swapspace? */ - for (type = 0; type < 2; type++) - if (!memcmp(normal_sigs[type], swap_header, - strlen(normal_sigs[type]))) - return 0; - - /* Swsusp or uswsusp? */ - for (type = 0; type < 3; type++) - if (!memcmp(swsusp_sigs[type], swap_header, - strlen(swsusp_sigs[type]))) - return 2; - - /* Old TuxOnIce version? */ - if (!memcmp(tuxonice_signature, swap_header, - sizeof(tuxonice_signature) - 1)) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce " - "signature."); - have_old_image = 1; - return 3; - } - - return -1; -} - -/* - * Image_exists - * - * Returns -1 if don't know, otherwise 0 (no) or 1 (yes). - */ -int toi_bio_image_exists(int quiet) -{ - int result; - char *msg = NULL; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists."); - - if (!resume_dev_t) { - if (!quiet) - printk(KERN_INFO "Not even trying to read header " - "because resume_dev_t is not set.\n"); - return -1; - } - - if (open_resume_dev_t(0, quiet)) - return -1; - - result = toi_check_for_signature(); - - clear_toi_state(TOI_RESUMED_BEFORE); - if (toi_sig_data->resumed_before) - set_toi_state(TOI_RESUMED_BEFORE); - - if (quiet || result == -ENOMEM) - return result; - - if (result == -1) - msg = "TuxOnIce: Unable to find a signature." - " Could you have moved a swap file?\n"; - else if (!result) - msg = "TuxOnIce: No image found.\n"; - else if (result == 1) - msg = "TuxOnIce: Image found.\n"; - else if (result == 2) - msg = "TuxOnIce: uswsusp or swsusp image found.\n"; - else if (result == 3) - msg = "TuxOnIce: Old implementation's signature found.\n"; - - printk(KERN_INFO "%s", msg); - - return result; -} - -int toi_bio_scan_for_image(int quiet) -{ - struct block_device *bdev; - char default_name[255] = ""; - - if (!quiet) - printk(KERN_DEBUG "Scanning swap devices for TuxOnIce " - "signature...\n"); - for (bdev = next_bdev_of_type(NULL, "swap"); bdev; - bdev = next_bdev_of_type(bdev, "swap")) { - int result; - char name[255] = ""; - sprintf(name, "%u:%u", MAJOR(bdev->bd_dev), - MINOR(bdev->bd_dev)); - if (!quiet) - printk(KERN_DEBUG "- Trying %s.\n", name); - resume_block_device = bdev; - resume_dev_t = bdev->bd_dev; - - result = toi_check_for_signature(); - - resume_block_device = NULL; - resume_dev_t = MKDEV(0, 0); - - if (!default_name[0]) - strcpy(default_name, name); - - if (result == 1) { - /* Got one! */ - strcpy(resume_file, name); - next_bdev_of_type(bdev, NULL); - if (!quiet) - printk(KERN_DEBUG " ==> Image found on %s.\n", - resume_file); - return 1; - } - forget_signature_page(); - } - - if (!quiet) - printk(KERN_DEBUG "TuxOnIce scan: No image found.\n"); - strcpy(resume_file, default_name); - return 0; -} - -int toi_bio_get_header_version(void) -{ - return (memcmp(toi_sig_data->sig2, tuxonice_signature, - sizeof(tuxonice_signature))) ? - 0 : toi_sig_data->header_version; - -} diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c deleted file mode 100644 index 22bf07a43..000000000 --- a/kernel/power/tuxonice_builtin.c +++ /dev/null @@ -1,498 +0,0 @@ -/* - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tuxonice_io.h" -#include "tuxonice.h" -#include "tuxonice_extent.h" -#include "tuxonice_netlink.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_ui.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_modules.h" -#include "tuxonice_builtin.h" -#include "tuxonice_power_off.h" -#include "tuxonice_alloc.h" - -unsigned long toi_bootflags_mask; - -/* - * Highmem related functions (x86 only). - */ - -#ifdef CONFIG_HIGHMEM - -/** - * copyback_high: Restore highmem pages. - * - * Highmem data and pbe lists are/can be stored in highmem. - * The format is slightly different to the lowmem pbe lists - * used for the assembly code: the last pbe in each page is - * a struct page * instead of struct pbe *, pointing to the - * next page where pbes are stored (or NULL if happens to be - * the end of the list). Since we don't want to generate - * unnecessary deltas against swsusp code, we use a cast - * instead of a union. - **/ - -static void copyback_high(void) -{ - struct page *pbe_page = (struct page *) restore_highmem_pblist; - struct pbe *this_pbe, *first_pbe; - unsigned long *origpage, *copypage; - int pbe_index = 1; - - if (!pbe_page) - return; - - this_pbe = (struct pbe *) kmap_atomic(pbe_page); - first_pbe = this_pbe; - - while (this_pbe) { - int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1; - - origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address)); - copypage = kmap_atomic((struct page *) this_pbe->address); - - while (loop >= 0) { - *(origpage + loop) = *(copypage + loop); - loop--; - } - - kunmap_atomic(origpage); - kunmap_atomic(copypage); - - if (!this_pbe->next) - break; - - if (pbe_index < PBES_PER_PAGE) { - this_pbe++; - pbe_index++; - } else { - pbe_page = (struct page *) this_pbe->next; - kunmap_atomic(first_pbe); - if (!pbe_page) - return; - this_pbe = (struct pbe *) kmap_atomic(pbe_page); - first_pbe = this_pbe; - pbe_index = 1; - } - } - kunmap_atomic(first_pbe); -} - -#else /* CONFIG_HIGHMEM */ -static void copyback_high(void) { } -#endif - -char toi_wait_for_keypress_dev_console(int timeout) -{ - int fd, this_timeout = 255, orig_kthread = 0; - char key = '\0'; - struct termios t, t_backup; - - /* We should be guaranteed /dev/console exists after populate_rootfs() - * in init/main.c. - */ - fd = sys_open("/dev/console", O_RDONLY, 0); - if (fd < 0) { - printk(KERN_INFO "Couldn't open /dev/console.\n"); - return key; - } - - if (sys_ioctl(fd, TCGETS, (long)&t) < 0) - goto out_close; - - memcpy(&t_backup, &t, sizeof(t)); - - t.c_lflag &= ~(ISIG|ICANON|ECHO); - t.c_cc[VMIN] = 0; - -new_timeout: - if (timeout > 0) { - this_timeout = timeout < 26 ? timeout : 25; - timeout -= this_timeout; - this_timeout *= 10; - } - - t.c_cc[VTIME] = this_timeout; - - if (sys_ioctl(fd, TCSETS, (long)&t) < 0) - goto out_restore; - - if (current->flags & PF_KTHREAD) { - orig_kthread = (current->flags & PF_KTHREAD); - current->flags &= ~PF_KTHREAD; - } - - while (1) { - if (sys_read(fd, &key, 1) <= 0) { - if (timeout) - goto new_timeout; - key = '\0'; - break; - } - key = tolower(key); - if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) { - if (key == 'c') { - set_toi_state(TOI_CONTINUE_REQ); - break; - } else if (key == ' ') - break; - } else - break; - } - if (orig_kthread) { - current->flags |= PF_KTHREAD; - } - -out_restore: - sys_ioctl(fd, TCSETS, (long)&t_backup); -out_close: - sys_close(fd); - - return key; -} - -struct toi_boot_kernel_data toi_bkd __nosavedata - __attribute__((aligned(PAGE_SIZE))) = { - MY_BOOT_KERNEL_DATA_VERSION, - 0, -#ifdef CONFIG_TOI_REPLACE_SWSUSP - (1 << TOI_REPLACE_SWSUSP) | -#endif - (1 << TOI_NO_FLUSHER_THREAD) | - (1 << TOI_PAGESET2_FULL), -}; - -struct block_device *toi_open_by_devnum(dev_t dev) -{ - struct block_device *bdev = bdget(dev); - int err = -ENOMEM; - if (bdev) - err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); - return err ? ERR_PTR(err) : bdev; -} - -/** - * toi_close_bdev: Close a swap bdev. - * - * int: The swap entry number to close. - */ -void toi_close_bdev(struct block_device *bdev) -{ - blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); -} - -int toi_wait = CONFIG_TOI_DEFAULT_WAIT; -struct toi_core_fns *toi_core_fns; -unsigned long toi_result; -struct pagedir pagedir1 = {1}; -struct toi_cbw **toi_first_cbw; -int toi_next_cbw; - -unsigned long toi_get_nonconflicting_page(void) -{ - return toi_core_fns->get_nonconflicting_page(); -} - -int toi_post_context_save(void) -{ - return toi_core_fns->post_context_save(); -} - -int try_tuxonice_hibernate(void) -{ - if (!toi_core_fns) - return -ENODEV; - - return toi_core_fns->try_hibernate(); -} - -static int num_resume_calls; -#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL -static int ignore_late_initcall = 1; -#else -static int ignore_late_initcall; -#endif - -int toi_translate_err_default = TOI_CONTINUE_REQ; - -void try_tuxonice_resume(void) -{ - if (!hibernation_available()) - return; - - /* Don't let it wrap around eventually */ - if (num_resume_calls < 2) - num_resume_calls++; - - if (num_resume_calls == 1 && ignore_late_initcall) { - printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n"); - return; - } - - if (toi_core_fns) - toi_core_fns->try_resume(); - else - printk(KERN_INFO "TuxOnIce core not loaded yet.\n"); -} - -int toi_lowlevel_builtin(void) -{ - int error = 0; - - save_processor_state(); - error = swsusp_arch_suspend(); - if (error) - printk(KERN_ERR "Error %d hibernating\n", error); - - /* Restore control flow appears here */ - if (!toi_in_hibernate) { - copyback_high(); - set_toi_state(TOI_NOW_RESUMING); - } - - restore_processor_state(); - return error; -} - -unsigned long toi_compress_bytes_in; -unsigned long toi_compress_bytes_out; - -int toi_in_suspend(void) -{ - return in_suspend; -} - -unsigned long toi_state = ((1 << TOI_BOOT_TIME) | - (1 << TOI_IGNORE_LOGLEVEL) | - (1 << TOI_IO_STOPPED)); - -/* The number of hibernates we have started (some may have been cancelled) */ -unsigned int nr_hibernates; -int toi_running; -__nosavedata int toi_in_hibernate; -__nosavedata struct pbe *restore_highmem_pblist; - -int toi_trace_allocs; - -void toi_read_lock_tasklist(void) -{ - read_lock(&tasklist_lock); -} - -void toi_read_unlock_tasklist(void) -{ - read_unlock(&tasklist_lock); -} - -#ifdef CONFIG_TOI_ZRAM_SUPPORT -int (*toi_flag_zram_disks) (void); - -int toi_do_flag_zram_disks(void) -{ - return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0; -} - -#endif - -/* toi_generate_free_page_map - * - * Description: This routine generates a bitmap of free pages from the - * lists used by the memory manager. We then use the bitmap - * to quickly calculate which pages to save and in which - * pagesets. - */ -void toi_generate_free_page_map(void) -{ - int order, cpu, t; - unsigned long flags, i; - struct zone *zone; - struct list_head *curr; - unsigned long pfn; - struct page *page; - - for_each_populated_zone(zone) { - - if (!zone->spanned_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - - for (i = 0; i < zone->spanned_pages; i++) { - pfn = zone->zone_start_pfn + i; - - if (!pfn_valid(pfn)) - continue; - - page = pfn_to_page(pfn); - - ClearPageNosaveFree(page); - } - - for_each_migratetype_order(order, t) { - list_for_each(curr, - &zone->free_area[order].free_list[t]) { - unsigned long j; - - pfn = page_to_pfn(list_entry(curr, struct page, - lru)); - for (j = 0; j < (1UL << order); j++) - SetPageNosaveFree(pfn_to_page(pfn + j)); - } - } - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pset = - per_cpu_ptr(zone->pageset, cpu); - struct per_cpu_pages *pcp = &pset->pcp; - struct page *page; - int t; - - for (t = 0; t < MIGRATE_PCPTYPES; t++) - list_for_each_entry(page, &pcp->lists[t], lru) - SetPageNosaveFree(page); - } - - spin_unlock_irqrestore(&zone->lock, flags); - } -} - -/* toi_size_of_free_region - * - * Description: Return the number of pages that are free, beginning with and - * including this one. - */ -int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn) -{ - unsigned long this_pfn = start_pfn, - end_pfn = zone_end_pfn(zone); - - while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn))) - this_pfn++; - - return this_pfn - start_pfn; -} - -static int __init toi_wait_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) { - if (value < -1 || value > 255) - printk(KERN_INFO "TuxOnIce_wait outside range -1 to " - "255.\n"); - else - toi_wait = value; - } - - return 1; -} -__setup("toi_wait", toi_wait_setup); - -static int __init toi_translate_retry_setup(char *str) -{ - toi_translate_err_default = 0; - return 1; -} -__setup("toi_translate_retry", toi_translate_retry_setup); - -static int __init toi_debug_setup(char *str) -{ - toi_bkd.toi_action |= (1 << TOI_LOGALL); - toi_bootflags_mask |= (1 << TOI_LOGALL); - toi_bkd.toi_debug_state = 255; - toi_bkd.toi_default_console_level = 7; - return 1; -} -__setup("toi_debug_setup", toi_debug_setup); - -static int __init toi_pause_setup(char *str) -{ - toi_bkd.toi_action |= (1 << TOI_PAUSE); - toi_bootflags_mask |= (1 << TOI_PAUSE); - return 1; -} -__setup("toi_pause", toi_pause_setup); - -#ifdef CONFIG_PM_DEBUG -static int __init toi_trace_allocs_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) - toi_trace_allocs = value; - - return 1; -} -__setup("toi_trace_allocs", toi_trace_allocs_setup); -#endif - -static int __init toi_ignore_late_initcall_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) - ignore_late_initcall = value; - - return 1; -} -__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup); - -static int __init toi_force_no_multithreaded_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO); - toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO); - - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO); - - return 1; -} -__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup); - -#ifdef CONFIG_KGDB -static int __init toi_post_resume_breakpoint_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT); - toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT); - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT); - - return 1; -} -__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup); -#endif - -static int __init toi_disable_readahead_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD); - toi_bootflags_mask |= (1 << TOI_NO_READAHEAD); - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD); - - return 1; -} -__setup("toi_no_readahead", toi_disable_readahead_setup); diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h deleted file mode 100644 index 9539818e0..000000000 --- a/kernel/power/tuxonice_builtin.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ -#include - -extern struct toi_core_fns *toi_core_fns; -extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out; -extern unsigned int nr_hibernates; -extern int toi_in_hibernate; - -extern __nosavedata struct pbe *restore_highmem_pblist; - -int toi_lowlevel_builtin(void); - -#ifdef CONFIG_HIGHMEM -extern __nosavedata struct zone_data *toi_nosave_zone_list; -extern __nosavedata unsigned long toi_nosave_max_pfn; -#endif - -extern unsigned long toi_get_nonconflicting_page(void); -extern int toi_post_context_save(void); - -extern char toi_wait_for_keypress_dev_console(int timeout); -extern struct block_device *toi_open_by_devnum(dev_t dev); -extern void toi_close_bdev(struct block_device *bdev); -extern int toi_wait; -extern int toi_translate_err_default; -extern int toi_force_no_multithreaded; -extern void toi_read_lock_tasklist(void); -extern void toi_read_unlock_tasklist(void); -extern int toi_in_suspend(void); -extern void toi_generate_free_page_map(void); -extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn); - -#ifdef CONFIG_TOI_ZRAM_SUPPORT -extern int toi_do_flag_zram_disks(void); -#else -#define toi_do_flag_zram_disks() (0) -#endif diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c deleted file mode 100644 index 1c4e10c72..000000000 --- a/kernel/power/tuxonice_checksum.c +++ /dev/null @@ -1,392 +0,0 @@ -/* - * kernel/power/tuxonice_checksum.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data checksum routines for TuxOnIce, - * using cryptoapi. They are used to locate any modifications - * made to pageset 2 while we're saving it. - */ - -#include -#include -#include -#include -#include - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_checksum.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" - -static struct toi_module_ops toi_checksum_ops; - -/* Constant at the mo, but I might allow tuning later */ -static char toi_checksum_name[32] = "md4"; -/* Bytes per checksum */ -#define CHECKSUM_SIZE (16) - -#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE) - -struct cpu_context { - struct crypto_hash *transform; - struct hash_desc desc; - struct scatterlist sg[2]; - char *buf; -}; - -static DEFINE_PER_CPU(struct cpu_context, contexts); -static int pages_allocated; -static unsigned long page_list; - -static int toi_num_resaved; - -static unsigned long this_checksum, next_page; -static int checksum_count; - -static inline int checksum_pages_needed(void) -{ - return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE); -} - -/* ---- Local buffer management ---- */ - -/* - * toi_checksum_cleanup - * - * Frees memory allocated for our labours. - */ -static void toi_checksum_cleanup(int ending_cycle) -{ - int cpu; - - if (ending_cycle) { - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->transform) { - crypto_free_hash(this->transform); - this->transform = NULL; - this->desc.tfm = NULL; - } - - if (this->buf) { - toi_free_page(27, (unsigned long) this->buf); - this->buf = NULL; - } - } - } -} - -/* - * toi_crypto_initialise - * - * Prepare to do some work by allocating buffers and transforms. - * Returns: Int: Zero. Even if we can't set up checksum, we still - * seek to hibernate. - */ -static int toi_checksum_initialise(int starting_cycle) -{ - int cpu; - - if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled) - return 0; - - if (!*toi_checksum_name) { - printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - struct page *page; - - this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0); - if (IS_ERR(this->transform)) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s checksum algorithm: %ld.\n", - toi_checksum_name, (long) this->transform); - this->transform = NULL; - return 1; - } - - this->desc.tfm = this->transform; - this->desc.flags = 0; - - page = toi_alloc_page(27, GFP_KERNEL); - if (!page) - return 1; - this->buf = page_address(page); - sg_init_one(&this->sg[0], this->buf, PAGE_SIZE); - } - return 0; -} - -/* - * toi_checksum_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_checksum_print_debug_stats(char *buffer, int size) -{ - int len; - - if (!toi_checksum_ops.enabled) - return scnprintf(buffer, size, - "- Checksumming disabled.\n"); - - len = scnprintf(buffer, size, "- Checksum method is '%s'.\n", - toi_checksum_name); - len += scnprintf(buffer + len, size - len, - " %d pages resaved in atomic copy.\n", toi_num_resaved); - return len; -} - -static int toi_checksum_memory_needed(void) -{ - return toi_checksum_ops.enabled ? - checksum_pages_needed() << PAGE_SHIFT : 0; -} - -static int toi_checksum_storage_needed(void) -{ - if (toi_checksum_ops.enabled) - return strlen(toi_checksum_name) + sizeof(int) + 1; - else - return 0; -} - -/* - * toi_checksum_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_checksum_save_config_info(char *buffer) -{ - int namelen = strlen(toi_checksum_name) + 1; - int total_len; - - *((unsigned int *) buffer) = namelen; - strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen); - total_len = sizeof(unsigned int) + namelen; - return total_len; -} - -/* toi_checksum_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for dechecksuming the image at - * resume time. - */ -static void toi_checksum_load_config_info(char *buffer, int size) -{ - int namelen; - - namelen = *((unsigned int *) (buffer)); - strncpy(toi_checksum_name, buffer + sizeof(unsigned int), - namelen); - return; -} - -/* - * Free Checksum Memory - */ - -void free_checksum_pages(void) -{ - while (pages_allocated) { - unsigned long next = *((unsigned long *) page_list); - ClearPageNosave(virt_to_page(page_list)); - toi_free_page(15, (unsigned long) page_list); - page_list = next; - pages_allocated--; - } -} - -/* - * Allocate Checksum Memory - */ - -int allocate_checksum_pages(void) -{ - int pages_needed = checksum_pages_needed(); - - if (!toi_checksum_ops.enabled) - return 0; - - while (pages_allocated < pages_needed) { - unsigned long *new_page = - (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP); - if (!new_page) { - printk(KERN_ERR "Unable to allocate checksum pages.\n"); - return -ENOMEM; - } - SetPageNosave(virt_to_page(new_page)); - (*new_page) = page_list; - page_list = (unsigned long) new_page; - pages_allocated++; - } - - next_page = (unsigned long) page_list; - checksum_count = 0; - - return 0; -} - -char *tuxonice_get_next_checksum(void) -{ - if (!toi_checksum_ops.enabled) - return NULL; - - if (checksum_count % CHECKSUMS_PER_PAGE) - this_checksum += CHECKSUM_SIZE; - else { - this_checksum = next_page + sizeof(void *); - next_page = *((unsigned long *) next_page); - } - - checksum_count++; - return (char *) this_checksum; -} - -int tuxonice_calc_checksum(struct page *page, char *checksum_locn) -{ - char *pa; - int result, cpu = smp_processor_id(); - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!toi_checksum_ops.enabled) - return 0; - - pa = kmap(page); - memcpy(ctx->buf, pa, PAGE_SIZE); - kunmap(page); - result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, - checksum_locn); - if (result) - printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest " - "returned %d.\n", result); - return result; -} -/* - * Calculate checksums - */ - -void check_checksums(void) -{ - int index = 0, cpu = smp_processor_id(); - char current_checksum[CHECKSUM_SIZE]; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - unsigned long pfn; - - if (!toi_checksum_ops.enabled) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled."); - return; - } - - next_page = (unsigned long) page_list; - - toi_num_resaved = 0; - this_checksum = 0; - - toi_trace_index++; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums."); - memory_bm_position_reset(pageset2_map); - for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset2_map, 0)) { - int ret, resave_needed = false; - char *pa; - struct page *page = pfn_to_page(pfn); - - if (index < checksum_count) { - if (index % CHECKSUMS_PER_PAGE) { - this_checksum += CHECKSUM_SIZE; - } else { - this_checksum = next_page + sizeof(void *); - next_page = *((unsigned long *) next_page); - } - - /* Done when IRQs disabled so must be atomic */ - pa = kmap_atomic(page); - memcpy(ctx->buf, pa, PAGE_SIZE); - kunmap_atomic(pa); - ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, - current_checksum); - - if (ret) { - printk(KERN_INFO "Digest failed. Returned %d.\n", ret); - return; - } - - resave_needed = memcmp(current_checksum, (char *) this_checksum, - CHECKSUM_SIZE); - } else { - resave_needed = true; - } - - if (resave_needed) { - TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed); - SetPageResave(pfn_to_page(pfn)); - toi_num_resaved++; - if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED)) - set_abort_result(TOI_RESAVE_NEEDED); - } - - index++; - } - toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete."); -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0, - NULL), - SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action, - TOI_ABORT_ON_RESAVE_NEEDED, 0) -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_checksum_ops = { - .type = MISC_MODULE, - .name = "checksumming", - .directory = "checksum", - .module = THIS_MODULE, - .initialise = toi_checksum_initialise, - .cleanup = toi_checksum_cleanup, - .print_debug_info = toi_checksum_print_debug_stats, - .save_config_info = toi_checksum_save_config_info, - .load_config_info = toi_checksum_load_config_info, - .memory_needed = toi_checksum_memory_needed, - .storage_needed = toi_checksum_storage_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -int toi_checksum_init(void) -{ - int result = toi_register_module(&toi_checksum_ops); - return result; -} - -void toi_checksum_exit(void) -{ - toi_unregister_module(&toi_checksum_ops); -} diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h deleted file mode 100644 index c8196fbb0..000000000 --- a/kernel/power/tuxonice_checksum.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * kernel/power/tuxonice_checksum.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data checksum routines for TuxOnIce, - * using cryptoapi. They are used to locate any modifications - * made to pageset 2 while we're saving it. - */ - -#if defined(CONFIG_TOI_CHECKSUM) -extern int toi_checksum_init(void); -extern void toi_checksum_exit(void); -void check_checksums(void); -int allocate_checksum_pages(void); -void free_checksum_pages(void); -char *tuxonice_get_next_checksum(void); -int tuxonice_calc_checksum(struct page *page, char *checksum_locn); -#else -static inline int toi_checksum_init(void) { return 0; } -static inline void toi_checksum_exit(void) { } -static inline void check_checksums(void) { }; -static inline int allocate_checksum_pages(void) { return 0; }; -static inline void free_checksum_pages(void) { }; -static inline char *tuxonice_get_next_checksum(void) { return NULL; }; -static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn) - { return 0; } -#endif - diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c deleted file mode 100644 index 2873f93c6..000000000 --- a/kernel/power/tuxonice_cluster.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * kernel/power/tuxonice_cluster.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains routines for cluster hibernation support. - * - * Based on ip autoconfiguration code in net/ipv4/ipconfig.c. - * - * How does it work? - * - * There is no 'master' node that tells everyone else what to do. All nodes - * send messages to the broadcast address/port, maintain a list of peers - * and figure out when to progress to the next step in hibernating or resuming. - * This makes us more fault tolerant when it comes to nodes coming and going - * (which may be more of an issue if we're hibernating when power supplies - * are being unreliable). - * - * At boot time, we start a ktuxonice thread that handles communication with - * other nodes. This node maintains a state machine that controls our progress - * through hibernating and resuming, keeping us in step with other nodes. Nodes - * are identified by their hw address. - * - * On startup, the node sends CLUSTER_PING on the configured interface's - * broadcast address, port $toi_cluster_port (see below) and begins to listen - * for other broadcast messages. CLUSTER_PING messages are repeated at - * intervals of 5 minutes, with a random offset to spread traffic out. - * - * A hibernation cycle is initiated from any node via - * - * echo > /sys/power/tuxonice/do_hibernate - * - * and (possibily) the hibernate script. At each step of the process, the node - * completes its work, and waits for all other nodes to signal completion of - * their work (or timeout) before progressing to the next step. - * - * Request/state Action before reply Possible reply Next state - * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP - * HIBERNATE|NACK INIT_0 - * - * PREP prepare_image PREP|ACK IMAGE_WRITE - * PREP|NACK INIT_0 - * ABORT RUNNING - * - * IO write image IO|ACK power off - * ABORT POST_RESUME - * - * (Boot time) check for image IMAGE|ACK RESUME_PREP - * (Note 1) - * IMAGE|NACK (Note 2) - * - * PREP prepare read image PREP|ACK IMAGE_READ - * PREP|NACK (As NACK_IMAGE) - * - * IO read image IO|ACK POST_RESUME - * - * POST_RESUME thaw, post-script RUNNING - * - * INIT_0 init 0 - * - * Other messages: - * - * - PING: Request for all other live nodes to send a PONG. Used at startup to - * announce presence, when a node is suspected dead and periodically, in case - * segments of the network are [un]plugged. - * - * - PONG: Response to a PING. - * - * - ABORT: Request to cancel writing an image. - * - * - BYE: Notification that this node is shutting down. - * - * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that - * nodes which are slower to start up can get state synchronised. If a node - * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send - * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it - * must invalidate its image (if any) and boot normally. - * - * Note 2: May occur when one node lost power or powered off while others - * hibernated. This node waits for others to complete resuming (ACK_READ) - * before completing its boot, so that it appears as a fail node restarting. - * - * If any node has an image, then it also has a list of nodes that hibernated - * in synchronisation with it. The node will wait for other nodes to appear - * or timeout before beginning its restoration. - * - * If a node has no image, it needs to wait, in case other nodes which do have - * an image are going to resume, but are taking longer to announce their - * presence. For this reason, the user can specify a timeout value and a number - * of nodes detected before we just continue. (We might want to assume in a - * cluster of, say, 15 nodes, if 8 others have booted without finding an image, - * the remaining nodes will too. This might help in situations where some nodes - * are much slower to boot, or more subject to hardware failures or such like). - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" - -#if 1 -#define PRINTK(a, b...) do { printk(a, ##b); } while (0) -#else -#define PRINTK(a, b...) do { } while (0) -#endif - -static int loopback_mode; -static int num_local_nodes = 1; -#define MAX_LOCAL_NODES 8 -#define SADDR (loopback_mode ? b->sid : h->saddr) - -#define MYNAME "TuxOnIce Clustering" - -enum cluster_message { - MSG_ACK = 1, - MSG_NACK = 2, - MSG_PING = 4, - MSG_ABORT = 8, - MSG_BYE = 16, - MSG_HIBERNATE = 32, - MSG_IMAGE = 64, - MSG_IO = 128, - MSG_RUNNING = 256 -}; - -static char *str_message(int message) -{ - switch (message) { - case 4: - return "Ping"; - case 8: - return "Abort"; - case 9: - return "Abort acked"; - case 10: - return "Abort nacked"; - case 16: - return "Bye"; - case 17: - return "Bye acked"; - case 18: - return "Bye nacked"; - case 32: - return "Hibernate request"; - case 33: - return "Hibernate ack"; - case 34: - return "Hibernate nack"; - case 64: - return "Image exists?"; - case 65: - return "Image does exist"; - case 66: - return "No image here"; - case 128: - return "I/O"; - case 129: - return "I/O okay"; - case 130: - return "I/O failed"; - case 256: - return "Running"; - default: - printk(KERN_ERR "Unrecognised message %d.\n", message); - return "Unrecognised message (see dmesg)"; - } -} - -#define MSG_ACK_MASK (MSG_ACK | MSG_NACK) -#define MSG_STATE_MASK (~MSG_ACK_MASK) - -struct node_info { - struct list_head member_list; - wait_queue_head_t member_events; - spinlock_t member_list_lock; - spinlock_t receive_lock; - int peer_count, ignored_peer_count; - struct toi_sysfs_data sysfs_data; - enum cluster_message current_message; -}; - -struct node_info node_array[MAX_LOCAL_NODES]; - -struct cluster_member { - __be32 addr; - enum cluster_message message; - struct list_head list; - int ignore; -}; - -#define toi_cluster_port_send 3501 -#define toi_cluster_port_recv 3502 - -static struct net_device *net_dev; -static struct toi_module_ops toi_cluster_ops; - -static int toi_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); - -static struct packet_type toi_cluster_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = toi_recv, -}; - -struct toi_pkt { /* BOOTP packet format */ - struct iphdr iph; /* IP header */ - struct udphdr udph; /* UDP header */ - u8 htype; /* HW address type */ - u8 hlen; /* HW address length */ - __be32 xid; /* Transaction ID */ - __be16 secs; /* Seconds since we started */ - __be16 flags; /* Just what it says */ - u8 hw_addr[16]; /* Sender's HW address */ - u16 message; /* Message */ - unsigned long sid; /* Source ID for loopback testing */ -}; - -static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE; - -static int added_pack; - -static int others_have_image; - -/* Key used to allow multiple clusters on the same lan */ -static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY; -static char pre_hibernate_script[255] = - CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE; -static char post_hibernate_script[255] = - CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE; - -/* List of cluster members */ -static unsigned long continue_delay = 5 * HZ; -static unsigned long cluster_message_timeout = 3 * HZ; - -/* === Membership list === */ - -static void print_member_info(int index) -{ - struct cluster_member *this; - - printk(KERN_INFO "==> Dumping node %d.\n", index); - - list_for_each_entry(this, &node_array[index].member_list, list) - printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n", - NIPQUAD(this->addr), - str_message(this->message), - this->ignore ? "(Ignored)" : ""); - printk(KERN_INFO "== Done ==\n"); -} - -static struct cluster_member *__find_member(int index, __be32 addr) -{ - struct cluster_member *this; - - list_for_each_entry(this, &node_array[index].member_list, list) { - if (this->addr != addr) - continue; - - return this; - } - - return NULL; -} - -static void set_ignore(int index, __be32 addr, struct cluster_member *this) -{ - if (this->ignore) { - PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", - index, NIPQUAD(addr)); - return; - } - - PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", - index, NIPQUAD(addr)); - this->ignore = 1; - node_array[index].ignored_peer_count++; -} - -static int __add_update_member(int index, __be32 addr, int message) -{ - struct cluster_member *this; - - this = __find_member(index, addr); - if (this) { - if (this->message != message) { - this->message = message; - if ((message & MSG_NACK) && - (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) - set_ignore(index, addr, this); - PRINTK("Node %d sees node %d.%d.%d.%d now sending " - "%s.\n", index, NIPQUAD(addr), - str_message(message)); - wake_up(&node_array[index].member_events); - } - return 0; - } - - this = (struct cluster_member *) toi_kzalloc(36, - sizeof(struct cluster_member), GFP_KERNEL); - - if (!this) - return -1; - - this->addr = addr; - this->message = message; - this->ignore = 0; - INIT_LIST_HEAD(&this->list); - - node_array[index].peer_count++; - - PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index, - NIPQUAD(addr), str_message(message)); - - if ((message & MSG_NACK) && - (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) - set_ignore(index, addr, this); - list_add_tail(&this->list, &node_array[index].member_list); - return 1; -} - -static int add_update_member(int index, __be32 addr, int message) -{ - int result; - unsigned long flags; - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - result = __add_update_member(index, addr, message); - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); - - print_member_info(index); - - wake_up(&node_array[index].member_events); - - return result; -} - -static void del_member(int index, __be32 addr) -{ - struct cluster_member *this; - unsigned long flags; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - this = __find_member(index, addr); - - if (this) { - list_del_init(&this->list); - toi_kfree(36, this, sizeof(*this)); - node_array[index].peer_count--; - } - - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); -} - -/* === Message transmission === */ - -static void toi_send_if(int message, unsigned long my_id); - -/* - * Process received TOI packet. - */ -static int toi_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) -{ - struct toi_pkt *b; - struct iphdr *h; - int len, result, index; - unsigned long addr, message, ack; - - /* Perform verifications before taking the lock. */ - if (skb->pkt_type == PACKET_OTHERHOST) - goto drop; - - if (dev != net_dev) - goto drop; - - skb = skb_share_check(skb, GFP_ATOMIC); - if (!skb) - return NET_RX_DROP; - - if (!pskb_may_pull(skb, - sizeof(struct iphdr) + - sizeof(struct udphdr))) - goto drop; - - b = (struct toi_pkt *)skb_network_header(skb); - h = &b->iph; - - if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) - goto drop; - - /* Fragments are not supported */ - if (h->frag_off & htons(IP_OFFSET | IP_MF)) { - if (net_ratelimit()) - printk(KERN_ERR "TuxOnIce: Ignoring fragmented " - "cluster message.\n"); - goto drop; - } - - if (skb->len < ntohs(h->tot_len)) - goto drop; - - if (ip_fast_csum((char *) h, h->ihl)) - goto drop; - - if (b->udph.source != htons(toi_cluster_port_send) || - b->udph.dest != htons(toi_cluster_port_recv)) - goto drop; - - if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) - goto drop; - - len = ntohs(b->udph.len) - sizeof(struct udphdr); - - /* Ok the front looks good, make sure we can get at the rest. */ - if (!pskb_may_pull(skb, skb->len)) - goto drop; - - b = (struct toi_pkt *)skb_network_header(skb); - h = &b->iph; - - addr = SADDR; - PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n", - str_message(b->message), NIPQUAD(addr)); - - message = b->message & MSG_STATE_MASK; - ack = b->message & MSG_ACK_MASK; - - for (index = 0; index < num_local_nodes; index++) { - int new_message = node_array[index].current_message, - old_message = new_message; - - if (index == SADDR || !old_message) { - PRINTK("Ignoring node %d (offline or self).\n", index); - continue; - } - - /* One message at a time, please. */ - spin_lock(&node_array[index].receive_lock); - - result = add_update_member(index, SADDR, b->message); - if (result == -1) { - printk(KERN_INFO "Failed to add new cluster member " - NIPQUAD_FMT ".\n", - NIPQUAD(addr)); - goto drop_unlock; - } - - switch (b->message & MSG_STATE_MASK) { - case MSG_PING: - break; - case MSG_ABORT: - break; - case MSG_BYE: - break; - case MSG_HIBERNATE: - /* Can I hibernate? */ - new_message = MSG_HIBERNATE | - ((index & 1) ? MSG_NACK : MSG_ACK); - break; - case MSG_IMAGE: - /* Can I resume? */ - new_message = MSG_IMAGE | - ((index & 1) ? MSG_NACK : MSG_ACK); - if (new_message != old_message) - printk(KERN_ERR "Setting whether I can resume " - "to %d.\n", new_message); - break; - case MSG_IO: - new_message = MSG_IO | MSG_ACK; - break; - case MSG_RUNNING: - break; - default: - if (net_ratelimit()) - printk(KERN_ERR "Unrecognised TuxOnIce cluster" - " message %d from " NIPQUAD_FMT ".\n", - b->message, NIPQUAD(addr)); - }; - - if (old_message != new_message) { - node_array[index].current_message = new_message; - printk(KERN_INFO ">>> Sending new message for node " - "%d.\n", index); - toi_send_if(new_message, index); - } else if (!ack) { - printk(KERN_INFO ">>> Resending message for node %d.\n", - index); - toi_send_if(new_message, index); - } -drop_unlock: - spin_unlock(&node_array[index].receive_lock); - }; - -drop: - /* Throw the packet out. */ - kfree_skb(skb); - - return 0; -} - -/* - * Send cluster message to single interface. - */ -static void toi_send_if(int message, unsigned long my_id) -{ - struct sk_buff *skb; - struct toi_pkt *b; - int hh_len = LL_RESERVED_SPACE(net_dev); - struct iphdr *h; - - /* Allocate packet */ - skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL); - if (!skb) - return; - skb_reserve(skb, hh_len); - b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt)); - memset(b, 0, sizeof(struct toi_pkt)); - - /* Construct IP header */ - skb_reset_network_header(skb); - h = ip_hdr(skb); - h->version = 4; - h->ihl = 5; - h->tot_len = htons(sizeof(struct toi_pkt)); - h->frag_off = htons(IP_DF); - h->ttl = 64; - h->protocol = IPPROTO_UDP; - h->daddr = htonl(INADDR_BROADCAST); - h->check = ip_fast_csum((unsigned char *) h, h->ihl); - - /* Construct UDP header */ - b->udph.source = htons(toi_cluster_port_send); - b->udph.dest = htons(toi_cluster_port_recv); - b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr)); - /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ - - /* Construct message */ - b->message = message; - b->sid = my_id; - b->htype = net_dev->type; /* can cause undefined behavior */ - b->hlen = net_dev->addr_len; - memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len); - b->secs = htons(3); /* 3 seconds */ - - /* Chain packet down the line... */ - skb->dev = net_dev; - skb->protocol = htons(ETH_P_IP); - if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol), - net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) || - dev_queue_xmit(skb) < 0) - printk(KERN_INFO "E"); -} - -/* ========================================= */ - -/* kTOICluster */ - -static atomic_t num_cluster_threads; -static DECLARE_WAIT_QUEUE_HEAD(clusterd_events); - -static int kTOICluster(void *data) -{ - unsigned long my_id; - - my_id = atomic_add_return(1, &num_cluster_threads) - 1; - node_array[my_id].current_message = (unsigned long) data; - - PRINTK("kTOICluster daemon %lu starting.\n", my_id); - - current->flags |= PF_NOFREEZE; - - while (node_array[my_id].current_message) { - toi_send_if(node_array[my_id].current_message, my_id); - sleep_on_timeout(&clusterd_events, - cluster_message_timeout); - PRINTK("Link state %lu is %d.\n", my_id, - node_array[my_id].current_message); - } - - toi_send_if(MSG_BYE, my_id); - atomic_dec(&num_cluster_threads); - wake_up(&clusterd_events); - - PRINTK("kTOICluster daemon %lu exiting.\n", my_id); - __set_current_state(TASK_RUNNING); - return 0; -} - -static void kill_clusterd(void) -{ - int i; - - for (i = 0; i < num_local_nodes; i++) { - if (node_array[i].current_message) { - PRINTK("Seeking to kill clusterd %d.\n", i); - node_array[i].current_message = 0; - } - } - wait_event(clusterd_events, - !atomic_read(&num_cluster_threads)); - PRINTK("All cluster daemons have exited.\n"); -} - -static int peers_not_in_message(int index, int message, int precise) -{ - struct cluster_member *this; - unsigned long flags; - int result = 0; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - list_for_each_entry(this, &node_array[index].member_list, list) { - if (this->ignore) - continue; - - PRINTK("Peer %d.%d.%d.%d sending %s. " - "Seeking %s.\n", - NIPQUAD(this->addr), - str_message(this->message), str_message(message)); - if ((precise ? this->message : - this->message & MSG_STATE_MASK) != - message) - result++; - } - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); - PRINTK("%d peers in sought message.\n", result); - return result; -} - -static void reset_ignored(int index) -{ - struct cluster_member *this; - unsigned long flags; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - list_for_each_entry(this, &node_array[index].member_list, list) - this->ignore = 0; - node_array[index].ignored_peer_count = 0; - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); -} - -static int peers_in_message(int index, int message, int precise) -{ - return node_array[index].peer_count - - node_array[index].ignored_peer_count - - peers_not_in_message(index, message, precise); -} - -static int time_to_continue(int index, unsigned long start, int message) -{ - int first = peers_not_in_message(index, message, 0); - int second = peers_in_message(index, message, 1); - - PRINTK("First part returns %d, second returns %d.\n", first, second); - - if (!first && !second) { - PRINTK("All peers answered message %d.\n", - message); - return 1; - } - - if (time_after(jiffies, start + continue_delay)) { - PRINTK("Timeout reached.\n"); - return 1; - } - - PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies, - start + continue_delay); - return 0; -} - -void toi_initiate_cluster_hibernate(void) -{ - int result; - unsigned long start; - - result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); - if (result) - return; - - toi_send_if(MSG_HIBERNATE, 0); - - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_HIBERNATE)); - - if (test_action_state(TOI_FREEZER_TEST)) { - toi_send_if(MSG_ABORT, 0); - - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_RUNNING)); - - do_toi_step(STEP_QUIET_CLEANUP); - return; - } - - toi_send_if(MSG_IO, 0); - - result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); - if (result) - return; - - /* This code runs at resume time too! */ - if (toi_in_hibernate) - result = do_toi_step(STEP_HIBERNATE_POWERDOWN); -} - -/* toi_cluster_print_debug_stats - * - * Description: Print information to be recorded for debugging purposes into a - * buffer. - * Arguments: buffer: Pointer to a buffer into which the debug info will be - * printed. - * size: Size of the buffer. - * Returns: Number of characters written to the buffer. - */ -static int toi_cluster_print_debug_stats(char *buffer, int size) -{ - int len; - - if (strlen(toi_cluster_iface)) - len = scnprintf(buffer, size, - "- Cluster interface is '%s'.\n", - toi_cluster_iface); - else - len = scnprintf(buffer, size, - "- Cluster support is disabled.\n"); - return len; -} - -/* cluster_memory_needed - * - * Description: Tell the caller how much memory we need to operate during - * hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_cluster_memory_needed(void) -{ - return 0; -} - -static int toi_cluster_storage_needed(void) -{ - return 1 + strlen(toi_cluster_iface); -} - -/* toi_cluster_save_config_info - * - * Description: Save informaton needed when reloading the image at resume time. - * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE. - * Returns: Number of bytes used for saving our data. - */ -static int toi_cluster_save_config_info(char *buffer) -{ - strcpy(buffer, toi_cluster_iface); - return strlen(toi_cluster_iface + 1); -} - -/* toi_cluster_load_config_info - * - * Description: Reload information needed for declustering the image at - * resume time. - * Arguments: Buffer: Pointer to the start of the data. - * Size: Number of bytes that were saved. - */ -static void toi_cluster_load_config_info(char *buffer, int size) -{ - strncpy(toi_cluster_iface, buffer, size); - return; -} - -static void cluster_startup(void) -{ - int have_image = do_check_can_resume(), i; - unsigned long start = jiffies, initial_message; - struct task_struct *p; - - initial_message = MSG_IMAGE; - - have_image = 1; - - for (i = 0; i < num_local_nodes; i++) { - PRINTK("Starting ktoiclusterd %d.\n", i); - p = kthread_create(kTOICluster, (void *) initial_message, - "ktoiclusterd/%d", i); - if (IS_ERR(p)) { - printk(KERN_ERR "Failed to start ktoiclusterd.\n"); - return; - } - - wake_up_process(p); - } - - /* Wait for delay or someone else sending first message */ - wait_event(node_array[0].member_events, time_to_continue(0, start, - MSG_IMAGE)); - - others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1); - - printk(KERN_INFO "Continuing. I %shave an image. Peers with image:" - " %d.\n", have_image ? "" : "don't ", others_have_image); - - if (have_image) { - int result; - - /* Start to resume */ - printk(KERN_INFO " === Starting to resume === \n"); - node_array[0].current_message = MSG_IO; - toi_send_if(MSG_IO, 0); - - /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */ - result = 0; - - if (!result) { - /* - * Atomic restore - we'll come back in the hibernation - * path. - */ - - /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */ - result = 0; - - /* do_toi_step(STEP_QUIET_CLEANUP); */ - } - - node_array[0].current_message |= MSG_NACK; - - /* For debugging - disable for real life? */ - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_IO)); - } - - if (others_have_image) { - /* Wait for them to resume */ - printk(KERN_INFO "Waiting for other nodes to resume.\n"); - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_RUNNING)); - if (peers_not_in_message(0, MSG_RUNNING, 0)) - printk(KERN_INFO "Timed out while waiting for other " - "nodes to resume.\n"); - } - - /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE - * as appropriate. - * - * If we don't have an image: - * - Wait until someone else says they have one, or conditions are met - * for continuing to boot (n machines or t seconds). - * - If anyone has an image, wait for them to resume before continuing - * to boot. - * - * If we have an image: - * - Wait until conditions are met before continuing to resume (n - * machines or t seconds). Send RESUME_PREP and freeze processes. - * NACK_PREP if freezing fails (shouldn't) and follow logic for - * us having no image above. On success, wait for [N]ACK_PREP from - * other machines. Read image (including atomic restore) until done. - * Wait for ACK_READ from others (should never fail). Thaw processes - * and do post-resume. (The section after the atomic restore is done - * via the code for hibernating). - */ - - node_array[0].current_message = MSG_RUNNING; -} - -/* toi_cluster_open_iface - * - * Description: Prepare to use an interface. - */ - -static int toi_cluster_open_iface(void) -{ - struct net_device *dev; - - rtnl_lock(); - - for_each_netdev(&init_net, dev) { - if (/* dev == &init_net.loopback_dev || */ - strcmp(dev->name, toi_cluster_iface)) - continue; - - net_dev = dev; - break; - } - - rtnl_unlock(); - - if (!net_dev) { - printk(KERN_ERR MYNAME ": Device %s not found.\n", - toi_cluster_iface); - return -ENODEV; - } - - dev_add_pack(&toi_cluster_packet_type); - added_pack = 1; - - loopback_mode = (net_dev == init_net.loopback_dev); - num_local_nodes = loopback_mode ? 8 : 1; - - PRINTK("Loopback mode is %s. Number of local nodes is %d.\n", - loopback_mode ? "on" : "off", num_local_nodes); - - cluster_startup(); - return 0; -} - -/* toi_cluster_close_iface - * - * Description: Stop using an interface. - */ - -static int toi_cluster_close_iface(void) -{ - kill_clusterd(); - if (added_pack) { - dev_remove_pack(&toi_cluster_packet_type); - added_pack = 0; - } - return 0; -} - -static void write_side_effect(void) -{ - if (toi_cluster_ops.enabled) { - toi_cluster_open_iface(); - set_toi_state(TOI_CLUSTER_MODE); - } else { - toi_cluster_close_iface(); - clear_toi_state(TOI_CLUSTER_MODE); - } -} - -static void node_write_side_effect(void) -{ -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0, - NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0, - write_side_effect), - SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL), - SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script, - 256, 0, NULL), - SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script, - 256, 0, STRING), - SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ, - 0) -}; - -/* - * Ops structure. - */ - -static struct toi_module_ops toi_cluster_ops = { - .type = FILTER_MODULE, - .name = "Cluster", - .directory = "cluster", - .module = THIS_MODULE, - .memory_needed = toi_cluster_memory_needed, - .print_debug_info = toi_cluster_print_debug_stats, - .save_config_info = toi_cluster_save_config_info, - .load_config_info = toi_cluster_load_config_info, - .storage_needed = toi_cluster_storage_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -#ifdef MODULE -#define INIT static __init -#define EXIT static __exit -#else -#define INIT -#define EXIT -#endif - -INIT int toi_cluster_init(void) -{ - int temp = toi_register_module(&toi_cluster_ops), i; - struct kobject *kobj = toi_cluster_ops.dir_kobj; - - for (i = 0; i < MAX_LOCAL_NODES; i++) { - node_array[i].current_message = 0; - INIT_LIST_HEAD(&node_array[i].member_list); - init_waitqueue_head(&node_array[i].member_events); - spin_lock_init(&node_array[i].member_list_lock); - spin_lock_init(&node_array[i].receive_lock); - - /* Set up sysfs entry */ - node_array[i].sysfs_data.attr.name = toi_kzalloc(8, - sizeof(node_array[i].sysfs_data.attr.name), - GFP_KERNEL); - sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d", - i); - node_array[i].sysfs_data.attr.mode = SYSFS_RW; - node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER; - node_array[i].sysfs_data.flags = 0; - node_array[i].sysfs_data.data.integer.variable = - (int *) &node_array[i].current_message; - node_array[i].sysfs_data.data.integer.minimum = 0; - node_array[i].sysfs_data.data.integer.maximum = INT_MAX; - node_array[i].sysfs_data.write_side_effect = - node_write_side_effect; - toi_register_sysfs_file(kobj, &node_array[i].sysfs_data); - } - - toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0); - - if (toi_cluster_ops.enabled) - toi_cluster_open_iface(); - - return temp; -} - -EXIT void toi_cluster_exit(void) -{ - int i; - toi_cluster_close_iface(); - - for (i = 0; i < MAX_LOCAL_NODES; i++) - toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj, - &node_array[i].sysfs_data); - toi_unregister_module(&toi_cluster_ops); -} - -static int __init toi_cluster_iface_setup(char *iface) -{ - toi_cluster_ops.enabled = (*iface && - strcmp(iface, "off")); - - if (toi_cluster_ops.enabled) - strncpy(toi_cluster_iface, iface, strlen(iface)); -} - -__setup("toi_cluster=", toi_cluster_iface_setup); diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h deleted file mode 100644 index 84356b304..000000000 --- a/kernel/power/tuxonice_cluster.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * kernel/power/tuxonice_cluster.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifdef CONFIG_TOI_CLUSTER -extern int toi_cluster_init(void); -extern void toi_cluster_exit(void); -extern void toi_initiate_cluster_hibernate(void); -#else -static inline int toi_cluster_init(void) { return 0; } -static inline void toi_cluster_exit(void) { } -static inline void toi_initiate_cluster_hibernate(void) { } -#endif - diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c deleted file mode 100644 index 84b85226d..000000000 --- a/kernel/power/tuxonice_compress.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * kernel/power/compression.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data compression routines for TuxOnIce, - * using cryptoapi. - */ - -#include -#include -#include -#include - -#include "tuxonice_builtin.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" - -static int toi_expected_compression; - -static struct toi_module_ops toi_compression_ops; -static struct toi_module_ops *next_driver; - -static char toi_compressor_name[32] = "lzo"; - -static DEFINE_MUTEX(stats_lock); - -struct cpu_context { - u8 *page_buffer; - struct crypto_comp *transform; - unsigned int len; - u8 *buffer_start; - u8 *output_buffer; -}; - -#define OUT_BUF_SIZE (2 * PAGE_SIZE) - -static DEFINE_PER_CPU(struct cpu_context, contexts); - -/* - * toi_crypto_prepare - * - * Prepare to do some work by allocating buffers and transforms. - */ -static int toi_compress_crypto_prepare(void) -{ - int cpu; - - if (!*toi_compressor_name) { - printk(KERN_INFO "TuxOnIce: Compression enabled but no " - "compressor name set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0); - if (IS_ERR(this->transform)) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s compression transform.\n", - toi_compressor_name); - this->transform = NULL; - return 1; - } - - this->page_buffer = - (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP); - - if (!this->page_buffer) { - printk(KERN_ERR - "Failed to allocate a page buffer for TuxOnIce " - "compression driver.\n"); - return -ENOMEM; - } - - this->output_buffer = - (char *) vmalloc_32(OUT_BUF_SIZE); - - if (!this->output_buffer) { - printk(KERN_ERR - "Failed to allocate a output buffer for TuxOnIce " - "compression driver.\n"); - return -ENOMEM; - } - } - - return 0; -} - -static int toi_compress_rw_cleanup(int writing) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->transform) { - crypto_free_comp(this->transform); - this->transform = NULL; - } - - if (this->page_buffer) - toi_free_page(16, (unsigned long) this->page_buffer); - - this->page_buffer = NULL; - - if (this->output_buffer) - vfree(this->output_buffer); - - this->output_buffer = NULL; - } - - return 0; -} - -/* - * toi_compress_init - */ - -static int toi_compress_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - toi_compress_bytes_in = 0; - toi_compress_bytes_out = 0; - - next_driver = toi_get_next_filter(&toi_compression_ops); - - return next_driver ? 0 : -ECHILD; -} - -/* - * toi_compress_rw_init() - */ - -static int toi_compress_rw_init(int rw, int stream_number) -{ - if (toi_compress_crypto_prepare()) { - printk(KERN_ERR "Failed to initialise compression " - "algorithm.\n"); - if (rw == READ) { - printk(KERN_INFO "Unable to read the image.\n"); - return -ENODEV; - } else { - printk(KERN_INFO "Continuing without " - "compressing the image.\n"); - toi_compression_ops.enabled = 0; - } - } - - return 0; -} - -/* - * toi_compress_write_page() - * - * Compress a page of data, buffering output and passing on filled - * pages to the next module in the pipeline. - * - * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing - * data to be compressed. - * - * Returns: 0 on success. Otherwise the error is that returned by later - * modules, -ECHILD if we have a broken pipeline or -EIO if - * zlib errs. - */ -static int toi_compress_write_page(unsigned long index, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - int ret = 0, cpu = smp_processor_id(); - struct cpu_context *ctx = &per_cpu(contexts, cpu); - u8* output_buffer = buffer_page; - int output_len = buf_size; - int out_buf_type = buf_type; - - if (ctx->transform) { - - ctx->buffer_start = TOI_MAP(buf_type, buffer_page); - ctx->len = OUT_BUF_SIZE; - - ret = crypto_comp_compress(ctx->transform, - ctx->buffer_start, buf_size, - ctx->output_buffer, &ctx->len); - - TOI_UNMAP(buf_type, buffer_page); - - toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, - "CPU %d, index %lu: %d bytes", - cpu, index, ctx->len); - - if (!ret && ctx->len < buf_size) { /* some compression */ - output_buffer = ctx->output_buffer; - output_len = ctx->len; - out_buf_type = TOI_VIRT; - } - - } - - mutex_lock(&stats_lock); - - toi_compress_bytes_in += buf_size; - toi_compress_bytes_out += output_len; - - mutex_unlock(&stats_lock); - - if (!ret) - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - - return ret; -} - -/* - * toi_compress_read_page() - * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. - * - * Retrieve data from later modules and decompress it until the input buffer - * is filled. - * Zero if successful. Error condition from me or from downstream on failure. - */ -static int toi_compress_read_page(unsigned long *index, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int ret, cpu = smp_processor_id(); - unsigned int len; - unsigned int outlen = PAGE_SIZE; - char *buffer_start; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!ctx->transform) - return next_driver->read_page(index, TOI_PAGE, buffer_page, - buf_size); - - /* - * All our reads must be synchronous - we can't decompress - * data that hasn't been read yet. - */ - - ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len); - - buffer_start = kmap(buffer_page); - - /* Error or uncompressed data */ - if (ret || len == PAGE_SIZE) { - memcpy(buffer_start, ctx->page_buffer, len); - goto out; - } - - ret = crypto_comp_decompress( - ctx->transform, - ctx->page_buffer, - len, buffer_start, &outlen); - - toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, - "CPU %d, index %lu: %d=>%d (%d).", - cpu, *index, len, outlen, ret); - - if (ret) - abort_hibernate(TOI_FAILED_IO, - "Compress_read returned %d.\n", ret); - else if (outlen != PAGE_SIZE) { - abort_hibernate(TOI_FAILED_IO, - "Decompression yielded %d bytes instead of %ld.\n", - outlen, PAGE_SIZE); - printk(KERN_ERR "Decompression yielded %d bytes instead of " - "%ld.\n", outlen, PAGE_SIZE); - ret = -EIO; - *buf_size = outlen; - } -out: - TOI_UNMAP(buf_type, buffer_page); - return ret; -} - -/* - * toi_compress_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_compress_print_debug_stats(char *buffer, int size) -{ - unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT, - pages_out = toi_compress_bytes_out >> PAGE_SHIFT; - int len; - - /* Output the compression ratio achieved. */ - if (*toi_compressor_name) - len = scnprintf(buffer, size, "- Compressor is '%s'.\n", - toi_compressor_name); - else - len = scnprintf(buffer, size, "- Compressor is not set.\n"); - - if (pages_in) - len += scnprintf(buffer+len, size - len, " Compressed " - "%lu bytes into %lu (%ld percent compression).\n", - toi_compress_bytes_in, - toi_compress_bytes_out, - (pages_in - pages_out) * 100 / pages_in); - return len; -} - -/* - * toi_compress_compression_memory_needed - * - * Tell the caller how much memory we need to operate during hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_compress_memory_needed(void) -{ - return 2 * PAGE_SIZE; -} - -static int toi_compress_storage_needed(void) -{ - return 2 * sizeof(unsigned long) + 2 * sizeof(int) + - strlen(toi_compressor_name) + 1; -} - -/* - * toi_compress_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_compress_save_config_info(char *buffer) -{ - int len = strlen(toi_compressor_name) + 1, offset = 0; - - *((unsigned long *) buffer) = toi_compress_bytes_in; - offset += sizeof(unsigned long); - *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out; - offset += sizeof(unsigned long); - *((int *) (buffer + offset)) = toi_expected_compression; - offset += sizeof(int); - *((int *) (buffer + offset)) = len; - offset += sizeof(int); - strncpy(buffer + offset, toi_compressor_name, len); - return offset + len; -} - -/* toi_compress_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for decompressing the image at - * resume time. - */ -static void toi_compress_load_config_info(char *buffer, int size) -{ - int len, offset = 0; - - toi_compress_bytes_in = *((unsigned long *) buffer); - offset += sizeof(unsigned long); - toi_compress_bytes_out = *((unsigned long *) (buffer + offset)); - offset += sizeof(unsigned long); - toi_expected_compression = *((int *) (buffer + offset)); - offset += sizeof(int); - len = *((int *) (buffer + offset)); - offset += sizeof(int); - strncpy(toi_compressor_name, buffer + offset, len); -} - -static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - bkd->compress_bytes_in = toi_compress_bytes_in; - bkd->compress_bytes_out = toi_compress_bytes_out; -} - -static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_compress_bytes_in = bkd->compress_bytes_in; - toi_compress_bytes_out = bkd->compress_bytes_out; -} - -/* - * toi_expected_compression_ratio - * - * Description: Returns the expected ratio between data passed into this module - * and the amount of data output when writing. - * Returns: 100 if the module is disabled. Otherwise the value set by the - * user via our sysfs entry. - */ - -static int toi_compress_expected_ratio(void) -{ - if (!toi_compression_ops.enabled) - return 100; - else - return 100 - toi_expected_compression; -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression, - 0, 99, 0, NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0, - NULL), - SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL), -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_compression_ops = { - .type = FILTER_MODULE, - .name = "compression", - .directory = "compression", - .module = THIS_MODULE, - .initialise = toi_compress_init, - .memory_needed = toi_compress_memory_needed, - .print_debug_info = toi_compress_print_debug_stats, - .save_config_info = toi_compress_save_config_info, - .load_config_info = toi_compress_load_config_info, - .storage_needed = toi_compress_storage_needed, - .expected_compression = toi_compress_expected_ratio, - - .pre_atomic_restore = toi_compress_pre_atomic_restore, - .post_atomic_restore = toi_compress_post_atomic_restore, - - .rw_init = toi_compress_rw_init, - .rw_cleanup = toi_compress_rw_cleanup, - - .write_page = toi_compress_write_page, - .read_page = toi_compress_read_page, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -static __init int toi_compress_load(void) -{ - return toi_register_module(&toi_compression_ops); -} - -late_initcall(toi_compress_load); diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c deleted file mode 100644 index eb627915e..000000000 --- a/kernel/power/tuxonice_copy_before_write.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * kernel/power/tuxonice_copy_before_write.c - * - * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines (apart from the fault handling code) to deal with allocating memory - * for copying pages before they are modified, restoring the contents and getting - * the contents written to disk. - */ - -#include -#include -#include -#include "tuxonice_alloc.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice.h" - -DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states); -#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw)) -#define toi_cbw_pool_size 100 - -static void _toi_free_cbw_data(struct toi_cbw_state *state) -{ - struct toi_cbw *page_ptr, *ptr, *next; - - page_ptr = ptr = state->first; - - while(ptr) { - next = ptr->next; - - if (ptr->virt) { - toi__free_page(40, virt_to_page(ptr->virt)); - } - if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) { - /* Must be on a new page - free the previous one. */ - toi__free_page(40, virt_to_page(page_ptr)); - page_ptr = ptr; - } - ptr = next; - } - - if (page_ptr) { - toi__free_page(40, virt_to_page(page_ptr)); - } - - state->first = state->next = state->last = NULL; - state->size = 0; -} - -void toi_free_cbw_data(void) -{ - int i; - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - - if (!state->first) - continue; - - state->enabled = 0; - - while (state->active) { - schedule(); - } - - _toi_free_cbw_data(state); - } -} - -static int _toi_allocate_cbw_data(struct toi_cbw_state *state) -{ - while(state->size < toi_cbw_pool_size) { - int i; - struct toi_cbw *ptr; - - ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL); - - if (!ptr) { - return -ENOMEM; - } - - if (!state->first) { - state->first = state->next = state->last = ptr; - } - - for (i = 0; i < CBWS_PER_PAGE; i++) { - struct toi_cbw *cbw = &ptr[i]; - - cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL); - if (!cbw->virt) { - state->size += i; - printk("Out of memory allocating CBW pages.\n"); - return -ENOMEM; - } - - if (cbw == state->first) - continue; - - state->last->next = cbw; - state->last = cbw; - } - - state->size += CBWS_PER_PAGE; - } - - state->enabled = 1; - - return 0; -} - - -int toi_allocate_cbw_data(void) -{ - int i, result; - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - - result = _toi_allocate_cbw_data(state); - - if (result) - return result; - } - - return 0; -} - -void toi_cbw_restore(void) -{ - if (!toi_keeping_image) - return; - -} - -void toi_cbw_write(void) -{ - if (!toi_keeping_image) - return; - -} - -/** - * toi_cbw_test_read - Test copy before write on one page - * - * Allocate copy before write buffers, then make one page only copy-before-write - * and attempt to write to it. We should then be able to retrieve the original - * version from the cbw buffer and the modified version from the page itself. - */ -static int toi_cbw_test_read(const char *buffer, int count) -{ - unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL); - char *original = "Original contents"; - char *modified = "Modified material"; - struct page *page = virt_to_page(virt); - int i, len = 0, found = 0, pfn = page_to_pfn(page); - - if (!page) { - printk("toi_cbw_test_read: Unable to allocate a page for testing.\n"); - return -ENOMEM; - } - - memcpy((char *) virt, original, strlen(original)); - - if (toi_allocate_cbw_data()) { - printk("toi_cbw_test_read: Unable to allocate cbw data.\n"); - return -ENOMEM; - } - - toi_reset_dirtiness_one(pfn, 0); - - SetPageTOI_CBW(page); - - memcpy((char *) virt, modified, strlen(modified)); - - if (strncmp((char *) virt, modified, strlen(modified))) { - len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n"); - } - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - struct toi_cbw *ptr = state->first, *last_ptr = ptr; - - if (!found) { - while (ptr) { - if (ptr->pfn == pfn) { - found = 1; - if (strncmp(ptr->virt, original, strlen(original))) { - len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n"); - } else { - len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n"); - } - break; - } - - last_ptr = ptr; - ptr = ptr->next; - } - } - - if (!last_ptr) - len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i); - } - - if (!found) - len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n"); - - toi_free_cbw_data(); - - return len; -} - -/* - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read, - NULL, SYSFS_NEEDS_SM_FOR_READ, NULL), -}; - -static struct toi_module_ops toi_cbw_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "copy_before_write debugging", - .directory = "cbw", - .module = THIS_MODULE, - .early = 1, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_cbw_init(void) -{ - int result = toi_register_module(&toi_cbw_ops); - return result; -} diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c deleted file mode 100644 index 522c836ad..000000000 --- a/kernel/power/tuxonice_extent.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * kernel/power/tuxonice_extent.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * These functions encapsulate the manipulation of storage metadata. - */ - -#include -#include "tuxonice_modules.h" -#include "tuxonice_extent.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" -#include "tuxonice.h" - -/** - * toi_get_extent - return a free extent - * - * May fail, returning NULL instead. - **/ -static struct hibernate_extent *toi_get_extent(void) -{ - return (struct hibernate_extent *) toi_kzalloc(2, - sizeof(struct hibernate_extent), TOI_ATOMIC_GFP); -} - -/** - * toi_put_extent_chain - free a chain of extents starting from value 'from' - * @chain: Chain to free. - * - * Note that 'from' is an extent value, and may be part way through an extent. - * In this case, the extent should be truncated (if necessary) and following - * extents freed. - **/ -void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from) -{ - struct hibernate_extent *this; - - this = chain->first; - - while (this) { - struct hibernate_extent *next = this->next; - - // Delete the whole extent? - if (this->start >= from) { - chain->size -= (this->end - this->start + 1); - if (chain->first == this) - chain->first = next; - if (chain->last_touched == this) - chain->last_touched = NULL; - if (chain->current_extent == this) - chain->current_extent = NULL; - toi_kfree(2, this, sizeof(*this)); - chain->num_extents--; - } else if (this->end >= from) { - // Delete part of the extent - chain->size -= (this->end - from + 1); - this->start = from; - } - this = next; - } -} - -/** - * toi_put_extent_chain - free a whole chain of extents - * @chain: Chain to free. - **/ -void toi_put_extent_chain(struct hibernate_extent_chain *chain) -{ - toi_put_extent_chain_from(chain, 0); -} - -/** - * toi_add_to_extent_chain - add an extent to an existing chain - * @chain: Chain to which the extend should be added - * @start: Start of the extent (first physical block) - * @end: End of the extent (last physical block) - * - * The chain information is updated if the insertion is successful. - **/ -int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, - unsigned long start, unsigned long end) -{ - struct hibernate_extent *new_ext = NULL, *cur_ext = NULL; - - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Adding extent %lu-%lu to chain %p.\n", start, end, chain); - - /* Find the right place in the chain */ - if (chain->last_touched && chain->last_touched->start < start) - cur_ext = chain->last_touched; - else if (chain->first && chain->first->start < start) - cur_ext = chain->first; - - if (cur_ext) { - while (cur_ext->next && cur_ext->next->start < start) - cur_ext = cur_ext->next; - - if (cur_ext->end == (start - 1)) { - struct hibernate_extent *next_ext = cur_ext->next; - cur_ext->end = end; - - /* Merge with the following one? */ - if (next_ext && cur_ext->end + 1 == next_ext->start) { - cur_ext->end = next_ext->end; - cur_ext->next = next_ext->next; - toi_kfree(2, next_ext, sizeof(*next_ext)); - chain->num_extents--; - } - - chain->last_touched = cur_ext; - chain->size += (end - start + 1); - - return 0; - } - } - - new_ext = toi_get_extent(); - if (!new_ext) { - printk(KERN_INFO "Error unable to append a new extent to the " - "chain.\n"); - return -ENOMEM; - } - - chain->num_extents++; - chain->size += (end - start + 1); - new_ext->start = start; - new_ext->end = end; - - chain->last_touched = new_ext; - - if (cur_ext) { - new_ext->next = cur_ext->next; - cur_ext->next = new_ext; - } else { - if (chain->first) - new_ext->next = chain->first; - chain->first = new_ext; - } - - return 0; -} diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h deleted file mode 100644 index aeccf1f5e..000000000 --- a/kernel/power/tuxonice_extent.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * kernel/power/tuxonice_extent.h - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations related to extents. Extents are - * TuxOnIce's method of storing some of the metadata for the image. - * See tuxonice_extent.c for more info. - * - */ - -#include "tuxonice_modules.h" - -#ifndef EXTENT_H -#define EXTENT_H - -struct hibernate_extent { - unsigned long start, end; - struct hibernate_extent *next; -}; - -struct hibernate_extent_chain { - unsigned long size; /* size of the chain ie sum (max-min+1) */ - int num_extents; - struct hibernate_extent *first, *last_touched; - struct hibernate_extent *current_extent; - unsigned long current_offset; -}; - -/* Simplify iterating through all the values in an extent chain */ -#define toi_extent_for_each(extent_chain, extentpointer, value) \ -if ((extent_chain)->first) \ - for ((extentpointer) = (extent_chain)->first, (value) = \ - (extentpointer)->start; \ - ((extentpointer) && ((extentpointer)->next || (value) <= \ - (extentpointer)->end)); \ - (((value) == (extentpointer)->end) ? \ - ((extentpointer) = (extentpointer)->next, (value) = \ - ((extentpointer) ? (extentpointer)->start : 0)) : \ - (value)++)) - -extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from); -#endif diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c deleted file mode 100644 index baf191211..000000000 --- a/kernel/power/tuxonice_file.c +++ /dev/null @@ -1,484 +0,0 @@ -/* - * kernel/power/tuxonice_file.c - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file encapsulates functions for usage of a simple file as a - * backing store. It is based upon the swapallocator, and shares the - * same basic working. Here, though, we have nothing to do with - * swapspace, and only one device to worry about. - * - * The user can just - * - * echo TuxOnIce > /path/to/my_file - * - * dd if=/dev/zero bs=1M count= >> /path/to/my_file - * - * and - * - * echo /path/to/my_file > /sys/power/tuxonice/file/target - * - * then put what they find in /sys/power/tuxonice/resume - * as their resume= parameter in lilo.conf (and rerun lilo if using it). - * - * Having done this, they're ready to hibernate and resume. - * - * TODO: - * - File resizing. - */ - -#include -#include -#include -#include - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_bio.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_ui.h" -#include "tuxonice_io.h" - -#define target_is_normal_file() (S_ISREG(target_inode->i_mode)) - -static struct toi_module_ops toi_fileops; - -static struct file *target_file; -static struct block_device *toi_file_target_bdev; -static unsigned long pages_available, pages_allocated; -static char toi_file_target[256]; -static struct inode *target_inode; -static int file_target_priority; -static int used_devt; -static int target_claim; -static dev_t toi_file_dev_t; -static int sig_page_index; - -/* For test_toi_file_target */ -static struct toi_bdev_info *file_chain; - -static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num) -{ - int j; - sector_t last = 0; - - for (j = 0; j < dev_info->blocks_per_page; j++) { - sector_t this = bmap(target_inode, - page_num * dev_info->blocks_per_page + j); - - if (!this || (last && (last + 1) != this)) - break; - - last = this; - } - - return j == dev_info->blocks_per_page; -} - -static unsigned long get_usable_pages(struct toi_bdev_info *dev_info) -{ - unsigned long result = 0; - struct block_device *bdev = dev_info->bdev; - int i; - - switch (target_inode->i_mode & S_IFMT) { - case S_IFSOCK: - case S_IFCHR: - case S_IFIFO: /* Socket, Char, Fifo */ - return -1; - case S_IFREG: /* Regular file: current size - holes + free - space on part */ - for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) { - if (has_contiguous_blocks(dev_info, i)) - result++; - } - break; - case S_IFBLK: /* Block device */ - if (!bdev->bd_disk) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "bdev->bd_disk null."); - return 0; - } - - result = (bdev->bd_part ? - bdev->bd_part->nr_sects : - get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9); - } - - - return result; -} - -static int toi_file_register_storage(void) -{ - struct toi_bdev_info *devinfo; - int result = 0; - struct fs_info *fs_info; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage."); - if (!strlen(toi_file_target)) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: " - "No target filename set."); - return 0; - } - - target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0); - toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.", - toi_file_target, target_file); - - if (IS_ERR(target_file) || !target_file) { - target_file = NULL; - toi_file_dev_t = name_to_dev_t(toi_file_target); - if (!toi_file_dev_t) { - struct kstat stat; - int error = vfs_stat(toi_file_target, &stat); - printk(KERN_INFO "Open file %s returned %p and " - "name_to_devt failed.\n", - toi_file_target, target_file); - if (error) { - printk(KERN_INFO "Stating the file also failed." - " Nothing more we can do.\n"); - return 0; - } else - toi_file_dev_t = stat.rdev; - } - - toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t); - if (IS_ERR(toi_file_target_bdev)) { - printk(KERN_INFO "Got a dev_num (%lx) but failed to " - "open it.\n", - (unsigned long) toi_file_dev_t); - toi_file_target_bdev = NULL; - return 0; - } - used_devt = 1; - target_inode = toi_file_target_bdev->bd_inode; - } else - target_inode = target_file->f_mapping->host; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target."); - if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) || - S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) { - printk(KERN_INFO "File support works with regular files," - " character files and block devices.\n"); - /* Cleanup routine will undo the above */ - return 0; - } - - if (!used_devt) { - if (S_ISBLK(target_inode->i_mode)) { - toi_file_target_bdev = I_BDEV(target_inode); - if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE | - FMODE_READ, NULL)) - target_claim = 1; - } else - toi_file_target_bdev = target_inode->i_sb->s_bdev; - if (!toi_file_target_bdev) { - printk(KERN_INFO "%s is not a valid file allocator " - "target.\n", toi_file_target); - return 0; - } - toi_file_dev_t = toi_file_target_bdev->bd_dev; - } - - devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC); - if (!devinfo) { - printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n"); - return -ENOMEM; - } - - devinfo->bdev = toi_file_target_bdev; - devinfo->allocator = &toi_fileops; - devinfo->allocator_index = 0; - - fs_info = fs_info_from_block_dev(toi_file_target_bdev); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(devinfo->uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - /* Unlike swap code, only complain if fs_info_from_block_dev returned - * -ENOMEM. The 'file' might be a full partition, so might validly not - * have an identifiable type, UUID etc. - */ - if (result) - printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n", - result); - devinfo->dev_t = toi_file_dev_t; - devinfo->prio = file_target_priority; - devinfo->bmap_shift = target_inode->i_blkbits - 9; - devinfo->blocks_per_page = - (1 << (PAGE_SHIFT - target_inode->i_blkbits)); - sprintf(devinfo->name, "file %s", toi_file_target); - file_chain = devinfo; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap " - "shift is %d. Blocks per page %d.", - devinfo->dev_t, devinfo->prio, devinfo->bmap_shift, - devinfo->blocks_per_page); - - /* Keep one aside for the signature */ - pages_available = get_usable_pages(devinfo) - 1; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu " - "pages.", pages_available); - - toi_bio_ops.register_storage(devinfo); - return 0; -} - -static unsigned long toi_file_storage_available(void) -{ - return pages_available; -} - -static int toi_file_allocate_storage(struct toi_bdev_info *chain, - unsigned long request) -{ - unsigned long available = pages_available - pages_allocated; - unsigned long to_add = min(available, request); - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated " - "is %lu. Allocating %lu pages from file.", - pages_available, pages_allocated, to_add); - pages_allocated += to_add; - - return to_add; -} - -/** - * __populate_block_list - add an extent to the chain - * @min: Start of the extent (first physical block = sector) - * @max: End of the extent (last physical block = sector) - * - * If TOI_TEST_BIO is set, print a debug message, outputting the min and max - * fs block numbers. - **/ -static int __populate_block_list(struct toi_bdev_info *chain, int min, int max) -{ - if (test_action_state(TOI_TEST_BIO)) - toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.", - min << chain->bmap_shift, - ((max + 1) << chain->bmap_shift) - 1); - - return toi_add_to_extent_chain(&chain->blocks, min, max); -} - -static int get_main_pool_phys_params(struct toi_bdev_info *chain) -{ - int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0; - unsigned long pages_mapped = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks."); - - if (chain->blocks.first) - toi_put_extent_chain(&chain->blocks); - - if (!target_is_normal_file()) { - result = (pages_available > 0) ? - __populate_block_list(chain, chain->blocks_per_page, - (pages_allocated + 1) * - chain->blocks_per_page - 1) : 0; - return result; - } - - /* - * FIXME: We are assuming the first page is contiguous. Is that - * assumption always right? - */ - - for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) { - sector_t new_sector; - - if (!has_contiguous_blocks(chain, i)) - continue; - - if (!have_sig_page) { - have_sig_page = 1; - sig_page_index = i; - continue; - } - - pages_mapped++; - - /* Ignore first page - it has the header */ - if (pages_mapped == 1) - continue; - - new_sector = bmap(target_inode, (i * chain->blocks_per_page)); - - /* - * I'd love to be able to fill in holes and resize - * files, but not yet... - */ - - if (new_sector == extent_max + 1) - extent_max += chain->blocks_per_page; - else { - if (extent_min > -1) { - result = __populate_block_list(chain, - extent_min, extent_max); - if (result) - return result; - } - - extent_min = new_sector; - extent_max = extent_min + - chain->blocks_per_page - 1; - } - - if (pages_mapped == pages_allocated) - break; - } - - if (extent_min > -1) { - result = __populate_block_list(chain, extent_min, extent_max); - if (result) - return result; - } - - return 0; -} - -static void toi_file_free_storage(struct toi_bdev_info *chain) -{ - pages_allocated = 0; - file_chain = NULL; -} - -/** - * toi_file_print_debug_stats - print debug info - * @buffer: Buffer to data to populate - * @size: Size of the buffer - **/ -static int toi_file_print_debug_stats(char *buffer, int size) -{ - int len = scnprintf(buffer, size, "- File Allocator active.\n"); - - len += scnprintf(buffer+len, size-len, " Storage available for " - "image: %lu pages.\n", pages_available); - - return len; -} - -static void toi_file_cleanup(int finishing_cycle) -{ - if (toi_file_target_bdev) { - if (target_claim) { - blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ); - target_claim = 0; - } - - if (used_devt) { - blkdev_put(toi_file_target_bdev, - FMODE_READ | FMODE_NDELAY); - used_devt = 0; - } - toi_file_target_bdev = NULL; - target_inode = NULL; - } - - if (target_file) { - filp_close(target_file, NULL); - target_file = NULL; - } - - pages_available = 0; -} - -/** - * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target - * - * Test wheter the target file is valid for hibernating. - **/ -static void test_toi_file_target(void) -{ - int result = toi_file_register_storage(); - sector_t sector; - char buf[50]; - struct fs_info *fs_info; - - if (result || !file_chain) - return; - - /* This doesn't mean we're in business. Is any storage available? */ - if (!pages_available) - goto out; - - toi_file_allocate_storage(file_chain, 1); - result = get_main_pool_phys_params(file_chain); - if (result) - goto out; - - - sector = bmap(target_inode, sig_page_index * - file_chain->blocks_per_page) << file_chain->bmap_shift; - - /* Use the uuid, or the dev_t if that fails */ - fs_info = fs_info_from_block_dev(toi_file_target_bdev); - if (!fs_info || IS_ERR(fs_info)) { - bdevname(toi_file_target_bdev, buf); - sprintf(resume_file, "/dev/%s:%llu", buf, - (unsigned long long) sector); - } else { - int i; - hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0); - - /* Remove the spaces */ - for (i = 1; i < 16; i++) { - buf[2 * i] = buf[3 * i]; - buf[2 * i + 1] = buf[3 * i + 1]; - } - buf[32] = 0; - sprintf(resume_file, "UUID=%s:0x%llx", buf, - (unsigned long long) sector); - free_fs_info(fs_info); - } - - toi_attempt_to_parse_resume_device(0); -out: - toi_file_free_storage(file_chain); - toi_bio_ops.free_storage(); -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256, - SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target), - SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL), - SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095, - 4096, 0, NULL), -}; - -static struct toi_bio_allocator_ops toi_bio_fileops = { - .register_storage = toi_file_register_storage, - .storage_available = toi_file_storage_available, - .allocate_storage = toi_file_allocate_storage, - .bmap = get_main_pool_phys_params, - .free_storage = toi_file_free_storage, -}; - -static struct toi_module_ops toi_fileops = { - .type = BIO_ALLOCATOR_MODULE, - .name = "file storage", - .directory = "file", - .module = THIS_MODULE, - .print_debug_info = toi_file_print_debug_stats, - .cleanup = toi_file_cleanup, - .bio_allocator_ops = &toi_bio_fileops, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -static __init int toi_file_load(void) -{ - return toi_register_module(&toi_fileops); -} - -late_initcall(toi_file_load); diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c deleted file mode 100644 index 13bb93811..000000000 --- a/kernel/power/tuxonice_highlevel.c +++ /dev/null @@ -1,1414 +0,0 @@ -/* - * kernel/power/tuxonice_highlevel.c - */ -/** \mainpage TuxOnIce. - * - * TuxOnIce provides support for saving and restoring an image of - * system memory to an arbitrary storage device, either on the local computer, - * or across some network. The support is entirely OS based, so TuxOnIce - * works without requiring BIOS, APM or ACPI support. The vast majority of the - * code is also architecture independant, so it should be very easy to port - * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem - * and preemption. Initramfses and initrds are also supported. - * - * TuxOnIce uses a modular design, in which the method of storing the image is - * completely abstracted from the core code, as are transformations on the data - * such as compression and/or encryption (multiple 'modules' can be used to - * provide arbitrary combinations of functionality). The user interface is also - * modular, so that arbitrarily simple or complex interfaces can be used to - * provide anything from debugging information through to eye candy. - * - * \section Copyright - * - * TuxOnIce is released under the GPLv2. - * - * Copyright (C) 1998-2001 Gabor Kuti
- * Copyright (C) 1998,2001,2002 Pavel Machek
- * Copyright (C) 2002-2003 Florent Chabaud
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- * - * \section Credits - * - * Nigel would like to thank the following people for their work: - * - * Bernard Blackham
- * Web page & Wiki administration, some coding. A person without whom - * TuxOnIce would not be where it is. - * - * Michael Frank
- * Extensive testing and help with improving stability. I was constantly - * amazed by the quality and quantity of Michael's help. - * - * Pavel Machek
- * Modifications, defectiveness pointing, being with Gabor at the very - * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and - * 2.5.17. Even though Pavel and I disagree on the direction suspend to - * disk should take, I appreciate the valuable work he did in helping Gabor - * get the concept working. - * - * ..and of course the myriads of TuxOnIce users who have helped diagnose - * and fix bugs, made suggestions on how to improve the code, proofread - * documentation, and donated time and money. - * - * Thanks also to corporate sponsors: - * - * Redhat.Sometime employer from May 2006 (my fault, not Redhat's!). - * - * Cyclades.com. Nigel's employers from Dec 2004 until May 2006, who - * allowed him to work on TuxOnIce and PM related issues on company time. - * - * LinuxFund.org. Sponsored Nigel's work on TuxOnIce for four months Oct - * 2003 to Jan 2004. - * - * LAC Linux. Donated P4 hardware that enabled development and ongoing - * maintenance of SMP and Highmem support. - * - * OSDL. Provided access to various hardware configurations, make - * occasional small donations to the project. - */ - -#include -#include -#include -#include -#include -#include -#include -#include /* for get/set_fs & KERNEL_DS on i386 */ -#include -#include - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_power_off.h" -#include "tuxonice_storage.h" -#include "tuxonice_checksum.h" -#include "tuxonice_builtin.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_alloc.h" -#include "tuxonice_cluster.h" - -/*! Pageset metadata. */ -struct pagedir pagedir2 = {2}; - -static mm_segment_t oldfs; -static DEFINE_MUTEX(tuxonice_in_use); -static int block_dump_save; - -int toi_trace_index; - -/* Binary signature if an image is present */ -char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c"; - -unsigned long boot_kernel_data_buffer; - -static char *result_strings[] = { - "Hibernation was aborted", - "The user requested that we cancel the hibernation", - "No storage was available", - "Insufficient storage was available", - "Freezing filesystems and/or tasks failed", - "A pre-existing image was used", - "We would free memory, but image size limit doesn't allow this", - "Unable to free enough memory to hibernate", - "Unable to obtain the Power Management Semaphore", - "A device suspend/resume returned an error", - "A system device suspend/resume returned an error", - "The extra pages allowance is too small", - "We were unable to successfully prepare an image", - "TuxOnIce module initialisation failed", - "TuxOnIce module cleanup failed", - "I/O errors were encountered", - "Ran out of memory", - "An error was encountered while reading the image", - "Platform preparation failed", - "CPU Hotplugging failed", - "Architecture specific preparation failed", - "Pages needed resaving, but we were told to abort if this happens", - "We can't hibernate at the moment (invalid resume= or filewriter " - "target?)", - "A hibernation preparation notifier chain member cancelled the " - "hibernation", - "Pre-snapshot preparation failed", - "Pre-restore preparation failed", - "Failed to disable usermode helpers", - "Can't resume from alternate image", - "Header reservation too small", - "Device Power Management Preparation failed", -}; - -/** - * toi_finish_anything - cleanup after doing anything - * @hibernate_or_resume: Whether finishing a cycle or attempt at - * resuming. - * - * This is our basic clean-up routine, matching start_anything below. We - * call cleanup routines, drop module references and restore process fs and - * cpus allowed masks, together with the global block_dump variable's value. - **/ -void toi_finish_anything(int hibernate_or_resume) -{ - toi_running = 0; - toi_cleanup_modules(hibernate_or_resume); - toi_put_modules(); - if (hibernate_or_resume) { - block_dump = block_dump_save; - set_cpus_allowed_ptr(current, cpu_all_mask); - toi_alloc_print_debug_stats(); - atomic_inc(&snapshot_device_available); - unlock_system_sleep(); - } - - set_fs(oldfs); - mutex_unlock(&tuxonice_in_use); -} - -/** - * toi_start_anything - basic initialisation for TuxOnIce - * @toi_or_resume: Whether starting a cycle or attempt at resuming. - * - * Our basic initialisation routine. Take references on modules, use the - * kernel segment, recheck resume= if no active allocator is set, initialise - * modules, save and reset block_dump and ensure we're running on CPU0. - **/ -int toi_start_anything(int hibernate_or_resume) -{ - mutex_lock(&tuxonice_in_use); - - oldfs = get_fs(); - set_fs(KERNEL_DS); - - toi_trace_index = 0; - - if (hibernate_or_resume) { - lock_system_sleep(); - - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - goto snapshotdevice_unavailable; - } - - if (hibernate_or_resume == SYSFS_HIBERNATE) - toi_print_modules(); - - if (toi_get_modules()) { - printk(KERN_INFO "TuxOnIce: Get modules failed!\n"); - goto prehibernate_err; - } - - if (hibernate_or_resume) { - block_dump_save = block_dump; - block_dump = 0; - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_first(cpu_online_mask))); - } - - if (toi_initialise_modules_early(hibernate_or_resume)) - goto early_init_err; - - if (!toiActiveAllocator) - toi_attempt_to_parse_resume_device(!hibernate_or_resume); - - if (!toi_initialise_modules_late(hibernate_or_resume)) { - toi_running = 1; /* For the swsusp code we use :< */ - return 0; - } - - toi_cleanup_modules(hibernate_or_resume); -early_init_err: - if (hibernate_or_resume) { - block_dump_save = block_dump; - set_cpus_allowed_ptr(current, cpu_all_mask); - } - toi_put_modules(); -prehibernate_err: - if (hibernate_or_resume) - atomic_inc(&snapshot_device_available); -snapshotdevice_unavailable: - if (hibernate_or_resume) - mutex_unlock(&pm_mutex); - release_super_lock(); - set_fs(oldfs); - mutex_unlock(&tuxonice_in_use); - return -EBUSY; -} - -/* - * Nosave page tracking. - * - * Here rather than in prepare_image because we want to do it once only at the - * start of a cycle. - */ - -/** - * mark_nosave_pages - set up our Nosave bitmap - * - * Build a bitmap of Nosave pages from the list. The bitmap allows faster - * use when preparing the image. - **/ -static void mark_nosave_pages(void) -{ - struct nosave_region *region; - - list_for_each_entry(region, &nosave_regions, list) { - unsigned long pfn; - - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - SetPageNosave(pfn_to_page(pfn)); - } - } -} - -/** - * allocate_bitmaps - allocate bitmaps used to record page states - * - * Allocate the bitmaps we use to record the various TuxOnIce related - * page states. - **/ -static int allocate_bitmaps(void) -{ - if (toi_alloc_bitmap(&pageset1_map) || - toi_alloc_bitmap(&pageset1_copy_map) || - toi_alloc_bitmap(&pageset2_map) || - toi_alloc_bitmap(&io_map) || - toi_alloc_bitmap(&nosave_map) || - toi_alloc_bitmap(&free_map) || - toi_alloc_bitmap(&compare_map) || - toi_alloc_bitmap(&page_resave_map)) - return 1; - - return 0; -} - -/** - * free_bitmaps - free the bitmaps used to record page states - * - * Free the bitmaps allocated above. It is not an error to call - * memory_bm_free on a bitmap that isn't currently allocated. - **/ -static void free_bitmaps(void) -{ - toi_free_bitmap(&pageset1_map); - toi_free_bitmap(&pageset1_copy_map); - toi_free_bitmap(&pageset2_map); - toi_free_bitmap(&io_map); - toi_free_bitmap(&nosave_map); - toi_free_bitmap(&free_map); - toi_free_bitmap(&compare_map); - toi_free_bitmap(&page_resave_map); -} - -/** - * io_MB_per_second - return the number of MB/s read or written - * @write: Whether to return the speed at which we wrote. - * - * Calculate the number of megabytes per second that were read or written. - **/ -static int io_MB_per_second(int write) -{ - return (toi_bkd.toi_io_time[write][1]) ? - MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ / - toi_bkd.toi_io_time[write][1] : 0; -} - -#define SNPRINTF(a...) do { len += scnprintf(((char *) buffer) + len, \ - count - len - 1, ## a); } while (0) - -/** - * get_debug_info - fill a buffer with debugging information - * @buffer: The buffer to be filled. - * @count: The size of the buffer, in bytes. - * - * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will - * either printk or return via sysfs. - **/ -static int get_toi_debug_info(const char *buffer, int count) -{ - int len = 0, i, first_result = 1; - - SNPRINTF("TuxOnIce debugging info:\n"); - SNPRINTF("- TuxOnIce core : " TOI_CORE_VERSION "\n"); - SNPRINTF("- Kernel Version : " UTS_RELEASE "\n"); - SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__); - SNPRINTF("- Attempt number : %d\n", nr_hibernates); - SNPRINTF("- Parameters : %ld %ld %ld %d %ld %ld\n", - toi_result, - toi_bkd.toi_action, - toi_bkd.toi_debug_state, - toi_bkd.toi_default_console_level, - image_size_limit, - toi_poweroff_method); - SNPRINTF("- Overall expected compression percentage: %d.\n", - 100 - toi_expected_compression_ratio()); - len += toi_print_module_debug_info(((char *) buffer) + len, - count - len - 1); - if (toi_bkd.toi_io_time[0][1]) { - if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) { - SNPRINTF("- I/O speed: Write %ld KB/s", - (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / - toi_bkd.toi_io_time[0][1])); - if (toi_bkd.toi_io_time[1][1]) - SNPRINTF(", Read %ld KB/s", - (KB((unsigned long) - toi_bkd.toi_io_time[1][0]) * HZ / - toi_bkd.toi_io_time[1][1])); - } else { - SNPRINTF("- I/O speed: Write %ld MB/s", - (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / - toi_bkd.toi_io_time[0][1])); - if (toi_bkd.toi_io_time[1][1]) - SNPRINTF(", Read %ld MB/s", - (MB((unsigned long) - toi_bkd.toi_io_time[1][0]) * HZ / - toi_bkd.toi_io_time[1][1])); - } - SNPRINTF(".\n"); - } else - SNPRINTF("- No I/O speed stats available.\n"); - SNPRINTF("- Extra pages : %lu used/%lu.\n", - extra_pd1_pages_used, extra_pd1_pages_allowance); - - for (i = 0; i < TOI_NUM_RESULT_STATES; i++) - if (test_result_state(i)) { - SNPRINTF("%s: %s.\n", first_result ? - "- Result " : - " ", - result_strings[i]); - first_result = 0; - } - if (first_result) - SNPRINTF("- Result : %s.\n", nr_hibernates ? - "Succeeded" : - "No hibernation attempts so far"); - return len; -} - -#ifdef CONFIG_TOI_INCREMENTAL -/** - * get_toi_page_state - fill a buffer with page state information - * @buffer: The buffer to be filled. - * @count: The size of the buffer, in bytes. - * - * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will - * either printk or return via sysfs. - **/ -static int get_toi_page_state(const char *buffer, int count) -{ - int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0; - int len = 0; - struct zone *zone; - int allocated_bitmaps = 0; - - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_first(cpu_online_mask))); - - if (!free_map) { - BUG_ON(toi_alloc_bitmap(&free_map)); - allocated_bitmaps = 1; - } - - toi_generate_free_page_map(); - - for_each_populated_zone(zone) { - unsigned long loop; - - total += zone->spanned_pages; - - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - /* - * If the page gets allocated, it will be need - * saving in an image. - * Don't bother with explicitly removing any - * RO protection applied below. - * We'll SetPageTOI_Dirty(page) if/when it - * gets allocated. - */ - free += chunk_size; - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageTOI_Untracked(page)) { - untracked++; - } else if (PageTOI_RO(page)) { - ro++; - } else if (PageTOI_Dirty(page)) { - dirty++; - } else { - printk("Page %ld state 'other'.\n", pfn); - other++; - } - } - } - - if (allocated_bitmaps) { - toi_free_bitmap(&free_map); - } - - set_cpus_allowed_ptr(current, cpu_all_mask); - - SNPRINTF("TuxOnIce page breakdown:\n"); - SNPRINTF("- Free : %d\n", free); - SNPRINTF("- Untracked : %d\n", untracked); - SNPRINTF("- Read only : %d\n", ro); - SNPRINTF("- Dirty : %d\n", dirty); - SNPRINTF("- Other : %d\n", other); - SNPRINTF("- Invalid : %d\n", invalid); - SNPRINTF("- Total : %d\n", total); - return len; -} -#endif - -/** - * do_cleanup - cleanup after attempting to hibernate or resume - * @get_debug_info: Whether to allocate and return debugging info. - * - * Cleanup after attempting to hibernate or resume, possibly getting - * debugging info as we do so. - **/ -static void do_cleanup(int get_debug_info, int restarting) -{ - int i = 0; - char *buffer = NULL; - - trap_non_toi_io = 0; - - if (get_debug_info) - toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up..."); - - free_checksum_pages(); - - toi_cbw_restore(); - toi_free_cbw_data(); - - if (get_debug_info) - buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP); - - if (buffer) - i = get_toi_debug_info(buffer, PAGE_SIZE); - - toi_free_extra_pagedir_memory(); - - pagedir1.size = 0; - pagedir2.size = 0; - set_highmem_size(pagedir1, 0); - set_highmem_size(pagedir2, 0); - - if (boot_kernel_data_buffer) { - if (!test_toi_state(TOI_BOOT_KERNEL)) - toi_free_page(37, boot_kernel_data_buffer); - boot_kernel_data_buffer = 0; - } - - if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) { - unlock_device_hotplug(); - clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); - } - - clear_toi_state(TOI_BOOT_KERNEL); - if (current->flags & PF_SUSPEND_TASK) - thaw_processes(); - - if (!restarting) - toi_stop_other_threads(); - - if (toi_keeping_image && - !test_result_state(TOI_ABORTED)) { - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, - "TuxOnIce: Not invalidating the image due " - "to Keep Image or Incremental Image being enabled."); - set_result_state(TOI_KEPT_IMAGE); - - /* - * For an incremental image, free unused storage so - * swap (if any) can be used for normal system operation, - * if so desired. - */ - - toiActiveAllocator->free_unused_storage(); - } else - if (toiActiveAllocator) - toiActiveAllocator->remove_image(); - - free_bitmaps(); - usermodehelper_enable(); - - if (test_toi_state(TOI_NOTIFIERS_PREPARE)) { - pm_notifier_call_chain(PM_POST_HIBERNATION); - clear_toi_state(TOI_NOTIFIERS_PREPARE); - } - - if (buffer && i) { - /* Printk can only handle 1023 bytes, including - * its level mangling. */ - for (i = 0; i < 3; i++) - printk(KERN_ERR "%s", buffer + (1023 * i)); - toi_free_page(20, (unsigned long) buffer); - } - - if (!restarting) - toi_cleanup_console(); - - free_attention_list(); - - if (!restarting) - toi_deactivate_storage(0); - - clear_toi_state(TOI_IGNORE_LOGLEVEL); - clear_toi_state(TOI_TRYING_TO_RESUME); - clear_toi_state(TOI_NOW_RESUMING); -} - -/** - * check_still_keeping_image - we kept an image; check whether to reuse it. - * - * We enter this routine when we have kept an image. If the user has said they - * want to still keep it, all we need to do is powerdown. If powering down - * means hibernating to ram and the power doesn't run out, we'll return 1. - * If we do power off properly or the battery runs out, we'll resume via the - * normal paths. - * - * If the user has said they want to remove the previously kept image, we - * remove it, and return 0. We'll then store a new image. - **/ -static int check_still_keeping_image(void) -{ - if (toi_keeping_image) { - if (!test_action_state(TOI_INCREMENTAL_IMAGE)) { - printk(KERN_INFO "Image already stored: powering down " - "immediately."); - do_toi_step(STEP_HIBERNATE_POWERDOWN); - return 1; - } - /** - * Incremental image - need to write new part. - * We detect that we're writing an incremental image by looking - * at test_result_state(TOI_KEPT_IMAGE) - **/ - return 0; - } - - printk(KERN_INFO "Invalidating previous image.\n"); - toiActiveAllocator->remove_image(); - - return 0; -} - -/** - * toi_init - prepare to hibernate to disk - * - * Initialise variables & data structures, in preparation for - * hibernating to disk. - **/ -static int toi_init(int restarting) -{ - int result, i, j; - - toi_result = 0; - - printk(KERN_INFO "Initiating a hibernation cycle.\n"); - - nr_hibernates++; - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - toi_bkd.toi_io_time[i][j] = 0; - - if (!test_toi_state(TOI_CAN_HIBERNATE) || - allocate_bitmaps()) - return 1; - - mark_nosave_pages(); - - if (!restarting) - toi_prepare_console(); - - result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (result) { - set_result_state(TOI_NOTIFIERS_PREPARE_FAILED); - return 1; - } - set_toi_state(TOI_NOTIFIERS_PREPARE); - - if (!restarting) { - printk(KERN_ERR "Starting other threads."); - toi_start_other_threads(); - } - - result = usermodehelper_disable(); - if (result) { - printk(KERN_ERR "TuxOnIce: Failed to disable usermode " - "helpers\n"); - set_result_state(TOI_USERMODE_HELPERS_ERR); - return 1; - } - - boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP); - if (!boot_kernel_data_buffer) { - printk(KERN_ERR "TuxOnIce: Failed to allocate " - "boot_kernel_data_buffer.\n"); - set_result_state(TOI_OUT_OF_MEMORY); - return 1; - } - - toi_allocate_cbw_data(); - - return 0; -} - -/** - * can_hibernate - perform basic 'Can we hibernate?' tests - * - * Perform basic tests that must pass if we're going to be able to hibernate: - * Can we get the pm_mutex? Is resume= valid (we need to know where to write - * the image header). - **/ -static int can_hibernate(void) -{ - if (!test_toi_state(TOI_CAN_HIBERNATE)) - toi_attempt_to_parse_resume_device(0); - - if (!test_toi_state(TOI_CAN_HIBERNATE)) { - printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n" - "This may be because you haven't put something along " - "the lines of\n\nresume=swap:/dev/hda1\n\n" - "in lilo.conf or equivalent. (Where /dev/hda1 is your " - "swap partition).\n"); - set_abort_result(TOI_CANT_SUSPEND); - return 0; - } - - if (strlen(alt_resume_param)) { - attempt_to_parse_alt_resume_param(); - - if (!strlen(alt_resume_param)) { - printk(KERN_INFO "Alternate resume parameter now " - "invalid. Aborting.\n"); - set_abort_result(TOI_CANT_USE_ALT_RESUME); - return 0; - } - } - - return 1; -} - -/** - * do_post_image_write - having written an image, figure out what to do next - * - * After writing an image, we might load an alternate image or power down. - * Powering down might involve hibernating to ram, in which case we also - * need to handle reloading pageset2. - **/ -static int do_post_image_write(void) -{ - /* If switching images fails, do normal powerdown */ - if (alt_resume_param[0]) - do_toi_step(STEP_RESUME_ALT_IMAGE); - - toi_power_down(); - - barrier(); - mb(); - return 0; -} - -/** - * __save_image - do the hard work of saving the image - * - * High level routine for getting the image saved. The key assumptions made - * are that processes have been frozen and sufficient memory is available. - * - * We also exit through here at resume time, coming back from toi_hibernate - * after the atomic restore. This is the reason for the toi_in_hibernate - * test. - **/ -static int __save_image(void) -{ - int temp_result, did_copy = 0; - - toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image.."); - - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, - " - Final values: %d and %d.", - pagedir1.size, pagedir2.size); - - toi_cond_pause(1, "About to write pagedir2."); - - temp_result = write_pageset(&pagedir2); - - if (temp_result == -1 || test_result_state(TOI_ABORTED)) - return 1; - - toi_cond_pause(1, "About to copy pageset 1."); - - if (test_result_state(TOI_ABORTED)) - return 1; - - toi_deactivate_storage(1); - - toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); - - toi_in_hibernate = 1; - - if (toi_go_atomic(PMSG_FREEZE, 1)) - goto Failed; - - temp_result = toi_hibernate(); - -#ifdef CONFIG_KGDB - if (test_action_state(TOI_POST_RESUME_BREAKPOINT)) - kgdb_breakpoint(); -#endif - - if (!temp_result) - did_copy = 1; - - /* We return here at resume time too! */ - toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result); - -Failed: - if (toi_activate_storage(1)) - panic("Failed to reactivate our storage."); - - /* Resume time? */ - if (!toi_in_hibernate) { - copyback_post(); - return 0; - } - - /* Nope. Hibernating. So, see if we can save the image... */ - - if (temp_result || test_result_state(TOI_ABORTED)) { - if (did_copy) - goto abort_reloading_pagedir_two; - else - return 1; - } - - toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size, - NULL); - - if (test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - toi_cond_pause(1, "About to write pageset1."); - - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1"); - - temp_result = write_pageset(&pagedir1); - - /* We didn't overwrite any memory, so no reread needs to be done. */ - if (test_action_state(TOI_TEST_FILTER_SPEED) || - test_action_state(TOI_TEST_BIO)) - return 1; - - if (temp_result == 1 || test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - toi_cond_pause(1, "About to write header."); - - if (test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - temp_result = write_image_header(); - - if (!temp_result && !test_result_state(TOI_ABORTED)) - return 0; - -abort_reloading_pagedir_two: - temp_result = read_pageset2(1); - - /* If that failed, we're sunk. Panic! */ - if (temp_result) - panic("Attempt to reload pagedir 2 while aborting " - "a hibernate failed."); - - return 1; -} - -static void map_ps2_pages(int enable) -{ - unsigned long pfn = 0; - - memory_bm_position_reset(pageset2_map); - pfn = memory_bm_next_pfn(pageset2_map, 0); - - while (pfn != BM_END_OF_MAP) { - struct page *page = pfn_to_page(pfn); - kernel_map_pages(page, 1, enable); - pfn = memory_bm_next_pfn(pageset2_map, 0); - } -} - -/** - * do_save_image - save the image and handle the result - * - * Save the prepared image. If we fail or we're in the path returning - * from the atomic restore, cleanup. - **/ -static int do_save_image(void) -{ - int result; - map_ps2_pages(0); - result = __save_image(); - map_ps2_pages(1); - return result; -} - -/** - * do_prepare_image - try to prepare an image - * - * Seek to initialise and prepare an image to be saved. On failure, - * cleanup. - **/ -static int do_prepare_image(void) -{ - int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - - if (!restarting && toi_activate_storage(0)) - return 1; - - /* - * If kept image and still keeping image and hibernating to RAM, (non - * incremental image case) we will return 1 after hibernating and - * resuming (provided the power doesn't run out. In that case, we skip - * directly to cleaning up and exiting. - */ - - if (!can_hibernate() || - (test_result_state(TOI_KEPT_IMAGE) && - check_still_keeping_image())) - return 1; - - if (toi_init(restarting) || toi_prepare_image() || - test_result_state(TOI_ABORTED)) - return 1; - - trap_non_toi_io = 1; - - return 0; -} - -/** - * do_check_can_resume - find out whether an image has been stored - * - * Read whether an image exists. We use the same routine as the - * image_exists sysfs entry, and just look to see whether the - * first character in the resulting buffer is a '1'. - **/ -int do_check_can_resume(void) -{ - int result = -1; - - if (toi_activate_storage(0)) - return -1; - - if (!test_toi_state(TOI_RESUME_DEVICE_OK)) - toi_attempt_to_parse_resume_device(1); - - if (toiActiveAllocator) - result = toiActiveAllocator->image_exists(1); - - toi_deactivate_storage(0); - return result; -} - -/** - * do_load_atomic_copy - load the first part of an image, if it exists - * - * Check whether we have an image. If one exists, do sanity checking - * (possibly invalidating the image or even rebooting if the user - * requests that) before loading it into memory in preparation for the - * atomic restore. - * - * If and only if we have an image loaded and ready to restore, we return 1. - **/ -static int do_load_atomic_copy(void) -{ - int read_image_result = 0; - - if (sizeof(swp_entry_t) != sizeof(long)) { - printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size" - " of long. Please report this!\n"); - return 1; - } - - if (!resume_file[0]) - printk(KERN_WARNING "TuxOnIce: " - "You need to use a resume= command line parameter to " - "tell TuxOnIce where to look for an image.\n"); - - toi_activate_storage(0); - - if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) && - !toi_attempt_to_parse_resume_device(0)) { - /* - * Without a usable storage device we can do nothing - - * even if noresume is given - */ - - if (!toiNumAllocators) - printk(KERN_ALERT "TuxOnIce: " - "No storage allocators have been registered.\n"); - else - printk(KERN_ALERT "TuxOnIce: " - "Missing or invalid storage location " - "(resume= parameter). Please correct and " - "rerun lilo (or equivalent) before " - "hibernating.\n"); - toi_deactivate_storage(0); - return 1; - } - - if (allocate_bitmaps()) - return 1; - - read_image_result = read_pageset1(); /* non fatal error ignored */ - - if (test_toi_state(TOI_NORESUME_SPECIFIED)) - clear_toi_state(TOI_NORESUME_SPECIFIED); - - toi_deactivate_storage(0); - - if (read_image_result) - return 1; - - return 0; -} - -/** - * prepare_restore_load_alt_image - save & restore alt image variables - * - * Save and restore the pageset1 maps, when loading an alternate image. - **/ -static void prepare_restore_load_alt_image(int prepare) -{ - static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save; - - if (prepare) { - pageset1_map_save = pageset1_map; - pageset1_map = NULL; - pageset1_copy_map_save = pageset1_copy_map; - pageset1_copy_map = NULL; - set_toi_state(TOI_LOADING_ALT_IMAGE); - toi_reset_alt_image_pageset2_pfn(); - } else { - toi_free_bitmap(&pageset1_map); - pageset1_map = pageset1_map_save; - toi_free_bitmap(&pageset1_copy_map); - pageset1_copy_map = pageset1_copy_map_save; - clear_toi_state(TOI_NOW_RESUMING); - clear_toi_state(TOI_LOADING_ALT_IMAGE); - } -} - -/** - * do_toi_step - perform a step in hibernating or resuming - * - * Perform a step in hibernating or resuming an image. This abstraction - * is in preparation for implementing cluster support, and perhaps replacing - * uswsusp too (haven't looked whether that's possible yet). - **/ -int do_toi_step(int step) -{ - switch (step) { - case STEP_HIBERNATE_PREPARE_IMAGE: - return do_prepare_image(); - case STEP_HIBERNATE_SAVE_IMAGE: - return do_save_image(); - case STEP_HIBERNATE_POWERDOWN: - return do_post_image_write(); - case STEP_RESUME_CAN_RESUME: - return do_check_can_resume(); - case STEP_RESUME_LOAD_PS1: - return do_load_atomic_copy(); - case STEP_RESUME_DO_RESTORE: - /* - * If we succeed, this doesn't return. - * Instead, we return from do_save_image() in the - * hibernated kernel. - */ - return toi_atomic_restore(); - case STEP_RESUME_ALT_IMAGE: - printk(KERN_INFO "Trying to resume alternate image.\n"); - toi_in_hibernate = 0; - save_restore_alt_param(SAVE, NOQUIET); - prepare_restore_load_alt_image(1); - if (!do_check_can_resume()) { - printk(KERN_INFO "Nothing to resume from.\n"); - goto out; - } - if (!do_load_atomic_copy()) - toi_atomic_restore(); - - printk(KERN_INFO "Failed to load image.\n"); -out: - prepare_restore_load_alt_image(0); - save_restore_alt_param(RESTORE, NOQUIET); - break; - case STEP_CLEANUP: - do_cleanup(1, 0); - break; - case STEP_QUIET_CLEANUP: - do_cleanup(0, 0); - break; - } - - return 0; -} - -/* -- Functions for kickstarting a hibernate or resume --- */ - -/** - * toi_try_resume - try to do the steps in resuming - * - * Check if we have an image and if so try to resume. Clear the status - * flags too. - **/ -void toi_try_resume(void) -{ - set_toi_state(TOI_TRYING_TO_RESUME); - resume_attempted = 1; - - current->flags |= PF_MEMALLOC; - toi_start_other_threads(); - - if (do_toi_step(STEP_RESUME_CAN_RESUME) && - !do_toi_step(STEP_RESUME_LOAD_PS1)) - do_toi_step(STEP_RESUME_DO_RESTORE); - - toi_stop_other_threads(); - do_cleanup(0, 0); - - current->flags &= ~PF_MEMALLOC; - - clear_toi_state(TOI_IGNORE_LOGLEVEL); - clear_toi_state(TOI_TRYING_TO_RESUME); - clear_toi_state(TOI_NOW_RESUMING); -} - -/** - * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume - * - * Wrapper for when __toi_try_resume is called from swsusp resume path, - * rather than from echo > /sys/power/tuxonice/do_resume. - **/ -static void toi_sys_power_disk_try_resume(void) -{ - resume_attempted = 1; - - /* - * There's a comment in kernel/power/disk.c that indicates - * we should be able to use mutex_lock_nested below. That - * doesn't seem to cut it, though, so let's just turn lockdep - * off for now. - */ - lockdep_off(); - - if (toi_start_anything(SYSFS_RESUMING)) - goto out; - - toi_try_resume(); - - /* - * For initramfs, we have to clear the boot time - * flag after trying to resume - */ - clear_toi_state(TOI_BOOT_TIME); - - toi_finish_anything(SYSFS_RESUMING); -out: - lockdep_on(); -} - -/** - * toi_try_hibernate - try to start a hibernation cycle - * - * Start a hibernation cycle, coming in from either - * echo > /sys/power/tuxonice/do_suspend - * - * or - * - * echo disk > /sys/power/state - * - * In the later case, we come in without pm_sem taken; in the - * former, it has been taken. - **/ -int toi_try_hibernate(void) -{ - int result = 0, sys_power_disk = 0, retries = 0; - - if (!mutex_is_locked(&tuxonice_in_use)) { - /* Came in via /sys/power/disk */ - if (toi_start_anything(SYSFS_HIBERNATING)) - return -EBUSY; - sys_power_disk = 1; - } - - current->flags |= PF_MEMALLOC; - - if (test_toi_state(TOI_CLUSTER_MODE)) { - toi_initiate_cluster_hibernate(); - goto out; - } - -prepare: - result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); - - if (result) - goto out; - - if (test_action_state(TOI_FREEZER_TEST)) - goto out_restore_gfp_mask; - - result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); - - if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) { - if (retries < 2) { - do_cleanup(0, 1); - retries++; - clear_result_state(TOI_ABORTED); - extra_pd1_pages_allowance = extra_pd1_pages_used + 500; - printk(KERN_INFO "Automatically adjusting the extra" - " pages allowance to %ld and restarting.\n", - extra_pd1_pages_allowance); - pm_restore_gfp_mask(); - goto prepare; - } - - printk(KERN_INFO "Adjusted extra pages allowance twice and " - "still couldn't hibernate successfully. Giving up."); - } - - /* This code runs at resume time too! */ - if (!result && toi_in_hibernate) - result = do_toi_step(STEP_HIBERNATE_POWERDOWN); - -out_restore_gfp_mask: - pm_restore_gfp_mask(); -out: - do_cleanup(1, 0); - current->flags &= ~PF_MEMALLOC; - - if (sys_power_disk) - toi_finish_anything(SYSFS_HIBERNATING); - - return result; -} - -/* - * channel_no: If !0, -c is added to args (userui). - */ -int toi_launch_userspace_program(char *command, int channel_no, - int wait, int debug) -{ - int retval; - static char *envp[] = { - "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; - static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL - }; - char *channel = NULL; - int arg = 0, size; - char test_read[255]; - char *orig_posn = command; - - if (!strlen(orig_posn)) - return 1; - - if (channel_no) { - channel = toi_kzalloc(4, 6, GFP_KERNEL); - if (!channel) { - printk(KERN_INFO "Failed to allocate memory in " - "preparing to launch userspace program.\n"); - return 1; - } - } - - /* Up to 6 args supported */ - while (arg < 6) { - sscanf(orig_posn, "%s", test_read); - size = strlen(test_read); - if (!(size)) - break; - argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP); - strcpy(argv[arg], test_read); - orig_posn += size + 1; - *test_read = 0; - arg++; - } - - if (channel_no) { - sprintf(channel, "-c%d", channel_no); - argv[arg] = channel; - } else - arg--; - - if (debug) { - argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP); - strcpy(argv[arg], "--debug"); - } - - retval = call_usermodehelper(argv[0], argv, envp, wait); - - /* - * If the program reports an error, retval = 256. Don't complain - * about that here. - */ - if (retval && retval != 256) - printk(KERN_ERR "Failed to launch userspace program '%s': " - "Error %d\n", command, retval); - - { - int i; - for (i = 0; i < arg; i++) - if (argv[i] && argv[i] != channel) - toi_kfree(5, argv[i], sizeof(*argv[i])); - } - - toi_kfree(4, channel, sizeof(*channel)); - - return retval; -} - -/* - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_LONG("extra_pages_allowance", SYSFS_RW, - &extra_pd1_pages_allowance, 0, LONG_MAX, 0), - SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read, - image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL), - SYSFS_STRING("resume", SYSFS_RW, resume_file, 255, - SYSFS_NEEDS_SM_FOR_WRITE, - attempt_to_parse_resume_device2), - SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255, - SYSFS_NEEDS_SM_FOR_WRITE, - attempt_to_parse_alt_resume_param), - SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0, - NULL), - SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action, - TOI_IGNORE_ROOTFS, 0), - SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2, - INT_MAX, 0), - SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0), - SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_MULTITHREADED_IO, 0), - SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_FLUSHER_THREAD, 0), - SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action, - TOI_PAGESET2_FULL, 0), - SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0), - SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action, - TOI_REPLACE_SWSUSP, 0), - SYSFS_STRING("resume_commandline", SYSFS_RW, - toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0, - NULL), - SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL), - SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action, - TOI_FREEZER_TEST, 0), - SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0), - SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action, - TOI_TEST_FILTER_SPEED, 0), - SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_PAGESET2, 0), - SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_PS2_IF_UNNEEDED, 0), - SYSFS_STRING("binary_signature", SYSFS_READONLY, - tuxonice_signature, 9, 0, NULL), - SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0, - NULL), -#ifdef CONFIG_KGDB - SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action, - TOI_POST_RESUME_BREAKPOINT, 0), -#endif - SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_READAHEAD, 0), - SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action, - TOI_TRACE_DEBUG_ON, 0), -#ifdef CONFIG_TOI_KEEP_IMAGE - SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE, - 0), -#endif -#ifdef CONFIG_TOI_INCREMENTAL - SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0, - NULL), - SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action, - TOI_INCREMENTAL_IMAGE, 1), -#endif -}; - -static struct toi_core_fns my_fns = { - .get_nonconflicting_page = __toi_get_nonconflicting_page, - .post_context_save = __toi_post_context_save, - .try_hibernate = toi_try_hibernate, - .try_resume = toi_sys_power_disk_try_resume, -}; - -/** - * core_load - initialisation of TuxOnIce core - * - * Initialise the core, beginning with sysfs. Checksum and so on are part of - * the core, but have their own initialisation routines because they either - * aren't compiled in all the time or have their own subdirectories. - **/ -static __init int core_load(void) -{ - int i, - numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION - " (http://tuxonice.net)\n"); - - if (!hibernation_available()) { - printk(KERN_INFO "TuxOnIce disabled due to request for hibernation" - " to be disabled in this kernel.\n"); - return 1; - } - - if (toi_sysfs_init()) - return 1; - - for (i = 0; i < numfiles; i++) - toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); - - toi_core_fns = &my_fns; - - if (toi_alloc_init()) - return 1; - if (toi_checksum_init()) - return 1; - if (toi_usm_init()) - return 1; - if (toi_ui_init()) - return 1; - if (toi_poweroff_init()) - return 1; - if (toi_cluster_init()) - return 1; - if (toi_cbw_init()) - return 1; - - return 0; -} - -late_initcall(core_load); diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c deleted file mode 100644 index a8c5f3660..000000000 --- a/kernel/power/tuxonice_incremental.c +++ /dev/null @@ -1,402 +0,0 @@ -/* - * kernel/power/tuxonice_incremental.c - * - * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains routines related to storing incremental images - that - * is, retaining an image after an initial cycle and then storing incremental - * changes on subsequent hibernations. - * - * Based in part on on... - * - * Debug helper to dump the current kernel pagetables of the system - * so that we can see what the various memory ranges are set to. - * - * (C) Copyright 2008 Intel Corporation - * - * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include "tuxonice_pageflags.h" -#include "tuxonice_builtin.h" -#include "power.h" - -int toi_do_incremental_initcall; - -extern void kdb_init(int level); -extern noinline void kgdb_breakpoint(void); - -#undef pr_debug -#if 0 -#define pr_debug(a, b...) do { printk(a, ##b); } while(0) -#else -#define pr_debug(a, b...) do { } while(0) -#endif - -/* Multipliers for offsets within the PTEs */ -#define PTE_LEVEL_MULT (PAGE_SIZE) -#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) -#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) - -/* - * This function gets called on a break in a continuous series - * of PTE entries; the next one is different so we need to - * print what we collected so far. - */ -static void note_page(void *addr) -{ - static struct page *lastpage; - struct page *page; - - page = virt_to_page(addr); - - if (page != lastpage) { - unsigned int level; - pte_t *pte = lookup_address((unsigned long) addr, &level); - struct page *pt_page2 = pte_page(*pte); - //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2)); - SetPageTOI_Untracked(pt_page2); - lastpage = page; - } -} - -static void walk_pte_level(pmd_t addr) -{ - int i; - pte_t *start; - - start = (pte_t *) pmd_page_vaddr(addr); - for (i = 0; i < PTRS_PER_PTE; i++) { - note_page(start); - start++; - } -} - -#if PTRS_PER_PMD > 1 - -static void walk_pmd_level(pud_t addr) -{ - int i; - pmd_t *start; - - start = (pmd_t *) pud_page_vaddr(addr); - for (i = 0; i < PTRS_PER_PMD; i++) { - if (!pmd_none(*start)) { - if (pmd_large(*start) || !pmd_present(*start)) - note_page(start); - else - walk_pte_level(*start); - } else - note_page(start); - start++; - } -} - -#else -#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a))) -#define pud_large(a) pmd_large(__pmd(pud_val(a))) -#define pud_none(a) pmd_none(__pmd(pud_val(a))) -#endif - -#if PTRS_PER_PUD > 1 - -static void walk_pud_level(pgd_t addr) -{ - int i; - pud_t *start; - - start = (pud_t *) pgd_page_vaddr(addr); - - for (i = 0; i < PTRS_PER_PUD; i++) { - if (!pud_none(*start)) { - if (pud_large(*start) || !pud_present(*start)) - note_page(start); - else - walk_pmd_level(*start); - } else - note_page(start); - - start++; - } -} - -#else -#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a))) -#define pgd_large(a) pud_large(__pud(pgd_val(a))) -#define pgd_none(a) pud_none(__pud(pgd_val(a))) -#endif - -/* - * Not static in the original at the time of writing, so needs renaming here. - */ -static void toi_ptdump_walk_pgd_level(pgd_t *pgd) -{ -#ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_level4_pgt; -#else - pgd_t *start = swapper_pg_dir; -#endif - int i; - if (pgd) { - start = pgd; - } - - for (i = 0; i < PTRS_PER_PGD; i++) { - if (!pgd_none(*start)) { - if (pgd_large(*start) || !pgd_present(*start)) - note_page(start); - else - walk_pud_level(*start); - } else - note_page(start); - - start++; - } - - /* Flush out the last page */ - note_page(start); -} - -#ifdef CONFIG_PARAVIRT -extern struct pv_info pv_info; - -static void toi_set_paravirt_ops_untracked(void) { - int i; - - unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)), - pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end)); - //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end); - for (i = pvpfn; i <= pvpfn_end; i++) { - SetPageTOI_Untracked(pfn_to_page(i)); - } -} -#else -#define toi_set_paravirt_ops_untracked() { do { } while(0) } -#endif - -extern void toi_mark_per_cpus_pages_untracked(void); - -void toi_untrack_stack(unsigned long *stack) -{ - int i; - struct page *stack_page = virt_to_page(stack); - - for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) { - pr_debug("Untrack stack page %p.\n", page_address(stack_page + i)); - SetPageTOI_Untracked(stack_page + i); - } -} -void toi_untrack_process(struct task_struct *p) -{ - SetPageTOI_Untracked(virt_to_page(p)); - pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p))); - - toi_untrack_stack(p->stack); -} - -void toi_generate_untracked_map(void) -{ - struct task_struct *p, *t; - struct page *page; - pte_t *pte; - int i; - unsigned int level; - static int been_here = 0; - - if (been_here) - return; - - been_here = 1; - - /* Pagetable pages */ - toi_ptdump_walk_pgd_level(NULL); - - /* Printk buffer - not normally needed but can be helpful for debugging. */ - //toi_set_logbuf_untracked(); - - /* Paravirt ops */ - toi_set_paravirt_ops_untracked(); - - /* Task structs and stacks */ - for_each_process_thread(p, t) { - toi_untrack_process(p); - //toi_untrack_stack((unsigned long *) t->thread.sp); - } - - for (i = 0; i < NR_CPUS; i++) { - struct task_struct *idle = idle_task(i); - - if (idle) { - pr_debug("Untrack idle process for CPU %d.\n", i); - toi_untrack_process(idle); - } - - /* IRQ stack */ - pr_debug("Untrack IRQ stack for CPU %d.\n", i); - toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i)); - } - - /* Per CPU data */ - //pr_debug("Untracking per CPU variable pages.\n"); - toi_mark_per_cpus_pages_untracked(); - - /* Init stack - for bringing up secondary CPUs */ - page = virt_to_page(init_stack); - for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) { - SetPageTOI_Untracked(page + i); - } - - pte = lookup_address((unsigned long) &mmu_cr4_features, &level); - SetPageTOI_Untracked(pte_page(*pte)); - SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features)); -} - -/** - * toi_reset_dirtiness_one - */ - -void toi_reset_dirtiness_one(unsigned long pfn, int verbose) -{ - struct page *page = pfn_to_page(pfn); - - /** - * Don't worry about whether the Dirty flag is - * already set. If this is our first call, it - * won't be. - */ - - preempt_disable(); - - ClearPageTOI_Dirty(page); - SetPageTOI_RO(page); - if (verbose) - printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page)); - - set_memory_ro((unsigned long) page_address(page), 1); - - preempt_enable(); -} - -/** - * TuxOnIce's incremental image support works by marking all memory apart from - * the page tables read-only, then in the page-faults that result enabling - * writing if appropriate and flagging the page as dirty. Free pages are also - * marked as dirty and not protected so that if allocated, they will be included - * in the image without further processing. - * - * toi_reset_dirtiness is called when and image exists and incremental images are - * enabled, and each time we resume thereafter. It is not invoked on a fresh boot. - * - * This routine should be called from a single-cpu-running context to avoid races in setting - * page dirty/read only flags. - * - * TODO: Make "it is not invoked on a fresh boot" true when I've finished developing it! - * - * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c. - **/ - -int toi_reset_dirtiness(int verbose) -{ - struct zone *zone; - unsigned long loop; - int allocated_map = 0; - - toi_generate_untracked_map(); - - if (!free_map) { - if (!toi_alloc_bitmap(&free_map)) - return -ENOMEM; - allocated_map = 1; - } - - toi_generate_free_page_map(); - - pr_debug(KERN_EMERG "Reset dirtiness.\n"); - for_each_populated_zone(zone) { - // 64 bit only. No need to worry about highmem. - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageNosave(page) || !saveable_page(zone, pfn)) { - continue; - } - - if (PageTOI_Untracked(page)) { - continue; - } - - /** - * Do we need to (re)protect the page? - * If it is already protected (PageTOI_RO), there is - * nothing to do - skip the following. - * If it is marked as dirty (PageTOI_Dirty), it was - * either free and has been allocated or has been - * written to and marked dirty. Reset the dirty flag - * and (re)apply the protection. - */ - if (!PageTOI_RO(page)) { - toi_reset_dirtiness_one(pfn, verbose); - } - } - } - - pr_debug(KERN_EMERG "Done resetting dirtiness.\n"); - - if (allocated_map) { - toi_free_bitmap(&free_map); - } - return 0; -} - -static int toi_reset_dirtiness_initcall(void) -{ - if (toi_do_incremental_initcall) { - pr_info("TuxOnIce: Enabling dirty page tracking.\n"); - toi_reset_dirtiness(0); - } - return 1; -} -extern void toi_generate_untracked_map(void); - -// Leave early_initcall for pages to register untracked sections. -early_initcall(toi_reset_dirtiness_initcall); - -static int __init toi_incremental_initcall_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value) && value) - toi_do_incremental_initcall = value; - - return 1; -} -__setup("toi_incremental_initcall", toi_incremental_initcall_setup); diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c deleted file mode 100644 index 2db934350..000000000 --- a/kernel/power/tuxonice_io.c +++ /dev/null @@ -1,1936 +0,0 @@ -/* - * kernel/power/tuxonice_io.c - * - * Copyright (C) 1998-2001 Gabor Kuti - * Copyright (C) 1998,2001,2002 Pavel Machek - * Copyright (C) 2002-2003 Florent Chabaud - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains high level IO routines for hibernating. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_storage.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_extent.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_builtin.h" -#include "tuxonice_checksum.h" -#include "tuxonice_alloc.h" -char alt_resume_param[256]; - -/* Version read from image header at resume */ -static int toi_image_header_version; - -#define read_if_version(VERS, VAR, DESC, ERR_ACT) do { \ - if (likely(toi_image_header_version >= VERS)) \ - if (toiActiveAllocator->rw_header_chunk(READ, NULL, \ - (char *) &VAR, sizeof(VAR))) { \ - abort_hibernate(TOI_FAILED_IO, "Failed to read DESC."); \ - ERR_ACT; \ - } \ -} while(0) \ - -/* Variables shared between threads and updated under the mutex */ -static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result; -static int io_index, io_nextupdate, io_pc, io_pc_step; -static DEFINE_MUTEX(io_mutex); -static DEFINE_PER_CPU(struct page *, last_sought); -static DEFINE_PER_CPU(struct page *, last_high_page); -static DEFINE_PER_CPU(char *, checksum_locn); -static DEFINE_PER_CPU(struct pbe *, last_low_page); -static atomic_t io_count; -atomic_t toi_io_workers; - -static int using_flusher; - -DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher); - -int toi_bio_queue_flusher_should_finish; - -int toi_max_workers; - -static char *image_version_error = "The image header version is newer than " \ - "this kernel supports."; - -struct toi_module_ops *first_filter; - -static atomic_t toi_num_other_threads; -static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue); -enum toi_worker_commands { - TOI_IO_WORKER_STOP, - TOI_IO_WORKER_RUN, - TOI_IO_WORKER_EXIT -}; -static enum toi_worker_commands toi_worker_command; - -/** - * toi_attempt_to_parse_resume_device - determine if we can hibernate - * - * Can we hibernate, using the current resume= parameter? - **/ -int toi_attempt_to_parse_resume_device(int quiet) -{ - struct list_head *Allocator; - struct toi_module_ops *thisAllocator; - int result, returning = 0; - - if (toi_activate_storage(0)) - return 0; - - toiActiveAllocator = NULL; - clear_toi_state(TOI_RESUME_DEVICE_OK); - clear_toi_state(TOI_CAN_RESUME); - clear_result_state(TOI_ABORTED); - - if (!toiNumAllocators) { - if (!quiet) - printk(KERN_INFO "TuxOnIce: No storage allocators have " - "been registered. Hibernating will be " - "disabled.\n"); - goto cleanup; - } - - list_for_each(Allocator, &toiAllocators) { - thisAllocator = list_entry(Allocator, struct toi_module_ops, - type_list); - - /* - * Not sure why you'd want to disable an allocator, but - * we should honour the flag if we're providing it - */ - if (!thisAllocator->enabled) - continue; - - result = thisAllocator->parse_sig_location( - resume_file, (toiNumAllocators == 1), - quiet); - - switch (result) { - case -EINVAL: - /* For this allocator, but not a valid - * configuration. Error already printed. */ - goto cleanup; - - case 0: - /* For this allocator and valid. */ - toiActiveAllocator = thisAllocator; - - set_toi_state(TOI_RESUME_DEVICE_OK); - set_toi_state(TOI_CAN_RESUME); - returning = 1; - goto cleanup; - } - } - if (!quiet) - printk(KERN_INFO "TuxOnIce: No matching enabled allocator " - "found. Resuming disabled.\n"); -cleanup: - toi_deactivate_storage(0); - return returning; -} - -void attempt_to_parse_resume_device2(void) -{ - toi_prepare_usm(); - toi_attempt_to_parse_resume_device(0); - toi_cleanup_usm(); -} - -void save_restore_alt_param(int replace, int quiet) -{ - static char resume_param_save[255]; - static unsigned long toi_state_save; - - if (replace) { - toi_state_save = toi_state; - strcpy(resume_param_save, resume_file); - strcpy(resume_file, alt_resume_param); - } else { - strcpy(resume_file, resume_param_save); - toi_state = toi_state_save; - } - toi_attempt_to_parse_resume_device(quiet); -} - -void attempt_to_parse_alt_resume_param(void) -{ - int ok = 0; - - /* Temporarily set resume_param to the poweroff value */ - if (!strlen(alt_resume_param)) - return; - - printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n"); - save_restore_alt_param(SAVE, NOQUIET); - if (test_toi_state(TOI_CAN_RESUME)) - ok = 1; - - printk(KERN_INFO "=== Done ===\n"); - save_restore_alt_param(RESTORE, QUIET); - - /* If not ok, clear the string */ - if (ok) - return; - - printk(KERN_INFO "Can't resume from that location; clearing " - "alt_resume_param.\n"); - alt_resume_param[0] = '\0'; -} - -/** - * noresume_reset_modules - reset data structures in case of non resuming - * - * When we read the start of an image, modules (and especially the - * active allocator) might need to reset data structures if we - * decide to remove the image rather than resuming from it. - **/ -static void noresume_reset_modules(void) -{ - struct toi_module_ops *this_filter; - - list_for_each_entry(this_filter, &toi_filters, type_list) - if (this_filter->noresume_reset) - this_filter->noresume_reset(); - - if (toiActiveAllocator && toiActiveAllocator->noresume_reset) - toiActiveAllocator->noresume_reset(); -} - -/** - * fill_toi_header - fill the hibernate header structure - * @struct toi_header: Header data structure to be filled. - **/ -static int fill_toi_header(struct toi_header *sh) -{ - int i, error; - - error = init_header((struct swsusp_info *) sh); - if (error) - return error; - - sh->pagedir = pagedir1; - sh->pageset_2_size = pagedir2.size; - sh->param0 = toi_result; - sh->param1 = toi_bkd.toi_action; - sh->param2 = toi_bkd.toi_debug_state; - sh->param3 = toi_bkd.toi_default_console_level; - sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev; - for (i = 0; i < 4; i++) - sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2]; - sh->bkd = boot_kernel_data_buffer; - return 0; -} - -/** - * rw_init_modules - initialize modules - * @rw: Whether we are reading of writing an image. - * @which: Section of the image being processed. - * - * Iterate over modules, preparing the ones that will be used to read or write - * data. - **/ -static int rw_init_modules(int rw, int which) -{ - struct toi_module_ops *this_module; - /* Initialise page transformers */ - list_for_each_entry(this_module, &toi_filters, type_list) { - if (!this_module->enabled) - continue; - if (this_module->rw_init && this_module->rw_init(rw, which)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialize the %s filter.", - this_module->name); - return 1; - } - } - - /* Initialise allocator */ - if (toiActiveAllocator->rw_init(rw, which)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialise the allocator."); - return 1; - } - - /* Initialise other modules */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type == FILTER_MODULE || - this_module->type == WRITER_MODULE) - continue; - if (this_module->rw_init && this_module->rw_init(rw, which)) { - set_abort_result(TOI_FAILED_MODULE_INIT); - printk(KERN_INFO "Setting aborted flag due to module " - "init failure.\n"); - return 1; - } - } - - return 0; -} - -/** - * rw_cleanup_modules - cleanup modules - * @rw: Whether we are reading of writing an image. - * - * Cleanup components after reading or writing a set of pages. - * Only the allocator may fail. - **/ -static int rw_cleanup_modules(int rw) -{ - struct toi_module_ops *this_module; - int result = 0; - - /* Cleanup other modules */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type == FILTER_MODULE || - this_module->type == WRITER_MODULE) - continue; - if (this_module->rw_cleanup) - result |= this_module->rw_cleanup(rw); - } - - /* Flush data and cleanup */ - list_for_each_entry(this_module, &toi_filters, type_list) { - if (!this_module->enabled) - continue; - if (this_module->rw_cleanup) - result |= this_module->rw_cleanup(rw); - } - - result |= toiActiveAllocator->rw_cleanup(rw); - - return result; -} - -static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high) -{ - int index, min, max; - struct page *high_page = NULL, - **my_last_high_page = raw_cpu_ptr(&last_high_page), - **my_last_sought = raw_cpu_ptr(&last_sought); - struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page); - void *compare; - - if (is_high) { - if (*my_last_sought && *my_last_high_page && - *my_last_sought < orig_page) - high_page = *my_last_high_page; - else - high_page = (struct page *) restore_highmem_pblist; - this = (struct pbe *) kmap(high_page); - compare = orig_page; - } else { - if (*my_last_sought && *my_last_low_page && - *my_last_sought < orig_page) - this = *my_last_low_page; - else - this = restore_pblist; - compare = page_address(orig_page); - } - - *my_last_sought = orig_page; - - /* Locate page containing pbe */ - while (this[PBES_PER_PAGE - 1].next && - this[PBES_PER_PAGE - 1].orig_address < compare) { - if (is_high) { - struct page *next_high_page = (struct page *) - this[PBES_PER_PAGE - 1].next; - kunmap(high_page); - this = kmap(next_high_page); - high_page = next_high_page; - } else - this = this[PBES_PER_PAGE - 1].next; - } - - /* Do a binary search within the page */ - min = 0; - max = PBES_PER_PAGE; - index = PBES_PER_PAGE / 2; - while (max - min) { - if (!this[index].orig_address || - this[index].orig_address > compare) - max = index; - else if (this[index].orig_address == compare) { - if (is_high) { - struct page *page = this[index].address; - *my_last_high_page = high_page; - kunmap(high_page); - return page; - } - *my_last_low_page = this; - return virt_to_page(this[index].address); - } else - min = index; - index = ((max + min) / 2); - }; - - if (is_high) - kunmap(high_page); - - abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for" - " orig page %p. This[min].orig_address=%p.\n", orig_page, - this[index].orig_address); - return NULL; -} - -/** - * write_next_page - write the next page in a pageset - * @data_pfn: The pfn where the next data to write is located. - * @my_io_index: The index of the page in the pageset. - * @write_pfn: The pfn number to write in the image (where the data belongs). - * - * Get the pfn of the next page to write, map the page if necessary and do the - * write. - **/ -static int write_next_page(unsigned long *data_pfn, int *my_io_index, - unsigned long *write_pfn) -{ - struct page *page; - char **my_checksum_locn = raw_cpu_ptr(&checksum_locn); - int result = 0, was_present; - - *data_pfn = memory_bm_next_pfn(io_map, 0); - - /* Another thread could have beaten us to it. */ - if (*data_pfn == BM_END_OF_MAP) { - if (atomic_read(&io_count)) { - printk(KERN_INFO "Ran out of pfns but io_count is " - "still %d.\n", atomic_read(&io_count)); - BUG(); - } - mutex_unlock(&io_mutex); - return -ENODATA; - } - - *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); - - memory_bm_clear_bit(io_map, 0, *data_pfn); - page = pfn_to_page(*data_pfn); - - was_present = kernel_page_present(page); - if (!was_present) - kernel_map_pages(page, 1, 1); - - if (io_pageset == 1) - *write_pfn = memory_bm_next_pfn(pageset1_map, 0); - else { - *write_pfn = *data_pfn; - *my_checksum_locn = tuxonice_get_next_checksum(); - } - - TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index); - - mutex_unlock(&io_mutex); - - if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn)) - return 1; - - result = first_filter->write_page(*write_pfn, TOI_PAGE, page, - PAGE_SIZE); - - if (!was_present) - kernel_map_pages(page, 1, 0); - - return result; -} - -/** - * read_next_page - read the next page in a pageset - * @my_io_index: The index of the page in the pageset. - * @write_pfn: The pfn in which the data belongs. - * - * Read a page of the image into our buffer. It can happen (here and in the - * write routine) that threads don't get run until after other CPUs have done - * all the work. This was the cause of the long standing issue with - * occasionally getting -ENODATA errors at the end of reading the image. We - * therefore need to check there's actually a page to read before trying to - * retrieve one. - **/ - -static int read_next_page(int *my_io_index, unsigned long *write_pfn, - struct page *buffer) -{ - unsigned int buf_size = PAGE_SIZE; - unsigned long left = atomic_read(&io_count); - - if (!left) - return -ENODATA; - - /* Start off assuming the page we read isn't resaved */ - *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); - - mutex_unlock(&io_mutex); - - /* - * Are we aborting? If so, don't submit any more I/O as - * resetting the resume_attempted flag (from ui.c) will - * clear the bdev flags, making this thread oops. - */ - if (unlikely(test_toi_state(TOI_STOP_RESUME))) { - atomic_dec(&toi_io_workers); - if (!atomic_read(&toi_io_workers)) { - /* - * So we can be sure we'll have memory for - * marking that we haven't resumed. - */ - rw_cleanup_modules(READ); - set_toi_state(TOI_IO_STOPPED); - } - while (1) - schedule(); - } - - /* - * See toi_bio_read_page in tuxonice_bio.c: - * read the next page in the image. - */ - return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size); -} - -static void use_read_page(unsigned long write_pfn, struct page *buffer) -{ - struct page *final_page = pfn_to_page(write_pfn), - *copy_page = final_page; - char *virt, *buffer_virt; - int was_present, cpu = smp_processor_id(); - unsigned long idx = 0; - - if (io_pageset == 1 && (!pageset1_copy_map || - !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) { - int is_high = PageHighMem(final_page); - copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high); - } - - if (!memory_bm_test_bit(io_map, cpu, write_pfn)) { - int test = !memory_bm_test_bit(io_map, cpu, write_pfn); - toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test); - mutex_lock(&io_mutex); - idx = atomic_add_return(1, &io_count); - mutex_unlock(&io_mutex); - return; - } - - virt = kmap(copy_page); - buffer_virt = kmap(buffer); - was_present = kernel_page_present(copy_page); - if (!was_present) - kernel_map_pages(copy_page, 1, 1); - memcpy(virt, buffer_virt, PAGE_SIZE); - if (!was_present) - kernel_map_pages(copy_page, 1, 0); - kunmap(copy_page); - kunmap(buffer); - memory_bm_clear_bit(io_map, cpu, write_pfn); - TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset); -} - -static unsigned long status_update(int writing, unsigned long done, - unsigned long ticks) -{ - int cs_index = writing ? 0 : 1; - unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks; - unsigned long msec = jiffies_to_msecs(abs(ticks_so_far)); - unsigned long pgs_per_s, estimate = 0, pages_left; - - if (msec) { - pages_left = io_barmax - done; - pgs_per_s = 1000 * done / msec; - if (pgs_per_s) - estimate = DIV_ROUND_UP(pages_left, pgs_per_s); - } - - if (estimate && ticks > HZ / 2) - return toi_update_status(done, io_barmax, - " %d/%d MB (%lu sec left)", - MB(done+1), MB(io_barmax), estimate); - - return toi_update_status(done, io_barmax, " %d/%d MB", - MB(done+1), MB(io_barmax)); -} - -/** - * worker_rw_loop - main loop to read/write pages - * - * The main I/O loop for reading or writing pages. The io_map bitmap is used to - * track the pages to read/write. - * If we are reading, the pages are loaded to their final (mapped) pfn. - * Data is non zero iff this is a thread started via start_other_threads. - * In that case, we stay in here until told to quit. - **/ -static int worker_rw_loop(void *data) -{ - unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4, - jif_index = 1, start_time = jiffies, thread_num; - int result = 0, my_io_index = 0, last_worker; - struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP); - cpumask_var_t orig_mask; - - if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) { - printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data); - result = -ENOMEM; - goto out; - } - - cpumask_copy(orig_mask, tsk_cpus_allowed(current)); - - current->flags |= PF_NOFREEZE; - -top: - mutex_lock(&io_mutex); - thread_num = atomic_read(&toi_io_workers); - - cpumask_copy(tsk_cpus_allowed(current), orig_mask); - schedule(); - - atomic_inc(&toi_io_workers); - - while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) && - !(io_write && test_result_state(TOI_ABORTED)) && - toi_worker_command == TOI_IO_WORKER_RUN) { - if (!thread_num && jiffies > next_jiffies) { - next_jiffies += HZ / 4; - if (toiActiveAllocator->update_throughput_throttle) - toiActiveAllocator->update_throughput_throttle( - jif_index); - jif_index++; - } - - /* - * What page to use? If reading, don't know yet which page's - * data will be read, so always use the buffer. If writing, - * use the copy (Pageset1) or original page (Pageset2), but - * always write the pfn of the original page. - */ - if (io_write) - result = write_next_page(&data_pfn, &my_io_index, - &write_pfn); - else /* Reading */ - result = read_next_page(&my_io_index, &write_pfn, - buffer); - - if (result) { - mutex_lock(&io_mutex); - /* Nothing to do? */ - if (result == -ENODATA) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Thread %d has no more work.", - smp_processor_id()); - break; - } - - io_result = result; - - if (io_write) { - printk(KERN_INFO "Write chunk returned %d.\n", - result); - abort_hibernate(TOI_FAILED_IO, - "Failed to write a chunk of the " - "image."); - break; - } - - if (io_pageset == 1) { - printk(KERN_ERR "\nBreaking out of I/O loop " - "because of result code %d.\n", result); - break; - } - panic("Read chunk returned (%d)", result); - } - - /* - * Discard reads of resaved pages while reading ps2 - * and unwanted pages while rereading ps2 when aborting. - */ - if (!io_write) { - if (!PageResave(pfn_to_page(write_pfn))) - use_read_page(write_pfn, buffer); - else { - mutex_lock(&io_mutex); - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Resaved %ld.", write_pfn); - atomic_inc(&io_count); - mutex_unlock(&io_mutex); - } - } - - if (!thread_num) { - if(my_io_index + io_base > io_nextupdate) - io_nextupdate = status_update(io_write, - my_io_index + io_base, - jiffies - start_time); - - if (my_io_index > io_pc) { - printk(KERN_CONT "...%d%%", 20 * io_pc_step); - io_pc_step++; - io_pc = io_finish_at * io_pc_step / 5; - } - } - - toi_cond_pause(0, NULL); - - /* - * Subtle: If there's less I/O still to be done than threads - * running, quit. This stops us doing I/O beyond the end of - * the image when reading. - * - * Possible race condition. Two threads could do the test at - * the same time; one should exit and one should continue. - * Therefore we take the mutex before comparing and exiting. - */ - - mutex_lock(&io_mutex); - } - - last_worker = atomic_dec_and_test(&toi_io_workers); - toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers)); - mutex_unlock(&io_mutex); - - if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) { - /* Were we the last thread and we're using a flusher thread? */ - if (last_worker && using_flusher) { - toiActiveAllocator->finish_all_io(); - } - /* First, if we're doing I/O, wait for it to finish */ - wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN); - /* Then wait to be told what to do next */ - wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP); - if (toi_worker_command == TOI_IO_WORKER_RUN) - goto top; - } - - if (thread_num) - atomic_dec(&toi_num_other_threads); - -out: - toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num); - toi__free_page(28, buffer); - free_cpumask_var(orig_mask); - - return result; -} - -int toi_start_other_threads(void) -{ - int cpu; - struct task_struct *p; - int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1; - unsigned long num_started = 0; - - if (test_action_state(TOI_NO_MULTITHREADED_IO)) - return 0; - - toi_worker_command = TOI_IO_WORKER_STOP; - - for_each_online_cpu(cpu) { - if (num_started == to_start) - break; - - if (cpu == smp_processor_id()) - continue; - - p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1, - cpu_to_node(cpu), "ktoi_io/%d", cpu); - if (IS_ERR(p)) { - printk(KERN_ERR "ktoi_io for %i failed\n", cpu); - continue; - } - kthread_bind(p, cpu); - p->flags |= PF_MEMALLOC; - wake_up_process(p); - num_started++; - atomic_inc(&toi_num_other_threads); - } - - toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started); - return num_started; -} - -void toi_stop_other_threads(void) -{ - toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads."); - toi_worker_command = TOI_IO_WORKER_EXIT; - wake_up(&toi_worker_wait_queue); -} - -/** - * do_rw_loop - main highlevel function for reading or writing pages - * - * Create the io_map bitmap and call worker_rw_loop to perform I/O operations. - **/ -static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags, - int base, int barmax, int pageset) -{ - int index = 0, cpu, result = 0, workers_started; - unsigned long pfn, next; - - first_filter = toi_get_next_filter(NULL); - - if (!finish_at) - return 0; - - io_write = write; - io_finish_at = finish_at; - io_base = base; - io_barmax = barmax; - io_pageset = pageset; - io_index = 0; - io_pc = io_finish_at / 5; - io_pc_step = 1; - io_result = 0; - io_nextupdate = base + 1; - toi_bio_queue_flusher_should_finish = 0; - - for_each_online_cpu(cpu) { - per_cpu(last_sought, cpu) = NULL; - per_cpu(last_low_page, cpu) = NULL; - per_cpu(last_high_page, cpu) = NULL; - } - - /* Ensure all bits clear */ - memory_bm_clear(io_map); - - memory_bm_position_reset(io_map); - next = memory_bm_next_pfn(io_map, 0); - - BUG_ON(next != BM_END_OF_MAP); - - /* Set the bits for the pages to write */ - memory_bm_position_reset(pageflags); - - pfn = memory_bm_next_pfn(pageflags, 0); - toi_trace_index++; - - while (pfn != BM_END_OF_MAP && index < finish_at) { - TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at); - memory_bm_set_bit(io_map, 0, pfn); - pfn = memory_bm_next_pfn(pageflags, 0); - index++; - } - - BUG_ON(next != BM_END_OF_MAP || index < finish_at); - - memory_bm_position_reset(io_map); - toi_trace_index++; - - atomic_set(&io_count, finish_at); - - memory_bm_position_reset(pageset1_map); - - mutex_lock(&io_mutex); - - clear_toi_state(TOI_IO_STOPPED); - - using_flusher = (atomic_read(&toi_num_other_threads) && - toiActiveAllocator->io_flusher && - !test_action_state(TOI_NO_FLUSHER_THREAD)); - - workers_started = atomic_read(&toi_num_other_threads); - - memory_bm_position_reset(io_map); - memory_bm_position_reset(pageset1_copy_map); - - toi_worker_command = TOI_IO_WORKER_RUN; - wake_up(&toi_worker_wait_queue); - - mutex_unlock(&io_mutex); - - if (using_flusher) - result = toiActiveAllocator->io_flusher(write); - else - worker_rw_loop(NULL); - - while (atomic_read(&toi_io_workers)) - schedule(); - - printk(KERN_CONT "\n"); - - toi_worker_command = TOI_IO_WORKER_STOP; - wake_up(&toi_worker_wait_queue); - - if (unlikely(test_toi_state(TOI_STOP_RESUME))) { - if (!atomic_read(&toi_io_workers)) { - rw_cleanup_modules(READ); - set_toi_state(TOI_IO_STOPPED); - } - while (1) - schedule(); - } - set_toi_state(TOI_IO_STOPPED); - - if (!io_result && !result && !test_result_state(TOI_ABORTED)) { - unsigned long next; - - toi_update_status(io_base + io_finish_at, io_barmax, - " %d/%d MB ", - MB(io_base + io_finish_at), MB(io_barmax)); - - memory_bm_position_reset(io_map); - next = memory_bm_next_pfn(io_map, 0); - if (next != BM_END_OF_MAP) { - printk(KERN_INFO "Finished I/O loop but still work to " - "do?\nFinish at = %d. io_count = %d.\n", - finish_at, atomic_read(&io_count)); - printk(KERN_INFO "I/O bitmap still records work to do." - "%ld.\n", next); - BUG(); - do { - cpu_relax(); - } while (0); - } - } - - return io_result ? io_result : result; -} - -/** - * write_pageset - write a pageset to disk. - * @pagedir: Which pagedir to write. - * - * Returns: - * Zero on success or -1 on failure. - **/ -int write_pageset(struct pagedir *pagedir) -{ - int finish_at, base = 0; - int barmax = pagedir1.size + pagedir2.size; - long error = 0; - struct memory_bitmap *pageflags; - unsigned long start_time, end_time; - - /* - * Even if there is nothing to read or write, the allocator - * may need the init/cleanup for it's housekeeping. (eg: - * Pageset1 may start where pageset2 ends when writing). - */ - finish_at = pagedir->size; - - if (pagedir->id == 1) { - toi_prepare_status(DONT_CLEAR_BAR, - "Writing kernel & process data..."); - base = pagedir2.size; - if (test_action_state(TOI_TEST_FILTER_SPEED) || - test_action_state(TOI_TEST_BIO)) - pageflags = pageset1_map; - else - pageflags = pageset1_copy_map; - } else { - toi_prepare_status(DONT_CLEAR_BAR, "Writing caches..."); - pageflags = pageset2_map; - } - - start_time = jiffies; - - if (rw_init_modules(WRITE, pagedir->id)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialise modules for writing."); - error = 1; - } - - if (!error) - error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax, - pagedir->id); - - if (rw_cleanup_modules(WRITE) && !error) { - abort_hibernate(TOI_FAILED_MODULE_CLEANUP, - "Failed to cleanup after writing."); - error = 1; - } - - end_time = jiffies; - - if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { - toi_bkd.toi_io_time[0][0] += finish_at, - toi_bkd.toi_io_time[0][1] += (end_time - start_time); - } - - return error; -} - -/** - * read_pageset - highlevel function to read a pageset from disk - * @pagedir: pageset to read - * @overwrittenpagesonly: Whether to read the whole pageset or - * only part of it. - * - * Returns: - * Zero on success or -1 on failure. - **/ -static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly) -{ - int result = 0, base = 0; - int finish_at = pagedir->size; - int barmax = pagedir1.size + pagedir2.size; - struct memory_bitmap *pageflags; - unsigned long start_time, end_time; - - if (pagedir->id == 1) { - toi_prepare_status(DONT_CLEAR_BAR, - "Reading kernel & process data..."); - pageflags = pageset1_map; - } else { - toi_prepare_status(DONT_CLEAR_BAR, "Reading caches..."); - if (overwrittenpagesonly) { - barmax = min(pagedir1.size, pagedir2.size); - finish_at = min(pagedir1.size, pagedir2.size); - } else - base = pagedir1.size; - pageflags = pageset2_map; - } - - start_time = jiffies; - - if (rw_init_modules(READ, pagedir->id)) { - toiActiveAllocator->remove_image(); - result = 1; - } else - result = do_rw_loop(READ, finish_at, pageflags, base, barmax, - pagedir->id); - - if (rw_cleanup_modules(READ) && !result) { - abort_hibernate(TOI_FAILED_MODULE_CLEANUP, - "Failed to cleanup after reading."); - result = 1; - } - - /* Statistics */ - end_time = jiffies; - - if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { - toi_bkd.toi_io_time[1][0] += finish_at, - toi_bkd.toi_io_time[1][1] += (end_time - start_time); - } - - return result; -} - -/** - * write_module_configs - store the modules configuration - * - * The configuration for each module is stored in the image header. - * Returns: Int - * Zero on success, Error value otherwise. - **/ -static int write_module_configs(void) -{ - struct toi_module_ops *this_module; - char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP); - int len, index = 1; - struct toi_module_header toi_module_header; - - if (!buffer) { - printk(KERN_INFO "Failed to allocate a buffer for saving " - "module configuration info.\n"); - return -ENOMEM; - } - - /* - * We have to know which data goes with which module, so we at - * least write a length of zero for a module. Note that we are - * also assuming every module's config data takes <= PAGE_SIZE. - */ - - /* For each module (in registration order) */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || !this_module->storage_needed || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - - /* Get the data from the module */ - len = 0; - if (this_module->save_config_info) - len = this_module->save_config_info(buffer); - - /* Save the details of the module */ - toi_module_header.enabled = this_module->enabled; - toi_module_header.type = this_module->type; - toi_module_header.index = index++; - strncpy(toi_module_header.name, this_module->name, - sizeof(toi_module_header.name)); - toiActiveAllocator->rw_header_chunk(WRITE, - this_module, - (char *) &toi_module_header, - sizeof(toi_module_header)); - - /* Save the size of the data and any data returned */ - toiActiveAllocator->rw_header_chunk(WRITE, - this_module, - (char *) &len, sizeof(int)); - if (len) - toiActiveAllocator->rw_header_chunk( - WRITE, this_module, buffer, len); - } - - /* Write a blank header to terminate the list */ - toi_module_header.name[0] = '\0'; - toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &toi_module_header, sizeof(toi_module_header)); - - toi_free_page(22, (unsigned long) buffer); - return 0; -} - -/** - * read_one_module_config - read and configure one module - * - * Read the configuration for one module, and configure the module - * to match if it is loaded. - * - * Returns: Int - * Zero on success, Error value otherwise. - **/ -static int read_one_module_config(struct toi_module_header *header) -{ - struct toi_module_ops *this_module; - int result, len; - char *buffer; - - /* Find the module */ - this_module = toi_find_module_given_name(header->name); - - if (!this_module) { - if (header->enabled) { - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "It looks like we need module %s for reading " - "the image but it hasn't been registered.\n", - header->name); - if (!(test_toi_state(TOI_CONTINUE_REQ))) - return -EINVAL; - } else - printk(KERN_INFO "Module %s configuration data found, " - "but the module hasn't registered. Looks like " - "it was disabled, so we're ignoring its data.", - header->name); - } - - /* Get the length of the data (if any) */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len, - sizeof(int)); - if (result) { - printk(KERN_ERR "Failed to read the length of the module %s's" - " configuration data.\n", - header->name); - return -EINVAL; - } - - /* Read any data and pass to the module (if we found one) */ - if (!len) - return 0; - - buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP); - - if (!buffer) { - printk(KERN_ERR "Failed to allocate a buffer for reloading " - "module configuration info.\n"); - return -ENOMEM; - } - - toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len); - - if (!this_module) - goto out; - - if (!this_module->save_config_info) - printk(KERN_ERR "Huh? Module %s appears to have a " - "save_config_info, but not a load_config_info " - "function!\n", this_module->name); - else - this_module->load_config_info(buffer, len); - - /* - * Now move this module to the tail of its lists. This will put it in - * order. Any new modules will end up at the top of the lists. They - * should have been set to disabled when loaded (people will - * normally not edit an initrd to load a new module and then hibernate - * without using it!). - */ - - toi_move_module_tail(this_module); - - this_module->enabled = header->enabled; - -out: - toi_free_page(23, (unsigned long) buffer); - return 0; -} - -/** - * read_module_configs - reload module configurations from the image header. - * - * Returns: Int - * Zero on success or an error code. - **/ -static int read_module_configs(void) -{ - int result = 0; - struct toi_module_header toi_module_header; - struct toi_module_ops *this_module; - - /* All modules are initially disabled. That way, if we have a module - * loaded now that wasn't loaded when we hibernated, it won't be used - * in trying to read the data. - */ - list_for_each_entry(this_module, &toi_modules, module_list) - this_module->enabled = 0; - - /* Get the first module header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - (char *) &toi_module_header, - sizeof(toi_module_header)); - if (result) { - printk(KERN_ERR "Failed to read the next module header.\n"); - return -EINVAL; - } - - /* For each module (in registration order) */ - while (toi_module_header.name[0]) { - result = read_one_module_config(&toi_module_header); - - if (result) - return -EINVAL; - - /* Get the next module header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - (char *) &toi_module_header, - sizeof(toi_module_header)); - - if (result) { - printk(KERN_ERR "Failed to read the next module " - "header.\n"); - return -EINVAL; - } - } - - return 0; -} - -static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev) -{ - return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1; -} - -int fs_info_space_needed(int reset) -{ - static int last_result = 0; - const struct super_block *sb; - int result = sizeof(int); - - if (!last_result || reset) { - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) - result += 16 + sizeof(dev_t) + sizeof(int) + - fs->last_mount_size; - free_fs_info(fs); - } - last_result = result; - } - return result; -} - -static int fs_info_num_to_save(void) -{ - const struct super_block *sb; - int to_save = 0; - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) - to_save++; - free_fs_info(fs); - } - - return to_save; -} - -static int fs_info_save(void) -{ - const struct super_block *sb; - int to_save = fs_info_num_to_save(); - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info" - " to save."); - return -EIO; - } - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) { - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - &fs->uuid[0], 16)) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write uuid."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &fs->dev_t, sizeof(dev_t))) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write dev_t."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &fs->last_mount_size, sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write last mount length."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - fs->last_mount, fs->last_mount_size)) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write uuid."); - return -EIO; - } - } - free_fs_info(fs); - } - return 0; -} - -static int fs_info_load_and_check_one(void) -{ - char uuid[16], *last_mount; - int result = 0, ln; - dev_t dev_t; - struct block_device *dev; - struct fs_info *fs_info, seek; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) { - abort_hibernate(TOI_FAILED_IO, "Failed to read uuid."); - return -EIO; - } - - read_if_version(3, dev_t, "uuid dev_t field", return -EIO); - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, - "Failed to read last mount size."); - return -EIO; - } - - last_mount = kzalloc(ln, GFP_KERNEL); - - if (!last_mount) - return -ENOMEM; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount, ln)) { - abort_hibernate(TOI_FAILED_IO, - "Failed to read last mount timestamp."); - result = -EIO; - goto out_lmt; - } - - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = dev_t; - seek.last_mount_size = ln; - seek.last_mount = last_mount; - dev_t = blk_lookup_fs_info(&seek); - if (!dev_t) - goto out_lmt; - - dev = toi_open_by_devnum(dev_t); - - fs_info = fs_info_from_block_dev(dev); - if (fs_info && !IS_ERR(fs_info)) { - if (ln != fs_info->last_mount_size) { - printk(KERN_EMERG "Found matching uuid but last mount " - "time lengths differ?! " - "(%d vs %d).\n", ln, - fs_info->last_mount_size); - result = -EINVAL; - } else { - char buf[BDEVNAME_SIZE]; - result = !!memcmp(fs_info->last_mount, last_mount, ln); - if (result) - printk(KERN_EMERG "Last mount time for %s has " - "changed!\n", bdevname(dev, buf)); - } - } - toi_close_bdev(dev); - free_fs_info(fs_info); -out_lmt: - kfree(last_mount); - return result; -} - -static int fs_info_load_and_check(void) -{ - int to_do, result = 0; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info " - "to load."); - return -EIO; - } - - while(to_do--) - result |= fs_info_load_and_check_one(); - - return result; -} - -/** - * write_image_header - write the image header after write the image proper - * - * Returns: Int - * Zero on success, error value otherwise. - **/ -int write_image_header(void) -{ - int ret; - int total = pagedir1.size + pagedir2.size+2; - char *header_buffer = NULL; - - /* Now prepare to write the header */ - ret = toiActiveAllocator->write_header_init(); - if (ret) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Active allocator's write_header_init" - " function failed."); - goto write_image_header_abort; - } - - /* Get a buffer */ - header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP); - if (!header_buffer) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Out of memory when trying to get page for header!"); - goto write_image_header_abort; - } - - /* Write hibernate header */ - if (fill_toi_header((struct toi_header *) header_buffer)) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to fill header information!"); - goto write_image_header_abort; - } - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - header_buffer, sizeof(struct toi_header))) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to write header info."); - goto write_image_header_abort; - } - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &toi_max_workers, sizeof(toi_max_workers))) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to number of workers to use."); - goto write_image_header_abort; - } - - /* Write filesystem info */ - if (fs_info_save()) - goto write_image_header_abort; - - /* Write module configurations */ - ret = write_module_configs(); - if (ret) { - abort_hibernate(TOI_FAILED_IO, - "Failed to write module configs."); - goto write_image_header_abort; - } - - if (memory_bm_write(pageset1_map, - toiActiveAllocator->rw_header_chunk)) { - abort_hibernate(TOI_FAILED_IO, - "Failed to write bitmaps."); - goto write_image_header_abort; - } - - /* Flush data and let allocator cleanup */ - if (toiActiveAllocator->write_header_cleanup()) { - abort_hibernate(TOI_FAILED_IO, - "Failed to cleanup writing header."); - goto write_image_header_abort_no_cleanup; - } - - if (test_result_state(TOI_ABORTED)) - goto write_image_header_abort_no_cleanup; - - toi_update_status(total, total, NULL); - -out: - if (header_buffer) - toi_free_page(24, (unsigned long) header_buffer); - return ret; - -write_image_header_abort: - toiActiveAllocator->write_header_cleanup(); -write_image_header_abort_no_cleanup: - ret = -1; - goto out; -} - -/** - * sanity_check - check the header - * @sh: the header which was saved at hibernate time. - * - * Perform a few checks, seeking to ensure that the kernel being - * booted matches the one hibernated. They need to match so we can - * be _sure_ things will work. It is not absolutely impossible for - * resuming from a different kernel to work, just not assured. - **/ -static char *sanity_check(struct toi_header *sh) -{ - char *reason = check_image_kernel((struct swsusp_info *) sh); - - if (reason) - return reason; - - if (!test_action_state(TOI_IGNORE_ROOTFS)) { - const struct super_block *sb; - list_for_each_entry(sb, &super_blocks, s_list) { - if ((!(sb->s_flags & MS_RDONLY)) && - (sb->s_type->fs_flags & FS_REQUIRES_DEV)) - return "Device backed fs has been mounted " - "rw prior to resume or initrd/ramfs " - "is mounted rw."; - } - } - - return NULL; -} - -static DECLARE_WAIT_QUEUE_HEAD(freeze_wait); - -#define FREEZE_IN_PROGRESS (~0) - -static int freeze_result; - -static void do_freeze(struct work_struct *dummy) -{ - freeze_result = freeze_processes(); - wake_up(&freeze_wait); - trap_non_toi_io = 1; -} - -static DECLARE_WORK(freeze_work, do_freeze); - -/** - * __read_pageset1 - test for the existence of an image and attempt to load it - * - * Returns: Int - * Zero if image found and pageset1 successfully loaded. - * Error if no image found or loaded. - **/ -static int __read_pageset1(void) -{ - int i, result = 0; - char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP), - *sanity_error = NULL; - struct toi_header *toi_header; - - if (!header_buffer) { - printk(KERN_INFO "Unable to allocate a page for reading the " - "signature.\n"); - return -ENOMEM; - } - - /* Check for an image */ - result = toiActiveAllocator->image_exists(1); - if (result == 3) { - result = -ENODATA; - toi_early_boot_message(1, 0, "The signature from an older " - "version of TuxOnIce has been detected."); - goto out_remove_image; - } - - if (result != 1) { - result = -ENODATA; - noresume_reset_modules(); - printk(KERN_INFO "TuxOnIce: No image found.\n"); - goto out; - } - - /* - * Prepare the active allocator for reading the image header. The - * activate allocator might read its own configuration. - * - * NB: This call may never return because there might be a signature - * for a different image such that we warn the user and they choose - * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the - * location of the image might be unavailable if it was stored on a - * network connection). - */ - - result = toiActiveAllocator->read_header_init(); - if (result) { - printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the " - "image header.\n"); - goto out_remove_image; - } - - /* Check for noresume command line option */ - if (test_toi_state(TOI_NORESUME_SPECIFIED)) { - printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed " - "image.\n"); - goto out_remove_image; - } - - /* Check whether we've resumed before */ - if (test_toi_state(TOI_RESUMED_BEFORE)) { - toi_early_boot_message(1, 0, NULL); - if (!(test_toi_state(TOI_CONTINUE_REQ))) { - printk(KERN_INFO "TuxOnIce: Tried to resume before: " - "Invalidated image.\n"); - goto out_remove_image; - } - } - - clear_toi_state(TOI_CONTINUE_REQ); - - toi_image_header_version = toiActiveAllocator->get_header_version(); - - if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) { - toi_early_boot_message(1, 0, image_version_error); - if (!(test_toi_state(TOI_CONTINUE_REQ))) { - printk(KERN_INFO "TuxOnIce: Header version too new: " - "Invalidated image.\n"); - goto out_remove_image; - } - } - - /* Read hibernate header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - header_buffer, sizeof(struct toi_header)); - if (result < 0) { - printk(KERN_ERR "TuxOnIce: Failed to read the image " - "signature.\n"); - goto out_remove_image; - } - - toi_header = (struct toi_header *) header_buffer; - - /* - * NB: This call may also result in a reboot rather than returning. - */ - - sanity_error = sanity_check(toi_header); - if (sanity_error) { - toi_early_boot_message(1, TOI_CONTINUE_REQ, - sanity_error); - printk(KERN_INFO "TuxOnIce: Sanity check failed.\n"); - goto out_remove_image; - } - - /* - * We have an image and it looks like it will load okay. - * - * Get metadata from header. Don't override commandline parameters. - * - * We don't need to save the image size limit because it's not used - * during resume and will be restored with the image anyway. - */ - - memcpy((char *) &pagedir1, - (char *) &toi_header->pagedir, sizeof(pagedir1)); - toi_result = toi_header->param0; - if (!toi_bkd.toi_debug_state) { - toi_bkd.toi_action = - (toi_header->param1 & ~toi_bootflags_mask) | - (toi_bkd.toi_action & toi_bootflags_mask); - toi_bkd.toi_debug_state = toi_header->param2; - toi_bkd.toi_default_console_level = toi_header->param3; - } - clear_toi_state(TOI_IGNORE_LOGLEVEL); - pagedir2.size = toi_header->pageset_2_size; - for (i = 0; i < 4; i++) - toi_bkd.toi_io_time[i/2][i%2] = - toi_header->io_time[i/2][i%2]; - - set_toi_state(TOI_BOOT_KERNEL); - boot_kernel_data_buffer = toi_header->bkd; - - read_if_version(1, toi_max_workers, "TuxOnIce max workers", - goto out_remove_image); - - /* Read filesystem info */ - if (fs_info_load_and_check()) { - printk(KERN_EMERG "TuxOnIce: File system mount time checks " - "failed. Refusing to corrupt your filesystems!\n"); - goto out_remove_image; - } - - /* Read module configurations */ - result = read_module_configs(); - if (result) { - pagedir1.size = 0; - pagedir2.size = 0; - printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module " - "configurations.\n"); - clear_action_state(TOI_KEEP_IMAGE); - goto out_remove_image; - } - - toi_prepare_console(); - - set_toi_state(TOI_NOW_RESUMING); - - result = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (result) - goto out_notifier_call_chain;; - - if (usermodehelper_disable()) - goto out_enable_usermodehelper; - - current->flags |= PF_NOFREEZE; - freeze_result = FREEZE_IN_PROGRESS; - - schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work); - - toi_cond_pause(1, "About to read original pageset1 locations."); - - /* - * See _toi_rw_header_chunk in tuxonice_bio.c: - * Initialize pageset1_map by reading the map from the image. - */ - if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk)) - goto out_thaw; - - /* - * See toi_rw_cleanup in tuxonice_bio.c: - * Clean up after reading the header. - */ - result = toiActiveAllocator->read_header_cleanup(); - if (result) { - printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the " - "image header.\n"); - goto out_thaw; - } - - toi_cond_pause(1, "About to read pagedir."); - - /* - * Get the addresses of pages into which we will load the kernel to - * be copied back and check if they conflict with the ones we are using. - */ - if (toi_get_pageset1_load_addresses()) { - printk(KERN_INFO "TuxOnIce: Failed to get load addresses for " - "pageset1.\n"); - goto out_thaw; - } - - /* Read the original kernel back */ - toi_cond_pause(1, "About to read pageset 1."); - - /* Given the pagemap, read back the data from disk */ - if (read_pageset(&pagedir1, 0)) { - toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1."); - result = -EIO; - goto out_thaw; - } - - toi_cond_pause(1, "About to restore original kernel."); - result = 0; - - if (!toi_keeping_image && - toiActiveAllocator->mark_resume_attempted) - toiActiveAllocator->mark_resume_attempted(1); - - wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); -out: - current->flags &= ~PF_NOFREEZE; - toi_free_page(25, (unsigned long) header_buffer); - return result; - -out_thaw: - wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); - trap_non_toi_io = 0; - thaw_processes(); -out_enable_usermodehelper: - usermodehelper_enable(); -out_notifier_call_chain: - pm_notifier_call_chain(PM_POST_RESTORE); - toi_cleanup_console(); -out_remove_image: - result = -EINVAL; - if (!toi_keeping_image) - toiActiveAllocator->remove_image(); - toiActiveAllocator->read_header_cleanup(); - noresume_reset_modules(); - goto out; -} - -/** - * read_pageset1 - highlevel function to read the saved pages - * - * Attempt to read the header and pageset1 of a hibernate image. - * Handle the outcome, complaining where appropriate. - **/ -int read_pageset1(void) -{ - int error; - - error = __read_pageset1(); - - if (error && error != -ENODATA && error != -EINVAL && - !test_result_state(TOI_ABORTED)) - abort_hibernate(TOI_IMAGE_ERROR, - "TuxOnIce: Error %d resuming\n", error); - - return error; -} - -/** - * get_have_image_data - check the image header - **/ -static char *get_have_image_data(void) -{ - char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP); - struct toi_header *toi_header; - - if (!output_buffer) { - printk(KERN_INFO "Output buffer null.\n"); - return NULL; - } - - /* Check for an image */ - if (!toiActiveAllocator->image_exists(1) || - toiActiveAllocator->read_header_init() || - toiActiveAllocator->rw_header_chunk(READ, NULL, - output_buffer, sizeof(struct toi_header))) { - sprintf(output_buffer, "0\n"); - /* - * From an initrd/ramfs, catting have_image and - * getting a result of 0 is sufficient. - */ - clear_toi_state(TOI_BOOT_TIME); - goto out; - } - - toi_header = (struct toi_header *) output_buffer; - - sprintf(output_buffer, "1\n%s\n%s\n", - toi_header->uts.machine, - toi_header->uts.version); - - /* Check whether we've resumed before */ - if (test_toi_state(TOI_RESUMED_BEFORE)) - strcat(output_buffer, "Resumed before.\n"); - -out: - noresume_reset_modules(); - return output_buffer; -} - -/** - * read_pageset2 - read second part of the image - * @overwrittenpagesonly: Read only pages which would have been - * verwritten by pageset1? - * - * Read in part or all of pageset2 of an image, depending upon - * whether we are hibernating and have only overwritten a portion - * with pageset1 pages, or are resuming and need to read them - * all. - * - * Returns: Int - * Zero if no error, otherwise the error value. - **/ -int read_pageset2(int overwrittenpagesonly) -{ - int result = 0; - - if (!pagedir2.size) - return 0; - - result = read_pageset(&pagedir2, overwrittenpagesonly); - - toi_cond_pause(1, "Pagedir 2 read."); - - return result; -} - -/** - * image_exists_read - has an image been found? - * @page: Output buffer - * - * Store 0 or 1 in page, depending on whether an image is found. - * Incoming buffer is PAGE_SIZE and result is guaranteed - * to be far less than that, so we don't worry about - * overflow. - **/ -int image_exists_read(const char *page, int count) -{ - int len = 0; - char *result; - - if (toi_activate_storage(0)) - return count; - - if (!test_toi_state(TOI_RESUME_DEVICE_OK)) - toi_attempt_to_parse_resume_device(0); - - if (!toiActiveAllocator) { - len = sprintf((char *) page, "-1\n"); - } else { - result = get_have_image_data(); - if (result) { - len = sprintf((char *) page, "%s", result); - toi_free_page(26, (unsigned long) result); - } - } - - toi_deactivate_storage(0); - - return len; -} - -/** - * image_exists_write - invalidate an image if one exists - **/ -int image_exists_write(const char *buffer, int count) -{ - if (toi_activate_storage(0)) - return count; - - if (toiActiveAllocator && toiActiveAllocator->image_exists(1)) - toiActiveAllocator->remove_image(); - - toi_deactivate_storage(0); - - clear_result_state(TOI_KEPT_IMAGE); - - return count; -} diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h deleted file mode 100644 index 7d4d83f40..000000000 --- a/kernel/power/tuxonice_io.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * kernel/power/tuxonice_io.h - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains high level IO routines for hibernating. - * - */ - -#include -#include "tuxonice_pagedir.h" - -/* Non-module data saved in our image header */ -struct toi_header { - /* - * Mirror struct swsusp_info, but without - * the page aligned attribute - */ - struct new_utsname uts; - u32 version_code; - unsigned long num_physpages; - int cpus; - unsigned long image_pages; - unsigned long pages; - unsigned long size; - - /* Our own data */ - unsigned long orig_mem_free; - int page_size; - int pageset_2_size; - int param0; - int param1; - int param2; - int param3; - int progress0; - int progress1; - int progress2; - int progress3; - int io_time[2][2]; - struct pagedir pagedir; - dev_t root_fs; - unsigned long bkd; /* Boot kernel data locn */ -}; - -extern int write_pageset(struct pagedir *pagedir); -extern int write_image_header(void); -extern int read_pageset1(void); -extern int read_pageset2(int overwrittenpagesonly); - -extern int toi_attempt_to_parse_resume_device(int quiet); -extern void attempt_to_parse_resume_device2(void); -extern void attempt_to_parse_alt_resume_param(void); -int image_exists_read(const char *page, int count); -int image_exists_write(const char *buffer, int count); -extern void save_restore_alt_param(int replace, int quiet); -extern atomic_t toi_io_workers; - -/* Args to save_restore_alt_param */ -#define RESTORE 0 -#define SAVE 1 - -#define NOQUIET 0 -#define QUIET 1 - -extern wait_queue_head_t toi_io_queue_flusher; -extern int toi_bio_queue_flusher_should_finish; - -int fs_info_space_needed(int reset); - -extern int toi_max_workers; diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c deleted file mode 100644 index a203c8fb9..000000000 --- a/kernel/power/tuxonice_modules.c +++ /dev/null @@ -1,520 +0,0 @@ -/* - * kernel/power/tuxonice_modules.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - */ - -#include -#include -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_ui.h" - -LIST_HEAD(toi_filters); -LIST_HEAD(toiAllocators); - -LIST_HEAD(toi_modules); - -struct toi_module_ops *toiActiveAllocator; - -static int toi_num_filters; -int toiNumAllocators, toi_num_modules; - -/* - * toi_header_storage_for_modules - * - * Returns the amount of space needed to store configuration - * data needed by the modules prior to copying back the original - * kernel. We can exclude data for pageset2 because it will be - * available anyway once the kernel is copied back. - */ -long toi_header_storage_for_modules(void) -{ - struct toi_module_ops *this_module; - int bytes = 0; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - if (this_module->storage_needed) { - int this = this_module->storage_needed() + - sizeof(struct toi_module_header) + - sizeof(int); - this_module->header_requested = this; - bytes += this; - } - } - - /* One more for the empty terminator */ - return bytes + sizeof(struct toi_module_header); -} - -void print_toi_header_storage_for_modules(void) -{ - struct toi_module_ops *this_module; - int bytes = 0; - - printk(KERN_DEBUG "Header storage:\n"); - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - if (this_module->storage_needed) { - int this = this_module->storage_needed() + - sizeof(struct toi_module_header) + - sizeof(int); - this_module->header_requested = this; - bytes += this; - printk(KERN_DEBUG "+ %16s : %-4d/%d.\n", - this_module->name, - this_module->header_used, this); - } - } - - printk(KERN_DEBUG "+ empty terminator : %zu.\n", - sizeof(struct toi_module_header)); - printk(KERN_DEBUG " ====\n"); - printk(KERN_DEBUG " %zu\n", - bytes + sizeof(struct toi_module_header)); -} - -/* - * toi_memory_for_modules - * - * Returns the amount of memory requested by modules for - * doing their work during the cycle. - */ - -long toi_memory_for_modules(int print_parts) -{ - long bytes = 0, result; - struct toi_module_ops *this_module; - - if (print_parts) - printk(KERN_INFO "Memory for modules:\n===================\n"); - list_for_each_entry(this_module, &toi_modules, module_list) { - int this; - if (!this_module->enabled) - continue; - if (this_module->memory_needed) { - this = this_module->memory_needed(); - if (print_parts) - printk(KERN_INFO "%10d bytes (%5ld pages) for " - "module '%s'.\n", this, - DIV_ROUND_UP(this, PAGE_SIZE), - this_module->name); - bytes += this; - } - } - - result = DIV_ROUND_UP(bytes, PAGE_SIZE); - if (print_parts) - printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result); - - return result; -} - -/* - * toi_expected_compression_ratio - * - * Returns the compression ratio expected when saving the image. - */ - -int toi_expected_compression_ratio(void) -{ - int ratio = 100; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled) - continue; - if (this_module->expected_compression) - ratio = ratio * this_module->expected_compression() - / 100; - } - - return ratio; -} - -/* toi_find_module_given_dir - * Functionality : Return a module (if found), given a pointer - * to its directory name - */ - -static struct toi_module_ops *toi_find_module_given_dir(char *name) -{ - struct toi_module_ops *this_module, *found_module = NULL; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!strcmp(name, this_module->directory)) { - found_module = this_module; - break; - } - } - - return found_module; -} - -/* toi_find_module_given_name - * Functionality : Return a module (if found), given a pointer - * to its name - */ - -struct toi_module_ops *toi_find_module_given_name(char *name) -{ - struct toi_module_ops *this_module, *found_module = NULL; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!strcmp(name, this_module->name)) { - found_module = this_module; - break; - } - } - - return found_module; -} - -/* - * toi_print_module_debug_info - * Functionality : Get debugging info from modules into a buffer. - */ -int toi_print_module_debug_info(char *buffer, int buffer_size) -{ - struct toi_module_ops *this_module; - int len = 0; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled) - continue; - if (this_module->print_debug_info) { - int result; - result = this_module->print_debug_info(buffer + len, - buffer_size - len); - len += result; - } - } - - /* Ensure null terminated */ - buffer[buffer_size] = 0; - - return len; -} - -/* - * toi_register_module - * - * Register a module. - */ -int toi_register_module(struct toi_module_ops *module) -{ - int i; - struct kobject *kobj; - - if (!hibernation_available()) - return -ENODEV; - - module->enabled = 1; - - if (toi_find_module_given_name(module->name)) { - printk(KERN_INFO "TuxOnIce: Trying to load module %s," - " which is already registered.\n", - module->name); - return -EBUSY; - } - - switch (module->type) { - case FILTER_MODULE: - list_add_tail(&module->type_list, &toi_filters); - toi_num_filters++; - break; - case WRITER_MODULE: - list_add_tail(&module->type_list, &toiAllocators); - toiNumAllocators++; - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Hmmm. Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return -EINVAL; - } - list_add_tail(&module->module_list, &toi_modules); - toi_num_modules++; - - if ((!module->directory && !module->shared_directory) || - !module->sysfs_data || !module->num_sysfs_entries) - return 0; - - /* - * Modules may share a directory, but those with shared_dir - * set must be loaded (via symbol dependencies) after parents - * and unloaded beforehand. - */ - if (module->shared_directory) { - struct toi_module_ops *shared = - toi_find_module_given_dir(module->shared_directory); - if (!shared) { - printk(KERN_ERR "TuxOnIce: Module %s wants to share " - "%s's directory but %s isn't loaded.\n", - module->name, module->shared_directory, - module->shared_directory); - toi_unregister_module(module); - return -ENODEV; - } - kobj = shared->dir_kobj; - } else { - if (!strncmp(module->directory, "[ROOT]", 6)) - kobj = tuxonice_kobj; - else - kobj = make_toi_sysdir(module->directory); - } - module->dir_kobj = kobj; - for (i = 0; i < module->num_sysfs_entries; i++) { - int result = toi_register_sysfs_file(kobj, - &module->sysfs_data[i]); - if (result) - return result; - } - return 0; -} - -/* - * toi_unregister_module - * - * Remove a module. - */ -void toi_unregister_module(struct toi_module_ops *module) -{ - int i; - - if (module->dir_kobj) - for (i = 0; i < module->num_sysfs_entries; i++) - toi_unregister_sysfs_file(module->dir_kobj, - &module->sysfs_data[i]); - - if (!module->shared_directory && module->directory && - strncmp(module->directory, "[ROOT]", 6)) - remove_toi_sysdir(module->dir_kobj); - - switch (module->type) { - case FILTER_MODULE: - list_del(&module->type_list); - toi_num_filters--; - break; - case WRITER_MODULE: - list_del(&module->type_list); - toiNumAllocators--; - if (toiActiveAllocator == module) { - toiActiveAllocator = NULL; - clear_toi_state(TOI_CAN_RESUME); - clear_toi_state(TOI_CAN_HIBERNATE); - } - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return; - } - list_del(&module->module_list); - toi_num_modules--; -} - -/* - * toi_move_module_tail - * - * Rearrange modules when reloading the config. - */ -void toi_move_module_tail(struct toi_module_ops *module) -{ - switch (module->type) { - case FILTER_MODULE: - if (toi_num_filters > 1) - list_move_tail(&module->type_list, &toi_filters); - break; - case WRITER_MODULE: - if (toiNumAllocators > 1) - list_move_tail(&module->type_list, &toiAllocators); - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return; - } - if ((toi_num_filters + toiNumAllocators) > 1) - list_move_tail(&module->module_list, &toi_modules); -} - -/* - * toi_initialise_modules - * - * Get ready to do some work! - */ -int toi_initialise_modules(int starting_cycle, int early) -{ - struct toi_module_ops *this_module; - int result; - - list_for_each_entry(this_module, &toi_modules, module_list) { - this_module->header_requested = 0; - this_module->header_used = 0; - if (!this_module->enabled) - continue; - if (this_module->early != early) - continue; - if (this_module->initialise) { - result = this_module->initialise(starting_cycle); - if (result) { - toi_cleanup_modules(starting_cycle); - return result; - } - this_module->initialised = 1; - } - } - - return 0; -} - -/* - * toi_cleanup_modules - * - * Tell modules the work is done. - */ -void toi_cleanup_modules(int finishing_cycle) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || !this_module->initialised) - continue; - if (this_module->cleanup) - this_module->cleanup(finishing_cycle); - this_module->initialised = 0; - } -} - -/* - * toi_pre_atomic_restore_modules - * - * Get ready to do some work! - */ -void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->enabled && this_module->pre_atomic_restore) - this_module->pre_atomic_restore(bkd); - } -} - -/* - * toi_post_atomic_restore_modules - * - * Get ready to do some work! - */ -void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->enabled && this_module->post_atomic_restore) - this_module->post_atomic_restore(bkd); - } -} - -/* - * toi_get_next_filter - * - * Get the next filter in the pipeline. - */ -struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought) -{ - struct toi_module_ops *last_filter = NULL, *this_filter = NULL; - - list_for_each_entry(this_filter, &toi_filters, type_list) { - if (!this_filter->enabled) - continue; - if ((last_filter == filter_sought) || (!filter_sought)) - return this_filter; - last_filter = this_filter; - } - - return toiActiveAllocator; -} - -/** - * toi_show_modules: Printk what support is loaded. - */ -void toi_print_modules(void) -{ - struct toi_module_ops *this_module; - int prev = 0; - - printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for"); - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->type == MISC_HIDDEN_MODULE) - continue; - printk("%s %s%s%s", prev ? "," : "", - this_module->enabled ? "" : "[", - this_module->name, - this_module->enabled ? "" : "]"); - prev = 1; - } - - printk(".\n"); -} - -/* toi_get_modules - * - * Take a reference to modules so they can't go away under us. - */ - -int toi_get_modules(void) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - struct toi_module_ops *this_module2; - - if (try_module_get(this_module->module)) - continue; - - /* Failed! Reverse gets and return error */ - list_for_each_entry(this_module2, &toi_modules, - module_list) { - if (this_module == this_module2) - return -EINVAL; - module_put(this_module2->module); - } - } - return 0; -} - -/* toi_put_modules - * - * Release our references to modules we used. - */ - -void toi_put_modules(void) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) - module_put(this_module->module); -} diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h deleted file mode 100644 index 44f10abb9..000000000 --- a/kernel/power/tuxonice_modules.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * kernel/power/tuxonice_modules.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations for modules. Modules are additions to - * TuxOnIce that provide facilities such as image compression or - * encryption, backends for storage of the image and user interfaces. - * - */ - -#ifndef TOI_MODULES_H -#define TOI_MODULES_H - -/* This is the maximum size we store in the image header for a module name */ -#define TOI_MAX_MODULE_NAME_LENGTH 30 - -struct toi_boot_kernel_data; - -/* Per-module metadata */ -struct toi_module_header { - char name[TOI_MAX_MODULE_NAME_LENGTH]; - int enabled; - int type; - int index; - int data_length; - unsigned long signature; -}; - -enum { - FILTER_MODULE, - WRITER_MODULE, - BIO_ALLOCATOR_MODULE, - MISC_MODULE, - MISC_HIDDEN_MODULE, -}; - -enum { - TOI_ASYNC, - TOI_SYNC -}; - -enum { - TOI_VIRT, - TOI_PAGE, -}; - -#define TOI_MAP(type, addr) \ - (type == TOI_PAGE ? kmap(addr) : addr) - -#define TOI_UNMAP(type, addr) \ - do { \ - if (type == TOI_PAGE) \ - kunmap(addr); \ - } while(0) - -struct toi_module_ops { - /* Functions common to all modules */ - int type; - char *name; - char *directory; - char *shared_directory; - struct kobject *dir_kobj; - struct module *module; - int enabled, early, initialised; - struct list_head module_list; - - /* List of filters or allocators */ - struct list_head list, type_list; - - /* - * Requirements for memory and storage in - * the image header.. - */ - int (*memory_needed) (void); - int (*storage_needed) (void); - - int header_requested, header_used; - - int (*expected_compression) (void); - - /* - * Debug info - */ - int (*print_debug_info) (char *buffer, int size); - int (*save_config_info) (char *buffer); - void (*load_config_info) (char *buffer, int len); - - /* - * Initialise & cleanup - general routines called - * at the start and end of a cycle. - */ - int (*initialise) (int starting_cycle); - void (*cleanup) (int finishing_cycle); - - void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd); - void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd); - - /* - * Calls for allocating storage (allocators only). - * - * Header space is requested separately and cannot fail, but the - * reservation is only applied when main storage is allocated. - * The header space reservation is thus always set prior to - * requesting the allocation of storage - and prior to querying - * how much storage is available. - */ - - unsigned long (*storage_available) (void); - void (*reserve_header_space) (unsigned long space_requested); - int (*register_storage) (void); - int (*allocate_storage) (unsigned long space_requested); - unsigned long (*storage_allocated) (void); - void (*free_unused_storage) (void); - - /* - * Routines used in image I/O. - */ - int (*rw_init) (int rw, int stream_number); - int (*rw_cleanup) (int rw); - int (*write_page) (unsigned long index, int buf_type, void *buf, - unsigned int buf_size); - int (*read_page) (unsigned long *index, int buf_type, void *buf, - unsigned int *buf_size); - int (*io_flusher) (int rw); - - /* Reset module if image exists but reading aborted */ - void (*noresume_reset) (void); - - /* Read and write the metadata */ - int (*write_header_init) (void); - int (*write_header_cleanup) (void); - - int (*read_header_init) (void); - int (*read_header_cleanup) (void); - - /* To be called after read_header_init */ - int (*get_header_version) (void); - - int (*rw_header_chunk) (int rw, struct toi_module_ops *owner, - char *buffer_start, int buffer_size); - - int (*rw_header_chunk_noreadahead) (int rw, - struct toi_module_ops *owner, char *buffer_start, - int buffer_size); - - /* Attempt to parse an image location */ - int (*parse_sig_location) (char *buffer, int only_writer, int quiet); - - /* Throttle I/O according to throughput */ - void (*update_throughput_throttle) (int jif_index); - - /* Flush outstanding I/O */ - int (*finish_all_io) (void); - - /* Determine whether image exists that we can restore */ - int (*image_exists) (int quiet); - - /* Mark the image as having tried to resume */ - int (*mark_resume_attempted) (int); - - /* Destroy image if one exists */ - int (*remove_image) (void); - - /* Sysfs Data */ - struct toi_sysfs_data *sysfs_data; - int num_sysfs_entries; - - /* Block I/O allocator */ - struct toi_bio_allocator_ops *bio_allocator_ops; -}; - -extern int toi_num_modules, toiNumAllocators; - -extern struct toi_module_ops *toiActiveAllocator; -extern struct list_head toi_filters, toiAllocators, toi_modules; - -extern void toi_prepare_console_modules(void); -extern void toi_cleanup_console_modules(void); - -extern struct toi_module_ops *toi_find_module_given_name(char *name); -extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *); - -extern int toi_register_module(struct toi_module_ops *module); -extern void toi_move_module_tail(struct toi_module_ops *module); - -extern long toi_header_storage_for_modules(void); -extern long toi_memory_for_modules(int print_parts); -extern void print_toi_header_storage_for_modules(void); -extern int toi_expected_compression_ratio(void); - -extern int toi_print_module_debug_info(char *buffer, int buffer_size); -extern int toi_register_module(struct toi_module_ops *module); -extern void toi_unregister_module(struct toi_module_ops *module); - -extern int toi_initialise_modules(int starting_cycle, int early); -#define toi_initialise_modules_early(starting) \ - toi_initialise_modules(starting, 1) -#define toi_initialise_modules_late(starting) \ - toi_initialise_modules(starting, 0) -extern void toi_cleanup_modules(int finishing_cycle); - -extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd); -extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd); - -extern void toi_print_modules(void); - -int toi_get_modules(void); -void toi_put_modules(void); -#endif diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c deleted file mode 100644 index 78bd31b05..000000000 --- a/kernel/power/tuxonice_netlink.c +++ /dev/null @@ -1,324 +0,0 @@ -/* - * kernel/power/tuxonice_netlink.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Functions for communicating with a userspace helper via netlink. - */ - -#include -#include -#include -#include "tuxonice_netlink.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" - -static struct user_helper_data *uhd_list; - -/* - * Refill our pool of SKBs for use in emergencies (eg, when eating memory and - * none can be allocated). - */ -static void toi_fill_skb_pool(struct user_helper_data *uhd) -{ - while (uhd->pool_level < uhd->pool_limit) { - struct sk_buff *new_skb = - alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); - - if (!new_skb) - break; - - new_skb->next = uhd->emerg_skbs; - uhd->emerg_skbs = new_skb; - uhd->pool_level++; - } -} - -/* - * Try to allocate a single skb. If we can't get one, try to use one from - * our pool. - */ -static struct sk_buff *toi_get_skb(struct user_helper_data *uhd) -{ - struct sk_buff *skb = - alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); - - if (skb) - return skb; - - skb = uhd->emerg_skbs; - if (skb) { - uhd->pool_level--; - uhd->emerg_skbs = skb->next; - skb->next = NULL; - } - - return skb; -} - -void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - void *dest; - struct task_struct *t; - - if (uhd->pid == -1) - return; - - if (uhd->debug) - printk(KERN_ERR "toi_send_netlink_message: Send " - "message type %d.\n", type); - - skb = toi_get_skb(uhd); - if (!skb) { - printk(KERN_INFO "toi_netlink: Can't allocate skb!\n"); - return; - } - - nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0); - uhd->sock_seq++; - - dest = NLMSG_DATA(nlh); - if (params && len > 0) - memcpy(dest, params, len); - - netlink_unicast(uhd->nl, skb, uhd->pid, 0); - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); - if (!t) { - toi_read_unlock_tasklist(); - if (uhd->pid > -1) - printk(KERN_INFO "Hmm. Can't find the userspace task" - " %d.\n", uhd->pid); - return; - } - wake_up_process(t); - toi_read_unlock_tasklist(); - - yield(); -} - -static void send_whether_debugging(struct user_helper_data *uhd) -{ - static u8 is_debugging = 1; - - toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING, - &is_debugging, sizeof(u8)); -} - -/* - * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we - * are hibernating. - */ -static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid) -{ - struct task_struct *t; - - if (uhd->debug) - printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid); - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(pid, &init_pid_ns); - if (!t) { - toi_read_unlock_tasklist(); - printk(KERN_INFO "Strange. Can't find the userspace task %d.\n", - pid); - return -EINVAL; - } - - t->flags |= PF_NOFREEZE; - - toi_read_unlock_tasklist(); - uhd->pid = pid; - - toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0); - - return 0; -} - -/* - * Called when the userspace process has informed us that it's ready to roll. - */ -static int nl_ready(struct user_helper_data *uhd, u32 version) -{ - if (version != uhd->interface_version) { - printk(KERN_INFO "%s userspace process using invalid interface" - " version (%d - kernel wants %d). Trying to " - "continue without it.\n", - uhd->name, version, uhd->interface_version); - if (uhd->not_ready) - uhd->not_ready(); - return -EINVAL; - } - - complete(&uhd->wait_for_process); - - return 0; -} - -void toi_netlink_close_complete(struct user_helper_data *uhd) -{ - if (uhd->nl) { - netlink_kernel_release(uhd->nl); - uhd->nl = NULL; - } - - while (uhd->emerg_skbs) { - struct sk_buff *next = uhd->emerg_skbs->next; - kfree_skb(uhd->emerg_skbs); - uhd->emerg_skbs = next; - } - - uhd->pid = -1; -} - -static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd, - struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type = nlh->nlmsg_type; - int *data; - int err; - - if (uhd->debug) - printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n", - type); - - /* Let the more specific handler go first. It returns - * 1 for valid messages that it doesn't know. */ - err = uhd->rcv_msg(skb, nlh); - if (err != 1) - return err; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) { - printk(KERN_INFO "Received extra nofreeze me requests.\n"); - return -EBUSY; - } - - data = NLMSG_DATA(nlh); - - switch (type) { - case NETLINK_MSG_NOFREEZE_ME: - return nl_set_nofreeze(uhd, nlh->nlmsg_pid); - case NETLINK_MSG_GET_DEBUGGING: - send_whether_debugging(uhd); - return 0; - case NETLINK_MSG_READY: - if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) { - printk(KERN_INFO "Invalid ready mesage.\n"); - if (uhd->not_ready) - uhd->not_ready(); - return -EINVAL; - } - return nl_ready(uhd, (u32) *data); - case NETLINK_MSG_CLEANUP: - toi_netlink_close_complete(uhd); - return 0; - } - - return -EINVAL; -} - -static void toi_user_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr *nlh; - struct user_helper_data *uhd = uhd_list; - - while (uhd && uhd->netlink_id != skb->sk->sk_protocol) - uhd = uhd->next; - - if (!uhd) - return; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *) skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - - err = toi_nl_gen_rcv_msg(uhd, skb, nlh); - if (err) - netlink_ack(skb, nlh, err); - else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } -} - -static int netlink_prepare(struct user_helper_data *uhd) -{ - struct netlink_kernel_cfg cfg = { - .groups = 0, - .input = toi_user_rcv_skb, - }; - - uhd->next = uhd_list; - uhd_list = uhd; - - uhd->sock_seq = 0x42c0ffee; - uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg); - if (!uhd->nl) { - printk(KERN_INFO "Failed to allocate netlink socket for %s.\n", - uhd->name); - return -ENOMEM; - } - - toi_fill_skb_pool(uhd); - - return 0; -} - -void toi_netlink_close(struct user_helper_data *uhd) -{ - struct task_struct *t; - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); - if (t) - t->flags &= ~PF_NOFREEZE; - toi_read_unlock_tasklist(); - - toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0); -} -int toi_netlink_setup(struct user_helper_data *uhd) -{ - /* In case userui didn't cleanup properly on us */ - toi_netlink_close_complete(uhd); - - if (netlink_prepare(uhd) < 0) { - printk(KERN_INFO "Netlink prepare failed.\n"); - return 1; - } - - if (toi_launch_userspace_program(uhd->program, uhd->netlink_id, - UMH_WAIT_EXEC, uhd->debug) < 0) { - printk(KERN_INFO "Launch userspace program failed.\n"); - toi_netlink_close_complete(uhd); - return 1; - } - - /* Wait 2 seconds for the userspace process to make contact */ - wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ); - - if (uhd->pid == -1) { - printk(KERN_INFO "%s: Failed to contact userspace process.\n", - uhd->name); - toi_netlink_close_complete(uhd); - return 1; - } - - return 0; -} diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h deleted file mode 100644 index 6613c8eaa..000000000 --- a/kernel/power/tuxonice_netlink.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * kernel/power/tuxonice_netlink.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Declarations for functions for communicating with a userspace helper - * via netlink. - */ - -#include -#include - -#define NETLINK_MSG_BASE 0x10 - -#define NETLINK_MSG_READY 0x10 -#define NETLINK_MSG_NOFREEZE_ME 0x16 -#define NETLINK_MSG_GET_DEBUGGING 0x19 -#define NETLINK_MSG_CLEANUP 0x24 -#define NETLINK_MSG_NOFREEZE_ACK 0x27 -#define NETLINK_MSG_IS_DEBUGGING 0x28 - -struct user_helper_data { - int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh); - void (*not_ready) (void); - struct sock *nl; - u32 sock_seq; - pid_t pid; - char *comm; - char program[256]; - int pool_level; - int pool_limit; - struct sk_buff *emerg_skbs; - int skb_size; - int netlink_id; - char *name; - struct user_helper_data *next; - struct completion wait_for_process; - u32 interface_version; - int must_init; - int debug; -}; - -#ifdef CONFIG_NET -int toi_netlink_setup(struct user_helper_data *uhd); -void toi_netlink_close(struct user_helper_data *uhd); -void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len); -void toi_netlink_close_complete(struct user_helper_data *uhd); -#else -static inline int toi_netlink_setup(struct user_helper_data *uhd) -{ - return 0; -} - -static inline void toi_netlink_close(struct user_helper_data *uhd) { }; -static inline void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len) { }; -static inline void toi_netlink_close_complete(struct user_helper_data *uhd) - { }; -#endif diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c deleted file mode 100644 index d469f3d2d..000000000 --- a/kernel/power/tuxonice_pagedir.c +++ /dev/null @@ -1,345 +0,0 @@ -/* - * kernel/power/tuxonice_pagedir.c - * - * Copyright (C) 1998-2001 Gabor Kuti - * Copyright (C) 1998,2001,2002 Pavel Machek - * Copyright (C) 2002-2003 Florent Chabaud - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for handling pagesets. - * Note that pbes aren't actually stored as such. They're stored as - * bitmaps and extents. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "tuxonice_pageflags.h" -#include "tuxonice_ui.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice.h" -#include "tuxonice_builtin.h" -#include "tuxonice_alloc.h" - -static int ptoi_pfn; -static struct pbe *this_low_pbe; -static struct pbe **last_low_pbe_ptr; - -void toi_reset_alt_image_pageset2_pfn(void) -{ - memory_bm_position_reset(pageset2_map); -} - -static struct page *first_conflicting_page; - -/* - * free_conflicting_pages - */ - -static void free_conflicting_pages(void) -{ - while (first_conflicting_page) { - struct page *next = - *((struct page **) kmap(first_conflicting_page)); - kunmap(first_conflicting_page); - toi__free_page(29, first_conflicting_page); - first_conflicting_page = next; - } -} - -/* __toi_get_nonconflicting_page - * - * Description: Gets order zero pages that won't be overwritten - * while copying the original pages. - */ - -struct page *___toi_get_nonconflicting_page(int can_be_highmem) -{ - struct page *page; - gfp_t flags = TOI_ATOMIC_GFP; - if (can_be_highmem) - flags |= __GFP_HIGHMEM; - - - if (test_toi_state(TOI_LOADING_ALT_IMAGE) && - pageset2_map && ptoi_pfn) { - do { - ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0); - if (ptoi_pfn != BM_END_OF_MAP) { - page = pfn_to_page(ptoi_pfn); - if (!PagePageset1(page) && - (can_be_highmem || !PageHighMem(page))) - return page; - } - } while (ptoi_pfn); - } - - do { - page = toi_alloc_page(29, flags | __GFP_ZERO); - if (!page) { - printk(KERN_INFO "Failed to get nonconflicting " - "page.\n"); - return NULL; - } - if (PagePageset1(page)) { - struct page **next = (struct page **) kmap(page); - *next = first_conflicting_page; - first_conflicting_page = page; - kunmap(page); - } - } while (PagePageset1(page)); - - return page; -} - -unsigned long __toi_get_nonconflicting_page(void) -{ - struct page *page = ___toi_get_nonconflicting_page(0); - return page ? (unsigned long) page_address(page) : 0; -} - -static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe, - int highmem) -{ - if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1)) - + 2 * sizeof(struct pbe)) > PAGE_SIZE) { - struct page *new_page = - ___toi_get_nonconflicting_page(highmem); - if (!new_page) - return ERR_PTR(-ENOMEM); - this_pbe = (struct pbe *) kmap(new_page); - memset(this_pbe, 0, PAGE_SIZE); - *page_ptr = new_page; - } else - this_pbe++; - - return this_pbe; -} - -/** - * get_pageset1_load_addresses - generate pbes for conflicting pages - * - * We check here that pagedir & pages it points to won't collide - * with pages where we're going to restore from the loaded pages - * later. - * - * Returns: - * Zero on success, one if couldn't find enough pages (shouldn't - * happen). - **/ -int toi_get_pageset1_load_addresses(void) -{ - int pfn, highallocd = 0, lowallocd = 0; - int low_needed = pagedir1.size - get_highmem_size(pagedir1); - int high_needed = get_highmem_size(pagedir1); - int low_pages_for_highmem = 0; - gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM; - struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL, - *low_pbe_page, *last_low_pbe_page = NULL; - struct pbe **last_high_pbe_ptr = &restore_highmem_pblist, - *this_high_pbe = NULL; - unsigned long orig_low_pfn, orig_high_pfn; - int high_pbes_done = 0, low_pbes_done = 0; - int low_direct = 0, high_direct = 0, result = 0, i; - int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0; - - toi_trace_index++; - - memory_bm_position_reset(pageset1_map); - memory_bm_position_reset(pageset1_copy_map); - - last_low_pbe_ptr = &restore_pblist; - - /* First, allocate pages for the start of our pbe lists. */ - if (high_needed) { - high_pbe_page = ___toi_get_nonconflicting_page(1); - if (!high_pbe_page) { - result = -ENOMEM; - goto out; - } - this_high_pbe = (struct pbe *) kmap(high_pbe_page); - memset(this_high_pbe, 0, PAGE_SIZE); - } - - low_pbe_page = ___toi_get_nonconflicting_page(0); - if (!low_pbe_page) { - result = -ENOMEM; - goto out; - } - this_low_pbe = (struct pbe *) page_address(low_pbe_page); - - /* - * Next, allocate the number of pages we need. - */ - - i = low_needed + high_needed; - - do { - int is_high; - - if (i == low_needed) - flags &= ~__GFP_HIGHMEM; - - page = toi_alloc_page(30, flags); - BUG_ON(!page); - - SetPagePageset1Copy(page); - is_high = PageHighMem(page); - - if (PagePageset1(page)) { - if (is_high) - high_direct++; - else - low_direct++; - } else { - if (is_high) - highallocd++; - else - lowallocd++; - } - } while (--i); - - high_needed -= high_direct; - low_needed -= low_direct; - - /* - * Do we need to use some lowmem pages for the copies of highmem - * pages? - */ - if (high_needed > highallocd) { - low_pages_for_highmem = high_needed - highallocd; - high_needed -= low_pages_for_highmem; - low_needed += low_pages_for_highmem; - } - - /* - * Now generate our pbes (which will be used for the atomic restore), - * and free unneeded pages. - */ - memory_bm_position_reset(pageset1_copy_map); - for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) { - int is_high; - page = pfn_to_page(pfn); - is_high = PageHighMem(page); - - if (PagePageset1(page)) - continue; - - /* Nope. We're going to use this page. Add a pbe. */ - if (is_high || low_pages_for_highmem) { - struct page *orig_page; - high_pbes_done++; - if (!is_high) - low_pages_for_highmem--; - do { - orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0); - BUG_ON(orig_high_pfn == BM_END_OF_MAP); - orig_page = pfn_to_page(orig_high_pfn); - } while (!PageHighMem(orig_page) || - PagePageset1Copy(orig_page)); - - this_high_pbe->orig_address = (void *) orig_high_pfn; - this_high_pbe->address = page; - this_high_pbe->next = NULL; - toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p", - high_page, high_offset, page, orig_high_pfn, orig_page); - if (last_high_pbe_page != high_pbe_page) { - *last_high_pbe_ptr = - (struct pbe *) high_pbe_page; - if (last_high_pbe_page) { - kunmap(last_high_pbe_page); - high_page++; - high_offset = 0; - } else - high_offset++; - last_high_pbe_page = high_pbe_page; - } else { - *last_high_pbe_ptr = this_high_pbe; - high_offset++; - } - last_high_pbe_ptr = &this_high_pbe->next; - this_high_pbe = get_next_pbe(&high_pbe_page, - this_high_pbe, 1); - if (IS_ERR(this_high_pbe)) { - printk(KERN_INFO - "This high pbe is an error.\n"); - return -ENOMEM; - } - } else { - struct page *orig_page; - low_pbes_done++; - do { - orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0); - BUG_ON(orig_low_pfn == BM_END_OF_MAP); - orig_page = pfn_to_page(orig_low_pfn); - } while (PageHighMem(orig_page) || - PagePageset1Copy(orig_page)); - - this_low_pbe->orig_address = page_address(orig_page); - this_low_pbe->address = page_address(page); - this_low_pbe->next = NULL; - toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p", - low_page, low_offset, this_low_pbe->orig_address, - orig_low_pfn, this_low_pbe->address); - TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address); - *last_low_pbe_ptr = this_low_pbe; - last_low_pbe_ptr = &this_low_pbe->next; - this_low_pbe = get_next_pbe(&low_pbe_page, - this_low_pbe, 0); - if (low_pbe_page != last_low_pbe_page) { - if (last_low_pbe_page) { - low_page++; - low_offset = 0; - } else { - low_offset++; - } - last_low_pbe_page = low_pbe_page; - } else - low_offset++; - if (IS_ERR(this_low_pbe)) { - printk(KERN_INFO "this_low_pbe is an error.\n"); - return -ENOMEM; - } - } - } - - if (high_pbe_page) - kunmap(high_pbe_page); - - if (last_high_pbe_page != high_pbe_page) { - if (last_high_pbe_page) - kunmap(last_high_pbe_page); - toi__free_page(29, high_pbe_page); - } - - free_conflicting_pages(); - -out: - return result; -} - -int add_boot_kernel_data_pbe(void) -{ - this_low_pbe->address = (char *) __toi_get_nonconflicting_page(); - if (!this_low_pbe->address) { - printk(KERN_INFO "Failed to get bkd atomic restore buffer."); - return -ENOMEM; - } - - toi_bkd.size = sizeof(toi_bkd); - memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd)); - - *last_low_pbe_ptr = this_low_pbe; - this_low_pbe->orig_address = (char *) boot_kernel_data_buffer; - this_low_pbe->next = NULL; - return 0; -} diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h deleted file mode 100644 index 046535918..000000000 --- a/kernel/power/tuxonice_pagedir.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * kernel/power/tuxonice_pagedir.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Declarations for routines for handling pagesets. - */ - -#ifndef KERNEL_POWER_PAGEDIR_H -#define KERNEL_POWER_PAGEDIR_H - -/* Pagedir - * - * Contains the metadata for a set of pages saved in the image. - */ - -struct pagedir { - int id; - unsigned long size; -#ifdef CONFIG_HIGHMEM - unsigned long size_high; -#endif -}; - -#ifdef CONFIG_HIGHMEM -#define get_highmem_size(pagedir) (pagedir.size_high) -#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0) -#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0) -#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high) -#else -#define get_highmem_size(pagedir) (0) -#define set_highmem_size(pagedir, sz) do { } while (0) -#define inc_highmem_size(pagedir) do { } while (0) -#define get_lowmem_size(pagedir) (pagedir.size) -#endif - -extern struct pagedir pagedir1, pagedir2; - -extern void toi_copy_pageset1(void); - -extern int toi_get_pageset1_load_addresses(void); - -extern unsigned long __toi_get_nonconflicting_page(void); -struct page *___toi_get_nonconflicting_page(int can_be_highmem); - -extern void toi_reset_alt_image_pageset2_pfn(void); -extern int add_boot_kernel_data_pbe(void); -#endif diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c deleted file mode 100644 index 0fe92edd7..000000000 --- a/kernel/power/tuxonice_pageflags.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * kernel/power/tuxonice_pageflags.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for serialising and relocating pageflags in which we - * store our image metadata. - */ - -#include "tuxonice_pageflags.h" -#include "power.h" - -int toi_pageflags_space_needed(void) -{ - return memory_bm_space_needed(pageset1_map); -} diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h deleted file mode 100644 index ddeeaf1e7..000000000 --- a/kernel/power/tuxonice_pageflags.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * kernel/power/tuxonice_pageflags.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H -#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H - -struct memory_bitmap; -void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); -void memory_bm_clear(struct memory_bitmap *bm); - -int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index); -unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index); -void memory_bm_position_reset(struct memory_bitmap *bm); -void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); -int toi_alloc_bitmap(struct memory_bitmap **bm); -void toi_free_bitmap(struct memory_bitmap **bm); -void memory_bm_clear(struct memory_bitmap *bm); -void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); - -struct toi_module_ops; -int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); -int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); -int memory_bm_space_needed(struct memory_bitmap *bm); - -extern struct memory_bitmap *pageset1_map; -extern struct memory_bitmap *pageset1_copy_map; -extern struct memory_bitmap *pageset2_map; -extern struct memory_bitmap *page_resave_map; -extern struct memory_bitmap *io_map; -extern struct memory_bitmap *nosave_map; -extern struct memory_bitmap *free_map; -extern struct memory_bitmap *compare_map; - -#define PagePageset1(page) \ - (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset1(page) \ - (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset1(page) \ - (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) - -#define PagePageset1Copy(page) \ - (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset1Copy(page) \ - (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset1Copy(page) \ - (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) - -#define PagePageset2(page) \ - (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset2(page) \ - (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset2(page) \ - (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) - -#define PageWasRW(page) \ - (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define SetPageWasRW(page) \ - (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageWasRW(page) \ - (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) - -#define PageResave(page) (page_resave_map ? \ - memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageResave(page) \ - (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageResave(page) \ - (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) - -#define PageNosave(page) (nosave_map ? \ - memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageNosave(page) \ - (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageNosave(page) \ - (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page))) - -#define PageNosaveFree(page) (free_map ? \ - memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageNosaveFree(page) \ - (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageNosaveFree(page) \ - (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page))) - -#define PageCompareChanged(page) (compare_map ? \ - memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageCompareChanged(page) \ - (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageCompareChanged(page) \ - (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page))) - -extern void save_pageflags(struct memory_bitmap *pagemap); -extern int load_pageflags(struct memory_bitmap *pagemap); -extern int toi_pageflags_space_needed(void); -#endif diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c deleted file mode 100644 index 7c78773cf..000000000 --- a/kernel/power/tuxonice_power_off.c +++ /dev/null @@ -1,286 +0,0 @@ -/* - * kernel/power/tuxonice_power_off.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Support for powering down. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_power_off.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" - -unsigned long toi_poweroff_method; /* 0 - Kernel power off */ - -static int wake_delay; -static char lid_state_file[256], wake_alarm_dir[256]; -static struct file *lid_file, *alarm_file, *epoch_file; -static int post_wake_state = -1; - -static int did_suspend_to_both; - -/* - * __toi_power_down - * Functionality : Powers down or reboots the computer once the image - * has been written to disk. - * Key Assumptions : Able to reboot/power down via code called or that - * the warning emitted if the calls fail will be visible - * to the user (ie printk resumes devices). - */ - -static void __toi_power_down(int method) -{ - int error; - - toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." : - "Powering down."); - - if (test_result_state(TOI_ABORTED)) - goto out; - - if (test_action_state(TOI_REBOOT)) - kernel_restart(NULL); - - switch (method) { - case 0: - break; - case 3: - /* - * Re-read the overwritten part of pageset2 to make post-resume - * faster. - */ - if (read_pageset2(1)) - panic("Attempt to reload pagedir 2 failed. " - "Try rebooting."); - - pm_prepare_console(); - - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (!error) { - pm_restore_gfp_mask(); - error = suspend_devices_and_enter(PM_SUSPEND_MEM); - pm_restrict_gfp_mask(); - if (!error) - did_suspend_to_both = 1; - } - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); - - /* Success - we're now post-resume-from-ram */ - if (did_suspend_to_both) - return; - - /* Failed to suspend to ram - do normal power off */ - break; - case 4: - /* - * If succeeds, doesn't return. If fails, do a simple - * powerdown. - */ - hibernation_platform_enter(); - break; - case 5: - /* Historic entry only now */ - break; - } - - if (method && method != 5) - toi_cond_pause(1, - "Falling back to alternate power off method."); - - if (test_result_state(TOI_ABORTED)) - goto out; - - if (pm_power_off) - kernel_power_off(); - kernel_halt(); - toi_cond_pause(1, "Powerdown failed."); - while (1) - cpu_relax(); - -out: - if (read_pageset2(1)) - panic("Attempt to reload pagedir 2 failed. Try rebooting."); - return; -} - -#define CLOSE_FILE(file) \ - if (file) { \ - filp_close(file, NULL); file = NULL; \ - } - -static void powerdown_cleanup(int toi_or_resume) -{ - if (!toi_or_resume) - return; - - CLOSE_FILE(lid_file); - CLOSE_FILE(alarm_file); - CLOSE_FILE(epoch_file); -} - -static void open_file(char *format, char *arg, struct file **var, int mode, - char *desc) -{ - char buf[256]; - - if (strlen(arg)) { - sprintf(buf, format, arg); - *var = filp_open(buf, mode, 0); - if (IS_ERR(*var) || !*var) { - printk(KERN_INFO "Failed to open %s file '%s' (%p).\n", - desc, buf, *var); - *var = NULL; - } - } -} - -static int powerdown_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - did_suspend_to_both = 0; - - open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file, - O_RDONLY, "lid"); - - if (strlen(wake_alarm_dir)) { - open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir, - &alarm_file, O_WRONLY, "alarm"); - - open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir, - &epoch_file, O_RDONLY, "epoch"); - } - - return 0; -} - -static int lid_closed(void) -{ - char array[25]; - ssize_t size; - loff_t pos = 0; - - if (!lid_file) - return 0; - - size = vfs_read(lid_file, (char __user *) array, 25, &pos); - if ((int) size < 1) { - printk(KERN_INFO "Failed to read lid state file (%d).\n", - (int) size); - return 0; - } - - if (!strcmp(array, "state: closed\n")) - return 1; - - return 0; -} - -static void write_alarm_file(int value) -{ - ssize_t size; - char buf[40]; - loff_t pos = 0; - - if (!alarm_file) - return; - - sprintf(buf, "%d\n", value); - - size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos); - - if (size < 0) - printk(KERN_INFO "Error %d writing alarm value %s.\n", - (int) size, buf); -} - -/** - * toi_check_resleep: See whether to powerdown again after waking. - * - * After waking, check whether we should powerdown again in a (usually - * different) way. We only do this if the lid switch is still closed. - */ -void toi_check_resleep(void) -{ - /* We only return if we suspended to ram and woke. */ - if (lid_closed() && post_wake_state >= 0) - __toi_power_down(post_wake_state); -} - -void toi_power_down(void) -{ - if (alarm_file && wake_delay) { - char array[25]; - loff_t pos = 0; - size_t size = vfs_read(epoch_file, (char __user *) array, 25, - &pos); - - if (((int) size) < 1) - printk(KERN_INFO "Failed to read epoch file (%d).\n", - (int) size); - else { - unsigned long since_epoch; - if (!kstrtoul(array, 0, &since_epoch)) { - /* Clear any wakeup time. */ - write_alarm_file(0); - - /* Set new wakeup time. */ - write_alarm_file(since_epoch + wake_delay); - } - } - } - - __toi_power_down(toi_poweroff_method); - - toi_check_resleep(); -} - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_ACPI) - SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL), - SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL), - SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL), - SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0, - NULL), - SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0), - SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both, - 0, 0, 0, NULL) -#endif -}; - -static struct toi_module_ops powerdown_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "poweroff", - .initialise = powerdown_init, - .cleanup = powerdown_cleanup, - .directory = "[ROOT]", - .module = THIS_MODULE, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_poweroff_init(void) -{ - return toi_register_module(&powerdown_ops); -} - -void toi_poweroff_exit(void) -{ - toi_unregister_module(&powerdown_ops); -} diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h deleted file mode 100644 index 6e1d8bb39..000000000 --- a/kernel/power/tuxonice_power_off.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * kernel/power/tuxonice_power_off.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Support for the powering down. - */ - -int toi_pm_state_finish(void); -void toi_power_down(void); -extern unsigned long toi_poweroff_method; -int toi_poweroff_init(void); -void toi_poweroff_exit(void); -void toi_check_resleep(void); - -extern int platform_begin(int platform_mode); -extern int platform_pre_snapshot(int platform_mode); -extern void platform_leave(int platform_mode); -extern void platform_end(int platform_mode); -extern void platform_finish(int platform_mode); -extern int platform_pre_restore(int platform_mode); -extern void platform_restore_cleanup(int platform_mode); diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c deleted file mode 100644 index df37cc805..000000000 --- a/kernel/power/tuxonice_prepare_image.c +++ /dev/null @@ -1,1089 +0,0 @@ -/* - * kernel/power/tuxonice_prepare_image.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * We need to eat memory until we can: - * 1. Perform the save without changing anything (RAM_NEEDED < #pages) - * 2. Fit it all in available space (toiActiveAllocator->available_space() >= - * main_storage_needed()) - * 3. Reload the pagedir and pageset1 to places that don't collide with their - * final destinations, not knowing to what extent the resumed kernel will - * overlap with the one loaded at boot time. I think the resumed kernel - * should overlap completely, but I don't want to rely on this as it is - * an unproven assumption. We therefore assume there will be no overlap at - * all (worse case). - * 4. Meet the user's requested limit (if any) on the size of the image. - * The limit is in MB, so pages/256 (assuming 4K pages). - * - */ - -#include -#include -#include -#include -#include -#include - -#include "tuxonice_pageflags.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice.h" -#include "tuxonice_extent.h" -#include "tuxonice_checksum.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_alloc.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_builtin.h" - -static unsigned long num_nosave, main_storage_allocated, storage_limit, - header_storage_needed; -unsigned long extra_pd1_pages_allowance = - CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE; -long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT; -static int no_ps2_needed; - -struct attention_list { - struct task_struct *task; - struct attention_list *next; -}; - -static struct attention_list *attention_list; - -#define PAGESET1 0 -#define PAGESET2 1 - -void free_attention_list(void) -{ - struct attention_list *last = NULL; - - while (attention_list) { - last = attention_list; - attention_list = attention_list->next; - toi_kfree(6, last, sizeof(*last)); - } -} - -static int build_attention_list(void) -{ - int i, task_count = 0; - struct task_struct *p; - struct attention_list *next; - - /* - * Count all userspace process (with task->mm) marked PF_NOFREEZE. - */ - toi_read_lock_tasklist(); - for_each_process(p) - if ((p->flags & PF_NOFREEZE) || p == current) - task_count++; - toi_read_unlock_tasklist(); - - /* - * Allocate attention list structs. - */ - for (i = 0; i < task_count; i++) { - struct attention_list *this = - toi_kzalloc(6, sizeof(struct attention_list), - TOI_WAIT_GFP); - if (!this) { - printk(KERN_INFO "Failed to allocate slab for " - "attention list.\n"); - free_attention_list(); - return 1; - } - this->next = NULL; - if (attention_list) - this->next = attention_list; - attention_list = this; - } - - next = attention_list; - toi_read_lock_tasklist(); - for_each_process(p) - if ((p->flags & PF_NOFREEZE) || p == current) { - next->task = p; - next = next->next; - } - toi_read_unlock_tasklist(); - return 0; -} - -static void pageset2_full(void) -{ - struct zone *zone; - struct page *page; - unsigned long flags; - int i; - - toi_trace_index++; - - for_each_populated_zone(zone) { - spin_lock_irqsave(&zone->lru_lock, flags); - for_each_lru(i) { - if (!zone_page_state(zone, NR_LRU_BASE + i)) - continue; - - list_for_each_entry(page, &zone->lruvec.lists[i], lru) { - struct address_space *mapping; - - mapping = page_mapping(page); - if (!mapping || !mapping->host || - !(mapping->host->i_flags & S_ATOMIC_COPY)) { - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified."); - } else { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full."); - SetPagePageset2(page); - } - } - } - } - spin_unlock_irqrestore(&zone->lru_lock, flags); - } -} - -/* - * toi_mark_task_as_pageset - * Functionality : Marks all the saveable pages belonging to a given process - * as belonging to a particular pageset. - */ - -static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2) -{ - struct vm_area_struct *vma; - struct mm_struct *mm; - - mm = t->active_mm; - - if (!mm || !mm->mmap) - return; - - toi_trace_index++; - - if (!irqs_disabled()) - down_read(&mm->mmap_sem); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long posn; - - if (!vma->vm_start || - vma->vm_flags & VM_PFNMAP) - continue; - - for (posn = vma->vm_start; posn < vma->vm_end; - posn += PAGE_SIZE) { - struct page *page = follow_page(vma, posn, 0); - struct address_space *mapping; - - if (!page || !pfn_valid(page_to_pfn(page))) - continue; - - mapping = page_mapping(page); - if (mapping && mapping->host && - mapping->host->i_flags & S_ATOMIC_COPY && pageset2) - continue; - - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2); - continue; - } - - if (pageset2) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1"); - SetPagePageset2(page); - } else { - TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2"); - ClearPagePageset2(page); - SetPagePageset1(page); - } - } - } - - if (!irqs_disabled()) - up_read(&mm->mmap_sem); -} - -static void mark_tasks(int pageset) -{ - struct task_struct *p; - - toi_read_lock_tasklist(); - for_each_process(p) { - if (!p->mm) - continue; - - if (p->flags & PF_KTHREAD) - continue; - - toi_mark_task_as_pageset(p, pageset); - } - toi_read_unlock_tasklist(); - -} - -/* mark_pages_for_pageset2 - * - * Description: Mark unshared pages in processes not needed for hibernate as - * being able to be written out in a separate pagedir. - * HighMem pages are simply marked as pageset2. They won't be - * needed during hibernate. - */ - -static void toi_mark_pages_for_pageset2(void) -{ - struct attention_list *this = attention_list; - - memory_bm_clear(pageset2_map); - - if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed) - return; - - if (test_action_state(TOI_PAGESET2_FULL)) - pageset2_full(); - else - mark_tasks(PAGESET2); - - /* - * Because the tasks in attention_list are ones related to hibernating, - * we know that they won't go away under us. - */ - - while (this) { - if (!test_result_state(TOI_ABORTED)) - toi_mark_task_as_pageset(this->task, PAGESET1); - this = this->next; - } -} - -/* - * The atomic copy of pageset1 is stored in pageset2 pages. - * But if pageset1 is larger (normally only just after boot), - * we need to allocate extra pages to store the atomic copy. - * The following data struct and functions are used to handle - * the allocation and freeing of that memory. - */ - -static unsigned long extra_pages_allocated; - -struct extras { - struct page *page; - int order; - struct extras *next; -}; - -static struct extras *extras_list; - -/* toi_free_extra_pagedir_memory - * - * Description: Free previously allocated extra pagedir memory. - */ -void toi_free_extra_pagedir_memory(void) -{ - /* Free allocated pages */ - while (extras_list) { - struct extras *this = extras_list; - int i; - - extras_list = this->next; - - for (i = 0; i < (1 << this->order); i++) - ClearPageNosave(this->page + i); - - toi_free_pages(9, this->page, this->order); - toi_kfree(7, this, sizeof(*this)); - } - - extra_pages_allocated = 0; -} - -/* toi_allocate_extra_pagedir_memory - * - * Description: Allocate memory for making the atomic copy of pagedir1 in the - * case where it is bigger than pagedir2. - * Arguments: int num_to_alloc: Number of extra pages needed. - * Result: int. Number of extra pages we now have allocated. - */ -static int toi_allocate_extra_pagedir_memory(int extra_pages_needed) -{ - int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated; - gfp_t flags = TOI_ATOMIC_GFP; - - if (num_to_alloc < 1) - return 0; - - order = fls(num_to_alloc); - if (order >= MAX_ORDER) - order = MAX_ORDER - 1; - - while (num_to_alloc) { - struct page *newpage; - unsigned long virt; - struct extras *extras_entry; - - while ((1 << order) > num_to_alloc) - order--; - - extras_entry = (struct extras *) toi_kzalloc(7, - sizeof(struct extras), TOI_ATOMIC_GFP); - - if (!extras_entry) - return extra_pages_allocated; - - virt = toi_get_free_pages(9, flags, order); - while (!virt && order) { - order--; - virt = toi_get_free_pages(9, flags, order); - } - - if (!virt) { - toi_kfree(7, extras_entry, sizeof(*extras_entry)); - return extra_pages_allocated; - } - - newpage = virt_to_page(virt); - - extras_entry->page = newpage; - extras_entry->order = order; - extras_entry->next = extras_list; - - extras_list = extras_entry; - - for (j = 0; j < (1 << order); j++) { - SetPageNosave(newpage + j); - SetPagePageset1Copy(newpage + j); - } - - extra_pages_allocated += (1 << order); - num_to_alloc -= (1 << order); - } - - return extra_pages_allocated; -} - -/* - * real_nr_free_pages: Count pcp pages for a zone type or all zones - * (-1 for all, otherwise zone_idx() result desired). - */ -unsigned long real_nr_free_pages(unsigned long zone_idx_mask) -{ - struct zone *zone; - int result = 0, cpu; - - /* PCP lists */ - for_each_populated_zone(zone) { - if (!(zone_idx_mask & (1 << zone_idx(zone)))) - continue; - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pset = - per_cpu_ptr(zone->pageset, cpu); - struct per_cpu_pages *pcp = &pset->pcp; - result += pcp->count; - } - - result += zone_page_state(zone, NR_FREE_PAGES); - } - return result; -} - -/* - * Discover how much extra memory will be required by the drivers - * when they're asked to hibernate. We can then ensure that amount - * of memory is available when we really want it. - */ -static void get_extra_pd1_allowance(void) -{ - unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final; - - toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers."); - - if (toi_go_atomic(PMSG_FREEZE, 1)) - return; - - final = real_nr_free_pages(all_zones_mask); - toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0); - - extra_pd1_pages_allowance = (orig_num_free > final) ? - orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE : - MIN_EXTRA_PAGES_ALLOWANCE; -} - -/* - * Amount of storage needed, possibly taking into account the - * expected compression ratio and possibly also ignoring our - * allowance for extra pages. - */ -static unsigned long main_storage_needed(int use_ecr, - int ignore_extra_pd1_allow) -{ - return (pagedir1.size + pagedir2.size + - (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) * - (use_ecr ? toi_expected_compression_ratio() : 100) / 100; -} - -/* - * Storage needed for the image header, in bytes until the return. - * - * fs_info_space_needed is saved in a static variable unless we - * explicitly want to reset the value (done at the start of a cycle) - * as it requires memory allocation that may result in a hang if we're - * also trying to free memory. - */ -unsigned long get_header_storage_needed(int reset) -{ - unsigned long bytes = sizeof(struct toi_header) + - toi_header_storage_for_modules() + - toi_pageflags_space_needed() + - fs_info_space_needed(0); - - return DIV_ROUND_UP(bytes, PAGE_SIZE); -} - -/* - * When freeing memory, pages from either pageset might be freed. - * - * When seeking to free memory to be able to hibernate, for every ps1 page - * freed, we need 2 less pages for the atomic copy because there is one less - * page to copy and one more page into which data can be copied. - * - * Freeing ps2 pages saves us nothing directly. No more memory is available - * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but - * that's too much work to figure out. - * - * => ps1_to_free functions - * - * Of course if we just want to reduce the image size, because of storage - * limitations or an image size limit either ps will do. - * - * => any_to_free function - */ - -static unsigned long lowpages_usable_for_highmem_copy(void) -{ - unsigned long needed = get_lowmem_size(pagedir1) + - extra_pd1_pages_allowance + MIN_FREE_RAM + - toi_memory_for_modules(0), - available = get_lowmem_size(pagedir2) + - real_nr_free_low_pages() + extra_pages_allocated; - - return available > needed ? available - needed : 0; -} - -static unsigned long highpages_ps1_to_free(void) -{ - unsigned long need = get_highmem_size(pagedir1), - available = get_highmem_size(pagedir2) + - real_nr_free_high_pages() + - lowpages_usable_for_highmem_copy(); - - return need > available ? DIV_ROUND_UP(need - available, 2) : 0; -} - -static unsigned long lowpages_ps1_to_free(void) -{ - unsigned long needed = get_lowmem_size(pagedir1) + - extra_pd1_pages_allowance + MIN_FREE_RAM + - toi_memory_for_modules(0), - available = get_lowmem_size(pagedir2) + - real_nr_free_low_pages() + extra_pages_allocated; - - return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0; -} - -static unsigned long current_image_size(void) -{ - return pagedir1.size + pagedir2.size + header_storage_needed; -} - -static unsigned long storage_still_required(void) -{ - unsigned long needed = main_storage_needed(1, 1); - return needed > storage_limit ? needed - storage_limit : 0; -} - -static unsigned long ram_still_required(void) -{ - unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) + - 2 * extra_pd1_pages_allowance, - available = real_nr_free_low_pages() + extra_pages_allocated; - return needed > available ? needed - available : 0; -} - -unsigned long any_to_free(int use_image_size_limit) -{ - int use_soft_limit = use_image_size_limit && image_size_limit > 0; - unsigned long current_size = current_image_size(), - soft_limit = use_soft_limit ? (image_size_limit << 8) : 0, - to_free = use_soft_limit ? (current_size > soft_limit ? - current_size - soft_limit : 0) : 0, - storage_limit = storage_still_required(), - ram_limit = ram_still_required(), - first_max = max(to_free, storage_limit); - - return max(first_max, ram_limit); -} - -static int need_pageset2(void) -{ - return (real_nr_free_low_pages() + extra_pages_allocated - - 2 * extra_pd1_pages_allowance - MIN_FREE_RAM - - toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size; -} - -/* amount_needed - * - * Calculates the amount by which the image size needs to be reduced to meet - * our constraints. - */ -static unsigned long amount_needed(int use_image_size_limit) -{ - return max(highpages_ps1_to_free() + lowpages_ps1_to_free(), - any_to_free(use_image_size_limit)); -} - -static int image_not_ready(int use_image_size_limit) -{ - toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, - "Amount still needed (%lu) > 0:%u," - " Storage allocd: %lu < %lu: %u.\n", - amount_needed(use_image_size_limit), - (amount_needed(use_image_size_limit) > 0), - main_storage_allocated, - main_storage_needed(1, 1), - main_storage_allocated < main_storage_needed(1, 1)); - - toi_cond_pause(0, NULL); - - return (amount_needed(use_image_size_limit) > 0) || - main_storage_allocated < main_storage_needed(1, 1); -} - -static void display_failure_reason(int tries_exceeded) -{ - unsigned long storage_required = storage_still_required(), - ram_required = ram_still_required(), - high_ps1 = highpages_ps1_to_free(), - low_ps1 = lowpages_ps1_to_free(); - - printk(KERN_INFO "Failed to prepare the image because...\n"); - - if (!storage_limit) { - printk(KERN_INFO "- You need some storage available to be " - "able to hibernate.\n"); - return; - } - - if (tries_exceeded) - printk(KERN_INFO "- The maximum number of iterations was " - "reached without successfully preparing the " - "image.\n"); - - if (storage_required) { - printk(KERN_INFO " - We need at least %lu pages of storage " - "(ignoring the header), but only have %lu.\n", - main_storage_needed(1, 1), - main_storage_allocated); - set_abort_result(TOI_INSUFFICIENT_STORAGE); - } - - if (ram_required) { - printk(KERN_INFO " - We need %lu more free pages of low " - "memory.\n", ram_required); - printk(KERN_INFO " Minimum free : %8d\n", MIN_FREE_RAM); - printk(KERN_INFO " + Reqd. by modules : %8lu\n", - toi_memory_for_modules(0)); - printk(KERN_INFO " + 2 * extra allow : %8lu\n", - 2 * extra_pd1_pages_allowance); - printk(KERN_INFO " - Currently free : %8lu\n", - real_nr_free_low_pages()); - printk(KERN_INFO " - Pages allocd : %8lu\n", - extra_pages_allocated); - printk(KERN_INFO " : ========\n"); - printk(KERN_INFO " Still needed : %8lu\n", - ram_required); - - /* Print breakdown of memory needed for modules */ - toi_memory_for_modules(1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } - - if (high_ps1) { - printk(KERN_INFO "- We need to free %lu highmem pageset 1 " - "pages.\n", high_ps1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } - - if (low_ps1) { - printk(KERN_INFO " - We need to free %ld lowmem pageset 1 " - "pages.\n", low_ps1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } -} - -static void display_stats(int always, int sub_extra_pd1_allow) -{ - char buffer[255]; - snprintf(buffer, 254, - "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). " - "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). " - "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n", - - /* Free */ - real_nr_free_pages(all_zones_mask), - real_nr_free_low_pages(), - - /* Sets */ - pagedir1.size, pagedir1.size - get_highmem_size(pagedir1), - pagedir2.size, pagedir2.size - get_highmem_size(pagedir2), - - /* Nosave */ - num_nosave, extra_pages_allocated, - num_nosave - extra_pages_allocated, - - /* Storage */ - main_storage_allocated, - storage_limit, - main_storage_needed(1, sub_extra_pd1_allow), - main_storage_needed(1, 1), - - /* Needed */ - lowpages_ps1_to_free(), highpages_ps1_to_free(), - any_to_free(1), - MIN_FREE_RAM, toi_memory_for_modules(0), - extra_pd1_pages_allowance, - image_size_limit, - - need_pageset2() ? "yes" : "no"); - - if (always) - printk("%s", buffer); - else - toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer); -} - -/* flag_image_pages - * - * This routine generates our lists of pages to be stored in each - * pageset. Since we store the data using extents, and adding new - * extents might allocate a new extent page, this routine may well - * be called more than once. - */ -static void flag_image_pages(int atomic_copy) -{ - int num_free = 0, num_unmodified = 0; - unsigned long loop; - struct zone *zone; - - pagedir1.size = 0; - pagedir2.size = 0; - - set_highmem_size(pagedir1, 0); - set_highmem_size(pagedir2, 0); - - num_nosave = 0; - toi_trace_index++; - - memory_bm_clear(pageset1_map); - - toi_generate_free_page_map(); - - /* - * Pages not to be saved are marked Nosave irrespective of being - * reserved. - */ - for_each_populated_zone(zone) { - int highmem = is_highmem(zone); - - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - TOI_TRACE_DEBUG(pfn, "_Flag Invalid"); - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - unsigned long y; - for (y = pfn; y < pfn + chunk_size; y++) { - page = pfn_to_page(y); - TOI_TRACE_DEBUG(y, "_Flag Free"); - ClearPagePageset1(page); - ClearPagePageset2(page); - } - num_free += chunk_size; - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageNosave(page)) { - char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave"; - TOI_TRACE_DEBUG(pfn, "_Flag %s", desc); - num_nosave++; - continue; - } - - page = highmem ? saveable_highmem_page(zone, pfn) : - saveable_page(zone, pfn); - - if (!page) { - TOI_TRACE_DEBUG(pfn, "_Flag Nosave2"); - num_nosave++; - continue; - } - - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(pfn, "_Unmodified"); - num_unmodified++; - continue; - } - - if (PagePageset2(page)) { - pagedir2.size++; - TOI_TRACE_DEBUG(pfn, "_Flag PS2"); - if (PageHighMem(page)) - inc_highmem_size(pagedir2); - else - SetPagePageset1Copy(page); - if (PageResave(page)) { - SetPagePageset1(page); - ClearPagePageset1Copy(page); - pagedir1.size++; - if (PageHighMem(page)) - inc_highmem_size(pagedir1); - } - } else { - pagedir1.size++; - TOI_TRACE_DEBUG(pfn, "_Flag PS1"); - SetPagePageset1(page); - if (PageHighMem(page)) - inc_highmem_size(pagedir1); - } - } - } - - if (!atomic_copy) - toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0, - "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)" - " + Unmodified (%d) + NumFree (%d) = %d.\n", - pagedir1.size, pagedir2.size, num_nosave, num_unmodified, - num_free, pagedir1.size + pagedir2.size + num_nosave + num_free); -} - -void toi_recalculate_image_contents(int atomic_copy) -{ - memory_bm_clear(pageset1_map); - if (!atomic_copy) { - unsigned long pfn; - memory_bm_position_reset(pageset2_map); - for (pfn = memory_bm_next_pfn(pageset2_map, 0); - pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset2_map, 0)) - ClearPagePageset1Copy(pfn_to_page(pfn)); - /* Need to call this before getting pageset1_size! */ - toi_mark_pages_for_pageset2(); - } - memory_bm_position_reset(pageset2_map); - flag_image_pages(atomic_copy); - - if (!atomic_copy) { - storage_limit = toiActiveAllocator->storage_available(); - display_stats(0, 0); - } -} - -int try_allocate_extra_memory(void) -{ - unsigned long wanted = pagedir1.size + extra_pd1_pages_allowance - - get_lowmem_size(pagedir2); - if (wanted > extra_pages_allocated) { - unsigned long got = toi_allocate_extra_pagedir_memory(wanted); - if (wanted < got) { - toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, - "Want %d extra pages for pageset1, got %d.\n", - wanted, got); - return 1; - } - } - return 0; -} - -/* update_image - * - * Allocate [more] memory and storage for the image. - */ -static void update_image(int ps2_recalc) -{ - int old_header_req; - unsigned long seek; - - if (try_allocate_extra_memory()) - return; - - if (ps2_recalc) - goto recalc; - - thaw_kernel_threads(); - - /* - * Allocate remaining storage space, if possible, up to the - * maximum we know we'll need. It's okay to allocate the - * maximum if the writer is the swapwriter, but - * we don't want to grab all available space on an NFS share. - * We therefore ignore the expected compression ratio here, - * thereby trying to allocate the maximum image size we could - * need (assuming compression doesn't expand the image), but - * don't complain if we can't get the full amount we're after. - */ - - do { - int result; - - old_header_req = header_storage_needed; - toiActiveAllocator->reserve_header_space(header_storage_needed); - - /* How much storage is free with the reservation applied? */ - storage_limit = toiActiveAllocator->storage_available(); - seek = min(storage_limit, main_storage_needed(0, 0)); - - result = toiActiveAllocator->allocate_storage(seek); - if (result) - printk("Failed to allocate storage (%d).\n", result); - - main_storage_allocated = - toiActiveAllocator->storage_allocated(); - - /* Need more header because more storage allocated? */ - header_storage_needed = get_header_storage_needed(0); - - } while (header_storage_needed > old_header_req); - - if (freeze_kernel_threads()) - set_abort_result(TOI_FREEZING_FAILED); - -recalc: - toi_recalculate_image_contents(0); -} - -/* attempt_to_freeze - * - * Try to freeze processes. - */ - -static int attempt_to_freeze(void) -{ - int result; - - /* Stop processes before checking again */ - toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing " - "filesystems."); - result = freeze_processes(); - - if (result) - set_abort_result(TOI_FREEZING_FAILED); - - result = freeze_kernel_threads(); - - if (result) - set_abort_result(TOI_FREEZING_FAILED); - - return result; -} - -/* eat_memory - * - * Try to free some memory, either to meet hard or soft constraints on the image - * characteristics. - * - * Hard constraints: - * - Pageset1 must be < half of memory; - * - We must have enough memory free at resume time to have pageset1 - * be able to be loaded in pages that don't conflict with where it has to - * be restored. - * Soft constraints - * - User specificied image size limit. - */ -static void eat_memory(void) -{ - unsigned long amount_wanted = 0; - int did_eat_memory = 0; - - /* - * Note that if we have enough storage space and enough free memory, we - * may exit without eating anything. We give up when the last 10 - * iterations ate no extra pages because we're not going to get much - * more anyway, but the few pages we get will take a lot of time. - * - * We freeze processes before beginning, and then unfreeze them if we - * need to eat memory until we think we have enough. If our attempts - * to freeze fail, we give up and abort. - */ - - amount_wanted = amount_needed(1); - - switch (image_size_limit) { - case -1: /* Don't eat any memory */ - if (amount_wanted > 0) { - set_abort_result(TOI_WOULD_EAT_MEMORY); - return; - } - break; - case -2: /* Free caches only */ - drop_pagecache(); - toi_recalculate_image_contents(0); - amount_wanted = amount_needed(1); - break; - default: - break; - } - - if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) && - image_size_limit != -1) { - unsigned long request = amount_wanted; - unsigned long high_req = max(highpages_ps1_to_free(), - any_to_free(1)); - unsigned long low_req = lowpages_ps1_to_free(); - unsigned long got = 0; - - toi_prepare_status(CLEAR_BAR, - "Seeking to free %ldMB of memory.", - MB(amount_wanted)); - - thaw_kernel_threads(); - - /* - * Ask for too many because shrink_memory_mask doesn't - * currently return enough most of the time. - */ - - if (low_req) - got = shrink_memory_mask(low_req, GFP_KERNEL); - if (high_req) - shrink_memory_mask(high_req - got, GFP_HIGHUSER); - - did_eat_memory = 1; - - toi_recalculate_image_contents(0); - - amount_wanted = amount_needed(1); - - printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &" - " %ld pages from anywhere, got %ld.\n", - high_req, low_req, - request - amount_wanted); - - toi_cond_pause(0, NULL); - - if (freeze_kernel_threads()) - set_abort_result(TOI_FREEZING_FAILED); - } - - if (did_eat_memory) - toi_recalculate_image_contents(0); -} - -/* toi_prepare_image - * - * Entry point to the whole image preparation section. - * - * We do four things: - * - Freeze processes; - * - Ensure image size constraints are met; - * - Complete all the preparation for saving the image, - * including allocation of storage. The only memory - * that should be needed when we're finished is that - * for actually storing the image (and we know how - * much is needed for that because the modules tell - * us). - * - Make sure that all dirty buffers are written out. - */ -#define MAX_TRIES 2 -int toi_prepare_image(void) -{ - int result = 1, tries = 1; - - main_storage_allocated = 0; - - // Force recalculation of the amount of header storage needed for fs info. - fs_info_space_needed(1); - - no_ps2_needed = 0; - - if (attempt_to_freeze()) - return 1; - - lock_device_hotplug(); - set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); - - if (!extra_pd1_pages_allowance) - get_extra_pd1_allowance(); - - storage_limit = toiActiveAllocator->storage_available(); - - if (!storage_limit) { - printk(KERN_INFO "No storage available. Didn't try to prepare " - "an image.\n"); - display_failure_reason(0); - set_abort_result(TOI_NOSTORAGE_AVAILABLE); - return 1; - } - - if (build_attention_list()) { - abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, - "Unable to successfully prepare the image.\n"); - return 1; - } - - toi_recalculate_image_contents(0); - - do { - toi_prepare_status(CLEAR_BAR, - "Preparing Image. Try %d.", tries); - - eat_memory(); - - if (test_result_state(TOI_ABORTED)) - break; - - update_image(0); - - tries++; - - } while (image_not_ready(1) && tries <= MAX_TRIES && - !test_result_state(TOI_ABORTED)); - - result = image_not_ready(0); - - /* TODO: Handle case where need to remove existing image and resave - * instead of adding to incremental image. */ - - if (!test_result_state(TOI_ABORTED)) { - if (result) { - display_stats(1, 0); - display_failure_reason(tries > MAX_TRIES); - abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, - "Unable to successfully prepare the image.\n"); - } else { - /* Pageset 2 needed? */ - if (!need_pageset2() && - test_action_state(TOI_NO_PS2_IF_UNNEEDED)) { - no_ps2_needed = 1; - toi_recalculate_image_contents(0); - update_image(1); - } - - toi_cond_pause(1, "Image preparation complete."); - } - } - - return result ? result : allocate_checksum_pages(); -} diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h deleted file mode 100644 index f7f2b695c..000000000 --- a/kernel/power/tuxonice_prepare_image.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * kernel/power/tuxonice_prepare_image.h - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include - -extern int toi_prepare_image(void); -extern void toi_recalculate_image_contents(int storage_available); -extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask); -extern long image_size_limit; -extern void toi_free_extra_pagedir_memory(void); -extern unsigned long extra_pd1_pages_allowance; -extern void free_attention_list(void); - -#define MIN_FREE_RAM 100 -#define MIN_EXTRA_PAGES_ALLOWANCE 500 - -#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1)) -#ifdef CONFIG_HIGHMEM -#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM)) -#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \ - (1 << ZONE_HIGHMEM))) -#else -#define real_nr_free_high_pages() (0) -#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask)) - -/* For eat_memory function */ -#define ZONE_HIGHMEM (MAX_NR_ZONES + 1) -#endif - -unsigned long get_header_storage_needed(int reset); -unsigned long any_to_free(int use_image_size_limit); -int try_allocate_extra_memory(void); diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c deleted file mode 100644 index 5bc56d3a1..000000000 --- a/kernel/power/tuxonice_prune.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - * kernel/power/tuxonice_prune.c - * - * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file implements a TuxOnIce module that seeks to prune the - * amount of data written to disk. It builds a table of hashes - * of the uncompressed data, and writes the pfn of the previous page - * with the same contents instead of repeating the data when a match - * is found. - */ - -#include -#include -#include -#include -#include -#include - -#include "tuxonice_builtin.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" - -/* - * We never write a page bigger than PAGE_SIZE, so use a large number - * to indicate that data is a PFN. - */ -#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100) - -static unsigned long toi_pruned_pages; - -static struct toi_module_ops toi_prune_ops; -static struct toi_module_ops *next_driver; - -static char toi_prune_hash_algo_name[32] = "sha1"; - -static DEFINE_MUTEX(stats_lock); - -struct cpu_context { - struct shash_desc desc; - char *digest; -}; - -#define OUT_BUF_SIZE (2 * PAGE_SIZE) - -static DEFINE_PER_CPU(struct cpu_context, contexts); - -/* - * toi_crypto_prepare - * - * Prepare to do some work by allocating buffers and transforms. - */ -static int toi_prune_crypto_prepare(void) -{ - int cpu, ret, digestsize; - - if (!*toi_prune_hash_algo_name) { - printk(KERN_INFO "TuxOnIce: Pruning enabled but no " - "hash algorithm set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0); - if (IS_ERR(this->desc.tfm)) { - printk(KERN_INFO "TuxOnIce: Failed to allocate the " - "%s prune hash algorithm.\n", - toi_prune_hash_algo_name); - this->desc.tfm = NULL; - return 1; - } - - if (!digestsize) - digestsize = crypto_shash_digestsize(this->desc.tfm); - - this->digest = kmalloc(digestsize, GFP_KERNEL); - if (!this->digest) { - printk(KERN_INFO "TuxOnIce: Failed to allocate space " - "for digest output.\n"); - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - } - - this->desc.flags = 0; - - ret = crypto_shash_init(&this->desc); - if (ret < 0) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s prune hash algorithm.\n", - toi_prune_hash_algo_name); - kfree(this->digest); - this->digest = NULL; - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - return 1; - } - } - - return 0; -} - -static int toi_prune_rw_cleanup(int writing) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->desc.tfm) { - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - } - - if (this->digest) { - kfree(this->digest); - this->digest = NULL; - } - } - - return 0; -} - -/* - * toi_prune_init - */ - -static int toi_prune_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - toi_pruned_pages = 0; - - next_driver = toi_get_next_filter(&toi_prune_ops); - - return next_driver ? 0 : -ECHILD; -} - -/* - * toi_prune_rw_init() - */ - -static int toi_prune_rw_init(int rw, int stream_number) -{ - if (toi_prune_crypto_prepare()) { - printk(KERN_ERR "Failed to initialise prune " - "algorithm.\n"); - if (rw == READ) { - printk(KERN_INFO "Unable to read the image.\n"); - return -ENODEV; - } else { - printk(KERN_INFO "Continuing without " - "pruning the image.\n"); - toi_prune_ops.enabled = 0; - } - } - - return 0; -} - -/* - * toi_prune_write_page() - * - * Compress a page of data, buffering output and passing on filled - * pages to the next module in the pipeline. - * - * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing - * data to be checked. - * - * Returns: 0 on success. Otherwise the error is that returned by later - * modules, -ECHILD if we have a broken pipeline or -EIO if - * zlib errs. - */ -static int toi_prune_write_page(unsigned long index, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - int ret = 0, cpu = smp_processor_id(), write_data = 1; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - u8* output_buffer = buffer_page; - int output_len = buf_size; - int out_buf_type = buf_type; - void *buffer_start; - u32 buf[4]; - - if (ctx->desc.tfm) { - - buffer_start = TOI_MAP(buf_type, buffer_page); - ctx->len = OUT_BUF_SIZE; - - ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest); - if (ret) { - printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret); - } else { - mutex_lock(&stats_lock); - - toi_pruned_pages++; - - mutex_unlock(&stats_lock); - - } - - TOI_UNMAP(buf_type, buffer_page); - } - - if (write_data) - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - else - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - - return ret; -} - -/* - * toi_prune_read_page() - * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. - * - * Retrieve data from later modules or from a previously loaded page and - * fill the input buffer. - * Zero if successful. Error condition from me or from downstream on failure. - */ -static int toi_prune_read_page(unsigned long *index, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int ret, cpu = smp_processor_id(); - unsigned int len; - char *buffer_start; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!ctx->desc.tfm) - return next_driver->read_page(index, TOI_PAGE, buffer_page, - buf_size); - - /* - * All our reads must be synchronous - we can't handle - * data that hasn't been read yet. - */ - - ret = next_driver->read_page(index, buf_type, buffer_page, &len); - - if (len == PRUNE_DATA_IS_PFN) { - buffer_start = kmap(buffer_page); - } - - return ret; -} - -/* - * toi_prune_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_prune_print_debug_stats(char *buffer, int size) -{ - int len; - - /* Output the number of pages pruned. */ - if (*toi_prune_hash_algo_name) - len = scnprintf(buffer, size, "- Compressor is '%s'.\n", - toi_prune_hash_algo_name); - else - len = scnprintf(buffer, size, "- Compressor is not set.\n"); - - if (toi_pruned_pages) - len += scnprintf(buffer+len, size - len, " Pruned " - "%lu pages).\n", - toi_pruned_pages); - return len; -} - -/* - * toi_prune_memory_needed - * - * Tell the caller how much memory we need to operate during hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_prune_memory_needed(void) -{ - return 2 * PAGE_SIZE; -} - -static int toi_prune_storage_needed(void) -{ - return 2 * sizeof(unsigned long) + 2 * sizeof(int) + - strlen(toi_prune_hash_algo_name) + 1; -} - -/* - * toi_prune_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_prune_save_config_info(char *buffer) -{ - int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0; - - *((unsigned long *) buffer) = toi_pruned_pages; - offset += sizeof(unsigned long); - *((int *) (buffer + offset)) = len; - offset += sizeof(int); - strncpy(buffer + offset, toi_prune_hash_algo_name, len); - return offset + len; -} - -/* toi_prune_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for passing back to the - * resumed kernel. - */ -static void toi_prune_load_config_info(char *buffer, int size) -{ - int len, offset = 0; - - toi_pruned_pages = *((unsigned long *) buffer); - offset += sizeof(unsigned long); - len = *((int *) (buffer + offset)); - offset += sizeof(int); - strncpy(toi_prune_hash_algo_name, buffer + offset, len); -} - -static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - bkd->pruned_pages = toi_pruned_pages; -} - -static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_pruned_pages = bkd->pruned_pages; -} - -/* - * toi_expected_ratio - * - * Description: Returns the expected ratio between data passed into this module - * and the amount of data output when writing. - * Returns: 100 - we have no idea how many pages will be pruned. - */ - -static int toi_prune_expected_ratio(void) -{ - return 100; -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0, - NULL), - SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL), -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_prune_ops = { - .type = FILTER_MODULE, - .name = "prune", - .directory = "prune", - .module = THIS_MODULE, - .initialise = toi_prune_init, - .memory_needed = toi_prune_memory_needed, - .print_debug_info = toi_prune_print_debug_stats, - .save_config_info = toi_prune_save_config_info, - .load_config_info = toi_prune_load_config_info, - .storage_needed = toi_prune_storage_needed, - .expected_compression = toi_prune_expected_ratio, - - .pre_atomic_restore = toi_prune_pre_atomic_restore, - .post_atomic_restore = toi_prune_post_atomic_restore, - - .rw_init = toi_prune_rw_init, - .rw_cleanup = toi_prune_rw_cleanup, - - .write_page = toi_prune_write_page, - .read_page = toi_prune_read_page, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -static __init int toi_prune_load(void) -{ - return toi_register_module(&toi_prune_ops); -} - -late_initcall(toi_prune_load); diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c deleted file mode 100644 index d8539c275..000000000 --- a/kernel/power/tuxonice_storage.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - * kernel/power/tuxonice_storage.c - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for talking to a userspace program that manages storage. - * - * The kernel side: - * - starts the userspace program; - * - sends messages telling it when to open and close the connection; - * - tells it when to quit; - * - * The user space side: - * - passes messages regarding status; - * - */ - -#include -#include - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_netlink.h" -#include "tuxonice_storage.h" -#include "tuxonice_ui.h" - -static struct user_helper_data usm_helper_data; -static struct toi_module_ops usm_ops; -static int message_received, usm_prepare_count; -static int storage_manager_last_action, storage_manager_action; - -static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type; - int *data; - - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < NETLINK_MSG_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ - if (type >= USM_MSG_MAX) - return -EINVAL; - - /* All operations require privileges, even GET */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1) - return -EBUSY; - - data = (int *) NLMSG_DATA(nlh); - - switch (type) { - case USM_MSG_SUCCESS: - case USM_MSG_FAILED: - message_received = type; - complete(&usm_helper_data.wait_for_process); - break; - default: - printk(KERN_INFO "Storage manager doesn't recognise " - "message %d.\n", type); - } - - return 1; -} - -#ifdef CONFIG_NET -static int activations; - -int toi_activate_storage(int force) -{ - int tries = 1; - - if (usm_helper_data.pid == -1 || !usm_ops.enabled) - return 0; - - message_received = 0; - activations++; - - if (activations > 1 && !force) - return 0; - - while ((!message_received || message_received == USM_MSG_FAILED) && - tries < 2) { - toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt " - "%d.\n", tries); - - init_completion(&usm_helper_data.wait_for_process); - - toi_send_netlink_message(&usm_helper_data, - USM_MSG_CONNECT, - NULL, 0); - - /* Wait 2 seconds for the userspace process to make contact */ - wait_for_completion_timeout(&usm_helper_data.wait_for_process, - 2*HZ); - - tries++; - } - - return 0; -} - -int toi_deactivate_storage(int force) -{ - if (usm_helper_data.pid == -1 || !usm_ops.enabled) - return 0; - - message_received = 0; - activations--; - - if (activations && !force) - return 0; - - init_completion(&usm_helper_data.wait_for_process); - - toi_send_netlink_message(&usm_helper_data, - USM_MSG_DISCONNECT, - NULL, 0); - - wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ); - - if (!message_received || message_received == USM_MSG_FAILED) { - printk(KERN_INFO "Returning failure disconnecting storage.\n"); - return 1; - } - - return 0; -} -#endif - -static void storage_manager_simulate(void) -{ - printk(KERN_INFO "--- Storage manager simulate ---\n"); - toi_prepare_usm(); - schedule(); - printk(KERN_INFO "--- Activate storage 1 ---\n"); - toi_activate_storage(1); - schedule(); - printk(KERN_INFO "--- Deactivate storage 1 ---\n"); - toi_deactivate_storage(1); - schedule(); - printk(KERN_INFO "--- Cleanup usm ---\n"); - toi_cleanup_usm(); - schedule(); - printk(KERN_INFO "--- Storage manager simulate ends ---\n"); -} - -static int usm_storage_needed(void) -{ - return sizeof(int) + strlen(usm_helper_data.program) + 1; -} - -static int usm_save_config_info(char *buf) -{ - int len = strlen(usm_helper_data.program); - memcpy(buf, usm_helper_data.program, len + 1); - return sizeof(int) + len + 1; -} - -static void usm_load_config_info(char *buf, int size) -{ - /* Don't load the saved path if one has already been set */ - if (usm_helper_data.program[0]) - return; - - memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf)); -} - -static int usm_memory_needed(void) -{ - /* ball park figure of 32 pages */ - return 32 * PAGE_SIZE; -} - -/* toi_prepare_usm - */ -int toi_prepare_usm(void) -{ - usm_prepare_count++; - - if (usm_prepare_count > 1 || !usm_ops.enabled) - return 0; - - usm_helper_data.pid = -1; - - if (!*usm_helper_data.program) - return 0; - - toi_netlink_setup(&usm_helper_data); - - if (usm_helper_data.pid == -1) - printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't" - " start it.\n"); - - toi_activate_storage(0); - - return usm_helper_data.pid != -1; -} - -void toi_cleanup_usm(void) -{ - usm_prepare_count--; - - if (usm_helper_data.pid > -1 && !usm_prepare_count) { - toi_deactivate_storage(0); - toi_netlink_close(&usm_helper_data); - } -} - -static void storage_manager_activate(void) -{ - if (storage_manager_action == storage_manager_last_action) - return; - - if (storage_manager_action) - toi_prepare_usm(); - else - toi_cleanup_usm(); - - storage_manager_last_action = storage_manager_action; -} - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate), - SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL), - SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0, - NULL), - SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1, - 0, storage_manager_activate) -}; - -static struct toi_module_ops usm_ops = { - .type = MISC_MODULE, - .name = "usm", - .directory = "storage_manager", - .module = THIS_MODULE, - .storage_needed = usm_storage_needed, - .save_config_info = usm_save_config_info, - .load_config_info = usm_load_config_info, - .memory_needed = usm_memory_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* toi_usm_sysfs_init - * Description: Boot time initialisation for user interface. - */ -int toi_usm_init(void) -{ - usm_helper_data.nl = NULL; - usm_helper_data.program[0] = '\0'; - usm_helper_data.pid = -1; - usm_helper_data.skb_size = 0; - usm_helper_data.pool_limit = 6; - usm_helper_data.netlink_id = NETLINK_TOI_USM; - usm_helper_data.name = "userspace storage manager"; - usm_helper_data.rcv_msg = usm_user_rcv_msg; - usm_helper_data.interface_version = 2; - usm_helper_data.must_init = 0; - init_completion(&usm_helper_data.wait_for_process); - - return toi_register_module(&usm_ops); -} - -void toi_usm_exit(void) -{ - toi_netlink_close_complete(&usm_helper_data); - toi_unregister_module(&usm_ops); -} diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h deleted file mode 100644 index 0189c8888..000000000 --- a/kernel/power/tuxonice_storage.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * kernel/power/tuxonice_storage.h - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifdef CONFIG_NET -int toi_prepare_usm(void); -void toi_cleanup_usm(void); - -int toi_activate_storage(int force); -int toi_deactivate_storage(int force); -extern int toi_usm_init(void); -extern void toi_usm_exit(void); -#else -static inline int toi_usm_init(void) { return 0; } -static inline void toi_usm_exit(void) { } - -static inline int toi_activate_storage(int force) -{ - return 0; -} - -static inline int toi_deactivate_storage(int force) -{ - return 0; -} - -static inline int toi_prepare_usm(void) { return 0; } -static inline void toi_cleanup_usm(void) { } -#endif - -enum { - USM_MSG_BASE = 0x10, - - /* Kernel -> Userspace */ - USM_MSG_CONNECT = 0x30, - USM_MSG_DISCONNECT = 0x31, - USM_MSG_SUCCESS = 0x40, - USM_MSG_FAILED = 0x41, - - USM_MSG_MAX, -}; diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c deleted file mode 100644 index 9f555c932..000000000 --- a/kernel/power/tuxonice_swap.c +++ /dev/null @@ -1,474 +0,0 @@ -/* - * kernel/power/tuxonice_swap.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file encapsulates functions for usage of swap space as a - * backing store. - */ - -#include -#include -#include -#include -#include -#include - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_extent.h" -#include "tuxonice_bio.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" - -static struct toi_module_ops toi_swapops; - -/* For swapfile automatically swapon/off'd. */ -static char swapfilename[255] = ""; -static int toi_swapon_status; - -/* Swap Pages */ -static unsigned long swap_allocated; - -static struct sysinfo swapinfo; - -static int is_ram_backed(struct swap_info_struct *si) -{ - if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) || - !strncmp(si->bdev->bd_disk->disk_name, "zram", 4)) - return 1; - - return 0; -} - -/** - * enable_swapfile: Swapon the user specified swapfile prior to hibernating. - * - * Activate the given swapfile if it wasn't already enabled. Remember whether - * we really did swapon it for swapoffing later. - */ -static void enable_swapfile(void) -{ - int activateswapresult = -EINVAL; - - if (swapfilename[0]) { - /* Attempt to swap on with maximum priority */ - activateswapresult = sys_swapon(swapfilename, 0xFFFF); - if (activateswapresult && activateswapresult != -EBUSY) - printk(KERN_ERR "TuxOnIce: The swapfile/partition " - "specified by /sys/power/tuxonice/swap/swapfile" - " (%s) could not be turned on (error %d). " - "Attempting to continue.\n", - swapfilename, activateswapresult); - if (!activateswapresult) - toi_swapon_status = 1; - } -} - -/** - * disable_swapfile: Swapoff any file swaponed at the start of the cycle. - * - * If we did successfully swapon a file at the start of the cycle, swapoff - * it now (finishing up). - */ -static void disable_swapfile(void) -{ - if (!toi_swapon_status) - return; - - sys_swapoff(swapfilename); - toi_swapon_status = 0; -} - -static int add_blocks_to_extent_chain(struct toi_bdev_info *chain, - unsigned long start, unsigned long end) -{ - if (test_action_state(TOI_TEST_BIO)) - toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to " - "chain %p.", start << chain->bmap_shift, - end << chain->bmap_shift, chain); - - return toi_add_to_extent_chain(&chain->blocks, start, end); -} - - -static int get_main_pool_phys_params(struct toi_bdev_info *chain) -{ - struct hibernate_extent *extentpointer = NULL; - unsigned long address, extent_min = 0, extent_max = 0; - int empty = 1; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for " - "chain %d.", chain->allocator_index); - - if (!chain->allocations.first) - return 0; - - if (chain->blocks.first) - toi_put_extent_chain(&chain->blocks); - - toi_extent_for_each(&chain->allocations, extentpointer, address) { - swp_entry_t swap_address = (swp_entry_t) { address }; - struct block_device *bdev; - sector_t new_sector = map_swap_entry(swap_address, &bdev); - - if (empty) { - empty = 0; - extent_min = extent_max = new_sector; - continue; - } - - if (new_sector == extent_max + 1) { - extent_max++; - continue; - } - - if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) { - printk(KERN_ERR "Out of memory while making block " - "chains.\n"); - return -ENOMEM; - } - - extent_min = new_sector; - extent_max = new_sector; - } - - if (!empty && - add_blocks_to_extent_chain(chain, extent_min, extent_max)) { - printk(KERN_ERR "Out of memory while making block chains.\n"); - return -ENOMEM; - } - - return 0; -} - -/* - * Like si_swapinfo, except that we don't include ram backed swap (compcache!) - * and don't need to use the spinlocks (userspace is stopped when this - * function is called). - */ -void si_swapinfo_no_compcache(void) -{ - unsigned int i; - - si_swapinfo(&swapinfo); - swapinfo.freeswap = 0; - swapinfo.totalswap = 0; - - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) { - swapinfo.totalswap += si->inuse_pages; - swapinfo.freeswap += si->pages - si->inuse_pages; - } - } -} -/* - * We can't just remember the value from allocation time, because other - * processes might have allocated swap in the mean time. - */ -static unsigned long toi_swap_storage_available(void) -{ - toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available."); - si_swapinfo_no_compcache(); - return swapinfo.freeswap + swap_allocated; -} - -static int toi_swap_initialise(int starting_cycle) -{ - if (!starting_cycle) - return 0; - - enable_swapfile(); - return 0; -} - -static void toi_swap_cleanup(int ending_cycle) -{ - if (!ending_cycle) - return; - - disable_swapfile(); -} - -static void toi_swap_free_storage(struct toi_bdev_info *chain) -{ - /* Free swap entries */ - struct hibernate_extent *extentpointer; - unsigned long extentvalue; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.", - chain); - - swap_allocated -= chain->allocations.size; - toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) - swap_free((swp_entry_t) { extentvalue }); - - toi_put_extent_chain(&chain->allocations); -} - -static void free_swap_range(unsigned long min, unsigned long max) -{ - int j; - - for (j = min; j <= max; j++) - swap_free((swp_entry_t) { j }); - swap_allocated -= (max - min + 1); -} - -/* - * Allocation of a single swap type. Swap priorities are handled at the higher - * level. - */ -static int toi_swap_allocate_storage(struct toi_bdev_info *chain, - unsigned long request) -{ - unsigned long gotten = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, " Swap allocate storage: Asked to" - " allocate %lu pages from device %d.", request, - chain->allocator_index); - - while (gotten < request) { - swp_entry_t start, end; - if (0) { - /* Broken at the moment for SSDs */ - get_swap_range_of_type(chain->allocator_index, &start, &end, - request - gotten + 1); - } else { - start = end = get_swap_page_of_type(chain->allocator_index); - } - if (start.val) { - int added = end.val - start.val + 1; - if (toi_add_to_extent_chain(&chain->allocations, - start.val, end.val)) { - printk(KERN_INFO "Failed to allocate extent for " - "%lu-%lu.\n", start.val, end.val); - free_swap_range(start.val, end.val); - break; - } - gotten += added; - swap_allocated += added; - } else - break; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, " Allocated %lu pages.", gotten); - return gotten; -} - -static int toi_swap_register_storage(void) -{ - int i, result = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage."); - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - struct toi_bdev_info *devinfo; - unsigned char *p; - unsigned char buf[256]; - struct fs_info *fs_info; - - if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si)) - continue; - - devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), - GFP_ATOMIC); - if (!devinfo) { - printk("Failed to allocate devinfo struct for swap " - "device %d.\n", i); - return -ENOMEM; - } - - devinfo->bdev = si->bdev; - devinfo->allocator = &toi_swapops; - devinfo->allocator_index = i; - - fs_info = fs_info_from_block_dev(si->bdev); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(devinfo->uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - if (!fs_info) - printk("fs_info from block dev returned %d.\n", result); - devinfo->dev_t = si->bdev->bd_dev; - devinfo->prio = si->prio; - devinfo->bmap_shift = 3; - devinfo->blocks_per_page = 1; - - p = d_path(&si->swap_file->f_path, buf, sizeof(buf)); - sprintf(devinfo->name, "swap on %s", p); - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:" - " Device %d (%lx), prio %d.", i, - (unsigned long) devinfo->dev_t, devinfo->prio); - toi_bio_ops.register_storage(devinfo); - } - - return 0; -} - -static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used) -{ - struct hibernate_extent *extentpointer = NULL; - unsigned long extentvalue; - unsigned long i = 0, first_freed = 0; - - toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) { - i++; - if (i > used) { - swap_free((swp_entry_t) { extentvalue }); - if (!first_freed) - first_freed = extentvalue; - } - } - - return first_freed; -} - -/* - * workspace_size - * - * Description: - * Returns the number of bytes of RAM needed for this - * code to do its work. (Used when calculating whether - * we have enough memory to be able to hibernate & resume). - * - */ -static int toi_swap_memory_needed(void) -{ - return 1; -} - -/* - * Print debug info - * - * Description: - */ -static int toi_swap_print_debug_stats(char *buffer, int size) -{ - int len = 0; - - len = scnprintf(buffer, size, "- Swap Allocator enabled.\n"); - if (swapfilename[0]) - len += scnprintf(buffer+len, size-len, - " Attempting to automatically swapon: %s.\n", - swapfilename); - - si_swapinfo_no_compcache(); - - len += scnprintf(buffer+len, size-len, - " Swap available for image: %lu pages.\n", - swapinfo.freeswap + swap_allocated); - - return len; -} - -static int header_locations_read_sysfs(const char *page, int count) -{ - int i, printedpartitionsmessage = 0, len = 0, haveswap = 0; - struct inode *swapf = NULL; - int zone; - char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL); - char *path, *output = (char *) page; - int path_len; - - if (!page) - return 0; - - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - - if (!si || !(si->flags & SWP_WRITEOK)) - continue; - - if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) { - haveswap = 1; - if (!printedpartitionsmessage) { - len += sprintf(output + len, - "For swap partitions, simply use the " - "format: resume=swap:/dev/hda1.\n"); - printedpartitionsmessage = 1; - } - } else { - path_len = 0; - - path = d_path(&si->swap_file->f_path, path_page, - PAGE_SIZE); - path_len = snprintf(path_page, PAGE_SIZE, "%s", path); - - haveswap = 1; - swapf = si->swap_file->f_mapping->host; - zone = bmap(swapf, 0); - if (!zone) { - len += sprintf(output + len, - "Swapfile %s has been corrupted. Reuse" - " mkswap on it and try again.\n", - path_page); - } else { - char name_buffer[BDEVNAME_SIZE]; - len += sprintf(output + len, - "For swapfile `%s`," - " use resume=swap:/dev/%s:0x%x.\n", - path_page, - bdevname(si->bdev, name_buffer), - zone << (swapf->i_blkbits - 9)); - } - } - } - - if (!haveswap) - len = sprintf(output, "You need to turn on swap partitions " - "before examining this file.\n"); - - toi_free_page(10, (unsigned long) path_page); - return len; -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL), - SYSFS_CUSTOM("headerlocations", SYSFS_READONLY, - header_locations_read_sysfs, NULL, 0, NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0, - attempt_to_parse_resume_device2), -}; - -static struct toi_bio_allocator_ops toi_bio_swapops = { - .register_storage = toi_swap_register_storage, - .storage_available = toi_swap_storage_available, - .allocate_storage = toi_swap_allocate_storage, - .bmap = get_main_pool_phys_params, - .free_storage = toi_swap_free_storage, - .free_unused_storage = toi_swap_free_unused_storage, -}; - -static struct toi_module_ops toi_swapops = { - .type = BIO_ALLOCATOR_MODULE, - .name = "swap storage", - .directory = "swap", - .module = THIS_MODULE, - .memory_needed = toi_swap_memory_needed, - .print_debug_info = toi_swap_print_debug_stats, - .initialise = toi_swap_initialise, - .cleanup = toi_swap_cleanup, - .bio_allocator_ops = &toi_bio_swapops, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -static __init int toi_swap_load(void) -{ - return toi_register_module(&toi_swapops); -} - -late_initcall(toi_swap_load); diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c deleted file mode 100644 index 77f36dbeb..000000000 --- a/kernel/power/tuxonice_sysfs.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * kernel/power/tuxonice_sysfs.c - * - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains support for sysfs entries for tuning TuxOnIce. - * - * We have a generic handler that deals with the most common cases, and - * hooks for special handlers to use. - */ - -#include - -#include "tuxonice_sysfs.h" -#include "tuxonice.h" -#include "tuxonice_storage.h" -#include "tuxonice_alloc.h" - -static int toi_sysfs_initialised; - -static void toi_initialise_sysfs(void); - -static struct toi_sysfs_data sysfs_params[]; - -#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr) - -static void toi_main_wrapper(void) -{ - toi_try_hibernate(); -} - -static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); - int len = 0; - int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ; - - if (full_prep && toi_start_anything(0)) - return -EBUSY; - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) - toi_prepare_usm(); - - switch (sysfs_data->type) { - case TOI_SYSFS_DATA_CUSTOM: - len = (sysfs_data->data.special.read_sysfs) ? - (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE) - : 0; - break; - case TOI_SYSFS_DATA_BIT: - len = sprintf(page, "%d\n", - -test_bit(sysfs_data->data.bit.bit, - sysfs_data->data.bit.bit_vector)); - break; - case TOI_SYSFS_DATA_INTEGER: - len = sprintf(page, "%d\n", - *(sysfs_data->data.integer.variable)); - break; - case TOI_SYSFS_DATA_LONG: - len = sprintf(page, "%ld\n", - *(sysfs_data->data.a_long.variable)); - break; - case TOI_SYSFS_DATA_UL: - len = sprintf(page, "%lu\n", - *(sysfs_data->data.ul.variable)); - break; - case TOI_SYSFS_DATA_STRING: - len = sprintf(page, "%s\n", - sysfs_data->data.string.variable); - break; - } - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) - toi_cleanup_usm(); - - if (full_prep) - toi_finish_anything(0); - - return len; -} - -#define BOUND(_variable, _type) do { \ - if (*_variable < sysfs_data->data._type.minimum) \ - *_variable = sysfs_data->data._type.minimum; \ - else if (*_variable > sysfs_data->data._type.maximum) \ - *_variable = sysfs_data->data._type.maximum; \ -} while (0) - -static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr, - const char *my_buf, size_t count) -{ - int assigned_temp_buffer = 0, result = count; - struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); - - if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME))) - return -EBUSY; - - ((char *) my_buf)[count] = 0; - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) - toi_prepare_usm(); - - switch (sysfs_data->type) { - case TOI_SYSFS_DATA_CUSTOM: - if (sysfs_data->data.special.write_sysfs) - result = (sysfs_data->data.special.write_sysfs)(my_buf, - count); - break; - case TOI_SYSFS_DATA_BIT: - { - unsigned long value; - result = kstrtoul(my_buf, 0, &value); - if (result) - break; - if (value) - set_bit(sysfs_data->data.bit.bit, - (sysfs_data->data.bit.bit_vector)); - else - clear_bit(sysfs_data->data.bit.bit, - (sysfs_data->data.bit.bit_vector)); - } - break; - case TOI_SYSFS_DATA_INTEGER: - { - long temp; - result = kstrtol(my_buf, 0, &temp); - if (result) - break; - *(sysfs_data->data.integer.variable) = (int) temp; - BOUND(sysfs_data->data.integer.variable, integer); - break; - } - case TOI_SYSFS_DATA_LONG: - { - long *variable = - sysfs_data->data.a_long.variable; - result = kstrtol(my_buf, 0, variable); - if (result) - break; - BOUND(variable, a_long); - break; - } - case TOI_SYSFS_DATA_UL: - { - unsigned long *variable = - sysfs_data->data.ul.variable; - result = kstrtoul(my_buf, 0, variable); - if (result) - break; - BOUND(variable, ul); - break; - } - break; - case TOI_SYSFS_DATA_STRING: - { - int copy_len = count; - char *variable = - sysfs_data->data.string.variable; - - if (sysfs_data->data.string.max_length && - (copy_len > sysfs_data->data.string.max_length)) - copy_len = sysfs_data->data.string.max_length; - - if (!variable) { - variable = (char *) toi_get_zeroed_page(31, - TOI_ATOMIC_GFP); - sysfs_data->data.string.variable = variable; - assigned_temp_buffer = 1; - } - strncpy(variable, my_buf, copy_len); - if (copy_len && my_buf[copy_len - 1] == '\n') - variable[count - 1] = 0; - variable[count] = 0; - } - break; - } - - if (!result) - result = count; - - /* Side effect routine? */ - if (result == count && sysfs_data->write_side_effect) - sysfs_data->write_side_effect(); - - /* Free temporary buffers */ - if (assigned_temp_buffer) { - toi_free_page(31, - (unsigned long) sysfs_data->data.string.variable); - sysfs_data->data.string.variable = NULL; - } - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) - toi_cleanup_usm(); - - toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME); - - return result; -} - -static struct sysfs_ops toi_sysfs_ops = { - .show = &toi_attr_show, - .store = &toi_attr_store, -}; - -static struct kobj_type toi_ktype = { - .sysfs_ops = &toi_sysfs_ops, -}; - -struct kobject *tuxonice_kobj; - -/* Non-module sysfs entries. - * - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL, - SYSFS_HIBERNATING, toi_main_wrapper), - SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL, - SYSFS_RESUMING, toi_try_resume) -}; - -void remove_toi_sysdir(struct kobject *kobj) -{ - if (!kobj) - return; - - kobject_put(kobj); -} - -struct kobject *make_toi_sysdir(char *name) -{ - struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj); - - if (!kobj) { - printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs " - "dir!\n"); - return NULL; - } - - kobj->ktype = &toi_ktype; - - return kobj; -} - -/* toi_register_sysfs_file - * - * Helper for registering a new /sysfs/tuxonice entry. - */ - -int toi_register_sysfs_file( - struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data) -{ - int result; - - if (!toi_sysfs_initialised) - toi_initialise_sysfs(); - - result = sysfs_create_file(kobj, &toi_sysfs_data->attr); - if (result) - printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s " - "returned %d.\n", - toi_sysfs_data->attr.name, result); - kobj->ktype = &toi_ktype; - - return result; -} - -/* toi_unregister_sysfs_file - * - * Helper for removing unwanted /sys/power/tuxonice entries. - * - */ -void toi_unregister_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data) -{ - sysfs_remove_file(kobj, &toi_sysfs_data->attr); -} - -void toi_cleanup_sysfs(void) -{ - int i, - numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - if (!toi_sysfs_initialised) - return; - - for (i = 0; i < numfiles; i++) - toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]); - - kobject_put(tuxonice_kobj); - toi_sysfs_initialised = 0; -} - -/* toi_initialise_sysfs - * - * Initialise the /sysfs/tuxonice directory. - */ - -static void toi_initialise_sysfs(void) -{ - int i; - int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - if (toi_sysfs_initialised) - return; - - /* Make our TuxOnIce directory a child of /sys/power */ - tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj); - if (!tuxonice_kobj) - return; - - toi_sysfs_initialised = 1; - - for (i = 0; i < numfiles; i++) - toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); -} - -int toi_sysfs_init(void) -{ - toi_initialise_sysfs(); - return 0; -} - -void toi_sysfs_exit(void) -{ - toi_cleanup_sysfs(); -} diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h deleted file mode 100644 index 1de954ce1..000000000 --- a/kernel/power/tuxonice_sysfs.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * kernel/power/tuxonice_sysfs.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#include - -struct toi_sysfs_data { - struct attribute attr; - int type; - int flags; - union { - struct { - unsigned long *bit_vector; - int bit; - } bit; - struct { - int *variable; - int minimum; - int maximum; - } integer; - struct { - long *variable; - long minimum; - long maximum; - } a_long; - struct { - unsigned long *variable; - unsigned long minimum; - unsigned long maximum; - } ul; - struct { - char *variable; - int max_length; - } string; - struct { - int (*read_sysfs) (const char *buffer, int count); - int (*write_sysfs) (const char *buffer, int count); - void *data; - } special; - } data; - - /* Side effects routine. Used, eg, for reparsing the - * resume= entry when it changes */ - void (*write_side_effect) (void); - struct list_head sysfs_data_list; -}; - -enum { - TOI_SYSFS_DATA_NONE = 1, - TOI_SYSFS_DATA_CUSTOM, - TOI_SYSFS_DATA_BIT, - TOI_SYSFS_DATA_INTEGER, - TOI_SYSFS_DATA_UL, - TOI_SYSFS_DATA_LONG, - TOI_SYSFS_DATA_STRING -}; - -#define SYSFS_WRITEONLY 0200 -#define SYSFS_READONLY 0444 -#define SYSFS_RW 0644 - -#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_BIT, \ - .flags = _flags, \ - .data = { .bit = { .bit_vector = _ul, .bit = _bit } } } - -#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_INTEGER, \ - .flags = _flags, \ - .data = { .integer = { .variable = _int, .minimum = _min, \ - .maximum = _max } }, \ - .write_side_effect = _wse } - -#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_UL, \ - .flags = _flags, \ - .data = { .ul = { .variable = _ul, .minimum = _min, \ - .maximum = _max } } } - -#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_LONG, \ - .flags = _flags, \ - .data = { .a_long = { .variable = _long, .minimum = _min, \ - .maximum = _max } } } - -#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_STRING, \ - .flags = _flags, \ - .data = { .string = { .variable = _string, .max_length = _max_len } }, \ - .write_side_effect = _wse } - -#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_CUSTOM, \ - .flags = _flags, \ - .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \ - .write_side_effect = _wse } - -#define SYSFS_NONE(_name, _wse) { \ - .attr = {.name = _name , .mode = SYSFS_WRITEONLY }, \ - .type = TOI_SYSFS_DATA_NONE, \ - .write_side_effect = _wse, \ -} - -/* Flags */ -#define SYSFS_NEEDS_SM_FOR_READ 1 -#define SYSFS_NEEDS_SM_FOR_WRITE 2 -#define SYSFS_HIBERNATE 4 -#define SYSFS_RESUME 8 -#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME) -#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE) -#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE) -#define SYSFS_NEEDS_SM_FOR_BOTH \ - (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE) - -int toi_register_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data); -void toi_unregister_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data); - -extern struct kobject *tuxonice_kobj; - -struct kobject *make_toi_sysdir(char *name); -void remove_toi_sysdir(struct kobject *obj); -extern void toi_cleanup_sysfs(void); - -extern int toi_sysfs_init(void); -extern void toi_sysfs_exit(void); diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c deleted file mode 100644 index 76152f3ff..000000000 --- a/kernel/power/tuxonice_ui.c +++ /dev/null @@ -1,247 +0,0 @@ -/* - * kernel/power/tuxonice_ui.c - * - * Copyright (C) 1998-2001 Gabor Kuti - * Copyright (C) 1998,2001,2002 Pavel Machek - * Copyright (C) 2002-2003 Florent Chabaud - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for TuxOnIce's user interface. - * - * The user interface code talks to a userspace program via a - * netlink socket. - * - * The kernel side: - * - starts the userui program; - * - sends text messages and progress bar status; - * - * The user space side: - * - passes messages regarding user requests (abort, toggle reboot etc) - * - */ - -#define __KERNEL_SYSCALLS__ - -#include - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_netlink.h" -#include "tuxonice_power_off.h" -#include "tuxonice_builtin.h" - -static char local_printf_buf[1024]; /* Same as printk - should be safe */ -struct ui_ops *toi_current_ui; - -/** - * toi_wait_for_keypress - Wait for keypress via userui or /dev/console. - * - * @timeout: Maximum time to wait. - * - * Wait for a keypress, either from userui or /dev/console if userui isn't - * available. The non-userui path is particularly for at boot-time, prior - * to userui being started, when we have an important warning to give to - * the user. - */ -static char toi_wait_for_keypress(int timeout) -{ - if (toi_current_ui && toi_current_ui->wait_for_key(timeout)) - return ' '; - - return toi_wait_for_keypress_dev_console(timeout); -} - -/* toi_early_boot_message() - * Description: Handle errors early in the process of booting. - * The user may press C to continue booting, perhaps - * invalidating the image, or space to reboot. - * This works from either the serial console or normally - * attached keyboard. - * - * Note that we come in here from init, while the kernel is - * locked. If we want to get events from the serial console, - * we need to temporarily unlock the kernel. - * - * toi_early_boot_message may also be called post-boot. - * In this case, it simply printks the message and returns. - * - * Arguments: int Whether we are able to erase the image. - * int default_answer. What to do when we timeout. This - * will normally be continue, but the user might - * provide command line options (__setup) to override - * particular cases. - * Char *. Pointer to a string explaining why we're moaning. - */ - -#define say(message, a...) printk(KERN_EMERG message, ##a) - -void toi_early_boot_message(int message_detail, int default_answer, - char *warning_reason, ...) -{ -#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) - unsigned long orig_state = get_toi_state(), continue_req = 0; - unsigned long orig_loglevel = console_loglevel; - int can_ask = 1; -#else - int can_ask = 0; -#endif - - va_list args; - int printed_len; - - if (!toi_wait) { - set_toi_state(TOI_CONTINUE_REQ); - can_ask = 0; - } - - if (warning_reason) { - va_start(args, warning_reason); - printed_len = vsnprintf(local_printf_buf, - sizeof(local_printf_buf), - warning_reason, - args); - va_end(args); - } - - if (!test_toi_state(TOI_BOOT_TIME)) { - printk("TuxOnIce: %s\n", local_printf_buf); - return; - } - - if (!can_ask) { - continue_req = !!default_answer; - goto post_ask; - } - -#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) - console_loglevel = 7; - - say("=== TuxOnIce ===\n\n"); - if (warning_reason) { - say("BIG FAT WARNING!! %s\n\n", local_printf_buf); - switch (message_detail) { - case 0: - say("If you continue booting, note that any image WILL" - "NOT BE REMOVED.\nTuxOnIce is unable to do so " - "because the appropriate modules aren't\n" - "loaded. You should manually remove the image " - "to avoid any\npossibility of corrupting your " - "filesystem(s) later.\n"); - break; - case 1: - say("If you want to use the current TuxOnIce image, " - "reboot and try\nagain with the same kernel " - "that you hibernated from. If you want\n" - "to forget that image, continue and the image " - "will be erased.\n"); - break; - } - say("Press SPACE to reboot or C to continue booting with " - "this kernel\n\n"); - if (toi_wait > 0) - say("Default action if you don't select one in %d " - "seconds is: %s.\n", - toi_wait, - default_answer == TOI_CONTINUE_REQ ? - "continue booting" : "reboot"); - } else { - say("BIG FAT WARNING!!\n\n" - "You have tried to resume from this image before.\n" - "If it failed once, it may well fail again.\n" - "Would you like to remove the image and boot " - "normally?\nThis will be equivalent to entering " - "noresume on the\nkernel command line.\n\n" - "Press SPACE to remove the image or C to continue " - "resuming.\n\n"); - if (toi_wait > 0) - say("Default action if you don't select one in %d " - "seconds is: %s.\n", toi_wait, - !!default_answer ? - "continue resuming" : "remove the image"); - } - console_loglevel = orig_loglevel; - - set_toi_state(TOI_SANITY_CHECK_PROMPT); - clear_toi_state(TOI_CONTINUE_REQ); - - if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */ - continue_req = !!default_answer; - else - continue_req = test_toi_state(TOI_CONTINUE_REQ); - -#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */ - -post_ask: - if ((warning_reason) && (!continue_req)) - kernel_restart(NULL); - - restore_toi_state(orig_state); - if (continue_req) - set_toi_state(TOI_CONTINUE_REQ); -} - -#undef say - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) - SYSFS_INT("default_console_level", SYSFS_RW, - &toi_bkd.toi_default_console_level, 0, 7, 0, NULL), - SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0, - 1 << 30, 0), - SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL, - 0) -#endif -}; - -static struct toi_module_ops userui_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "printk ui", - .directory = "user_interface", - .module = THIS_MODULE, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_register_ui_ops(struct ui_ops *this_ui) -{ - if (toi_current_ui) { - printk(KERN_INFO "Only one TuxOnIce user interface module can " - "be loaded at a time."); - return -EBUSY; - } - - toi_current_ui = this_ui; - - return 0; -} - -void toi_remove_ui_ops(struct ui_ops *this_ui) -{ - if (toi_current_ui != this_ui) - return; - - toi_current_ui = NULL; -} - -/* toi_console_sysfs_init - * Description: Boot time initialisation for user interface. - */ - -int toi_ui_init(void) -{ - return toi_register_module(&userui_ops); -} - -void toi_ui_exit(void) -{ - toi_unregister_module(&userui_ops); -} diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h deleted file mode 100644 index 4934e3a91..000000000 --- a/kernel/power/tuxonice_ui.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * kernel/power/tuxonice_ui.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - */ - -enum { - DONT_CLEAR_BAR, - CLEAR_BAR -}; - -enum { - /* Userspace -> Kernel */ - USERUI_MSG_ABORT = 0x11, - USERUI_MSG_SET_STATE = 0x12, - USERUI_MSG_GET_STATE = 0x13, - USERUI_MSG_GET_DEBUG_STATE = 0x14, - USERUI_MSG_SET_DEBUG_STATE = 0x15, - USERUI_MSG_SPACE = 0x18, - USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A, - USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B, - USERUI_MSG_GET_LOGLEVEL = 0x1C, - USERUI_MSG_SET_LOGLEVEL = 0x1D, - USERUI_MSG_PRINTK = 0x1E, - - /* Kernel -> Userspace */ - USERUI_MSG_MESSAGE = 0x21, - USERUI_MSG_PROGRESS = 0x22, - USERUI_MSG_POST_ATOMIC_RESTORE = 0x25, - - USERUI_MSG_MAX, -}; - -struct userui_msg_params { - u32 a, b, c, d; - char text[255]; -}; - -struct ui_ops { - char (*wait_for_key) (int timeout); - u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...); - void (*prepare_status) (int clearbar, const char *fmt, ...); - void (*cond_pause) (int pause, char *message); - void (*abort)(int result_code, const char *fmt, ...); - void (*prepare)(void); - void (*cleanup)(void); - void (*message)(u32 section, u32 level, u32 normally_logged, - const char *fmt, ...); -}; - -extern struct ui_ops *toi_current_ui; - -#define toi_update_status(val, max, fmt, args...) \ - (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \ - max) - -#define toi_prepare_console(void) \ - do { if (toi_current_ui) \ - (toi_current_ui->prepare)(); \ - } while (0) - -#define toi_cleanup_console(void) \ - do { if (toi_current_ui) \ - (toi_current_ui->cleanup)(); \ - } while (0) - -#define abort_hibernate(result, fmt, args...) \ - do { if (toi_current_ui) \ - (toi_current_ui->abort)(result, fmt, ##args); \ - else { \ - set_abort_result(result); \ - } \ - } while (0) - -#define toi_cond_pause(pause, message) \ - do { if (toi_current_ui) \ - (toi_current_ui->cond_pause)(pause, message); \ - } while (0) - -#define toi_prepare_status(clear, fmt, args...) \ - do { if (toi_current_ui) \ - (toi_current_ui->prepare_status)(clear, fmt, ##args); \ - else \ - printk(KERN_INFO fmt "%s", ##args, "\n"); \ - } while (0) - -#define toi_message(sn, lev, log, fmt, a...) \ -do { \ - if (toi_current_ui && (!sn || test_debug_state(sn))) \ - toi_current_ui->message(sn, lev, log, fmt, ##a); \ -} while (0) - -__exit void toi_ui_cleanup(void); -extern int toi_ui_init(void); -extern void toi_ui_exit(void); -extern int toi_register_ui_ops(struct ui_ops *this_ui); -extern void toi_remove_ui_ops(struct ui_ops *this_ui); diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c deleted file mode 100644 index 6aa5ac3eb..000000000 --- a/kernel/power/tuxonice_userui.c +++ /dev/null @@ -1,658 +0,0 @@ -/* - * kernel/power/user_ui.c - * - * Copyright (C) 2005-2007 Bernard Blackham - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for TuxOnIce's user interface. - * - * The user interface code talks to a userspace program via a - * netlink socket. - * - * The kernel side: - * - starts the userui program; - * - sends text messages and progress bar status; - * - * The user space side: - * - passes messages regarding user requests (abort, toggle reboot etc) - * - */ - -#define __KERNEL_SYSCALLS__ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_netlink.h" -#include "tuxonice_power_off.h" - -static char local_printf_buf[1024]; /* Same as printk - should be safe */ - -static struct user_helper_data ui_helper_data; -static struct toi_module_ops userui_ops; -static int orig_kmsg; - -static char lastheader[512]; -static int lastheader_message_len; -static int ui_helper_changed; /* Used at resume-time so don't overwrite value - set from initrd/ramfs. */ - -/* Number of distinct progress amounts that userspace can display */ -static int progress_granularity = 30; - -static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key); -static int userui_wait_should_wake; - -#define toi_stop_waiting_for_userui_key() \ -{ \ - userui_wait_should_wake = true; \ - wake_up_interruptible(&userui_wait_for_key); \ -} - -/** - * ui_nl_set_state - Update toi_action based on a message from userui. - * - * @n: The bit (1 << bit) to set. - */ -static void ui_nl_set_state(int n) -{ - /* Only let them change certain settings */ - static const u32 toi_action_mask = - (1 << TOI_REBOOT) | (1 << TOI_PAUSE) | - (1 << TOI_LOGALL) | - (1 << TOI_SINGLESTEP) | - (1 << TOI_PAUSE_NEAR_PAGESET_END); - static unsigned long new_action; - - new_action = (toi_bkd.toi_action & (~toi_action_mask)) | - (n & toi_action_mask); - - printk(KERN_DEBUG "n is %x. Action flags being changed from %lx " - "to %lx.", n, toi_bkd.toi_action, new_action); - toi_bkd.toi_action = new_action; - - if (!test_action_state(TOI_PAUSE) && - !test_action_state(TOI_SINGLESTEP)) - toi_stop_waiting_for_userui_key(); -} - -/** - * userui_post_atomic_restore - Tell userui that atomic restore just happened. - * - * Tell userui that atomic restore just occured, so that it can do things like - * redrawing the screen, re-getting settings and so on. - */ -static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0); -} - -/** - * userui_storage_needed - Report how much memory in image header is needed. - */ -static int userui_storage_needed(void) -{ - return sizeof(ui_helper_data.program) + 1 + sizeof(int); -} - -/** - * userui_save_config_info - Fill buffer with config info for image header. - * - * @buf: Buffer into which to put the config info we want to save. - */ -static int userui_save_config_info(char *buf) -{ - *((int *) buf) = progress_granularity; - memcpy(buf + sizeof(int), ui_helper_data.program, - sizeof(ui_helper_data.program)); - return sizeof(ui_helper_data.program) + sizeof(int) + 1; -} - -/** - * userui_load_config_info - Restore config info from buffer. - * - * @buf: Buffer containing header info loaded. - * @size: Size of data loaded for this module. - */ -static void userui_load_config_info(char *buf, int size) -{ - progress_granularity = *((int *) buf); - size -= sizeof(int); - - /* Don't load the saved path if one has already been set */ - if (ui_helper_changed) - return; - - if (size > sizeof(ui_helper_data.program)) - size = sizeof(ui_helper_data.program); - - memcpy(ui_helper_data.program, buf + sizeof(int), size); - ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0'; -} - -/** - * set_ui_program_set: Record that userui program was changed. - * - * Side effect routine for when the userui program is set. In an initrd or - * ramfs, the user may set a location for the userui program. If this happens, - * we don't want to reload the value that was saved in the image header. This - * routine allows us to flag that we shouldn't restore the program name from - * the image header. - */ -static void set_ui_program_set(void) -{ - ui_helper_changed = 1; -} - -/** - * userui_memory_needed - Tell core how much memory to reserve for us. - */ -static int userui_memory_needed(void) -{ - /* ball park figure of 128 pages */ - return 128 * PAGE_SIZE; -} - -/** - * userui_update_status - Update the progress bar and (if on) in-bar message. - * - * @value: Current progress percentage numerator. - * @maximum: Current progress percentage denominator. - * @fmt: Message to be displayed in the middle of the progress bar. - * - * Note that a NULL message does not mean that any previous message is erased! - * For that, you need toi_prepare_status with clearbar on. - * - * Returns an unsigned long, being the next numerator (as determined by the - * maximum and progress granularity) where status needs to be updated. - * This is to reduce unnecessary calls to update_status. - */ -static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...) -{ - static u32 last_step = 9999; - struct userui_msg_params msg; - u32 this_step, next_update; - int bitshift; - - if (ui_helper_data.pid == -1) - return 0; - - if ((!maximum) || (!progress_granularity)) - return maximum; - - if (value < 0) - value = 0; - - if (value > maximum) - value = maximum; - - /* Try to avoid math problems - we can't do 64 bit math here - * (and shouldn't need it - anyone got screen resolution - * of 65536 pixels or more?) */ - bitshift = fls(maximum) - 16; - if (bitshift > 0) { - u32 temp_maximum = maximum >> bitshift; - u32 temp_value = value >> bitshift; - this_step = (u32) - (temp_value * progress_granularity / temp_maximum); - next_update = (((this_step + 1) * temp_maximum / - progress_granularity) + 1) << bitshift; - } else { - this_step = (u32) (value * progress_granularity / maximum); - next_update = ((this_step + 1) * maximum / - progress_granularity) + 1; - } - - if (this_step == last_step) - return next_update; - - memset(&msg, 0, sizeof(msg)); - - msg.a = this_step; - msg.b = progress_granularity; - - if (fmt) { - va_list args; - va_start(args, fmt); - vsnprintf(msg.text, sizeof(msg.text), fmt, args); - va_end(args); - msg.text[sizeof(msg.text)-1] = '\0'; - } - - toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS, - &msg, sizeof(msg)); - last_step = this_step; - - return next_update; -} - -/** - * userui_message - Display a message without necessarily logging it. - * - * @section: Type of message. Messages can be filtered by type. - * @level: Degree of importance of the message. Lower values = higher priority. - * @normally_logged: Whether logged even if log_everything is off. - * @fmt: Message (and parameters). - * - * This function is intended to do the same job as printk, but without normally - * logging what is printed. The point is to be able to get debugging info on - * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M" - * - * It may be called from an interrupt context - can't sleep! - */ -static void userui_message(u32 section, u32 level, u32 normally_logged, - const char *fmt, ...) -{ - struct userui_msg_params msg; - - if ((level) && (level > console_loglevel)) - return; - - memset(&msg, 0, sizeof(msg)); - - msg.a = section; - msg.b = level; - msg.c = normally_logged; - - if (fmt) { - va_list args; - va_start(args, fmt); - vsnprintf(msg.text, sizeof(msg.text), fmt, args); - va_end(args); - msg.text[sizeof(msg.text)-1] = '\0'; - } - - if (test_action_state(TOI_LOGALL)) - printk(KERN_INFO "%s\n", msg.text); - - toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE, - &msg, sizeof(msg)); -} - -/** - * wait_for_key_via_userui - Wait for userui to receive a keypress. - */ -static void wait_for_key_via_userui(void) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&userui_wait_for_key, &wait); - set_current_state(TASK_INTERRUPTIBLE); - - wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake); - userui_wait_should_wake = false; - - set_current_state(TASK_RUNNING); - remove_wait_queue(&userui_wait_for_key, &wait); -} - -/** - * userui_prepare_status - Display high level messages. - * - * @clearbar: Whether to clear the progress bar. - * @fmt...: New message for the title. - * - * Prepare the 'nice display', drawing the header and version, along with the - * current action and perhaps also resetting the progress bar. - */ -static void userui_prepare_status(int clearbar, const char *fmt, ...) -{ - va_list args; - - if (fmt) { - va_start(args, fmt); - lastheader_message_len = vsnprintf(lastheader, 512, fmt, args); - va_end(args); - } - - if (clearbar) - toi_update_status(0, 1, NULL); - - if (ui_helper_data.pid == -1) - printk(KERN_EMERG "%s\n", lastheader); - else - toi_message(0, TOI_STATUS, 1, lastheader, NULL); -} - -/** - * toi_wait_for_keypress - Wait for keypress via userui. - * - * @timeout: Maximum time to wait. - * - * Wait for a keypress from userui. - * - * FIXME: Implement timeout? - */ -static char userui_wait_for_keypress(int timeout) -{ - char key = '\0'; - - if (ui_helper_data.pid != -1) { - wait_for_key_via_userui(); - key = ' '; - } - - return key; -} - -/** - * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it. - * - * @result_code: Reason why we're aborting (1 << bit). - * @fmt: Message to display if telling the user what's going on. - * - * Abort a cycle. If this wasn't at the user's request (and we're displaying - * output), tell the user why and wait for them to acknowledge the message. - */ -static void userui_abort_hibernate(int result_code, const char *fmt, ...) -{ - va_list args; - int printed_len = 0; - - set_result_state(result_code); - - if (test_result_state(TOI_ABORTED)) - return; - - set_result_state(TOI_ABORTED); - - if (test_result_state(TOI_ABORT_REQUESTED)) - return; - - va_start(args, fmt); - printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf), - fmt, args); - va_end(args); - if (ui_helper_data.pid != -1) - printed_len = sprintf(local_printf_buf + printed_len, - " (Press SPACE to continue)"); - - toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf); - - if (ui_helper_data.pid != -1) - userui_wait_for_keypress(0); -} - -/** - * request_abort_hibernate - Abort hibernating or resuming at user request. - * - * Handle the user requesting the cancellation of a hibernation or resume by - * pressing escape. - */ -static void request_abort_hibernate(void) -{ - if (test_result_state(TOI_ABORT_REQUESTED) || - !test_action_state(TOI_CAN_CANCEL)) - return; - - if (test_toi_state(TOI_NOW_RESUMING)) { - toi_prepare_status(CLEAR_BAR, "Escape pressed. " - "Powering down again."); - set_toi_state(TOI_STOP_RESUME); - while (!test_toi_state(TOI_IO_STOPPED)) - schedule(); - if (toiActiveAllocator->mark_resume_attempted) - toiActiveAllocator->mark_resume_attempted(0); - toi_power_down(); - } - - toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :" - " ABORTING HIBERNATION ---"); - set_abort_result(TOI_ABORT_REQUESTED); - toi_stop_waiting_for_userui_key(); -} - -/** - * userui_user_rcv_msg - Receive a netlink message from userui. - * - * @skb: skb received. - * @nlh: Netlink header received. - */ -static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type; - int *data; - - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < NETLINK_MSG_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ - if (type >= USERUI_MSG_MAX) - return -EINVAL; - - /* All operations require privileges, even GET */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) { - printk(KERN_INFO "Got NOFREEZE_ME request when " - "ui_helper_data.pid is %d.\n", ui_helper_data.pid); - return -EBUSY; - } - - data = (int *) NLMSG_DATA(nlh); - - switch (type) { - case USERUI_MSG_ABORT: - request_abort_hibernate(); - return 0; - case USERUI_MSG_GET_STATE: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_STATE, &toi_bkd.toi_action, - sizeof(toi_bkd.toi_action)); - return 0; - case USERUI_MSG_GET_DEBUG_STATE: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_DEBUG_STATE, - &toi_bkd.toi_debug_state, - sizeof(toi_bkd.toi_debug_state)); - return 0; - case USERUI_MSG_SET_STATE: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - ui_nl_set_state(*data); - return 0; - case USERUI_MSG_SET_DEBUG_STATE: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - toi_bkd.toi_debug_state = (*data); - return 0; - case USERUI_MSG_SPACE: - toi_stop_waiting_for_userui_key(); - return 0; - case USERUI_MSG_GET_POWERDOWN_METHOD: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_POWERDOWN_METHOD, - &toi_poweroff_method, - sizeof(toi_poweroff_method)); - return 0; - case USERUI_MSG_SET_POWERDOWN_METHOD: - if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char))) - return -EINVAL; - toi_poweroff_method = (unsigned long)(*data); - return 0; - case USERUI_MSG_GET_LOGLEVEL: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_LOGLEVEL, - &toi_bkd.toi_default_console_level, - sizeof(toi_bkd.toi_default_console_level)); - return 0; - case USERUI_MSG_SET_LOGLEVEL: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - toi_bkd.toi_default_console_level = (*data); - return 0; - case USERUI_MSG_PRINTK: - printk(KERN_INFO "%s", (char *) data); - return 0; - } - - /* Unhandled here */ - return 1; -} - -/** - * userui_cond_pause - Possibly pause at user request. - * - * @pause: Whether to pause or just display the message. - * @message: Message to display at the start of pausing. - * - * Potentially pause and wait for the user to tell us to continue. We normally - * only pause when @pause is set. While paused, the user can do things like - * changing the loglevel, toggling the display of debugging sections and such - * like. - */ -static void userui_cond_pause(int pause, char *message) -{ - int displayed_message = 0, last_key = 0; - - while (last_key != 32 && - ui_helper_data.pid != -1 && - ((test_action_state(TOI_PAUSE) && pause) || - (test_action_state(TOI_SINGLESTEP)))) { - if (!displayed_message) { - toi_prepare_status(DONT_CLEAR_BAR, - "%s Press SPACE to continue.%s", - message ? message : "", - (test_action_state(TOI_SINGLESTEP)) ? - " Single step on." : ""); - displayed_message = 1; - } - last_key = userui_wait_for_keypress(0); - } - schedule(); -} - -/** - * userui_prepare_console - Prepare the console for use. - * - * Prepare a console for use, saving current kmsg settings and attempting to - * start userui. Console loglevel changes are handled by userui. - */ -static void userui_prepare_console(void) -{ - orig_kmsg = vt_kmsg_redirect(fg_console + 1); - - ui_helper_data.pid = -1; - - if (!userui_ops.enabled) { - printk(KERN_INFO "TuxOnIce: Userui disabled.\n"); - return; - } - - if (*ui_helper_data.program) - toi_netlink_setup(&ui_helper_data); - else - printk(KERN_INFO "TuxOnIce: Userui program not configured.\n"); -} - -/** - * userui_cleanup_console - Cleanup after a cycle. - * - * Tell userui to cleanup, and restore kmsg_redirect to its original value. - */ - -static void userui_cleanup_console(void) -{ - if (ui_helper_data.pid > -1) - toi_netlink_close(&ui_helper_data); - - vt_kmsg_redirect(orig_kmsg); -} - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) - SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action, - TOI_CAN_CANCEL, 0), - SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action, - TOI_PAUSE, 0), - SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL), - SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1, - 2048, 0, NULL), - SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0, - set_ui_program_set), - SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL) -#endif -}; - -static struct toi_module_ops userui_ops = { - .type = MISC_MODULE, - .name = "userui", - .shared_directory = "user_interface", - .module = THIS_MODULE, - .storage_needed = userui_storage_needed, - .save_config_info = userui_save_config_info, - .load_config_info = userui_load_config_info, - .memory_needed = userui_memory_needed, - .post_atomic_restore = userui_post_atomic_restore, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -static struct ui_ops my_ui_ops = { - .update_status = userui_update_status, - .message = userui_message, - .prepare_status = userui_prepare_status, - .abort = userui_abort_hibernate, - .cond_pause = userui_cond_pause, - .prepare = userui_prepare_console, - .cleanup = userui_cleanup_console, - .wait_for_key = userui_wait_for_keypress, -}; - -/** - * toi_user_ui_init - Boot time initialisation for user interface. - * - * Invoked from the core init routine. - */ -static __init int toi_user_ui_init(void) -{ - int result; - - ui_helper_data.nl = NULL; - strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255); - ui_helper_data.pid = -1; - ui_helper_data.skb_size = sizeof(struct userui_msg_params); - ui_helper_data.pool_limit = 6; - ui_helper_data.netlink_id = NETLINK_TOI_USERUI; - ui_helper_data.name = "userspace ui"; - ui_helper_data.rcv_msg = userui_user_rcv_msg; - ui_helper_data.interface_version = 8; - ui_helper_data.must_init = 0; - ui_helper_data.not_ready = userui_cleanup_console; - init_completion(&ui_helper_data.wait_for_process); - result = toi_register_module(&userui_ops); - if (!result) { - result = toi_register_ui_ops(&my_ui_ops); - if (result) - toi_unregister_module(&userui_ops); - } - - return result; -} - -late_initcall(toi_user_ui_init); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8c1204fa3..a787aa942 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,8 +15,8 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -ifdef CONFIG_SCHED_BFS -obj-y += bfs.o clock.o +ifdef CONFIG_SCHED_MUQSS +obj-y += MuQSS.o clock.o else obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c new file mode 100644 index 000000000..6a656ad4b --- /dev/null +++ b/kernel/sched/MuQSS.c @@ -0,0 +1,8247 @@ +/* + * kernel/sched/MuQSS.c, was kernel/sched.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements + * by Peter Williams + * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith + * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri + * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, + * Thomas Gleixner, Mike Kravetz + * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes + * a whole lot of those previous things. + * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS + * scheduler by Con Kolivas. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#ifdef CONFIG_PARAVIRT +#include +#endif + +#include "cpupri.h" +#include "../workqueue_internal.h" +#include "../smpboot.h" + +#define CREATE_TRACE_POINTS +#include + +#include "MuQSS.h" + +#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) +#define rt_task(p) rt_prio((p)->prio) +#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ + (policy) == SCHED_RR) +#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) + +#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) +#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) +#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) + +#define is_iso_policy(policy) ((policy) == SCHED_ISO) +#define iso_task(p) unlikely(is_iso_policy((p)->policy)) +#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) + +#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) + +#define ISO_PERIOD (5 * HZ) + +#define STOP_PRIO (MAX_RT_PRIO - 1) + +/* + * Some helpers for converting to/from various scales. Use shifts to get + * approximate multiples of ten for less overhead. + */ +#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ)) +#define JIFFY_NS (1073741824 / HZ) +#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) +#define HALF_JIFFY_NS (1073741824 / HZ / 2) +#define HALF_JIFFY_US (1048576 / HZ / 2) +#define MS_TO_NS(TIME) ((TIME) << 20) +#define MS_TO_US(TIME) ((TIME) << 10) +#define NS_TO_MS(TIME) ((TIME) >> 20) +#define NS_TO_US(TIME) ((TIME) >> 10) + +#define RESCHED_US (100) /* Reschedule if less than this many μs left */ + +void print_scheduler_version(void) +{ + printk(KERN_INFO "MuQSS CPU scheduler v0.114 by Con Kolivas.\n"); +} + +/* + * This is the time all tasks within the same priority round robin. + * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. + * Tunable via /proc interface. + */ +#ifdef CONFIG_PCK_INTERACTIVE +int rr_interval __read_mostly = 3; +#else +int rr_interval __read_mostly = 6; +#endif + +/* Tunable to choose whether to prioritise latency or throughput, simple + * binary yes or no */ + +int sched_interactive __read_mostly = 1; + +/* + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks + * are allowed to run five seconds as real time tasks. This is the total over + * all online cpus. + */ +#ifdef CONFIG_PCK_INTERACTIVE +int sched_iso_cpu __read_mostly = 25; +#else +int sched_iso_cpu __read_mostly = 70; +#endif + +/* + * The relative length of deadline for each priority(nice) level. + */ +static int prio_ratios[NICE_WIDTH] __read_mostly; + +/* + * The quota handed out to tasks of all priority levels when refilling their + * time_slice. + */ +static inline int timeslice(void) +{ + return MS_TO_US(rr_interval); +} + +/* + * The global runqueue data that all CPUs work off. Contains either atomic + * variables and a cpu bitmap set atomically. + */ +struct global_rq { +#ifdef CONFIG_SMP + atomic_t nr_running ____cacheline_aligned_in_smp; + atomic_t nr_uninterruptible ____cacheline_aligned_in_smp; + atomic64_t nr_switches ____cacheline_aligned_in_smp; + atomic_t qnr ____cacheline_aligned_in_smp; /* queued not running */ +#else + atomic_t nr_running ____cacheline_aligned; + atomic_t nr_uninterruptible ____cacheline_aligned; + atomic64_t nr_switches ____cacheline_aligned; + atomic_t qnr ____cacheline_aligned; /* queued not running */ +#endif +#ifdef CONFIG_SMP + cpumask_t cpu_idle_map; +#endif +}; + +#ifdef CONFIG_SMP +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; +}; + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +static struct root_domain def_root_domain; + +#endif /* CONFIG_SMP */ + +/* There can be only one */ +#ifdef CONFIG_SMP +static struct global_rq grq ____cacheline_aligned_in_smp; +#else +static struct global_rq grq ____cacheline_aligned; +#endif + +static DEFINE_MUTEX(sched_hotcpu_mutex); + +/* cpus with isolated domains */ +cpumask_var_t cpu_isolated_map; + +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#ifdef CONFIG_SMP +struct rq *cpu_rq(int cpu) +{ + return &per_cpu(runqueues, (cpu)); +} +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +/* + * sched_domains_mutex serialises calls to init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +DEFINE_MUTEX(sched_domains_mutex); + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +static struct root_domain def_root_domain; + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} +#else +struct rq *uprq; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SMP +static inline int cpu_of(struct rq *rq) +{ + return rq->cpu; +} +#else /* CONFIG_SMP */ +static inline int cpu_of(struct rq *rq) +{ + return 0; +} +#endif + +#include "stats.h" + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + +/* + * All common locking functions performed on rq->lock. rq->clock is local to + * the CPU accessing it so it can be modified just with interrupts disabled + * when we're not updating niffies. + * Looking up task_rq must be done under rq->lock to be safe. + */ +static void update_rq_clock_task(struct rq *rq, s64 delta); + +static inline void update_rq_clock(struct rq *rq) +{ + s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + + if (unlikely(delta < 0)) + return; + rq->clock += delta; + update_rq_clock_task(rq, delta); +} + +/* + * Niffies are a globally increasing nanosecond counter. They're only used by + * update_load_avg and time_slice_expired, however deadlines are based on them + * across CPUs. Update them whenever we will call one of those functions, and + * synchronise them across CPUs whenever we hold both runqueue locks. + */ +static inline void update_clocks(struct rq *rq) +{ + s64 ndiff, minndiff; + long jdiff; + + update_rq_clock(rq); + ndiff = rq->clock - rq->old_clock; + rq->old_clock = rq->clock; + jdiff = jiffies - rq->last_jiffy; + + /* Subtract any niffies added by balancing with other rqs */ + ndiff -= rq->niffies - rq->last_niffy; + minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; + if (minndiff < 0) + minndiff = 0; + ndiff = max(ndiff, minndiff); + rq->niffies += ndiff; + rq->last_niffy = rq->niffies; + if (jdiff) { + rq->last_jiffy += jdiff; + rq->last_jiffy_niffies = rq->niffies; + } +} + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_MIGRATING; +} + +static inline void rq_lock(struct rq *rq) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); +} + +static inline int rq_trylock(struct rq *rq) + __acquires(rq->lock) +{ + return raw_spin_trylock(&rq->lock); +} + +static inline void rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); +} + +static inline struct rq *this_rq_lock(void) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + raw_spin_lock(&rq->lock); + + return rq; +} + +/* + * Any time we have two runqueues locked we use that as an opportunity to + * synchronise niffies to the highest value as idle ticks may have artificially + * kept niffies low on one CPU and the truth can only be later. + */ +static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) +{ + if (rq1->niffies > rq2->niffies) + rq2->niffies = rq1->niffies; + else + rq1->niffies = rq2->niffies; +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ + +/* For when we know rq1 != rq2 */ +static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } +} + +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else + __double_rq_lock(rq1, rq2); + synchronise_niffies(rq1, rq2); +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +/* Must be sure rq1 != rq2 and irqs are disabled */ +static inline void lock_second_rq(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (unlikely(!raw_spin_trylock(&rq2->lock))) { + raw_spin_unlock(&rq1->lock); + __double_rq_lock(rq1, rq2); + } + synchronise_niffies(rq1, rq2); +} + +static inline void lock_all_rqs(void) +{ + int cpu; + + preempt_disable(); + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + do_raw_spin_lock(&rq->lock); + } +} + +static inline void unlock_all_rqs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + do_raw_spin_unlock(&rq->lock); + } + preempt_enable(); +} + +/* Specially nest trylock an rq */ +static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) +{ + if (unlikely(!do_raw_spin_trylock(&rq->lock))) + return false; + spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); + synchronise_niffies(this_rq, rq); + return true; +} + +/* Unlock a specially nested trylocked rq */ +static inline void unlock_rq(struct rq *rq) +{ + spin_release(&rq->lock.dep_map, 1, _RET_IP_); + do_raw_spin_unlock(&rq->lock); +} + +static inline void rq_lock_irq(struct rq *rq) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); +} + +static inline void rq_unlock_irq(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock_irq(&rq->lock); +} + +static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, *flags); +} + +static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags) + __releases(rq->lock) +{ + raw_spin_unlock_irqrestore(&rq->lock, *flags); +} + +static inline struct rq +*task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + while (42) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + break; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + } + return rq; +} + +static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + while (42) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + break; + raw_spin_unlock(&rq->lock); + } + return rq; +} + +static inline void __task_rq_unlock(struct rq *rq) +{ + rq_unlock(rq); +} + +/* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, mask) \ + ({ \ + typeof(ptr) _ptr = (ptr); \ + typeof(mask) _mask = (mask); \ + typeof(*_ptr) _old, _val = *_ptr; \ + \ + for (;;) { \ + _old = cmpxchg(_ptr, _val, _val | _mask); \ + if (_old == _val) \ + break; \ + _val = _old; \ + } \ + _old; \ +}) + +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif +#endif + +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + struct wake_q_node *node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means + * its already queued (either by us or someone else) and will get the + * wakeup due to that. + * + * This cmpxchg() implies a full barrier, which pairs with the write + * barrier implied by the wakeup in wake_up_q(). + */ + if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + return; + + get_task_struct(task); + + /* + * The head is context local, there can be no concurrency. + */ + *head->lastp = node; + head->lastp = &node->next; +} + +void wake_up_q(struct wake_q_head *head) +{ + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + + task = container_of(node, struct task_struct, wake_q); + BUG_ON(!task); + /* task can safely be re-inserted now */ + node = node->next; + task->wake_q.next = NULL; + + /* + * wake_up_process() implies a wmb() to pair with the queueing + * in wake_q_add() so as not to miss wakeups. + */ + wake_up_process(task); + put_task_struct(task); + } +} + +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ + next->on_cpu = 1; +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +void resched_task(struct task_struct *p) +{ + int cpu; +#ifdef CONFIG_LOCKDEP + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); +#endif + if (test_tsk_need_resched(p)) + return; + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); + set_preempt_need_resched(); + return; + } + + if (set_nr_and_not_polling(p)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} + +/* + * A task that is not running or queued will not have a node set. + * A task that is queued but not running will have a node set. + * A task that is currently running will have ->on_cpu set but no node set. + */ +static inline bool task_queued(struct task_struct *p) +{ + return !skiplist_node_empty(&p->node); +} + +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +static inline void resched_if_idle(struct rq *rq); + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + +#ifdef CONFIG_SMP + /* + * If prev was marked as migrating to another CPU in return_task, drop + * the local runqueue lock but leave interrupts disabled and grab the + * remote lock we're migrating it to before enabling them. + */ + if (unlikely(task_on_rq_migrating(prev))) { + sched_info_dequeued(rq, prev); + /* + * We move the ownership of prev to the new cpu now. ttwu can't + * activate prev to the wrong cpu since it has to grab this + * runqueue in ttwu_remote. + */ + task_thread_info(prev)->cpu = prev->wake_cpu; + raw_spin_unlock(&rq->lock); + + raw_spin_lock(&prev->pi_lock); + rq = __task_rq_lock(prev); + /* Check that someone else hasn't already queued prev */ + if (likely(!task_queued(prev))) { + enqueue_task(rq, prev, 0); + prev->on_rq = TASK_ON_RQ_QUEUED; + /* Wake up the CPU if it's not already running */ + resched_if_idle(rq); + } + raw_spin_unlock(&prev->pi_lock); + } +#endif + raw_spin_unlock_irq(&rq->lock); +} + +static inline bool deadline_before(u64 deadline, u64 time) +{ + return (deadline < time); +} + +/* + * Deadline is "now" in niffies + (offset by priority). Setting the deadline + * is the key to everything. It distributes cpu fairly amongst tasks of the + * same nice value, it proportions cpu according to nice level, it means the + * task that last woke up the longest ago has the earliest deadline, thus + * ensuring that interactive tasks get low latency on wake up. The CPU + * proportion works out to the square of the virtual deadline difference, so + * this equation will give nice 19 3% CPU compared to nice 0. + */ +static inline u64 prio_deadline_diff(int user_prio) +{ + return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); +} + +static inline u64 task_deadline_diff(struct task_struct *p) +{ + return prio_deadline_diff(TASK_USER_PRIO(p)); +} + +static inline u64 static_deadline_diff(int static_prio) +{ + return prio_deadline_diff(USER_PRIO(static_prio)); +} + +static inline int longest_deadline_diff(void) +{ + return prio_deadline_diff(39); +} + +static inline int ms_longest_deadline_diff(void) +{ + return NS_TO_MS(longest_deadline_diff()); +} + +static inline int rq_load(struct rq *rq) +{ + return rq->sl->entries + !rq_idle(rq); +} + +static inline bool rq_local(struct rq *rq); + +/* + * Update the load average for feeding into cpu frequency governors. Use a + * rough estimate of a rolling average with ~ time constant of 32ms. + * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 + * Make sure a call to update_clocks has been made before calling this to get + * an updated rq->niffies. + */ +static void update_load_avg(struct rq *rq) +{ + /* rq clock can go backwards so skip update if that happens */ + if (likely(rq->clock > rq->load_update)) { + unsigned long us_interval = (rq->clock - rq->load_update) >> 10; + long load, curload = rq_load(rq); + + load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); + if (unlikely(load < 0)) + load = 0; + load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; + rq->load_avg = load; + } else + return; + + rq->load_update = rq->clock; + if (likely(rq_local(rq))) + cpufreq_trigger(rq->niffies, rq->load_avg); +} + +/* + * Removing from the runqueue. Enter with rq locked. Deleting a task + * from the skip list is done via the stored node reference in the task struct + * and does not require a full look up. Thus it occurs in O(k) time where k + * is the "level" of the list the task was stored at - usually < 4, max 8. + */ +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +{ + skiplist_delete(rq->sl, &p->node); + rq->best_key = rq->node.next[0]->key; + update_clocks(rq); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(task_rq(p), p); + update_load_avg(rq); +} + +#ifdef CONFIG_PREEMPT_RCU +static bool rcu_read_critical(struct task_struct *p) +{ + return p->rcu_read_unlock_special.b.blocked; +} +#else /* CONFIG_PREEMPT_RCU */ +#define rcu_read_critical(p) (false) +#endif /* CONFIG_PREEMPT_RCU */ + +/* + * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as + * an idle task, we ensure none of the following conditions are met. + */ +static bool idleprio_suitable(struct task_struct *p) +{ + return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && + !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); +} + +/* + * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check + * that the iso_refractory flag is not set. + */ +static inline bool isoprio_suitable(struct rq *rq) +{ + return !rq->iso_refractory; +} + +/* + * Adding to the runqueue. Enter with rq locked. + */ +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +{ + unsigned int randseed; + u64 sl_id; + + if (!rt_task(p)) { + /* Check it hasn't gotten rt from PI */ + if ((idleprio_task(p) && idleprio_suitable(p)) || + (iso_task(p) && isoprio_suitable(rq))) + p->prio = p->normal_prio; + else + p->prio = NORMAL_PRIO; + } + /* + * The sl_id key passed to the skiplist generates a sorted list. + * Realtime and sched iso tasks run FIFO so they only need be sorted + * according to priority. The skiplist will put tasks of the same + * key inserted later in FIFO order. Tasks of sched normal, batch + * and idleprio are sorted according to their deadlines. Idleprio + * tasks are offset by an impossibly large deadline value ensuring + * they get sorted into last positions, but still according to their + * own deadlines. This creates a "landscape" of skiplists running + * from priority 0 realtime in first place to the lowest priority + * idleprio tasks last. Skiplist insertion is an O(log n) process. + */ + if (p->prio <= ISO_PRIO) + sl_id = p->prio; + else { + sl_id = p->deadline; + if (idleprio_task(p)) { + if (p->prio == IDLE_PRIO) + sl_id |= 0xF000000000000000; + else + sl_id += longest_deadline_diff(); + } + } + /* + * Some architectures don't have better than microsecond resolution + * so mask out ~microseconds as the random seed for skiplist insertion. + */ + update_clocks(rq); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); + randseed = (rq->niffies >> 10) & 0xFFFFFFFF; + skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); + rq->best_key = rq->node.next[0]->key; + update_load_avg(rq); +} + +/* + * Returns the relative length of deadline all compared to the shortest + * deadline which is that of nice -20. + */ +static inline int task_prio_ratio(struct task_struct *p) +{ + return prio_ratios[TASK_USER_PRIO(p)]; +} + +/* + * task_timeslice - all tasks of all priorities get the exact same timeslice + * length. CPU distribution is handled by giving different deadlines to + * tasks of different priorities. Use 128 as the base value for fast shifts. + */ +static inline int task_timeslice(struct task_struct *p) +{ + return (rr_interval * task_prio_ratio(p) / 128); +} + +/* + * qnr is the "queued but not running" count which is the total number of + * tasks on the global runqueue list waiting for cpu time but not actually + * currently running on a cpu. + */ +static inline void inc_qnr(void) +{ + atomic_inc(&grq.qnr); +} + +static inline void dec_qnr(void) +{ + atomic_dec(&grq.qnr); +} + +static inline int queued_notrunning(void) +{ + return atomic_read(&grq.qnr); +} + +#ifdef CONFIG_SMP +/* Entered with rq locked */ +static inline void resched_if_idle(struct rq *rq) +{ + if (rq_idle(rq)) + resched_task(rq->curr); +} + +static inline bool rq_local(struct rq *rq) +{ + return (rq->cpu == smp_processor_id()); +} +#ifdef CONFIG_SMT_NICE +static const cpumask_t *thread_cpumask(int cpu); + +/* Find the best real time priority running on any SMT siblings of cpu and if + * none are running, the static priority of the best deadline task running. + * The lookups to the other runqueues is done lockless as the occasional wrong + * value would be harmless. */ +static int best_smt_bias(struct rq *this_rq) +{ + int other_cpu, best_bias = 0; + + for_each_cpu(other_cpu, &this_rq->thread_mask) { + struct rq *rq = cpu_rq(other_cpu); + + if (rq_idle(rq)) + continue; + if (unlikely(!rq->online)) + continue; + if (!rq->rq_mm) + continue; + if (likely(rq->rq_smt_bias > best_bias)) + best_bias = rq->rq_smt_bias; + } + return best_bias; +} + +static int task_prio_bias(struct task_struct *p) +{ + if (rt_task(p)) + return 1 << 30; + else if (task_running_iso(p)) + return 1 << 29; + else if (task_running_idle(p)) + return 0; + return MAX_PRIO - p->static_prio; +} + +static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) +{ + return true; +} + +static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; + +/* We've already decided p can run on CPU, now test if it shouldn't for SMT + * nice reasons. */ +static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) +{ + int best_bias, task_bias; + + /* Kernel threads always run */ + if (unlikely(!p->mm)) + return true; + if (rt_task(p)) + return true; + if (!idleprio_suitable(p)) + return true; + best_bias = best_smt_bias(this_rq); + /* The smt siblings are all idle or running IDLEPRIO */ + if (best_bias < 1) + return true; + task_bias = task_prio_bias(p); + if (task_bias < 1) + return false; + if (task_bias >= best_bias) + return true; + /* Dither 25% cpu of normal tasks regardless of nice difference */ + if (best_bias % 4 == 1) + return true; + /* Sorry, you lose */ + return false; +} +#else /* CONFIG_SMT_NICE */ +#define smt_schedule(p, this_rq) (true) +#endif /* CONFIG_SMT_NICE */ + +static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) +{ + set_bit(cpu, (volatile unsigned long *)cpumask); +} + +/* + * The cpu_idle_map stores a bitmap of all the CPUs currently idle to + * allow easy lookup of whether any suitable idle CPUs are available. + * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the + * idle_cpus variable than to do a full bitmask check when we are busy. The + * bits are set atomically but read locklessly as occasional false positive / + * negative is harmless. + */ +static inline void set_cpuidle_map(int cpu) +{ + if (likely(cpu_online(cpu))) + atomic_set_cpu(cpu, &grq.cpu_idle_map); +} + +static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) +{ + clear_bit(cpu, (volatile unsigned long *)cpumask); +} + +static inline void clear_cpuidle_map(int cpu) +{ + atomic_clear_cpu(cpu, &grq.cpu_idle_map); +} + +static bool suitable_idle_cpus(struct task_struct *p) +{ + return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map)); +} + +/* + * Resched current on rq. We don't know if rq is local to this CPU nor if it + * is locked so we do not use an intermediate variable for the task to avoid + * having it dereferenced. + */ +static void resched_curr(struct rq *rq) +{ + int cpu; + + if (test_tsk_need_resched(rq->curr)) + return; + + rq->preempt = rq->curr; + cpu = rq->cpu; + + /* We're doing this without holding the rq lock if it's not task_rq */ + + if (cpu == smp_processor_id()) { + set_tsk_need_resched(rq->curr); + set_preempt_need_resched(); + return; + } + + if (set_nr_and_not_polling(rq->curr)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} + +#define CPUIDLE_DIFF_THREAD (1) +#define CPUIDLE_DIFF_CORE (2) +#define CPUIDLE_CACHE_BUSY (4) +#define CPUIDLE_DIFF_CPU (8) +#define CPUIDLE_THREAD_BUSY (16) +#define CPUIDLE_DIFF_NODE (32) + +/* + * The best idle CPU is chosen according to the CPUIDLE ranking above where the + * lowest value would give the most suitable CPU to schedule p onto next. The + * order works out to be the following: + * + * Same thread, idle or busy cache, idle or busy threads + * Other core, same cache, idle or busy cache, idle threads. + * Same node, other CPU, idle cache, idle threads. + * Same node, other CPU, busy cache, idle threads. + * Other core, same cache, busy threads. + * Same node, other CPU, busy threads. + * Other node, other CPU, idle cache, idle threads. + * Other node, other CPU, busy cache, idle threads. + * Other node, other CPU, busy threads. + */ +static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) +{ + int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | + CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | + CPUIDLE_DIFF_THREAD; + int cpu_tmp; + + if (cpumask_test_cpu(best_cpu, tmpmask)) + goto out; + + for_each_cpu(cpu_tmp, tmpmask) { + int ranking, locality; + struct rq *tmp_rq; + + ranking = 0; + tmp_rq = cpu_rq(cpu_tmp); + + locality = rq->cpu_locality[cpu_tmp]; +#ifdef CONFIG_NUMA + if (locality > 3) + ranking |= CPUIDLE_DIFF_NODE; + else +#endif + if (locality > 2) + ranking |= CPUIDLE_DIFF_CPU; +#ifdef CONFIG_SCHED_MC + else if (locality == 2) + ranking |= CPUIDLE_DIFF_CORE; + else if (!(tmp_rq->cache_idle(tmp_rq))) + ranking |= CPUIDLE_CACHE_BUSY; +#endif +#ifdef CONFIG_SCHED_SMT + if (locality == 1) + ranking |= CPUIDLE_DIFF_THREAD; + if (!(tmp_rq->siblings_idle(tmp_rq))) + ranking |= CPUIDLE_THREAD_BUSY; +#endif + if (ranking < best_ranking) { + best_cpu = cpu_tmp; + best_ranking = ranking; + } + } +out: + return best_cpu; +} + +bool cpus_share_cache(int this_cpu, int that_cpu) +{ + struct rq *this_rq = cpu_rq(this_cpu); + + return (this_rq->cpu_locality[that_cpu] < 3); +} + +/* As per resched_curr but only will resched idle task */ +static inline void resched_idle(struct rq *rq) +{ + if (test_tsk_need_resched(rq->idle)) + return; + + rq->preempt = rq->idle; + + set_tsk_need_resched(rq->idle); + + if (rq_local(rq)) { + set_preempt_need_resched(); + return; + } + + smp_send_reschedule(rq->cpu); +} + +static struct rq *resched_best_idle(struct task_struct *p, int cpu) +{ + cpumask_t tmpmask; + struct rq *rq; + int best_cpu; + + cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); + best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); + rq = cpu_rq(best_cpu); + if (!smt_schedule(p, rq)) + return NULL; + resched_idle(rq); + return rq; +} + +static inline void resched_suitable_idle(struct task_struct *p) +{ + if (suitable_idle_cpus(p)) + resched_best_idle(p, task_cpu(p)); +} + +static inline struct rq *rq_order(struct rq *rq, int cpu) +{ + return rq->rq_order[cpu]; +} +#else /* CONFIG_SMP */ +static inline void set_cpuidle_map(int cpu) +{ +} + +static inline void clear_cpuidle_map(int cpu) +{ +} + +static inline bool suitable_idle_cpus(struct task_struct *p) +{ + return uprq->curr == uprq->idle; +} + +static inline void resched_suitable_idle(struct task_struct *p) +{ +} + +static inline void resched_curr(struct rq *rq) +{ + resched_task(rq->curr); +} + +static inline void resched_if_idle(struct rq *rq) +{ +} + +static inline bool rq_local(struct rq *rq) +{ + return true; +} + +static inline struct rq *rq_order(struct rq *rq, int cpu) +{ + return rq; +} + +static inline bool smt_schedule(struct task_struct *p, struct rq *rq) +{ + return true; +} +#endif /* CONFIG_SMP */ + +static inline int normal_prio(struct task_struct *p) +{ + if (has_rt_policy(p)) + return MAX_RT_PRIO - 1 - p->rt_priority; + if (idleprio_task(p)) + return IDLE_PRIO; + if (iso_task(p)) + return ISO_PRIO; + return NORMAL_PRIO; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks as it will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/* + * activate_task - move a task to the runqueue. Enter with rq locked. + */ +static void activate_task(struct task_struct *p, struct rq *rq) +{ + resched_if_idle(rq); + + /* + * Sleep time is in units of nanosecs, so shift by 20 to get a + * milliseconds-range estimation of the amount of time that the task + * spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + if (p->state == TASK_UNINTERRUPTIBLE) + profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), + (rq->niffies - p->last_ran) >> 20); + } + + p->prio = effective_prio(p); + if (task_contributes_to_load(p)) + atomic_dec(&grq.nr_uninterruptible); + + enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + atomic_inc(&grq.nr_running); + inc_qnr(); +} + +/* + * deactivate_task - If it's running, it's not on the runqueue and we can just + * decrement the nr_running. Enter with rq locked. + */ +static inline void deactivate_task(struct task_struct *p, struct rq *rq) +{ + if (task_contributes_to_load(p)) + atomic_inc(&grq.nr_uninterruptible); + + p->on_rq = 0; + atomic_dec(&grq.nr_running); + sched_info_dequeued(rq, p); +} + +#ifdef CONFIG_SMP +void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + struct rq *rq = task_rq(p); + bool queued; + +#ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif + if (p->wake_cpu == cpu) + return; + trace_sched_migrate_task(p, cpu); + perf_event_task_migrate(p); + + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfully executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + + if (task_running(rq, p)) { + /* + * We should only be calling this on a running task if we're + * holding rq lock. + */ + lockdep_assert_held(&rq->lock); + + /* + * We can't change the task_thread_info cpu on a running task + * as p will still be protected by the rq lock of the cpu it + * is still running on so we set the wake_cpu for it to be + * lazily updated once off the cpu. + */ + p->wake_cpu = cpu; + return; + } + + if ((queued = task_queued(p))) + dequeue_task(rq, p, 0); + task_thread_info(p)->cpu = p->wake_cpu = cpu; + if (queued) + enqueue_task(cpu_rq(cpu), p, 0); +} +#endif /* CONFIG_SMP */ + +/* + * Move a task off the runqueue and take it to a cpu for it will + * become the running task. + */ +static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) +{ + struct rq *p_rq = task_rq(p); + + dequeue_task(p_rq, p, DEQUEUE_SAVE); + if (p_rq != rq) { + sched_info_dequeued(p_rq, p); + sched_info_queued(rq, p); + } + set_task_cpu(p, cpu); + dec_qnr(); +} + +/* + * Returns a descheduling task to the runqueue unless it is being + * deactivated. + */ +static inline void return_task(struct task_struct *p, struct rq *rq, + int cpu, bool deactivate) +{ + if (deactivate) + deactivate_task(p, rq); + else { + inc_qnr(); +#ifdef CONFIG_SMP + /* + * set_task_cpu was called on the running task that doesn't + * want to deactivate so it has to be enqueued to a different + * CPU and we need its lock. Tag it to be moved with as the + * lock is dropped in finish_lock_switch. + */ + if (unlikely(p->wake_cpu != cpu)) + p->on_rq = TASK_ON_RQ_MIGRATING; + else +#endif + enqueue_task(rq, p, ENQUEUE_RESTORE); + } +} + +/* Enter with rq lock held. We know p is on the local cpu */ +static inline void __set_tsk_resched(struct task_struct *p) +{ + set_tsk_need_resched(p); + set_preempt_need_resched(); +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + * + * Return: 1 if the task is currently executing. 0 otherwise. + */ +inline int task_curr(const struct task_struct *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +#ifdef CONFIG_SMP +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, long match_state) +{ + int running, queued; + unsigned long flags; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since this will return false + * if the runqueue has changed and p is actually now + * running somewhere else! + */ + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + trace_sched_wait_task(p); + running = task_running(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &flags); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(queued)) { + ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesn't have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(struct task_struct *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kick_process); +#endif + +/* + * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the + * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or + * between themselves, they cooperatively multitask. An idle rq scores as + * prio PRIO_LIMIT so it is always preempted. + */ +static inline bool +can_preempt(struct task_struct *p, int prio, u64 deadline) +{ + /* Better static priority RT task or better policy preemption */ + if (p->prio < prio) + return true; + if (p->prio > prio) + return false; + if (p->policy == SCHED_BATCH) + return false; + /* SCHED_NORMAL and ISO will preempt based on deadline */ + if (!deadline_before(p->deadline, deadline)) + return false; + return true; +} + +#ifdef CONFIG_SMP +/* + * Check to see if p can run on cpu, and if not, whether there are any online + * CPUs it can run on instead. + */ +static inline bool needs_other_cpu(struct task_struct *p, int cpu) +{ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed))) + return true; + return false; +} +#define cpu_online_map (*(cpumask_t *)cpu_online_mask) + +static void try_preempt(struct task_struct *p, struct rq *this_rq) +{ + int i, this_entries = rq_load(this_rq); + cpumask_t tmp; + + if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) + return; + + /* IDLEPRIO tasks never preempt anything but idle */ + if (p->policy == SCHED_IDLEPRIO) + return; + + cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); + + for (i = 0; i < num_possible_cpus(); i++) { + struct rq *rq = this_rq->rq_order[i]; + + if (!cpumask_test_cpu(rq->cpu, &tmp)) + continue; + + if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) + continue; + if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { + resched_curr(rq); + return; + } + } +} + +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check); +#else /* CONFIG_SMP */ +static inline bool needs_other_cpu(struct task_struct *p, int cpu) +{ + return false; +} + +static void try_preempt(struct task_struct *p, struct rq *this_rq) +{ + if (p->policy == SCHED_IDLEPRIO) + return; + if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) + resched_curr(uprq); +} + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} +#endif /* CONFIG_SMP */ + +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x04 /* internal use, task got migrated */ + +static void +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) +{ +#ifdef CONFIG_SCHEDSTATS + struct rq *rq = this_rq(); + +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(); + + if (cpu == this_cpu) + schedstat_inc(rq, ttwu_local); + else { + struct sched_domain *sd; + + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + rcu_read_unlock(); + } + +#endif /* CONFIG_SMP */ + + schedstat_inc(rq, ttwu_count); +#endif /* CONFIG_SCHEDSTATS */ +} + +static inline void ttwu_activate(struct rq *rq, struct task_struct *p) +{ + activate_task(p, rq); + + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); +} + +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption if there are no idle cpus, + * instead waiting for current to deschedule. + */ + if (wake_flags & WF_SYNC) + resched_suitable_idle(p); + else + try_preempt(p, rq); + p->state = TASK_RUNNING; + trace_sched_wakeup(p); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ + lockdep_assert_held(&rq->lock); + +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + atomic_dec(&grq.nr_uninterruptible); +#endif + + ttwu_activate(rq, p); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (likely(task_on_rq_queued(p))) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; +} + +#ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; + +void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; + unsigned long flags; + + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + + while (llist) { + int wake_flags = 0; + + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); + + ttwu_do_activate(rq, p, wake_flags); + } + + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +void scheduler_ipi(void) +{ + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + preempt_fold_need_resched(); + + if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) + return; + + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ + irq_enter(); + sched_ttwu_pending(); + irq_exit(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +{ + struct rq *rq = cpu_rq(cpu); + + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } +} + +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + rq_lock_irqsave(rq, &flags); + if (likely(is_idle_task(rq->curr))) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + rq_unlock_irqrestore(rq, &flags); + } + +out: + rcu_read_unlock(); +} + +static int valid_task_cpu(struct task_struct *p) +{ + cpumask_t valid_mask; + + if (p->flags & PF_KTHREAD) + cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_online_mask); + else + cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_active_mask); + + if (unlikely(!cpumask_weight(&valid_mask))) { + /* Hotplug boot threads do this before the CPU is up */ + WARN_ON(sched_smp_initialized); + return cpumask_any(tsk_cpus_allowed(p)); + } + return cpumask_any(&valid_mask); +} + +/* + * For a task that's just being woken up we have a valuable balancing + * opportunity so choose the nearest cache most lightly loaded runqueue. + * Entered with rq locked and returns with the chosen runqueue locked. + */ +static inline int select_best_cpu(struct task_struct *p) +{ + unsigned int idlest = ~0U; + struct rq *rq = NULL; + int i; + + if (suitable_idle_cpus(p)) { + int cpu = task_cpu(p); + + if (unlikely(needs_other_cpu(p, cpu))) + cpu = valid_task_cpu(p); + rq = resched_best_idle(p, cpu); + if (likely(rq)) + return rq->cpu; + } + + for (i = 0; i < num_possible_cpus(); i++) { + struct rq *other_rq = task_rq(p)->rq_order[i]; + int entries; + + if (!other_rq->online) + continue; + if (needs_other_cpu(p, other_rq->cpu)) + continue; + entries = rq_load(other_rq); + if (entries >= idlest) + continue; + idlest = entries; + rq = other_rq; + } + if (unlikely(!rq)) + return smp_processor_id(); + return rq->cpu; +} +#else /* CONFIG_SMP */ +static int valid_task_cpu(struct task_struct *p) +{ + return 0; +} + +static inline int select_best_cpu(struct task_struct *p) +{ + return 0; +} + +static struct rq *resched_best_idle(struct task_struct *p, int cpu) +{ + return NULL; +} +#endif /* CONFIG_SMP */ + +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) +{ + struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) + if (!cpus_share_cache(smp_processor_id(), cpu)) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ + ttwu_queue_remote(p, cpu, wake_flags); + return; + } +#endif + rq_lock(rq); + ttwu_do_activate(rq, p, wake_flags); + rq_unlock(rq); +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the thread to be awakened + * @state: the mask of task states that can be woken + * @wake_flags: wake modifier flags (WF_*) + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * Return: %true if @p was woken up, %false if it was already running. + * or @state didn't match @p's state. + */ +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +{ + unsigned long flags; + int cpu, success = 0; + + /* + * If we are going to wake up a thread waiting for CONDITION we + * need to ensure that CONDITION=1 done by the caller can not be + * reordered with p->state check below. This pairs with mb() in + * set_current_state() the waiting thread does. + */ + smp_mb__before_spinlock(); + raw_spin_lock_irqsave(&p->pi_lock, flags); + /* state is a volatile long, どうして、分からない */ + if (!((unsigned int)p->state & state)) + goto out; + + trace_sched_waking(p); + + success = 1; /* we're going to change ->state */ + cpu = task_cpu(p); + + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; + +#ifdef CONFIG_SMP + /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. + */ + smp_rmb(); + + /* + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. + */ + smp_cond_load_acquire(&p->on_cpu, !VAL); + + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + + cpu = select_best_cpu(p); + if (task_cpu(p) != cpu) + set_task_cpu(p, cpu); +#endif /* CONFIG_SMP */ + + ttwu_queue(p, cpu, wake_flags); +stat: + if (schedstat_enabled()) + ttwu_stat(p, cpu, wake_flags); +out: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return success; +} + +/** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not already there. The caller must + * ensure that rq is locked and, @p is not the current task. + * rq stays locked over invocation. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + + lockdep_assert_held(&rq->lock); + + if (!raw_spin_trylock(&p->pi_lock)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet picked a replacement task. + */ + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } + + if (!(p->state & TASK_NORMAL)) + goto out; + + trace_sched_waking(p); + + if (!task_on_rq_queued(p)) + ttwu_activate(rq, p); + + ttwu_do_wakeup(rq, p, 0); + if (schedstat_enabled()) + ttwu_stat(p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); +} + +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. + * + * Return: 1 if the process was woken up, 0 if it was already running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +int wake_up_process(struct task_struct *p) +{ + return try_to_wake_up(p, TASK_NORMAL, 0); +} +EXPORT_SYMBOL(wake_up_process); + +int wake_up_state(struct task_struct *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +static void time_slice_expired(struct task_struct *p, struct rq *rq); + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) +{ + unsigned long flags; + int cpu = get_cpu(); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif + /* + * We mark the process as NEW here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_NEW; + + /* + * The process state is set to the same value of the process executing + * do_fork() code. That is running. This guarantees that nobody will + * actually run it, and a signal or other external event cannot wake + * it up and insert it on the runqueue either. + */ + + /* Should be reset in fork.c but done here for ease of MuQSS patching */ + p->on_cpu = + p->on_rq = + p->utime = + p->stime = + p->utimescaled = + p->stimescaled = + p->sched_time = + p->stime_ns = + p->utime_ns = 0; + skiplist_node_init(&p->node); + + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { + p->policy = SCHED_NORMAL; + p->normal_prio = normal_prio(p); + } + + if (PRIO_TO_NICE(p->static_prio) < 0) { + p->static_prio = NICE_TO_PRIO(0); + p->normal_prio = p->static_prio; + } + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } + + /* + * Silence PROVE_RCU. + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + set_task_cpu(p, cpu); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +#ifdef CONFIG_SCHED_INFO + if (unlikely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif + init_task_preempt_count(p); + + put_cpu(); + return 0; +} + +#ifdef CONFIG_SCHEDSTATS + +DEFINE_STATIC_KEY_FALSE(sched_schedstats); +static bool __initdata __sched_schedstats = false; + +static void set_schedstats(bool enabled) +{ + if (enabled) + static_branch_enable(&sched_schedstats); + else + static_branch_disable(&sched_schedstats); +} + +void force_schedstat_enabled(void) +{ + if (!schedstat_enabled()) { + pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); + static_branch_enable(&sched_schedstats); + } +} + +static int __init setup_schedstats(char *str) +{ + int ret = 0; + if (!str) + goto out; + + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_schedstats() can do it later. + */ + if (!strcmp(str, "enable")) { + __sched_schedstats = true; + ret = 1; + } else if (!strcmp(str, "disable")) { + __sched_schedstats = false; + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse schedstats=\n"); + + return ret; +} +__setup("schedstats=", setup_schedstats); + +static void __init init_schedstats(void) +{ + set_schedstats(__sched_schedstats); +} + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_schedstats); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_schedstats(state); + return err; +} +#endif /* CONFIG_PROC_SYSCTL */ +#else /* !CONFIG_SCHEDSTATS */ +static inline void init_schedstats(void) {} +#endif /* CONFIG_SCHEDSTATS */ + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void wake_up_new_task(struct task_struct *p) +{ + struct task_struct *parent, *rq_curr; + struct rq *rq, *new_rq; + unsigned long flags; + + parent = p->parent; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + p->state = TASK_RUNNING; + /* Task_rq can't change yet on a new task */ + new_rq = rq = task_rq(p); + if (unlikely(needs_other_cpu(p, task_cpu(p)))) { + set_task_cpu(p, valid_task_cpu(p)); + new_rq = task_rq(p); + } + + double_rq_lock(rq, new_rq); + update_clocks(rq); + rq_curr = rq->curr; + + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = rq_curr->normal_prio; + + activate_task(p, rq); + trace_sched_wakeup_new(p); + + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. If it's negative, it won't + * matter since that's the same as being 0. rq->rq_deadline is only + * modified within schedule() so it is always equal to + * current->deadline. + */ + p->last_ran = rq_curr->last_ran; + if (likely(rq_curr->policy != SCHED_FIFO)) { + rq_curr->time_slice /= 2; + if (unlikely(rq_curr->time_slice < RESCHED_US)) { + /* + * Forking task has run out of timeslice. Reschedule it and + * start its child with a new time slice and deadline. The + * child will end up running first because its deadline will + * be slightly earlier. + */ + rq_curr->time_slice = 0; + __set_tsk_resched(rq_curr); + time_slice_expired(p, new_rq); + if (suitable_idle_cpus(p)) + resched_best_idle(p, task_cpu(p)); + else if (unlikely(rq != new_rq)) + try_preempt(p, new_rq); + } else { + p->time_slice = rq_curr->time_slice; + if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + __set_tsk_resched(rq_curr); + } else + try_preempt(p, new_rq); + } + } else { + time_slice_expired(p, new_rq); + try_preempt(p, new_rq); + } + double_rq_unlock(rq, new_rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +#ifdef CONFIG_PREEMPT_NOTIFIERS + +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; + +void preempt_notifier_inc(void) +{ + static_key_slow_inc(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_inc); + +void preempt_notifier_dec(void) +{ + static_key_slow_dec(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_dec); + +/** + * preempt_notifier_register - tell me when current is being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + if (!static_key_false(&preempt_notifier_key)) + WARN(1, "registering preempt_notifier while notifiers disabled\n"); + + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is *not* safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + if (static_key_false(&preempt_notifier_key)) + __fire_sched_in_preempt_notifiers(curr); +} + +static void +__fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} + +static __always_inline void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + if (static_key_false(&preempt_notifier_key)) + __fire_sched_out_preempt_notifiers(curr, next); +} + +#else /* !CONFIG_PREEMPT_NOTIFIERS */ + +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static inline void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ +} + +#endif /* CONFIG_PREEMPT_NOTIFIERS */ + +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + sched_info_switch(rq, prev, next); + perf_event_task_sched_out(prev, next); + fire_sched_out_preempt_notifiers(prev, next); + prepare_lock_switch(rq, next); + prepare_arch_switch(next); +} + +/** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + * + * The context switch have flipped the stack from under us and restored the + * local variables which were saved when this task called schedule() in the + * past. prev == current is still correct but we need to recalculate this_rq + * because prev may have moved to another CPU. + */ +static struct rq *finish_task_switch(struct task_struct *prev) + __releases(rq->lock) +{ + struct rq *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + long prev_state; + + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, + "corrupted preempt_count: %s/%d/0x%x\n", + current->comm, current->pid, preempt_count())) + preempt_count_set(FORK_PREEMPT_COUNT); + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. + */ + prev_state = prev->state; + vtime_task_switch(prev); + perf_event_task_sched_in(prev, current); + finish_lock_switch(rq, prev); + finish_arch_post_lock_switch(); + + fire_sched_in_preempt_notifiers(current); + if (mm) + mmdrop(mm); + if (unlikely(prev_state == TASK_DEAD)) { + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); + put_task_struct(prev); + } + return rq; +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage __visible void schedule_tail(struct task_struct *prev) + __releases(rq->lock) +{ + struct rq *rq; + + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ + + rq = finish_task_switch(prev); + preempt_enable(); + + if (current->set_child_tid) + put_user(task_pid_vnr(current), current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new thread's register state. + */ +static __always_inline struct rq * +context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + struct mm_struct *mm, *oldmm; + + prepare_task_switch(rq, prev, next); + + mm = next->mm; + oldmm = prev->active_mm; + /* + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. + */ + arch_start_context_switch(prev); + + if (!mm) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm_irqs_off(oldmm, mm, next); + + if (!prev->mm) { + prev->active_mm = NULL; + rq->prev_mm = oldmm; + } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + barrier(); + + return finish_task_switch(prev); +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, total number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + return atomic_read(&grq.nr_running); +} + +static unsigned long nr_uninterruptible(void) +{ + return atomic_read(&grq.nr_uninterruptible); +} + +/* + * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop) + */ +bool single_task_running(void) +{ + struct rq *rq = cpu_rq(smp_processor_id()); + + if (rq_load(rq) == 1) + return true; + else + return false; +} +EXPORT_SYMBOL(single_task_running); + +unsigned long long nr_context_switches(void) +{ + return (unsigned long long)atomic64_read(&grq.nr_switches); +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_possible_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +unsigned long nr_iowait_cpu(int cpu) +{ + struct rq *this = cpu_rq(cpu); + return atomic_read(&this->nr_iowait); +} + +unsigned long nr_active(void) +{ + return nr_running() + nr_uninterruptible(); +} + +/* + * I/O wait is the number of running or queued tasks with their ->rq pointer + * set to this cpu as being the CPU they're more likely to run on. + */ +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *rq = this_rq(); + + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq_load(rq); +} + +/* Variables and functions for calc_load */ +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; +} + +/* + * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. + */ +void calc_global_load(unsigned long ticks) +{ + long active; + + if (time_before(jiffies, calc_load_update)) + return; + active = nr_active() * FIXED_1; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update = jiffies + LOAD_FREQ; +} + +DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); + +EXPORT_PER_CPU_SYMBOL(kstat); +EXPORT_PER_CPU_SYMBOL(kernel_cpustat); + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +static DEFINE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void irqtime_account_irq(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(irqtime_account_irq); + +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}irq region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}irq + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_key_false((¶virt_steal_rq_enabled))) { + s64 steal = paravirt_steal_clock(cpu_of(rq)); + + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + rq->prev_steal_time_rq += steal; + + delta -= steal; + } +#endif + rq->clock_task += delta; +} + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static void irqtime_account_hi_si(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + u64 latest_ns; + + latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)); + if (latest_ns > cpustat[CPUTIME_IRQ]) + cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy; + + latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)); + if (latest_ns > cpustat[CPUTIME_SOFTIRQ]) + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy; +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime (0) + +static inline void irqtime_account_hi_si(void) +{ +} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_key_false(¶virt_steal_enabled)) { + u64 steal; + cputime_t steal_ct; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + /* + * cputime_t may be less precise than nsecs (eg: if it's + * based on jiffies). Lets cast the result to cputime + * granularity and account the rest on the next rounds. + */ + steal_ct = nsecs_to_cputime(steal); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); + + account_steal_time(steal_ct); + return steal_ct; + } +#endif + return false; +} + +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + cputime_t utime, stime; + struct task_struct *t; + unsigned int seq, nextseq; + unsigned long flags; + + rcu_read_lock(); + /* Attempt a lockless read on the first round. */ + nextseq = 0; + do { + seq = nextseq; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + rcu_read_unlock(); +} + +/* + * On each tick, add the number of nanoseconds to the unbanked variables and + * once one tick's worth has accumulated, account it allowing for accurate + * sub-tick accounting and totals. + */ +static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; + + if (atomic_read(&rq->nr_iowait) > 0) { + rq->iowait_ns += ns; + if (rq->iowait_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->iowait_ns); + cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks; + rq->iowait_ns %= JIFFY_NS; + } + } else { + rq->idle_ns += ns; + if (rq->idle_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->idle_ns); + cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks; + rq->idle_ns %= JIFFY_NS; + } + } + acct_update_integrals(idle); +} + +static void pc_system_time(struct rq *rq, struct task_struct *p, + int hardirq_offset, unsigned long ns) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; + + p->stime_ns += ns; + if (p->stime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->stime_ns); + p->stime_ns %= JIFFY_NS; + p->stime += (__force u64)cputime_one_jiffy * ticks; + p->stimescaled += one_jiffy_scaled * ticks; + account_group_system_time(p, cputime_one_jiffy * ticks); + } + p->sched_time += ns; + account_group_exec_runtime(p, ns); + + if (hardirq_count() - hardirq_offset) { + rq->irq_ns += ns; + if (rq->irq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->irq_ns); + cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->irq_ns %= JIFFY_NS; + } + } else if (in_serving_softirq()) { + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; + } + } else { + rq->system_ns += ns; + if (rq->system_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->system_ns); + cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks; + rq->system_ns %= JIFFY_NS; + } + } + acct_update_integrals(p); +} + +static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; + + p->utime_ns += ns; + if (p->utime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->utime_ns); + p->utime_ns %= JIFFY_NS; + p->utime += (__force u64)cputime_one_jiffy * ticks; + p->utimescaled += one_jiffy_scaled * ticks; + account_group_user_time(p, cputime_one_jiffy * ticks); + } + p->sched_time += ns; + account_group_exec_runtime(p, ns); + + if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + */ + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; + } + } + + if (task_nice(p) > 0 || idleprio_task(p)) { + rq->nice_ns += ns; + if (rq->nice_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->nice_ns); + cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks; + rq->nice_ns %= JIFFY_NS; + } + } else { + rq->user_ns += ns; + if (rq->user_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->user_ns); + cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks; + rq->user_ns %= JIFFY_NS; + } + } + acct_update_integrals(p); +} + +/* + * This is called on clock ticks. + * Bank in p->sched_time the ns elapsed since the last tick or switch. + * CPU scheduler quota accounting is also performed here in microseconds. + */ +static void +update_cpu_clock_tick(struct rq *rq, struct task_struct *p) +{ + s64 account_ns = rq->niffies - p->last_ran; + struct task_struct *idle = rq->idle; + + if (steal_account_process_tick()) + goto ts_account; + + /* Accurate tick timekeeping */ + if (user_mode(get_irq_regs())) + pc_user_time(rq, p, account_ns); + else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { + pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); + } else + pc_idle_time(rq, idle, account_ns); + + if (sched_clock_irqtime) + irqtime_account_hi_si(); + +ts_account: + /* time_slice accounting is done in usecs to avoid overflow on 32bit */ + if (p->policy != SCHED_FIFO && p != idle) + p->time_slice -= NS_TO_US(account_ns); + + p->last_ran = rq->niffies; +} + +/* + * This is called on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. + * CPU scheduler quota accounting is also performed here in microseconds. + */ +static void +update_cpu_clock_switch(struct rq *rq, struct task_struct *p) +{ + s64 account_ns = rq->niffies - p->last_ran; + struct task_struct *idle = rq->idle; + + /* Accurate subtick timekeeping */ + if (p != idle) + pc_user_time(rq, p, account_ns); + else + pc_idle_time(rq, idle, account_ns); + + /* time_slice accounting is done in usecs to avoid overflow on 32bit */ + if (p->policy != SCHED_FIFO && p != idle) + p->time_slice -= NS_TO_US(account_ns); +} + +/* + * Return any ns on the sched_clock that have not yet been accounted in + * @p in case that task is currently running. + * + * Called with task_rq_lock(p) held. + */ +static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) +{ + u64 ns = 0; + + /* + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). + */ + if (p == rq->curr && task_on_rq_queued(p)) { + update_clocks(rq); + ns = rq->niffies - p->last_ran; + } + + return ns; +} + +/* + * Return accounted runtime for the task. + * Return separately the current's pending runtime that have not been + * accounted yet. + * + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns; + +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) + /* + * 64-bit doesn't need locks to atomically read a 64bit value. + * So we have a optimization chance when the task's delta_exec is 0. + * Reading ->on_cpu is racy, but this is ok. + * + * If we race with it leaving cpu, we'll take a lock. So we're correct. + * If we race with it entering cpu, unaccounted time is 0. This is + * indistinguishable from the read occurring a few cycles earlier. + * If we see ->on_cpu without ->on_rq, the task is leaving, and has + * been accounted, so we're correct here as well. + */ + if (!p->on_cpu || !task_on_rq_queued(p)) + return tsk_seruntime(p); +#endif + + rq = task_rq_lock(p, &flags); + ns = p->sched_time + do_task_delta_exec(p, rq); + task_rq_unlock(rq, p, &flags); + + return ns; +} + +/* Compatibility crap */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ +} + +void account_idle_time(cputime_t cputime) +{ +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + /* Add guest time to process. */ + p->utime += (__force u64)cputime; + p->utimescaled += (__force u64)cputime_scaled; + account_group_user_time(p, cputime); + p->gtime += (__force u64)cputime; + + /* Add guest time to cpustat. */ + if (task_nice(p) > 0) { + cpustat[CPUTIME_NICE] += (__force u64)cputime; + cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime; + } else { + cpustat[CPUTIME_USER] += (__force u64)cputime; + cpustat[CPUTIME_GUEST] += (__force u64)cputime; + } +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, cputime64_t *target_cputime64) +{ + /* Add system time to process. */ + p->stime += (__force u64)cputime; + p->stimescaled += (__force u64)cputime_scaled; + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + *target_cputime64 += (__force u64)cputime; + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * This is for guest only now. + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) + account_guest_time(p, cputime, cputime_scaled); +} + +/* + * Account for involuntary wait time. + * @steal: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + cpustat[CPUTIME_STEAL] += (__force u64)cputime; +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +static void account_idle_times(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat[CPUTIME_IOWAIT] += (__force u64)cputime; + else + cpustat[CPUTIME_IDLE] += (__force u64)cputime; +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + +void account_process_tick(struct task_struct *p, int user_tick) +{ +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + account_idle_times(jiffies_to_cputime(ticks)); +} +#endif + +/* + * Functions to test for when SCHED_ISO tasks have used their allocated + * quota as real time scheduling and convert them back to SCHED_NORMAL. All + * data is modified only by the local runqueue during scheduler_tick with + * interrupts disabled. + */ + +/* + * Test if SCHED_ISO tasks have run longer than their alloted period as RT + * tasks and set the refractory flag if necessary. There is 10% hysteresis + * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a + * slow division. + */ +static inline void iso_tick(struct rq *rq) +{ + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; + rq->iso_ticks += 100; + if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { + rq->iso_refractory = true; + if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) + rq->iso_ticks = ISO_PERIOD * 100; + } +} + +/* No SCHED_ISO task was running so decrease rq->iso_ticks */ +static inline void no_iso_tick(struct rq *rq, int ticks) +{ + if (rq->iso_ticks > 0 || rq->iso_refractory) { + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; + if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { + rq->iso_refractory = false; + if (unlikely(rq->iso_ticks < 0)) + rq->iso_ticks = 0; + } + } +} + +/* This manages tasks that have run out of timeslice during a scheduler_tick */ +static void task_running_tick(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + /* + * If a SCHED_ISO task is running we increment the iso_ticks. In + * order to prevent SCHED_ISO tasks from causing starvation in the + * presence of true RT tasks we account those as iso_ticks as well. + */ + if (rt_task(p) || task_running_iso(p)) + iso_tick(rq); + else + no_iso_tick(rq, 1); + + /* SCHED_FIFO tasks never run out of timeslice. */ + if (p->policy == SCHED_FIFO) + return; + + if (iso_task(p)) { + if (task_running_iso(p)) { + if (rq->iso_refractory) { + /* + * SCHED_ISO task is running as RT and limit + * has been hit. Force it to reschedule as + * SCHED_NORMAL by zeroing its time_slice + */ + p->time_slice = 0; + } + } else if (!rq->iso_refractory) { + /* Can now run again ISO. Reschedule to pick up prio */ + goto out_resched; + } + } + + /* + * Tasks that were scheduled in the first half of a tick are not + * allowed to run into the 2nd half of the next tick if they will + * run out of time slice in the interim. Otherwise, if they have + * less than RESCHED_US μs of time slice left they will be rescheduled. + */ + if (p->time_slice - rq->dither >= RESCHED_US) + return; +out_resched: + rq_lock(rq); + __set_tsk_resched(p); + rq_unlock(rq); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +void scheduler_tick(void) +{ + int cpu __maybe_unused = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + sched_clock_tick(); + update_rq_clock(rq); + update_load_avg(rq); + update_cpu_clock_tick(rq, rq->curr); + if (!rq_idle(rq)) + task_running_tick(rq); + else + no_iso_tick(rq, rq->last_scheduler_tick - rq->last_jiffy); + rq->last_scheduler_tick = rq->last_jiffy; + rq->last_tick = rq->clock; + perf_event_task_tick(); +} + +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) +/* + * If the value passed in is equal to the current preempt count + * then we just disabled preemption. Start timing the latency. + */ +static inline void preempt_latency_start(int val) +{ + if (preempt_count() == val) { + unsigned long ip = get_lock_parent_ip(); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } +} + +void preempt_count_add(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) + return; +#endif + __preempt_count_add(val); +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Spinlock count overflowing soon? + */ + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); +#endif + preempt_latency_start(val); +} +EXPORT_SYMBOL(preempt_count_add); +NOKPROBE_SYMBOL(preempt_count_add); + +/* + * If the value passed in equals to the current preempt count + * then we just enabled preemption. Stop timing the latency. + */ +static inline void preempt_latency_stop(int val) +{ + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); +} + +void preempt_count_sub(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + return; + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif + + preempt_latency_stop(val); + __preempt_count_sub(val); +} +EXPORT_SYMBOL(preempt_count_sub); +NOKPROBE_SYMBOL(preempt_count_sub); + +#else +static inline void preempt_latency_start(int val) { } +static inline void preempt_latency_stop(int val) { } +#endif + +/* + * The time_slice is only refilled when it is empty and that is when we set a + * new deadline. Make sure update_clocks has been called recently to update + * rq->niffies. + */ +static void time_slice_expired(struct task_struct *p, struct rq *rq) +{ + p->time_slice = timeslice(); + p->deadline = rq->niffies + task_deadline_diff(p); +#ifdef CONFIG_SMT_NICE + if (!p->mm) + p->smt_bias = 0; + else if (rt_task(p)) + p->smt_bias = 1 << 30; + else if (task_running_iso(p)) + p->smt_bias = 1 << 29; + else if (idleprio_task(p)) { + if (task_running_idle(p)) + p->smt_bias = 0; + else + p->smt_bias = 1; + } else if (--p->smt_bias < 1) + p->smt_bias = MAX_PRIO - p->static_prio; +#endif +} + +/* + * Timeslices below RESCHED_US are considered as good as expired as there's no + * point rescheduling when there's so little time left. SCHED_BATCH tasks + * have been flagged be not latency sensitive and likely to be fully CPU + * bound so every time they're rescheduled they have their time_slice + * refilled, but get a new later deadline to have little effect on + * SCHED_NORMAL tasks. + + */ +static inline void check_deadline(struct task_struct *p, struct rq *rq) +{ + if (p->time_slice < RESCHED_US || batch_task(p)) + time_slice_expired(p, rq); +} + +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/* + * Task selection with skiplists is a simple matter of picking off the first + * task in the sorted list, an O(1) operation. The lookup is amortised O(1) + * being bound to the number of processors. + * + * Runqueues are selectively locked based on their unlocked data and then + * unlocked if not needed. At most 3 locks will be held at any time and are + * released as soon as they're no longer needed. All balancing between CPUs + * is thus done here in an extremely simple first come best fit manner. + * + * This iterates over runqueues in cache locality order. In interactive mode + * it iterates over all CPUs and finds the task with the best key/deadline. + * In non-interactive mode it will only take a task if it's from the current + * runqueue or a runqueue with more tasks than the current one with a better + * key/deadline. + */ +static inline struct +task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) +{ + struct task_struct *edt = idle; + struct rq *locked = NULL; + int i, best_entries = 0; + u64 best_key = ~0ULL; + + for (i = 0; i < num_possible_cpus(); i++) { + struct rq *other_rq = rq_order(rq, i); + int entries = other_rq->sl->entries; + struct task_struct *p; + u64 key; + + /* + * Check for queued entres lockless first. The local runqueue + * is locked so entries will always be accurate. + */ + if (!sched_interactive) { + if (entries <= best_entries) + continue; + } else if (!entries) + continue; + + /* if (i) implies other_rq != rq */ + if (i) { + /* Check for best id queued lockless first */ + if (other_rq->best_key >= best_key) + continue; + + if (unlikely(!trylock_rq(rq, other_rq))) + continue; + + /* Need to reevaluate entries after locking */ + entries = other_rq->sl->entries; + if (unlikely(!entries)) { + unlock_rq(other_rq); + continue; + } + } + key = other_rq->node.next[0]->key; + /* Reevaluate key after locking */ + if (unlikely(key >= best_key)) { + if (i) + unlock_rq(other_rq); + continue; + } + + p = other_rq->node.next[0]->value; + if (!smt_schedule(p, rq)) { + if (i) + unlock_rq(other_rq); + continue; + } + + /* Make sure affinity is ok */ + if (i) { + if (needs_other_cpu(p, cpu)) { + unlock_rq(other_rq); + continue; + } + if (locked) + unlock_rq(locked); + locked = other_rq; + } + + best_entries = entries; + best_key = key; + edt = p; + } + + if (likely(edt != idle)) + take_task(rq, cpu, edt); + + if (locked) + unlock_rq(locked); + + return edt; +} + +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) +{ + if (oops_in_progress) + return; + + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); + + debug_show_held_locks(prev); + print_modules(); + if (irqs_disabled()) + print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT + if (in_atomic_preempt_off()) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} + +/* + * Various schedule()-time debugging checks and statistics: + */ +static inline void schedule_debug(struct task_struct *prev) +{ +#ifdef CONFIG_SCHED_STACK_END_CHECK + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); +#endif + + if (unlikely(in_atomic_preempt_off())) { + __schedule_bug(prev); + preempt_count_set(PREEMPT_DISABLED); + } + rcu_sleep_check(); + + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + + schedstat_inc(this_rq(), sched_count); +} + +/* + * The currently running task's information is all stored in rq local data + * which is only modified by the local CPU. + */ +static inline void set_rq_task(struct rq *rq, struct task_struct *p) +{ + rq->rq_deadline = p->deadline; + rq->rq_prio = p->prio; +#ifdef CONFIG_SMT_NICE + rq->rq_mm = p->mm; + rq->rq_smt_bias = p->smt_bias; +#endif +} + +#ifdef CONFIG_SMT_NICE +static void check_no_siblings(struct rq __maybe_unused *this_rq) {} +static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} +static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; +static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; + +/* Iterate over smt siblings when we've scheduled a process on cpu and decide + * whether they should continue running or be descheduled. */ +static void check_smt_siblings(struct rq *this_rq) +{ + int other_cpu; + + for_each_cpu(other_cpu, &this_rq->thread_mask) { + struct task_struct *p; + struct rq *rq; + + rq = cpu_rq(other_cpu); + if (rq_idle(rq)) + continue; + if (unlikely(!rq->online)) + continue; + p = rq->curr; + if (!smt_schedule(p, this_rq)) { + set_tsk_need_resched(p); + smp_send_reschedule(other_cpu); + } + } +} + +static void wake_smt_siblings(struct rq *this_rq) +{ + int other_cpu; + + if (!queued_notrunning()) + return; + + for_each_cpu(other_cpu, &this_rq->thread_mask) { + struct rq *rq; + + rq = cpu_rq(other_cpu); + if (unlikely(!rq->online)) + continue; + if (rq_idle(rq)) { + struct task_struct *p = rq->curr; + + set_tsk_need_resched(p); + smp_send_reschedule(other_cpu); + } + } +} +#else +static void check_siblings(struct rq __maybe_unused *this_rq) {} +static void wake_siblings(struct rq __maybe_unused *this_rq) {} +#endif + +/* + * schedule() is the main scheduler function. + * + * The main means of driving the scheduler and thus entering this function are: + * + * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. + * + * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return + * paths. For example, see arch/x86/entry_64.S. + * + * To drive preemption between tasks, the scheduler sets the flag in timer + * interrupt handler scheduler_tick(). + * + * 3. Wakeups don't really cause entry into schedule(). They add a + * task to the run-queue and that's it. + * + * Now, if the new task added to the run-queue preempts the current + * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets + * called on the nearest possible occasion: + * + * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * + * - in syscall or exception context, at the next outmost + * preempt_enable(). (this might be as soon as the wake_up()'s + * spin_unlock()!) + * + * - in IRQ context, return from interrupt-handler to + * preemptible context + * + * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call + * - explicit schedule() call + * - return from syscall or exception to user-space + * - return from interrupt-handler to user-space + * + * WARNING: must be called with preemption disabled! + */ +static void __sched notrace __schedule(bool preempt) +{ + struct task_struct *prev, *next, *idle; + unsigned long *switch_count; + bool deactivate = false; + struct rq *rq; + u64 niffies; + int cpu; + + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + prev = rq->curr; + idle = rq->idle; + + /* + * do_exit() calls schedule() with preemption disabled as an exception; + * however we must fix that up, otherwise the next task will see an + * inconsistent (higher) preempt count. + * + * It also avoids the below schedule_debug() test from complaining + * about this. + */ + if (unlikely(prev->state == TASK_DEAD)) + preempt_enable_no_resched_notrace(); + + schedule_debug(prev); + + local_irq_disable(); + rcu_note_context_switch(); + + /* + * Make sure that signal_pending_state()->signal_pending() below + * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) + * done by the caller to avoid the race with signal_wake_up(). + */ + smp_mb__before_spinlock(); + rq_lock(rq); +#ifdef CONFIG_SMP + if (rq->preempt) { + /* + * Make sure resched_curr hasn't triggered a preemption + * locklessly on a task that has since scheduled away. Spurious + * wakeup of idle is okay though. + */ + if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { + rq->preempt = NULL; + clear_preempt_need_resched(); + rq_unlock_irq(rq); + return; + } + rq->preempt = NULL; + } +#endif + + switch_count = &prev->nivcsw; + if (!preempt && prev->state) { + if (unlikely(signal_pending_state(prev->state, prev))) { + prev->state = TASK_RUNNING; + } else { + deactivate = true; + prev->on_rq = 0; + + /* + * If a worker is going to sleep, notify and + * ask workqueue whether it wants to wake up a + * task to maintain concurrency. If so, wake + * up the task. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev); + if (to_wakeup) { + /* This shouldn't happen, but does */ + if (WARN_ONCE((to_wakeup == prev), "Waking up prev as worker\n")) + deactivate = false; + else + try_to_wake_up_local(to_wakeup); + } + } + } + switch_count = &prev->nvcsw; + } + + /* + * Store the niffy value here for use by the next task's last_ran + * below to avoid losing niffies due to update_clocks being called + * again after this point. + */ + update_clocks(rq); + niffies = rq->niffies; + update_cpu_clock_switch(rq, prev); + if (rq->clock - rq->last_tick > HALF_JIFFY_NS) + rq->dither = 0; + else + rq->dither = HALF_JIFFY_US; + + clear_tsk_need_resched(prev); + clear_preempt_need_resched(); + + if (idle != prev) { + check_deadline(prev, rq); + return_task(prev, rq, cpu, deactivate); + } + + if (unlikely(!queued_notrunning())) { + next = idle; + schedstat_inc(rq, sched_goidle); + set_cpuidle_map(cpu); + update_load_avg(rq); + } else { + next = earliest_deadline_task(rq, cpu, idle); + if (likely(next->prio != PRIO_LIMIT)) + clear_cpuidle_map(cpu); + else { + set_cpuidle_map(cpu); + update_load_avg(rq); + } + } + + set_rq_task(rq, next); + next->last_ran = niffies; + + if (likely(prev != next)) { + /* + * Don't reschedule an idle task or deactivated tasks + */ + if (prev != idle && !deactivate) + resched_suitable_idle(prev); + if (next != idle) + check_siblings(rq); + else + wake_siblings(rq); + atomic64_inc(&grq.nr_switches); + rq->curr = next; + ++*switch_count; + + trace_sched_switch(preempt, prev, next); + rq = context_switch(rq, prev, next); /* unlocks the rq */ + } else { + check_siblings(rq); + rq_unlock_irq(rq); + } +} + +static inline void sched_submit_work(struct task_struct *tsk) +{ + if (!tsk->state || tsk_is_pi_blocked(tsk) || + preempt_count() || + signal_pending_state(tsk->state, tsk)) + return; + + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); +} + +asmlinkage __visible void __sched schedule(void) +{ + struct task_struct *tsk = current; + + sched_submit_work(tsk); + do { + preempt_disable(); + __schedule(false); + sched_preempt_enable_no_resched(); + } while (need_resched()); +} + +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +asmlinkage __visible void __sched schedule_user(void) +{ + /* + * If we come here after a random call to set_need_resched(), + * or we have been woken up remotely but the IPI has not yet arrived, + * we haven't yet exited the RCU idle mode. Do it here manually until + * we find a better solution. + * + * NB: There are buggy callers of this function. Ideally we + * should warn if prev_state != IN_USER, but that will trigger + * too frequently to make sense yet. + */ + enum ctx_state prev_state = exception_enter(); + schedule(); + exception_exit(prev_state); +} +#endif + +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ + sched_preempt_enable_no_resched(); + schedule(); + preempt_disable(); +} + +static void __sched notrace preempt_schedule_common(void) +{ + do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ + preempt_disable_notrace(); + preempt_latency_start(1); + __schedule(true); + preempt_latency_stop(1); + preempt_enable_no_resched_notrace(); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + } while (need_resched()); +} + +#ifdef CONFIG_PREEMPT +/* + * this is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage __visible void __sched notrace preempt_schedule(void) +{ + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (likely(!preemptible())) + return; + + preempt_schedule_common(); +} +NOKPROBE_SYMBOL(preempt_schedule); +EXPORT_SYMBOL(preempt_schedule); + +/** + * preempt_schedule_notrace - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + + do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ + preempt_disable_notrace(); + preempt_latency_start(1); + /* + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + prev_ctx = exception_enter(); + __schedule(true); + exception_exit(prev_ctx); + + preempt_latency_stop(1); + preempt_enable_no_resched_notrace(); + } while (need_resched()); +} +EXPORT_SYMBOL_GPL(preempt_schedule_notrace); + +#endif /* CONFIG_PREEMPT */ + +/* + * this is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage __visible void __sched preempt_schedule_irq(void) +{ + enum ctx_state prev_state; + + /* Catch callers which need to be fixed */ + BUG_ON(preempt_count() || !irqs_disabled()); + + prev_state = exception_enter(); + + do { + preempt_disable(); + local_irq_enable(); + __schedule(true); + local_irq_disable(); + sched_preempt_enable_no_resched(); + } while (need_resched()); + + exception_exit(prev_state); +} + +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, + void *key) +{ + return try_to_wake_up(curr->private, mode, wake_flags); +} +EXPORT_SYMBOL(default_wake_function); + +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed. + */ +void rt_mutex_setprio(struct task_struct *p, int prio) +{ + struct rq *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = __task_rq_lock(p); + + /* + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + + trace_sched_pi_setprio(p, prio); + oldprio = p->prio; + p->prio = prio; + if (task_running(rq, p)){ + if (prio > oldprio) + resched_task(p); + } else if (task_queued(p)) { + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); + if (prio < oldprio) + try_preempt(p, rq); + } +out_unlock: + __task_rq_unlock(rq); +} + +#endif + +/* + * Adjust the deadline for when the priority is to change, before it's + * changed. + */ +static inline void adjust_deadline(struct task_struct *p, int new_prio) +{ + p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); +} + +void set_user_nice(struct task_struct *p, long nice) +{ + int new_static, old_static; + unsigned long flags; + struct rq *rq; + + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) + return; + new_static = NICE_TO_PRIO(nice); + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL/SCHED_BATCH: + */ + if (has_rt_policy(p)) { + p->static_prio = new_static; + goto out_unlock; + } + + adjust_deadline(p, new_static); + old_static = p->static_prio; + p->static_prio = new_static; + p->prio = effective_prio(p); + + if (task_queued(p)) { + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); + if (new_static < old_static) + try_preempt(p, rq); + } else if (task_running(rq, p)) { + set_rq_task(rq, p); + if (old_static < new_static) + resched_task(p); + } +out_unlock: + task_rq_unlock(rq, p, &flags); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = nice_to_rlimit(nice); + + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || + capable(CAP_SYS_NICE)); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ + long nice, retval; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); + nice = task_nice(current) + increment; + + nice = clamp_val(nice, MIN_NICE, MAX_NICE); + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * Return: The priority value as seen by users in /proc. + * RT tasks are offset by -100. Normal tasks are centered around 1, value goes + * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). + */ +int task_prio(const struct task_struct *p) +{ + int delta, prio = p->prio - MAX_RT_PRIO; + + /* rt tasks and iso tasks */ + if (prio <= 0) + goto out; + + /* Convert to ms to avoid overflows */ + delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); + delta = delta * 40 / ms_longest_deadline_diff(); + if (delta > 0 && delta <= 80) + prio += delta; + if (idleprio_task(p)) + prio += 40; +out: + return prio; +} + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + * + * Return: The idle task for the cpu @cpu. + */ +struct task_struct *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + * + * The task of @pid, if found. %NULL otherwise. + */ +static inline struct task_struct *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_vpid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, + int prio, bool keep_boost) +{ + int oldrtprio, oldprio; + + p->policy = policy; + oldrtprio = p->rt_priority; + p->rt_priority = prio; + p->normal_prio = normal_prio(p); + oldprio = p->prio; + /* + * Keep a potential priority boosting if called from + * sched_setscheduler(). + */ + if (keep_boost) { + /* + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + p->prio = rt_mutex_get_effective_prio(p, p->normal_prio); + } else + p->prio = p->normal_prio; + + if (task_running(rq, p)) { + set_rq_task(rq, p); + resched_task(p); + } else if (task_queued(p)) { + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); + if (p->prio < oldprio || p->rt_priority > oldrtprio) + try_preempt(p, rq); + } +} + +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + match = (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); + rcu_read_unlock(); + return match; +} + +static int +__sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool user, bool pi) +{ + struct sched_param zero_param = { .sched_priority = 0 }; + unsigned long flags, rlim_rtprio = 0; + int retval, oldpolicy = -1; + int reset_on_fork; + struct rq *rq; + + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); + + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { + unsigned long lflags; + + if (!lock_task_sighand(p, &lflags)) + return -ESRCH; + rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); + unlock_task_sighand(p, &lflags); + if (rlim_rtprio) + goto recheck; + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + * We also set the parameter to zero to pass the checks. + */ + policy = SCHED_ISO; + param = &zero_param; + } +recheck: + /* double check policy once rq lock held */ + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; + policy = oldpolicy = p->policy; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (!SCHED_RANGE(policy)) + return -EINVAL; + } + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and + * SCHED_BATCH is 0. + */ + if (param->sched_priority < 0 || + (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO - 1)) + return -EINVAL; + if (is_rt_policy(policy) != (param->sched_priority != 0)) + return -EINVAL; + + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (user && !capable(CAP_SYS_NICE)) { + if (is_rt_policy(policy)) { + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); + + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; + + /* can't increase priority */ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; + } else { + switch (p->policy) { + /* + * Can only downgrade policies but not back to + * SCHED_NORMAL + */ + case SCHED_ISO: + if (policy == SCHED_ISO) + goto out; + if (policy != SCHED_NORMAL) + return -EPERM; + break; + case SCHED_BATCH: + if (policy == SCHED_BATCH) + goto out; + if (policy != SCHED_IDLEPRIO) + return -EPERM; + break; + case SCHED_IDLEPRIO: + if (policy == SCHED_IDLEPRIO) + goto out; + return -EPERM; + default: + break; + } + } + + /* can't change other user's priorities */ + if (!check_same_owner(p)) + return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; + } + + if (user) { + retval = security_task_setscheduler(p); + if (retval) + return retval; + } + + /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + * + * To be able to change p->policy safely, the runqueue lock must be + * held. + */ + rq = task_rq_lock(p, &flags); + + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + task_rq_unlock(rq, p, &flags); + return -EINVAL; + } + + /* + * If not changing anything there's no need to proceed further: + */ + if (unlikely(policy == p->policy && (!is_rt_policy(policy) || + param->sched_priority == p->rt_priority))) { + task_rq_unlock(rq, p, &flags); + return 0; + } + + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &flags); + goto recheck; + } + p->sched_reset_on_fork = reset_on_fork; + + __setscheduler(p, rq, policy, param->sched_priority, pi); + task_rq_unlock(rq, p, &flags); + + if (pi) + rt_mutex_adjust_pi(p); +out: + return 0; +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true, true); +} + +EXPORT_SYMBOL_GPL(sched_setscheduler); + +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + const struct sched_param param = { .sched_priority = attr->sched_priority }; + int policy = attr->sched_policy; + + return __sched_setscheduler(p, policy, ¶m, true, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + * + * Return: 0 on success. An error code otherwise. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false, true); +} +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lparam; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); + rcu_read_unlock(); + + return retval; +} + +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, + struct sched_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = SCHED_ATTR_SIZE_VER0; + + if (size < SCHED_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * XXX: do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, -20, 19); + + /* sched/core.c uses zero here but we already know ret is zero */ + return 0; + +err_size: + put_user(sizeof(*attr), &uattr->size); + return -E2BIG; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * + * Return: 0 on success. An error code otherwise. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, + struct sched_param __user *param) +{ + /* negative values for policy are not valid */ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); +} + +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); +} + +/** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + */ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) +{ + struct sched_attr attr; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || flags) + return -EINVAL; + + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; + + if ((int)attr.sched_policy < 0) + return -EINVAL; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setattr(p, &attr); + rcu_read_unlock(); + + return retval; +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + * + * Return: On success, the policy of the thread. Otherwise, a negative error + * code. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ + struct task_struct *p; + int retval = -EINVAL; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + rcu_read_lock(); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + rcu_read_unlock(); + +out_nounlock: + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + * + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error + * code. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ + struct sched_param lp = { .sched_priority = 0 }; + struct task_struct *p; + int retval = -EINVAL; + + if (!param || pid < 0) + goto out_nounlock; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + if (has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + rcu_read_unlock(); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +out_nounlock: + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + +static int sched_read_attr(struct sched_attr __user *uattr, + struct sched_attr *attr, + unsigned int usize) +{ + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, usize)) + return -EFAULT; + + /* + * If we're handed a smaller struct than we know of, + * ensure all the unknown bits are 0 - i.e. old + * user-space does not get uncomplete information. + */ + if (usize < sizeof(*attr)) { + unsigned char *addr; + unsigned char *end; + + addr = (void *)attr + usize; + end = (void *)attr + sizeof(*attr); + + for (; addr < end; addr++) { + if (*addr) + return -EFBIG; + } + + attr->size = usize; + } + + ret = copy_to_user(uattr, attr, attr->size); + if (ret) + return -EFAULT; + + /* sched/core.c uses zero here but we already know ret is zero */ + return ret; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. + */ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + }; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || size > PAGE_SIZE || + size < SCHED_ATTR_SIZE_VER0 || flags) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + attr.sched_policy = p->policy; + if (rt_task(p)) + attr.sched_priority = p->rt_priority; + else + attr.sched_nice = task_nice(p); + + rcu_read_unlock(); + + retval = sched_read_attr(uattr, &attr, size); + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + cpumask_var_t cpus_allowed, new_mask; + struct task_struct *p; + int retval; + + get_online_cpus(); + rcu_read_lock(); + + p = find_process_by_pid(pid); + if (!p) { + rcu_read_unlock(); + put_online_cpus(); + return -ESRCH; + } + + /* Prevent p going away */ + get_task_struct(p); + rcu_read_unlock(); + + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + retval = -EPERM; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + goto out_unlock; + } + rcu_read_unlock(); + } + + retval = security_task_setscheduler(p); + if (retval) + goto out_unlock; + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); +again: + retval = __set_cpus_allowed_ptr(p, new_mask, true); + + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + } +out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: + put_task_struct(p); + put_online_cpus(); + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + cpumask_t *new_mask) +{ + if (len < sizeof(cpumask_t)) { + memset(new_mask, 0, sizeof(cpumask_t)); + } else if (len > sizeof(cpumask_t)) { + len = sizeof(cpumask_t); + } + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + cpumask_var_t new_mask; + int retval; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; +} + +long sched_getaffinity(pid_t pid, cpumask_t *mask) +{ + struct task_struct *p; + unsigned long flags; + int retval; + + get_online_cpus(); + rcu_read_lock(); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +out_unlock: + rcu_read_unlock(); + put_online_cpus(); + + return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + int ret; + cpumask_var_t mask; + + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) + return -EINVAL; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) + ret = -EFAULT; + else + ret = retlen; + } + free_cpumask_var(mask); + + return ret; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. It does this by + * scheduling away the current task. If it still has the earliest deadline + * it will be scheduled again as the next task. + * + * Return: 0. + */ +SYSCALL_DEFINE0(sched_yield) +{ + struct task_struct *p; + struct rq *rq; + + p = current; + rq = this_rq_lock(); + time_slice_expired(p, rq); + schedstat_inc(task_rq(p), yld_count); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + do_raw_spin_unlock(&rq->lock); + sched_preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +int __sched _cond_resched(void) +{ + if (should_resched(0)) { + preempt_schedule_common(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(_cond_resched); + +/* + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int __cond_resched_lock(spinlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held(lock); + + if (spin_needbreak(lock) || resched) { + spin_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + spin_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_lock); + +int __sched __cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { + local_bh_enable(); + preempt_schedule_common(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(__cond_resched_softirq); + +/** + * yield - yield the current processor to other threads. + * + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, its already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Return: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. + */ +int __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *rq_p; + struct rq *rq, *p_rq; + unsigned long flags; + int yielded = 0; + + local_irq_save(flags); + rq = this_rq(); + +again: + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (task_running(p_rq, p) || p->state) { + yielded = -ESRCH; + goto out_irq; + } + + double_rq_lock(rq, p_rq); + if (unlikely(task_rq(p) != p_rq)) { + double_rq_unlock(rq, p_rq); + goto again; + } + + yielded = 1; + rq_p = rq->curr; + if (p->deadline > rq_p->deadline) + p->deadline = rq_p->deadline; + p->time_slice += rq_p->time_slice; + if (p->time_slice > timeslice()) + p->time_slice = timeslice(); + time_slice_expired(rq_p, rq); + if (preempt && rq != p_rq) + resched_task(p_rq->curr); + double_rq_unlock(rq, p_rq); +out_irq: + local_irq_restore(flags); + + if (yielded > 0) + schedule(); + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ + +long __sched io_schedule_timeout(long timeout) +{ + int old_iowait = current->in_iowait; + struct rq *rq; + long ret; + + current->in_iowait = 1; + blk_schedule_flush_plug(current); + + delayacct_blkio_start(); + rq = raw_rq(); + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + current->in_iowait = old_iowait; + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); + + return ret; +} +EXPORT_SYMBOL(io_schedule_timeout); + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * Return: On success, this syscall returns the maximum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_ISO: + case SCHED_IDLEPRIO: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * Return: On success, this syscall returns the minimum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_ISO: + case SCHED_IDLEPRIO: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct task_struct *p; + unsigned int time_slice; + unsigned long flags; + struct timespec t; + struct rq *rq; + int retval; + + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + rcu_read_lock(); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + rq = task_rq_lock(p, &flags); + time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); + task_rq_unlock(rq, p, &flags); + + rcu_read_unlock(); + t = ns_to_timespec(time_slice); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + +void sched_show_task(struct task_struct *p) +{ + unsigned long free = 0; + int ppid; + unsigned long state = p->state; + + if (state) + state = __ffs(state) + 1; + printk(KERN_INFO "%-15.15s %c", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +#if BITS_PER_LONG == 32 + if (state == TASK_RUNNING) + printk(KERN_CONT " running "); + else + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(KERN_CONT " running task "); + else + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + free = stack_not_used(p); +#endif + ppid = 0; + rcu_read_lock(); + if (pid_alive(p)) + ppid = task_pid_nr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), ppid, + (unsigned long)task_thread_info(p)->flags); + + print_worker_info(KERN_INFO, p); + show_stack(p, NULL); +} + +void show_state_filter(unsigned long state_filter) +{ + struct task_struct *g, *p; + +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); +#else + printk(KERN_INFO + " task PC stack pid father\n"); +#endif + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. + */ + touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); + if (!state_filter || (p->state & state_filter)) + sched_show_task(p); + } + + rcu_read_unlock(); + /* + * Only show locks if all tasks are dumped: + */ + if (!state_filter) + debug_show_all_locks(); +} + +void dump_cpu_task(int cpu) +{ + pr_info("Task dump for CPU %d:\n", cpu); + sched_show_task(cpu_curr(cpu)); +} + +#ifdef CONFIG_SMP +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +{ + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); +} + +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_held(&p->pi_lock); + + cpumask_copy(tsk_cpus_allowed(p), new_mask); + + if (task_queued(p)) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + } + if (needs_other_cpu(p, task_cpu(p))) + set_task_cpu(p, valid_task_cpu(p)); +} +#endif + +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. + */ +void init_idle(struct task_struct *idle, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); + idle->last_ran = rq->niffies; + idle->state = TASK_RUNNING; + /* Setting prio to illegal value shouldn't matter when never queued */ + idle->prio = PRIO_LIMIT; + + kasan_unpoison_task_stack(idle); + +#ifdef CONFIG_SMP + /* + * It's possible that init_idle() gets called multiple times on a task, + * in that case do_set_cpus_allowed() will not do the right thing. + * + * And since this is boot we can forgo the serialisation. + */ + set_cpus_allowed_common(idle, cpumask_of(cpu)); +#ifdef CONFIG_SMT_NICE + idle->smt_bias = 0; +#endif +#endif + set_rq_task(rq, idle); + + /* Silence PROVE_RCU */ + rcu_read_lock(); + set_task_cpu(idle, cpu); + rcu_read_unlock(); + + rq->curr = rq->idle = idle; + idle->on_rq = TASK_ON_RQ_QUEUED; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ + init_idle_preempt_count(idle, cpu); + + ftrace_graph_init_idle_task(idle, cpu); +#ifdef CONFIG_SMP + sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif +} + +int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, + const struct cpumask __maybe_unused *trial) +{ + return 1; +} + +int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed) +{ + int ret = 0; + + /* + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. + */ + if (p->flags & PF_NO_SETAFFINITY) + ret = -EINVAL; + + return ret; +} + +void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + rq_lock_irqsave(rq, &flags); + resched_task(cpu_curr(cpu)); + rq_unlock_irqrestore(rq, &flags); +} + +#ifdef CONFIG_SMP +#ifdef CONFIG_NO_HZ_COMMON +void nohz_balance_enter_idle(int cpu) +{ +} + +void select_nohz_load_balancer(int stop_tick) +{ +} + +void set_cpu_sd_state_idle(void) {} +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ + +/* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int i, cpu = smp_processor_id(); + struct sched_domain *sd; + + if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) + return cpu; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) { + if (cpu == i) + continue; + + if (!idle_cpu(i) && is_housekeeping_cpu(i)) { + cpu = i; + cpu = i; + goto unlock; + } + } + } + + if (!is_housekeeping_cpu(cpu)) + cpu = housekeeping_any_cpu(); +unlock: + rcu_read_unlock(); + return cpu; +} + +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + if (cpu == smp_processor_id()) + return; + + if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} + +void wake_up_nohz_cpu(int cpu) +{ + wake_up_idle_cpu(cpu); +} +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool running_wrong = false; + struct cpumask old_mask; + bool queued = false; + unsigned long flags; + struct rq *rq; + int ret = 0; + + rq = task_rq_lock(p, &flags); + + if (p->flags & PF_KTHREAD) { + /* + * Kernel threads are allowed on online && !active CPUs + */ + cpu_valid_mask = cpu_online_mask; + } + + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + + cpumask_copy(&old_mask, tsk_cpus_allowed(p)); + if (cpumask_equal(&old_mask, new_mask)) + goto out; + + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + ret = -EINVAL; + goto out; + } + + queued = task_queued(p); + + do_set_cpus_allowed(p, new_mask); + + if (p->flags & PF_KTHREAD) { + /* + * For kernel threads that do indeed end up on online && + * !active we want to ensure they are strict per-cpu threads. + */ + WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && + !cpumask_intersects(new_mask, cpu_active_mask) && + tsk_nr_cpus_allowed(p) != 1); + } + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask)) + goto out; + + if (task_running(rq, p)) { + /* Task is running on the wrong cpu now, reschedule it. */ + if (rq == this_rq()) { + set_tsk_need_resched(p); + running_wrong = true; + } else + resched_task(p); + } else { + int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + struct rq *dest_rq = cpu_rq(dest_cpu); + + /* Switch rq locks here */ + lock_second_rq(rq, dest_rq); + set_task_cpu(p, dest_cpu); + rq_unlock(rq); + + rq = dest_rq; + } +out: + if (queued && !cpumask_subset(new_mask, &old_mask)) + try_preempt(p, rq); + if (running_wrong) + preempt_disable(); + task_rq_unlock(rq, p, &flags); + + if (running_wrong) { + __schedule(true); + preempt_enable(); + } + + return ret; +} + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Run through task list and find tasks affined to the dead cpu, then remove + * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold + * cpu 0 and src_cpu's runqueue locks. + */ +static void bind_zero(int src_cpu) +{ + struct task_struct *p, *t; + int bound = 0; + + if (src_cpu == 0) + return; + + do_each_thread(t, p) { + if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) { + bool local = (task_cpu(p) == src_cpu); + + /* task_running is the cpu stopper thread */ + if (local && task_running(task_rq(p), p)) + continue; + atomic_clear_cpu(src_cpu, tsk_cpus_allowed(p)); + atomic_set_cpu(0, tsk_cpus_allowed(p)); + p->zerobound = true; + bound++; + if (local) + set_task_cpu(p, 0); + } + } while_each_thread(t, p); + + if (bound) { + printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n", + bound, src_cpu); + } +} + +/* Find processes with the zerobound flag and reenable their affinity for the + * CPU coming alive. */ +static void unbind_zero(int src_cpu) +{ + int unbound = 0, zerobound = 0; + struct task_struct *p, *t; + + if (src_cpu == 0) + return; + + do_each_thread(t, p) { + if (!p->mm) + p->zerobound = false; + if (p->zerobound) { + unbound++; + cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p)); + /* Once every CPU affinity has been re-enabled, remove + * the zerobound flag */ + if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) { + p->zerobound = false; + zerobound++; + } + } + } while_each_thread(t, p); + + if (unbound) { + printk(KERN_INFO "Added affinity for %d processes to cpu %d\n", + unbound, src_cpu); + } + if (zerobound) { + printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n", + zerobound); + } +} + +/* + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ + struct mm_struct *mm = current->active_mm; + + BUG_ON(cpu_online(smp_processor_id())); + + if (mm != &init_mm) { + switch_mm_irqs_off(mm, &init_mm, current); + finish_arch_post_lock_switch(); + } + mmdrop(mm); +} +#else /* CONFIG_HOTPLUG_CPU */ +static void unbind_zero(int src_cpu) {} +#endif /* CONFIG_HOTPLUG_CPU */ + +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param stop_param = { .sched_priority = STOP_PRIO }; + struct sched_param start_param = { .sched_priority = 0 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling policy so that + * it can die in pieces. + */ + sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); + } +} + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {} +}; + +static struct ctl_table sd_ctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +#define CPU_LOAD_IDX_MAX 5 +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + umode_t mode, proc_handler *proc_handler, + bool load_idx) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring, false); + /* &table[13] is terminator */ + + return table; +} + +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_possible_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_possible_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +void unregister_sched_domain_sysctl(void) +{ + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#endif /* CONFIG_SYSCTL */ + +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + cpumask_set_cpu(cpu_of(rq), rq->rd->online); + rq->online = true; + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + int cpu = cpu_of(rq); + + cpumask_clear_cpu(cpu, rq->rd->online); + rq->online = false; + clear_cpuidle_map(cpu); + } +} + +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + +#ifdef CONFIG_SCHED_DEBUG + +static __read_mostly int sched_debug_enabled; + +static int __init sched_debug_setup(char *str) +{ + sched_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ + return sched_debug_enabled; +} + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + struct cpumask *groupmask) +{ + cpumask_clear(groupmask); + + printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); + return -1; + } + + printk(KERN_CONT "span %*pbl level %s\n", + cpumask_pr_args(sched_domain_span(sd)), sd->name); + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); + } + + printk(KERN_CONT "\n"); + + if (!cpumask_equal(sched_domain_span(sd), groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); + return 0; +} + +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; + + if (!sched_debug_enabled) + return; + + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + for (;;) { + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) + break; + level++; + sd = sd->parent; + if (!sd) + break; + } +} +#else /* !CONFIG_SCHED_DEBUG */ +# define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ + return false; +} +#endif /* CONFIG_SCHED_DEBUG */ + +static int sd_degenerate(struct sched_domain *sd) +{ + if (cpumask_weight(sched_domain_span(sd)) == 1) + return 1; + + /* Following flags don't use groups */ + if (sd->flags & (SD_WAKE_AFFINE)) + return 0; + + return 1; +} + +static int +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) +{ + unsigned long cflags = sd->flags, pflags = parent->flags; + + if (sd_degenerate(parent)) + return 1; + + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) + return 0; + + if (~cflags & pflags) + return 0; + + return 1; +} + +static void free_rootdomain(struct rcu_head *rcu) +{ + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); + + cpupri_cleanup(&rd->cpupri); + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + +static void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ + struct root_domain *old_rd = NULL; + unsigned long flags; + + rq_lock_irqsave(rq, &flags); + + if (rq->rd) { + old_rd = rq->rd; + + if (cpumask_test_cpu(rq->cpu, old_rd->online)) + set_rq_offline(rq); + + cpumask_clear_cpu(rq->cpu, old_rd->span); + + /* + * If we dont want to free the old_rd yet then + * set old_rd to NULL to skip the freeing later + * in this function: + */ + if (!atomic_dec_and_test(&old_rd->refcount)) + old_rd = NULL; + } + + atomic_inc(&rd->refcount); + rq->rd = rd; + + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + + rq_unlock_irqrestore(rq, &flags); + + if (old_rd) + call_rcu_sched(&old_rd->rcu, free_rootdomain); +} + +static int init_rootdomain(struct root_domain *rd) +{ + memset(rd, 0, sizeof(*rd)); + + if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto out; + if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; + + if (cpupri_init(&rd->cpupri) != 0) + goto free_rto_mask; + return 0; + +free_rto_mask: + free_cpumask_var(rd->rto_mask); +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +out: + return -ENOMEM; +} + +static void init_defrootdomain(void) +{ + init_rootdomain(&def_root_domain); + + atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ + struct root_domain *rd; + + rd = kmalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return NULL; + + if (init_rootdomain(rd) != 0) { + kfree(rd); + return NULL; + } + + return rd; +} + +static void free_sched_domain(struct rcu_head *rcu) +{ + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + + kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ + call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ + for (; sd; sd = sd->parent) + destroy_sched_domain(sd, cpu); +} + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct sched_domain *tmp; + + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; ) { + struct sched_domain *parent = tmp->parent; + if (!parent) + break; + + if (sd_parent_degenerate(tmp, parent)) { + tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + /* + * Transfer SD_PREFER_SIBLING down in case of a + * degenerate parent; the spans match for this + * so the property transfers. + */ + if (parent->flags & SD_PREFER_SIBLING) + tmp->flags |= SD_PREFER_SIBLING; + destroy_sched_domain(parent, cpu); + } else + tmp = tmp->parent; + } + + if (sd && sd_degenerate(sd)) { + tmp = sd; + sd = sd->parent; + destroy_sched_domain(tmp, cpu); + if (sd) + sd->child = NULL; + } + + sched_domain_debug(sd, cpu); + + rq_attach_root(rq, rd); + tmp = rq->sd; + rcu_assign_pointer(rq->sd, sd); + destroy_sched_domains(tmp, cpu); +} + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + int ret; + + alloc_bootmem_cpumask_var(&cpu_isolated_map); + ret = cpulist_parse(str, cpu_isolated_map); + if (ret) { + pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + return 0; + } + return 1; +} + +__setup("isolcpus=", isolated_cpu_setup); + +struct s_data { + struct sched_domain ** __percpu sd; + struct root_domain *rd; +}; + +enum s_alloc { + sa_rootdomain, + sa_sd, + sa_sd_storage, + sa_none, +}; + +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +static int default_relax_domain_level = -1; +int sched_domain_level_max; + +static int __init setup_relax_domain_level(char *str) +{ + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); + + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } +} + +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, + const struct cpumask *cpu_map) +{ + switch (what) { + case sa_rootdomain: + if (!atomic_read(&d->rd->refcount)) + free_rootdomain(&d->rd->rcu); /* fall through */ + case sa_sd: + free_percpu(d->sd); /* fall through */ + case sa_sd_storage: + __sdt_free(cpu_map); /* fall through */ + case sa_none: + break; + } +} + +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, + const struct cpumask *cpu_map) +{ + memset(d, 0, sizeof(*d)); + + if (__sdt_alloc(cpu_map)) + return sa_sd_storage; + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) + return sa_sd_storage; + d->rd = alloc_rootdomain(); + if (!d->rd) + return sa_sd; + return sa_rootdomain; +} + +/* + * NULL the sd_data elements we've used to build the sched_domain + * structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) +{ + struct sd_data *sdd = sd->private; + + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; +} + +#ifdef CONFIG_NUMA +static int sched_domains_numa_levels; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; +static int sched_domains_curr_level; +#endif + +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * + * Odd one out: + * SD_ASYM_PACKING - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUCAPACITY | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ + SD_SHARE_POWERDOMAIN) + +static struct sched_domain * +sd_init(struct sched_domain_topology_level *tl, int cpu) +{ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); + int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; +#endif + + sd_weight = cpumask_weight(tl->mask(cpu)); + + if (tl->sd_flags) + sd_flags = (*tl->sd_flags)(); + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, + "wrong sd_flags in topology description\n")) + sd_flags &= ~TOPOLOGY_SD_FLAGS; + + *sd = (struct sched_domain){ + .min_interval = sd_weight, + .max_interval = 2*sd_weight, + .busy_factor = 32, + .imbalance_pct = 125, + + .cache_nice_tries = 0, + .busy_idx = 0, + .idle_idx = 0, + .newidle_idx = 0, + .wake_idx = 0, + .forkexec_idx = 0, + + .flags = 1*SD_LOAD_BALANCE + | 1*SD_BALANCE_NEWIDLE + | 1*SD_BALANCE_EXEC + | 1*SD_BALANCE_FORK + | 0*SD_BALANCE_WAKE + | 1*SD_WAKE_AFFINE + | 0*SD_SHARE_CPUCAPACITY + | 0*SD_SHARE_PKG_RESOURCES + | 0*SD_SERIALIZE + | 0*SD_PREFER_SIBLING + | 0*SD_NUMA + | sd_flags + , + + .last_balance = jiffies, + .balance_interval = sd_weight, + .smt_gain = 0, + .max_newidle_lb_cost = 0, + .next_decay_max_lb_cost = jiffies, +#ifdef CONFIG_SCHED_DEBUG + .name = tl->name, +#endif + }; + + /* + * Convert topological properties into behaviour. + */ + + if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->flags |= SD_PREFER_SIBLING; + sd->imbalance_pct = 110; + sd->smt_gain = 1178; /* ~15% */ + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + +#ifdef CONFIG_NUMA + } else if (sd->flags & SD_NUMA) { + sd->cache_nice_tries = 2; + sd->busy_idx = 3; + sd->idle_idx = 2; + + sd->flags |= SD_SERIALIZE; + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + sd->flags &= ~(SD_BALANCE_EXEC | + SD_BALANCE_FORK | + SD_WAKE_AFFINE); + } + +#endif + } else { + sd->flags |= SD_PREFER_SIBLING; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + sd->idle_idx = 1; + } + + sd->private = &tl->data; + + return sd; +} + +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = + default_topology; + +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ + sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA + +static const struct cpumask *sd_numa_mask(int cpu) +{ + return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; +} + +static void sched_numa_warn(const char *str) +{ + static int done = false; + int i,j; + + if (done) + return; + + done = true; + + printk(KERN_WARNING "ERROR: %s\n\n", str); + + for (i = 0; i < nr_node_ids; i++) { + printk(KERN_WARNING " "); + for (j = 0; j < nr_node_ids; j++) + printk(KERN_CONT "%02d ", node_distance(i,j)); + printk(KERN_CONT "\n"); + } + printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ + int i; + + if (distance == node_distance(0, 0)) + return true; + + for (i = 0; i < sched_domains_numa_levels; i++) { + if (sched_domains_numa_distance[i] == distance) + return true; + } + + return false; +} + +static void sched_init_numa(void) +{ + int next_distance, curr_distance = node_distance(0, 0); + struct sched_domain_topology_level *tl; + int level = 0; + int i, j, k; + + sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + if (!sched_domains_numa_distance) + return; + + /* + * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * unique distances in the node_distance() table. + * + * Assumes node_distance(0,j) includes all distances in + * node_distance(i,j) in order to avoid cubic time. + */ + next_distance = curr_distance; + for (i = 0; i < nr_node_ids; i++) { + for (j = 0; j < nr_node_ids; j++) { + for (k = 0; k < nr_node_ids; k++) { + int distance = node_distance(i, k); + + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + /* + * While not a strong assumption it would be nice to know + * about cases where if node A is connected to B, B is not + * equally connected to A. + */ + if (sched_debug() && node_distance(k, i) != distance) + sched_numa_warn("Node-distance not symmetric"); + + if (sched_debug() && i && !find_numa_distance(distance)) + sched_numa_warn("Node-0 not representative"); + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; + } + + /* + * In case of sched_debug() we verify the above assumption. + */ + if (!sched_debug()) + break; + } + /* + * 'level' contains the number of unique distances, excluding the + * identity distance node_distance(i,i). + * + * The sched_domains_numa_distance[] array includes the actual distance + * numbers. + */ + + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + if (!sched_domains_numa_masks) + return; + + /* + * Now for each level, construct a mask per node which contains all + * cpus of nodes that are that many hops away from us. + */ + for (i = 0; i < level; i++) { + sched_domains_numa_masks[i] = + kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!sched_domains_numa_masks[i]) + return; + + for (j = 0; j < nr_node_ids; j++) { + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!mask) + return; + + sched_domains_numa_masks[i][j] = mask; + + for_each_node(k) { + if (node_distance(j, k) > sched_domains_numa_distance[i]) + continue; + + cpumask_or(mask, mask, cpumask_of_node(k)); + } + } + } + + /* Compute default topology size */ + for (i = 0; sched_domain_topology[i].mask; i++); + + tl = kzalloc((i + level + 1) * + sizeof(struct sched_domain_topology_level), GFP_KERNEL); + if (!tl) + return; + + /* + * Copy the default topology bits.. + */ + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; + + /* + * .. and append 'j' levels of NUMA goodness. + */ + for (j = 0; j < level; i++, j++) { + tl[i] = (struct sched_domain_topology_level){ + .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, + .flags = SDTL_OVERLAP, + .numa_level = j, + SD_INIT_NAME(NUMA) + }; + } + + sched_domain_topology = tl; + + sched_domains_numa_levels = level; +} + +static void sched_domains_numa_masks_set(int cpu) +{ + int node = cpu_to_node(cpu); + int i, j; + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +#else +static inline void sched_init_numa(void) { } +static void sched_domains_numa_masks_set(unsigned int cpu) { } +static void sched_domains_numa_masks_clear(unsigned int cpu) { } +#endif /* CONFIG_NUMA */ + +static int __sdt_alloc(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for_each_sd_topology(tl) { + struct sd_data *sdd = &tl->data; + + sdd->sd = alloc_percpu(struct sched_domain *); + if (!sdd->sd) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return -ENOMEM; + + *per_cpu_ptr(sdd->sd, j) = sd; + } + } + + return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for_each_sd_topology(tl) { + struct sd_data *sdd = &tl->data; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + + if (sdd->sd) { + sd = *per_cpu_ptr(sdd->sd, j); + kfree(*per_cpu_ptr(sdd->sd, j)); + } + } + free_percpu(sdd->sd); + sdd->sd = NULL; + } +} + +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *child, int cpu) +{ + struct sched_domain *sd = sd_init(tl, cpu); + if (!sd) + return child; + + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); + child->parent = sd; + sd->child = child; + + if (!cpumask_subset(sched_domain_span(child), + sched_domain_span(sd))) { + pr_err("BUG: arch topology borken\n"); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" the %s domain not a subset of the %s domain\n", + child->name, sd->name); +#endif + /* Fixup, ensure @sd has at least @child cpus. */ + cpumask_or(sched_domain_span(sd), + sched_domain_span(sd), + sched_domain_span(child)); + } + + } + set_domain_attribute(sd, attr); + + return sd; +} + +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +static int build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) +{ + enum s_alloc alloc_state; + struct sched_domain *sd; + struct s_data d; + int i, ret = -ENOMEM; + + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) + goto error; + + /* Set up domains for cpus specified by the cpu_map. */ + for_each_cpu(i, cpu_map) { + struct sched_domain_topology_level *tl; + + sd = NULL; + for_each_sd_topology(tl) { + sd = build_sched_domain(tl, cpu_map, attr, sd, i); + if (tl == sched_domain_topology) + *per_cpu_ptr(d.sd, i) = sd; + if (tl->flags & SDTL_OVERLAP) + sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; + } + } + + /* Calculate CPU capacity for physical packages and nodes */ + for (i = nr_cpumask_bits-1; i >= 0; i--) { + if (!cpumask_test_cpu(i, cpu_map)) + continue; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + claim_allocations(i, sd); + } + } + + /* Attach the domains */ + rcu_read_lock(); + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d.sd, i); + cpu_attach_domain(sd, d.rd, i); + } + rcu_read_unlock(); + + ret = 0; +error: + __free_domain_allocs(&d, alloc_state, cpu_map); + return ret; +} + +static cpumask_var_t *doms_cur; /* current sched domains */ +static int ndoms_cur; /* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + /* attribues of custom domains in 'doms_cur' */ + +/* + * Special case: If a kmalloc of a doms_cur partition (array of + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. + */ +static cpumask_var_t fallback_doms; + +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __weak arch_update_cpu_topology(void) +{ + return 0; +} + +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ +static int init_sched_domains(const struct cpumask *cpu_map) +{ + int err; + + arch_update_cpu_topology(); + ndoms_cur = 1; + doms_cur = alloc_sched_domains(ndoms_cur); + if (!doms_cur) + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); + err = build_sched_domains(doms_cur[0], NULL); + register_sched_domain_sysctl(); + + return err; +} + +/* + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain + */ +static void detach_destroy_domains(const struct cpumask *cpu_map) +{ + int i; + + rcu_read_lock(); + for_each_cpu(i, cpu_map) + cpu_attach_domain(NULL, &def_root_domain, i); + rcu_read_unlock(); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* fast path */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); +} + +/* + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. + * + * If doms_new == NULL it will be replaced with cpu_online_mask. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. + * + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + int i, j, n; + int new_topology; + + mutex_lock(&sched_domains_mutex); + + /* always unregister in case we don't destroy any domains */ + unregister_sched_domain_sysctl(); + + /* Let architecture update cpu core mappings. */ + new_topology = arch_update_cpu_topology(); + + n = doms_new ? ndoms_new : 0; + + /* Destroy deleted domains */ + for (i = 0; i < ndoms_cur; i++) { + for (j = 0; j < n && !new_topology; j++) { + if (cpumask_equal(doms_cur[i], doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) + goto match1; + } + /* no match - a current sched domain not in new doms_new[] */ + detach_destroy_domains(doms_cur[i]); +match1: + ; + } + + n = ndoms_cur; + if (doms_new == NULL) { + n = 0; + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + WARN_ON_ONCE(dattr_new); + } + + /* Build new domains */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < n && !new_topology; j++) { + if (cpumask_equal(doms_new[i], doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) + goto match2; + } + /* no match - add a new doms_new */ + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); +match2: + ; + } + + /* Remember the new sched domains */ + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); + kfree(dattr_cur); /* kfree(NULL) is safe */ + doms_cur = doms_new; + dattr_cur = dattr_new; + ndoms_cur = ndoms_new; + + register_sched_domain_sysctl(); + + mutex_unlock(&sched_domains_mutex); +} + +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ + +/* + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. + */ +static void cpuset_cpu_active(void) +{ + if (cpuhp_tasks_frozen) { + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + return; + } + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + } + + cpuset_update_active_cpus(true); +} + +static int cpuset_cpu_inactive(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) { + cpuset_update_active_cpus(false); + } else { + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + } + return 0; +} + +int sched_cpu_activate(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + set_cpu_active(cpu, true); + + if (sched_smp_initialized) { + sched_domains_numa_masks_set(cpu); + cpuset_cpu_active(); + } + + /* + * Put the rq online, if not already. This happens: + * + * 1) In the early boot process, because we build the real domains + * after all cpus have been brought up. + * + * 2) At runtime, if cpuset_cpu_active() fails to rebuild the + * domains. + */ + rq_lock_irqsave(rq, &flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + unbind_zero(cpu); + rq_unlock_irqrestore(rq, &flags); + + return 0; +} + +int sched_cpu_deactivate(unsigned int cpu) +{ + int ret; + + set_cpu_active(cpu, false); + /* + * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU + * users of this state to go away such that all new such users will + * observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so wait for both. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); + + if (!sched_smp_initialized) + return 0; + + ret = cpuset_cpu_inactive(cpu); + if (ret) { + set_cpu_active(cpu, true); + return ret; + } + sched_domains_numa_masks_clear(cpu); + return 0; +} + +int sched_cpu_starting(unsigned int __maybe_unused cpu) +{ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +int sched_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(rq, cpu_rq(0)); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + bind_zero(cpu); + double_rq_unlock(rq, cpu_rq(0)); + local_irq_restore(flags); + + return 0; +} +#endif + +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +/* + * Cheaper version of the below functions in case support for SMT and MC is + * compiled in but CPUs have no siblings. + */ +static bool sole_cpu_idle(struct rq *rq) +{ + return rq_idle(rq); +} +#endif +#ifdef CONFIG_SCHED_SMT +static const cpumask_t *thread_cpumask(int cpu) +{ + return topology_sibling_cpumask(cpu); +} +/* All this CPU's SMT siblings are idle */ +static bool siblings_cpu_idle(struct rq *rq) +{ + return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map); +} +#endif +#ifdef CONFIG_SCHED_MC +static const cpumask_t *core_cpumask(int cpu) +{ + return topology_core_cpumask(cpu); +} +/* All this CPU's shared cache siblings are idle */ +static bool cache_cpu_idle(struct rq *rq) +{ + return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map); +} +#endif + +enum sched_domain_level { + SD_LV_NONE = 0, + SD_LV_SIBLING, + SD_LV_MC, + SD_LV_BOOK, + SD_LV_CPU, + SD_LV_NODE, + SD_LV_ALLNODES, + SD_LV_MAX +}; + +void __init sched_init_smp(void) +{ + struct sched_domain *sd; + int cpu, other_cpu; +#ifdef CONFIG_SCHED_SMT + bool smt_threads = false; +#endif + cpumask_var_t non_isolated_cpus; + struct rq *rq; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + + sched_init_numa(); + + /* + * There's no userspace yet to cause hotplug operations; hence all the + * cpu masks are stable and all blatant races in the below code cannot + * happen. + */ + mutex_lock(&sched_domains_mutex); + init_sched_domains(cpu_active_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); + + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) + BUG(); + free_cpumask_var(non_isolated_cpus); + + mutex_lock(&sched_domains_mutex); + local_irq_disable(); + lock_all_rqs(); + /* + * Set up the relative cache distance of each online cpu from each + * other in a simple array for quick lookup. Locality is determined + * by the closest sched_domain that CPUs are separated by. CPUs with + * shared cache in SMT and MC are treated as local. Separate CPUs + * (within the same package or physically) within the same node are + * treated as not local. CPUs not even in the same domain (different + * nodes) are treated as very distant. + */ + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + + /* First check if this cpu is in the same node */ + for_each_domain(cpu, sd) { + if (sd->level > SD_LV_MC) + continue; + /* Set locality to local node if not already found lower */ + for_each_cpu(other_cpu, sched_domain_span(sd)) { + if (rq->cpu_locality[other_cpu] > 3) + rq->cpu_locality[other_cpu] = 3; + } + } + + /* + * Each runqueue has its own function in case it doesn't have + * siblings of its own allowing mixed topologies. + */ +#ifdef CONFIG_SCHED_MC + for_each_cpu(other_cpu, core_cpumask(cpu)) { + if (rq->cpu_locality[other_cpu] > 2) + rq->cpu_locality[other_cpu] = 2; + } + if (cpumask_weight(core_cpumask(cpu)) > 1) { + cpumask_copy(&rq->core_mask, core_cpumask(cpu)); + cpumask_clear_cpu(cpu, &rq->core_mask); + rq->cache_idle = cache_cpu_idle; + } +#endif +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(thread_cpumask(cpu)) > 1) { + cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); + cpumask_clear_cpu(cpu, &rq->thread_mask); + for_each_cpu(other_cpu, thread_cpumask(cpu)) + rq->cpu_locality[other_cpu] = 1; + rq->siblings_idle = siblings_cpu_idle; + smt_threads = true; + } +#endif + } + for_each_possible_cpu(cpu) { + int total_cpus = 1, locality; + + rq = cpu_rq(cpu); + for (locality = 1; locality <= 4; locality++) { + for_each_possible_cpu(other_cpu) { + if (rq->cpu_locality[other_cpu] == locality) + rq->rq_order[total_cpus++] = cpu_rq(other_cpu); + } + } + } +#ifdef CONFIG_SMT_NICE + if (smt_threads) { + check_siblings = &check_smt_siblings; + wake_siblings = &wake_smt_siblings; + smt_schedule = &smt_should_schedule; + } +#endif + unlock_all_rqs(); + local_irq_enable(); + mutex_unlock(&sched_domains_mutex); + + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + + for_each_online_cpu(other_cpu) { + if (other_cpu <= cpu) + continue; + printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); + } + } + sched_smp_initialized = true; +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + +int in_sched_functions(unsigned long addr) +{ + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +#ifdef CONFIG_CGROUP_SCHED +/* task group related information */ +struct task_group { + struct cgroup_subsys_state css; + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; +}; + +/* + * Default task group. + * Every task in system belongs to this group at bootup. + */ +struct task_group root_task_group; +LIST_HEAD(task_groups); + +/* Cacheline aligned slab cache for task_group */ +static struct kmem_cache *task_group_cache __read_mostly; +#endif /* CONFIG_CGROUP_SCHED */ + +void __init sched_init(void) +{ +#ifdef CONFIG_SMP + int cpu_ids; +#endif + int i; + struct rq *rq; + + prio_ratios[0] = 128; + for (i = 1 ; i < NICE_WIDTH ; i++) + prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; + + atomic_set(&grq.nr_running, 0); + atomic_set(&grq.nr_uninterruptible, 0); + atomic64_set(&grq.nr_switches, 0); + skiplist_node_init(&init_task.node); + +#ifdef CONFIG_SMP + init_defrootdomain(); + atomic_set(&grq.qnr, 0); + cpumask_clear(&grq.cpu_idle_map); +#else + uprq = &per_cpu(runqueues, 0); +#endif + +#ifdef CONFIG_CGROUP_SCHED + task_group_cache = KMEM_CACHE(task_group, 0); + + list_add(&root_task_group.list, &task_groups); + INIT_LIST_HEAD(&root_task_group.children); + INIT_LIST_HEAD(&root_task_group.siblings); +#endif /* CONFIG_CGROUP_SCHED */ + for_each_possible_cpu(i) { + rq = cpu_rq(i); + skiplist_init(&rq->node); + rq->sl = new_skiplist(&rq->node); + raw_spin_lock_init(&rq->lock); + rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; + rq->last_jiffy = jiffies; + rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = + rq->iowait_ns = rq->idle_ns = 0; + rq->dither = 0; + set_rq_task(rq, &init_task); + rq->iso_ticks = 0; + rq->iso_refractory = false; +#ifdef CONFIG_SMP + rq->sd = NULL; + rq->rd = NULL; + rq->online = false; + rq->cpu = i; + rq_attach_root(rq, &def_root_domain); +#endif + atomic_set(&rq->nr_iowait, 0); + } + +#ifdef CONFIG_SMP + cpu_ids = i; + /* + * Set the base locality for cpu cache distance calculation to + * "distant" (3). Make sure the distance from a CPU to itself is 0. + */ + for_each_possible_cpu(i) { + int j; + + rq = cpu_rq(i); +#ifdef CONFIG_SCHED_SMT + rq->siblings_idle = sole_cpu_idle; +#endif +#ifdef CONFIG_SCHED_MC + rq->cache_idle = sole_cpu_idle; +#endif + rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); + for_each_possible_cpu(j) { + if (i == j) + rq->cpu_locality[j] = 0; + else + rq->cpu_locality[j] = 4; + } + rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); + rq->rq_order[0] = rq; + for (j = 1; j < cpu_ids; j++) + rq->rq_order[j] = cpu_rq(j); + } +#endif + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); + +#ifdef CONFIG_SMP + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + idle_thread_set_boot_cpu(); +#endif /* SMP */ + + init_schedstats(); +} + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +static inline int preempt_count_equals(int preempt_offset) +{ + int nested = preempt_count() + rcu_preempt_depth(); + + return (nested == preempt_offset); +} + +void __might_sleep(const char *file, int line, int preempt_offset) +{ + /* + * Blocking primitives will set (and therefore destroy) current->state, + * since we will exit with TASK_RUNNING make sure we enter with it, + * otherwise we will destroy state. + */ + WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, + "do not call blocking ops when !TASK_RUNNING; " + "state=%lx set at [<%p>] %pS\n", + current->state, + (void *)current->task_state_change, + (void *)current->task_state_change); + + ___might_sleep(file, line, preempt_offset); +} +EXPORT_SYMBOL(__might_sleep); + +void ___might_sleep(const char *file, int line, int preempt_offset) +{ + static unsigned long prev_jiffy; /* ratelimiting */ + + rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ + if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + !is_idle_task(current)) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + if (task_stack_end_corrupted(current)) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT + if (!preempt_count_equals(preempt_offset)) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif + dump_stack(); +} +EXPORT_SYMBOL(___might_sleep); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +static inline void normalise_rt_tasks(void) +{ + struct task_struct *g, *p; + unsigned long flags; + struct rq *rq; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + /* + * Only normalize user tasks: + */ + if (p->flags & PF_KTHREAD) + continue; + + if (!rt_task(p) && !iso_task(p)) + continue; + + rq = task_rq_lock(p, &flags); + __setscheduler(p, rq, SCHED_NORMAL, 0, false); + task_rq_unlock(rq, p, &flags); + } + read_unlock(&tasklist_lock); +} + +void normalize_rt_tasks(void) +{ + normalise_rt_tasks(); +} +#endif /* CONFIG_MAGIC_SYSRQ */ + +#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) +/* + * These functions are only useful for the IA64 MCA handling, or kdb. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + * + * Return: The current task for @cpu. + */ +struct task_struct *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ + +#ifdef CONFIG_IA64 +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronised, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, struct task_struct *p) +{ + cpu_curr(cpu) = p; +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_common_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + vtime_account_user(prev); +#endif + arch_vtime_task_switch(prev); +} +#endif + +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} +EXPORT_SYMBOL_GPL(task_cputime_adjusted); + +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} + +void vtime_account_system_irqsafe(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + vtime_account_system(tsk); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); + +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); + else + vtime_account_system(tsk); +} +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +/* + * Perform (stime * rtime) / total, but avoid multiplication overflow by + * losing precision when the numbers are big. + */ +static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) +{ + u64 scaled; + + for (;;) { + /* Make sure "rtime" is the bigger of stime/rtime */ + if (stime > rtime) { + u64 tmp = rtime; rtime = stime; stime = tmp; + } + + /* Make sure 'total' fits in 32 bits */ + if (total >> 32) + goto drop_precision; + + /* Does rtime (and thus stime) fit in 32 bits? */ + if (!(rtime >> 32)) + break; + + /* Can we just balance rtime/stime rather than dropping bits? */ + if (stime >> 31) + goto drop_precision; + + /* We can grow stime and shrink rtime and try to make them both fit */ + stime <<= 1; + rtime >>= 1; + continue; + +drop_precision: + /* We drop from rtime, it has more bits than stime */ + rtime >>= 1; + total >>= 1; + } + + /* + * Make sure gcc understands that this is a 32x32->64 multiply, + * followed by a 64/32->64 divide. + */ + scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); + return (__force cputime_t) scaled; +} + +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ +static void cputime_adjust(struct task_cputime *curr, + struct prev_cputime *prev, + cputime_t *ut, cputime_t *st) +{ + cputime_t rtime, stime, utime, total; + + stime = curr->stime; + total = stime + curr->utime; + + /* + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. + */ + rtime = nsecs_to_cputime(curr->sum_exec_runtime); + + /* + * Update userspace visible utime/stime values only if actual execution + * time is bigger than already exported. Note that can happen, that we + * provided bigger values due to scaling inaccuracy on big numbers. + */ + if (prev->stime + prev->utime >= rtime) + goto out; + + if (total) { + stime = scale_stime((__force u64)stime, + (__force u64)rtime, (__force u64)total); + utime = rtime - stime; + } else { + stime = rtime; + utime = 0; + } + + /* + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. + */ + prev->stime = max(prev->stime, stime); + prev->utime = max(prev->utime, utime); + +out: + *ut = prev->utime; + *st = prev->stime; +} + +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); + cputime_adjust(&cputime, &p->prev_cputime, ut, st); +} +EXPORT_SYMBOL_GPL(task_cputime_adjusted); + +/* + * Must be called with siglock held. + */ +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +void init_idle_bootup_task(struct task_struct *idle) +{} + +#ifdef CONFIG_SCHED_DEBUG +void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +{} + +void proc_sched_set_task(struct task_struct *p) +{} +#endif + +#ifdef CONFIG_SMP +#define SCHED_LOAD_SHIFT (10) +#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return SCHED_LOAD_SCALE; +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long smt_gain = sd->smt_gain; + + smt_gain /= weight; + + return smt_gain; +} +#endif + +#ifdef CONFIG_CGROUP_SCHED +static void sched_free_group(struct task_group *tg) +{ + kmem_cache_free(task_group_cache, tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(struct task_group *parent) +{ + struct task_group *tg; + + tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); + if (!tg) + return ERR_PTR(-ENOMEM); + + return tg; +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ +} + +/* rcu callback to free various structures associated with a task group */ +static void sched_free_group_rcu(struct rcu_head *rhp) +{ + /* now it should be safe to free those cfs_rqs */ + sched_free_group(container_of(rhp, struct task_group, rcu)); +} + +void sched_destroy_group(struct task_group *tg) +{ + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, sched_free_group_rcu); +} + +void sched_offline_group(struct task_group *tg) +{ +} + +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + +static struct cgroup_subsys_state * +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct task_group *parent = css_tg(parent_css); + struct task_group *tg; + + if (!parent) { + /* This is early initialization for the top cgroup */ + return &root_task_group.css; + } + + tg = sched_create_group(parent); + if (IS_ERR(tg)) + return ERR_PTR(-ENOMEM); + return &tg->css; +} + +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + sched_offline_group(tg); +} + +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); +} + +static void cpu_cgroup_fork(struct task_struct *task) +{ +} + +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) +{ + return 0; +} + +static void cpu_cgroup_attach(struct cgroup_taskset *tset) +{ +} + +static struct cftype cpu_files[] = { + { } /* terminate */ +}; + +struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, + .css_free = cpu_cgroup_css_free, + .fork = cpu_cgroup_fork, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .legacy_cftypes = cpu_files, + .early_init = true, +}; +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h new file mode 100644 index 000000000..4e3115dac --- /dev/null +++ b/kernel/sched/MuQSS.h @@ -0,0 +1,274 @@ +#include +#include +#include +#include + +#ifndef MUQSS_SCHED_H +#define MUQSS_SCHED_H + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 + +/* + * This is the main, per-CPU runqueue data structure. + * This data should only be modified by the local cpu. + */ +struct rq { + struct task_struct *curr, *idle, *stop; + struct mm_struct *prev_mm; + + raw_spinlock_t lock; + + /* Stored data about rq->curr to work outside rq lock */ + u64 rq_deadline; + int rq_prio; + + /* Best queued id for use outside lock */ + u64 best_key; + + unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ + unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ + u64 niffies; /* Last time this RQ updated rq clock */ + u64 last_niffy; /* Last niffies as updated by local clock */ + u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ + + u64 load_update; /* When we last updated load */ + unsigned long load_avg; /* Rolling load average */ +#ifdef CONFIG_SMT_NICE + struct mm_struct *rq_mm; + int rq_smt_bias; /* Policy/nice level bias across smt siblings */ +#endif + /* Accurate timekeeping data */ + unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, + iowait_ns, idle_ns; + atomic_t nr_iowait; + + skiplist_node node; + skiplist *sl; +#ifdef CONFIG_SMP + struct task_struct *preempt; /* Preempt triggered on this task */ + + int cpu; /* cpu of this runqueue */ + bool online; + + struct root_domain *rd; + struct sched_domain *sd; + int *cpu_locality; /* CPU relative cache distance */ + struct rq **rq_order; /* RQs ordered by relative cache distance */ + +#ifdef CONFIG_SCHED_SMT + cpumask_t thread_mask; + bool (*siblings_idle)(struct rq *rq); + /* See if all smt siblings are idle */ +#endif /* CONFIG_SCHED_SMT */ +#ifdef CONFIG_SCHED_MC + cpumask_t core_mask; + bool (*cache_idle)(struct rq *rq); + /* See if all cache siblings are idle */ +#endif /* CONFIG_SCHED_MC */ +#endif /* CONFIG_SMP */ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif /* CONFIG_PARAVIRT */ +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ + + u64 clock, old_clock, last_tick; + u64 clock_task; + int dither; + + int iso_ticks; + bool iso_refractory; + +#ifdef CONFIG_SCHEDSTATS + + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif /* CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif +}; + +#ifdef CONFIG_SMP +struct rq *cpu_rq(int cpu); +#endif + +#ifndef CONFIG_SMP +extern struct rq *uprq; +#define cpu_rq(cpu) (uprq) +#define this_rq() (uprq) +#define raw_rq() (uprq) +#define task_rq(p) (uprq) +#define cpu_curr(cpu) ((uprq)->curr) +#else /* CONFIG_SMP */ +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#define this_rq() this_cpu_ptr(&runqueues) +#define raw_rq() raw_cpu_ptr(&runqueues) +#endif /* CONFIG_SMP */ + +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 +#ifdef CONFIG_SMP +#define ENQUEUE_MIGRATED 0x20 +#else +#define ENQUEUE_MIGRATED 0x00 +#endif + +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return READ_ONCE(rq->clock); +} + +static inline u64 rq_clock(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + return rq->clock_task; +} + +extern struct mutex sched_domains_mutex; +extern struct static_key_false sched_schedstats; + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + +#ifdef CONFIG_SMP +extern void sched_ttwu_pending(void); +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); +#else +static inline void sched_ttwu_pending(void) { } +#endif + +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +static inline void cpufreq_trigger(u64 time, unsigned long util) +{ + struct update_util_data *data; + + if (util > SCHED_CAPACITY_SCALE) + util = SCHED_CAPACITY_SCALE; + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, time, util, SCHED_CAPACITY_SCALE); +} +#else +static inline void cpufreq_trigger(u64 time, unsigned long util) +{ +} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef arch_scale_freq_capacity +#ifndef arch_scale_freq_invariant +#define arch_scale_freq_invariant() (true) +#endif +#else /* arch_scale_freq_capacity */ +#define arch_scale_freq_invariant() (false) +#endif + +#endif /* MUQSS_SCHED_H */ diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c deleted file mode 100644 index bb5bac4b2..000000000 --- a/kernel/sched/bfs.c +++ /dev/null @@ -1,7671 +0,0 @@ -/* - * kernel/sched/bfs.c, was kernel/sched.c - * - * Kernel scheduler and related syscalls - * - * Copyright (C) 1991-2002 Linus Torvalds - * - * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and - * make semaphores SMP safe - * 1998-11-19 Implemented schedule_timeout() and related stuff - * by Andrea Arcangeli - * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: - * hybrid priority-list and round-robin design with - * an array-switch method of distributing timeslices - * and per-CPU runqueues. Cleanups and useful suggestions - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-04-15 Work begun on replacing all interactivity tuning with a - * fair scheduling design by Con Kolivas. - * 2007-05-05 Load balancing (smp-nice) and other improvements - * by Peter Williams - * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith - * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri - * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, - * Thomas Gleixner, Mike Kravetz - * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes - * a whole lot of those previous things. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#ifdef CONFIG_PARAVIRT -#include -#endif - -#include "cpupri.h" -#include "../workqueue_internal.h" -#include "../smpboot.h" - -#define CREATE_TRACE_POINTS -#include - -#include "bfs_sched.h" - -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -#define rt_task(p) rt_prio((p)->prio) -#define rt_queue(rq) rt_prio((rq)->rq_prio) -#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ - (policy) == SCHED_RR) -#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) - -#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) -#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) -#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy))) - -#define is_iso_policy(policy) ((policy) == SCHED_ISO) -#define iso_task(p) unlikely(is_iso_policy((p)->policy)) -#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy)) -#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO) - -#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) - -#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) - -#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) -#define STOP_PRIO (MAX_RT_PRIO - 1) - -/* - * Some helpers for converting to/from various scales. Use shifts to get - * approximate multiples of ten for less overhead. - */ -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -#define JIFFY_NS (1000000000 / HZ) -#define HALF_JIFFY_NS (1000000000 / HZ / 2) -#define HALF_JIFFY_US (1000000 / HZ / 2) -#define MS_TO_NS(TIME) ((TIME) << 20) -#define MS_TO_US(TIME) ((TIME) << 10) -#define NS_TO_MS(TIME) ((TIME) >> 20) -#define NS_TO_US(TIME) ((TIME) >> 10) - -#define RESCHED_US (100) /* Reschedule if less than this many μs left */ - -void print_scheduler_version(void) -{ - printk(KERN_INFO "BFS CPU scheduler v0.512 by Con Kolivas.\n"); -} - -/* - * This is the time all tasks within the same priority round robin. - * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. - * Tunable via /proc interface. - */ -#ifdef CONFIG_PCK_INTERACTIVE -int rr_interval __read_mostly = 3; -#else -int rr_interval __read_mostly = 6; -#endif - -/* Tunable to choose whether to prioritise latency or throughput, simple - * binary yes or no */ - -int sched_interactive __read_mostly = 1; - -/* - * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks - * are allowed to run five seconds as real time tasks. This is the total over - * all online cpus. - */ -#ifdef CONFIG_PCK_INTERACTIVE -int sched_iso_cpu __read_mostly = 25; -#else -int sched_iso_cpu __read_mostly = 70; -#endif - -/* - * The relative length of deadline for each priority(nice) level. - */ -static int prio_ratios[NICE_WIDTH] __read_mostly; - -/* - * The quota handed out to tasks of all priority levels when refilling their - * time_slice. - */ -static inline int timeslice(void) -{ - return MS_TO_US(rr_interval); -} - -/* - * The global runqueue data that all CPUs work off. Data is protected either - * by the global grq lock, or the discrete lock that precedes the data in this - * struct. - */ -struct global_rq { - raw_spinlock_t lock; - unsigned long nr_running; - unsigned long nr_uninterruptible; - unsigned long long nr_switches; - unsigned long qnr; /* queued not running */ -#ifdef CONFIG_SMP - cpumask_t cpu_idle_map; - bool idle_cpus; -#endif - int noc; /* num_online_cpus stored and updated when it changes */ - u64 niffies; /* Nanosecond jiffies */ - unsigned long last_jiffy; /* Last jiffy we updated niffies */ - - raw_spinlock_t iso_lock; - int iso_ticks; - bool iso_refractory; - - skiplist_node node; - skiplist *sl; -}; - -#ifdef CONFIG_SMP -/* - * We add the notion of a root-domain which will be used to define per-domain - * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new - * exclusive cpuset is created, we also create and attach a new root-domain - * object. - * - */ -struct root_domain { - atomic_t refcount; - atomic_t rto_count; - struct rcu_head rcu; - cpumask_var_t span; - cpumask_var_t online; - - /* - * The "RT overload" flag: it gets set if a CPU has more than - * one runnable RT task. - */ - cpumask_var_t rto_mask; - struct cpupri cpupri; -}; - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -static struct root_domain def_root_domain; - -#endif /* CONFIG_SMP */ - -/* There can be only one */ -static struct global_rq grq; - -static DEFINE_MUTEX(sched_hotcpu_mutex); - -/* cpus with isolated domains */ -cpumask_var_t cpu_isolated_map; - -DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#ifdef CONFIG_SMP -struct rq *cpu_rq(int cpu) -{ - return &per_cpu(runqueues, (cpu)); -} -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -/* - * sched_domains_mutex serialises calls to init_sched_domains, - * detach_destroy_domains and partition_sched_domains. - */ -DEFINE_MUTEX(sched_domains_mutex); - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -static struct root_domain def_root_domain; - -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; -} -#else -struct rq *uprq; -#endif /* CONFIG_SMP */ - -static inline void update_rq_clock(struct rq *rq); - -/* - * Sanity check should sched_clock return bogus values. We make sure it does - * not appear to go backwards, and use jiffies to determine the maximum and - * minimum it could possibly have increased, and round down to the nearest - * jiffy when it falls outside this. - */ -static inline void niffy_diff(s64 *niff_diff, int jiff_diff) -{ - unsigned long min_diff, max_diff; - - if (jiff_diff > 1) - min_diff = JIFFIES_TO_NS(jiff_diff - 1); - else - min_diff = 1; - /* Round up to the nearest tick for maximum */ - max_diff = JIFFIES_TO_NS(jiff_diff + 1); - - if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff)) - *niff_diff = min_diff; -} - -#ifdef CONFIG_SMP -static inline int cpu_of(struct rq *rq) -{ - return rq->cpu; -} - -/* - * Niffies are a globally increasing nanosecond counter. Whenever a runqueue - * clock is updated with the grq.lock held, it is an opportunity to update the - * niffies value. Any CPU can update it by adding how much its clock has - * increased since it last updated niffies, minus any added niffies by other - * CPUs. - */ -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; - /* old_clock is only updated when we are updating niffies */ - rq->old_clock = rq->clock; - ndiff -= grq.niffies - rq->last_niffy; - jdiff = jiffies - grq.last_jiffy; - niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; - rq->last_niffy = grq.niffies; -} -#else /* CONFIG_SMP */ -static inline int cpu_of(struct rq *rq) -{ - return 0; -} - -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; - rq->old_clock = rq->clock; - jdiff = jiffies - grq.last_jiffy; - niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; -} -#endif - -#include "stats.h" - -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif -#ifndef finish_arch_post_lock_switch -# define finish_arch_post_lock_switch() do { } while (0) -#endif - -/* - * All common locking functions performed on grq.lock. rq->clock is local to - * the CPU accessing it so it can be modified just with interrupts disabled - * when we're not updating niffies. - * Looking up task_rq must be done under grq.lock to be safe. - */ -static void update_rq_clock_task(struct rq *rq, s64 delta); - -static inline void update_rq_clock(struct rq *rq) -{ - s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; - - if (unlikely(delta < 0)) - return; - rq->clock += delta; - update_rq_clock_task(rq, delta); -} - -static inline bool task_running(struct task_struct *p) -{ - return p->on_cpu; -} - -static inline void grq_lock(void) - __acquires(grq.lock) -{ - raw_spin_lock(&grq.lock); -} - -static inline void grq_unlock(void) - __releases(grq.lock) -{ - raw_spin_unlock(&grq.lock); -} - -static inline void grq_lock_irq(void) - __acquires(grq.lock) -{ - raw_spin_lock_irq(&grq.lock); -} - -static inline void time_lock_grq(struct rq *rq) -{ - grq_lock(); - update_clocks(rq); -} - -static inline void grq_unlock_irq(void) - __releases(grq.lock) -{ - raw_spin_unlock_irq(&grq.lock); -} - -static inline void grq_lock_irqsave(unsigned long *flags) - __acquires(grq.lock) -{ - raw_spin_lock_irqsave(&grq.lock, *flags); -} - -static inline void grq_unlock_irqrestore(unsigned long *flags) - __releases(grq.lock) -{ - raw_spin_unlock_irqrestore(&grq.lock, *flags); -} - -static inline struct rq -*task_grq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) -{ - raw_spin_lock_irqsave(&p->pi_lock, *flags); - grq_lock(); - return task_rq(p); -} - -static inline struct rq -*time_task_grq_lock(struct task_struct *p, unsigned long *flags) -{ - struct rq *rq = task_grq_lock(p, flags); - - update_clocks(rq); - return rq; -} - -static inline void task_grq_unlock(struct task_struct *p, unsigned long *flags) - __releases(p->pi_lock) -{ - grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -} - -static inline void time_grq_lock(struct rq *rq, unsigned long *flags) -{ - local_irq_save(*flags); - time_lock_grq(rq); -} - -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - grq.lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_); - - grq_unlock_irq(); -} - -static inline bool deadline_before(u64 deadline, u64 time) -{ - return (deadline < time); -} - -static inline bool deadline_after(u64 deadline, u64 time) -{ - return (deadline > time); -} - -/* - * Deadline is "now" in niffies + (offset by priority). Setting the deadline - * is the key to everything. It distributes cpu fairly amongst tasks of the - * same nice value, it proportions cpu according to nice level, it means the - * task that last woke up the longest ago has the earliest deadline, thus - * ensuring that interactive tasks get low latency on wake up. The CPU - * proportion works out to the square of the virtual deadline difference, so - * this equation will give nice 19 3% CPU compared to nice 0. - */ -static inline u64 prio_deadline_diff(int user_prio) -{ - return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); -} - -static inline u64 task_deadline_diff(struct task_struct *p) -{ - return prio_deadline_diff(TASK_USER_PRIO(p)); -} - -static inline u64 static_deadline_diff(int static_prio) -{ - return prio_deadline_diff(USER_PRIO(static_prio)); -} - -static inline int longest_deadline_diff(void) -{ - return prio_deadline_diff(39); -} - -static inline int ms_longest_deadline_diff(void) -{ - return NS_TO_MS(longest_deadline_diff()); -} - -/* - * A task that is not running or queued will not have a node set. - * A task that is queued but not running will have a node set. - * A task that is currently running will have ->on_cpu set but no node set. - */ -static inline bool task_queued(struct task_struct *p) -{ - return !skiplist_node_empty(&p->node); -} - -/* - * Removing from the global runqueue. Enter with grq locked. Deleting a task - * from the skip list is done via the stored node reference in the task struct - * and does not require a full look up. Thus it occurs in O(k) time where k - * is the "level" of the list the task was stored at - usually < 4, max 16. - */ -static void dequeue_task(struct task_struct *p) -{ - skiplist_delete(grq.sl, &p->node); - sched_info_dequeued(task_rq(p), p); -} - -#ifdef CONFIG_PREEMPT_RCU -static bool rcu_read_critical(struct task_struct *p) -{ - return p->rcu_read_unlock_special.b.blocked; -} -#else /* CONFIG_PREEMPT_RCU */ -#define rcu_read_critical(p) (false) -#endif /* CONFIG_PREEMPT_RCU */ - -/* - * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as - * an idle task, we ensure none of the following conditions are met. - */ -static bool idleprio_suitable(struct task_struct *p) -{ - return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && - !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); -} - -/* - * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check - * that the iso_refractory flag is not set. - */ -static bool isoprio_suitable(void) -{ - return !grq.iso_refractory; -} - -/* - * Adding to the global runqueue. Enter with grq locked. - */ -static void enqueue_task(struct task_struct *p, struct rq *rq) -{ - unsigned int randseed; - u64 sl_id; - - if (!rt_task(p)) { - /* Check it hasn't gotten rt from PI */ - if ((idleprio_task(p) && idleprio_suitable(p)) || - (iso_task(p) && isoprio_suitable())) - p->prio = p->normal_prio; - else - p->prio = NORMAL_PRIO; - } - /* - * The sl_id key passed to the skiplist generates a sorted list. - * Realtime and sched iso tasks run FIFO so they only need be sorted - * according to priority. The skiplist will put tasks of the same - * key inserted later in FIFO order. Tasks of sched normal, batch - * and idleprio are sorted according to their deadlines. Idleprio - * tasks are offset by an impossibly large deadline value ensuring - * they get sorted into last positions, but still according to their - * own deadlines. This creates a "landscape" of skiplists running - * from priority 0 realtime in first place to the lowest priority - * idleprio tasks last. Skiplist insertion is an O(log n) process. - */ - if (p->prio <= ISO_PRIO) - sl_id = p->prio; - else { - sl_id = p->deadline; - if (idleprio_task(p)) { - /* Set it to cope with 4 left shifts with locality_diff */ - if (p->prio == IDLE_PRIO) - sl_id |= 0x00FF000000000000; - else - sl_id += longest_deadline_diff(); - } - } - /* - * Some architectures don't have better than microsecond resolution - * so mask out ~microseconds as the random seed for skiplist insertion. - */ - randseed = (grq.niffies >> 10) & 0xFFFFFFFF; - skiplist_insert(grq.sl, &p->node, sl_id, p, randseed); - sched_info_queued(rq, p); -} - -static inline void requeue_task(struct task_struct *p) -{ - sched_info_queued(task_rq(p), p); -} - -/* - * Returns the relative length of deadline all compared to the shortest - * deadline which is that of nice -20. - */ -static inline int task_prio_ratio(struct task_struct *p) -{ - return prio_ratios[TASK_USER_PRIO(p)]; -} - -/* - * task_timeslice - all tasks of all priorities get the exact same timeslice - * length. CPU distribution is handled by giving different deadlines to - * tasks of different priorities. Use 128 as the base value for fast shifts. - */ -static inline int task_timeslice(struct task_struct *p) -{ - return (rr_interval * task_prio_ratio(p) / 128); -} - -static void resched_task(struct task_struct *p); - -static inline void resched_curr(struct rq *rq) -{ - resched_task(rq->curr); -} - -/* - * qnr is the "queued but not running" count which is the total number of - * tasks on the global runqueue list waiting for cpu time but not actually - * currently running on a cpu. - */ -static inline void inc_qnr(void) -{ - grq.qnr++; -} - -static inline void dec_qnr(void) -{ - grq.qnr--; -} - -static inline int queued_notrunning(void) -{ - return grq.qnr; -} - -static unsigned long rq_load_avg(struct rq *rq) -{ - return rq->soft_affined * SCHED_CAPACITY_SCALE; -} - -#ifdef CONFIG_SMT_NICE -static const cpumask_t *thread_cpumask(int cpu); - -/* Find the best real time priority running on any SMT siblings of cpu and if - * none are running, the static priority of the best deadline task running. - * The lookups to the other runqueues is done lockless as the occasional wrong - * value would be harmless. */ -static int best_smt_bias(struct rq *this_rq) -{ - int other_cpu, best_bias = 0; - - for_each_cpu(other_cpu, &this_rq->thread_mask) { - struct rq *rq = cpu_rq(other_cpu); - - if (rq_idle(rq)) - continue; - if (unlikely(!rq->online)) - continue; - if (!rq->rq_mm) - continue; - if (likely(rq->rq_smt_bias > best_bias)) - best_bias = rq->rq_smt_bias; - } - return best_bias; -} - -static int task_prio_bias(struct task_struct *p) -{ - if (rt_task(p)) - return 1 << 30; - else if (task_running_iso(p)) - return 1 << 29; - else if (task_running_idle(p)) - return 0; - return MAX_PRIO - p->static_prio; -} - -static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) -{ - return true; -} - -static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; - -/* We've already decided p can run on CPU, now test if it shouldn't for SMT - * nice reasons. */ -static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) -{ - int best_bias, task_bias; - - /* Kernel threads always run */ - if (unlikely(!p->mm)) - return true; - if (rt_task(p)) - return true; - if (!idleprio_suitable(p)) - return true; - best_bias = best_smt_bias(this_rq); - /* The smt siblings are all idle or running IDLEPRIO */ - if (best_bias < 1) - return true; - task_bias = task_prio_bias(p); - if (task_bias < 1) - return false; - if (task_bias >= best_bias) - return true; - /* Dither 25% cpu of normal tasks regardless of nice difference */ - if (best_bias % 4 == 1) - return true; - /* Sorry, you lose */ - return false; -} -#else /* CONFIG_SMT_NICE */ -#define smt_schedule(p, this_rq) (true) -#endif /* CONFIG_SMT_NICE */ -#ifdef CONFIG_SMP -/* - * The cpu_idle_map stores a bitmap of all the CPUs currently idle to - * allow easy lookup of whether any suitable idle CPUs are available. - * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the - * idle_cpus variable than to do a full bitmask check when we are busy. - */ -static inline void set_cpuidle_map(int cpu) -{ - if (likely(cpu_online(cpu))) { - cpumask_set_cpu(cpu, &grq.cpu_idle_map); - grq.idle_cpus = true; - } -} - -static inline void clear_cpuidle_map(int cpu) -{ - cpumask_clear_cpu(cpu, &grq.cpu_idle_map); - if (cpumask_empty(&grq.cpu_idle_map)) - grq.idle_cpus = false; -} - -static bool suitable_idle_cpus(struct task_struct *p) -{ - if (!grq.idle_cpus) - return false; - return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map)); -} - -#define CPUIDLE_DIFF_THREAD (1) -#define CPUIDLE_DIFF_CORE (2) -#define CPUIDLE_CACHE_BUSY (4) -#define CPUIDLE_DIFF_CPU (8) -#define CPUIDLE_THREAD_BUSY (16) -#define CPUIDLE_DIFF_NODE (32) - -/* - * The best idle CPU is chosen according to the CPUIDLE ranking above where the - * lowest value would give the most suitable CPU to schedule p onto next. The - * order works out to be the following: - * - * Same thread, idle or busy cache, idle or busy threads - * Other core, same cache, idle or busy cache, idle threads. - * Same node, other CPU, idle cache, idle threads. - * Same node, other CPU, busy cache, idle threads. - * Other core, same cache, busy threads. - * Same node, other CPU, busy threads. - * Other node, other CPU, idle cache, idle threads. - * Other node, other CPU, busy cache, idle threads. - * Other node, other CPU, busy threads. - */ -static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) -{ - int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | - CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | - CPUIDLE_DIFF_THREAD; - int cpu_tmp; - - if (cpumask_test_cpu(best_cpu, tmpmask)) - goto out; - - for_each_cpu(cpu_tmp, tmpmask) { - int ranking, locality; - struct rq *tmp_rq; - - ranking = 0; - tmp_rq = cpu_rq(cpu_tmp); - - locality = rq->cpu_locality[cpu_tmp]; -#ifdef CONFIG_NUMA - if (locality > 3) - ranking |= CPUIDLE_DIFF_NODE; - else -#endif - if (locality > 2) - ranking |= CPUIDLE_DIFF_CPU; -#ifdef CONFIG_SCHED_MC - else if (locality == 2) - ranking |= CPUIDLE_DIFF_CORE; - else if (!(tmp_rq->cache_idle(tmp_rq))) - ranking |= CPUIDLE_CACHE_BUSY; -#endif -#ifdef CONFIG_SCHED_SMT - if (locality == 1) - ranking |= CPUIDLE_DIFF_THREAD; - if (!(tmp_rq->siblings_idle(tmp_rq))) - ranking |= CPUIDLE_THREAD_BUSY; -#endif - if (ranking < best_ranking) { - best_cpu = cpu_tmp; - best_ranking = ranking; - } - } -out: - return best_cpu; -} - -bool cpus_share_cache(int this_cpu, int that_cpu) -{ - struct rq *this_rq = cpu_rq(this_cpu); - - return (this_rq->cpu_locality[that_cpu] < 3); -} - -static bool resched_best_idle(struct task_struct *p) -{ - cpumask_t tmpmask; - struct rq *rq; - int best_cpu; - - cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); - best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask); - rq = cpu_rq(best_cpu); - if (!smt_schedule(p, rq)) - return false; - resched_curr(rq); - return true; -} - -static inline void resched_suitable_idle(struct task_struct *p) -{ - if (suitable_idle_cpus(p)) - resched_best_idle(p); -} - -static inline int locality_diff(int cpu, struct rq *rq) -{ - return rq->cpu_locality[cpu]; -} -#else /* CONFIG_SMP */ -static inline void set_cpuidle_map(int cpu) -{ -} - -static inline void clear_cpuidle_map(int cpu) -{ -} - -static inline bool suitable_idle_cpus(struct task_struct *p) -{ - return uprq->curr == uprq->idle; -} - -static inline void resched_suitable_idle(struct task_struct *p) -{ -} - -static inline int locality_diff(int cpu, struct rq *rq) -{ - return 0; -} -#endif /* CONFIG_SMP */ - -static inline int normal_prio(struct task_struct *p) -{ - if (has_rt_policy(p)) - return MAX_RT_PRIO - 1 - p->rt_priority; - if (idleprio_task(p)) - return IDLE_PRIO; - if (iso_task(p)) - return ISO_PRIO; - return NORMAL_PRIO; -} - -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks as it will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) -{ - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; -} - -/* - * Update the load average for feeding into cpu frequency governors. Use a - * rough estimate of a rolling average with ~ time constant of 32ms. - * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 - */ -static void update_load_avg(struct rq *rq) -{ - /* rq clock can go backwards so skip update if that happens */ - if (likely(rq->clock > rq->load_update)) { - unsigned long us_interval = (rq->clock - rq->load_update) >> 10; - long load; - - load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); - if (unlikely(load < 0)) - load = 0; - load += rq->soft_affined * rq_load_avg(rq) * us_interval * 5 / 262144; - rq->load_avg = load; - } - rq->load_update = rq->clock; -} - -/* - * activate_task - move a task to the runqueue. Enter with grq locked. - */ -static void activate_task(struct task_struct *p, struct rq *rq) -{ - update_clocks(rq); - - /* - * Sleep time is in units of nanosecs, so shift by 20 to get a - * milliseconds-range estimation of the amount of time that the task - * spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - if (p->state == TASK_UNINTERRUPTIBLE) - profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (rq->clock_task - p->last_ran) >> 20); - } - - p->prio = effective_prio(p); - if (task_contributes_to_load(p)) - grq.nr_uninterruptible--; - enqueue_task(p, rq); - rq->soft_affined++; - p->on_rq = 1; - grq.nr_running++; - inc_qnr(); - update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); -} - -/* - * deactivate_task - If it's running, it's not on the grq and we can just - * decrement the nr_running. Enter with grq locked. - */ -static inline void deactivate_task(struct task_struct *p, struct rq *rq) -{ - if (task_contributes_to_load(p)) - grq.nr_uninterruptible++; - rq->soft_affined--; - p->on_rq = 0; - grq.nr_running--; - update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); -} - -#ifdef CONFIG_SMP -void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ -#ifdef CONFIG_LOCKDEP - /* - * The caller should hold either p->pi_lock or grq lock, when changing - * a task's CPU. ->pi_lock for waking tasks, grq lock for runnable tasks. - * - * Furthermore, all task_rq users should acquire both locks, see - * task_grq_lock(). - */ - WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&grq.lock))); -#endif - if (task_cpu(p) == cpu) - return; - trace_sched_migrate_task(p, cpu); - perf_event_task_migrate(p); - - /* - * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be - * successfully executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - - if (p->on_rq) { - struct rq *rq = task_rq(p); - - rq->soft_affined--; - update_load_avg(rq); - rq = cpu_rq(cpu); - rq->soft_affined++; - update_load_avg(rq); - } - task_thread_info(p)->cpu = cpu; -} -#endif /* CONFIG_SMP */ - -/* - * Move a task off the global queue and take it to a cpu for it will - * become the running task. - */ -static inline void take_task(int cpu, struct task_struct *p) -{ - set_task_cpu(p, cpu); - dequeue_task(p); - dec_qnr(); -} - -/* - * Returns a descheduling task to the grq runqueue unless it is being - * deactivated. - */ -static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate) -{ - if (deactivate) - deactivate_task(p, rq); - else { - inc_qnr(); - enqueue_task(p, rq); - } -} - -/* Enter with grq lock held. We know p is on the local cpu */ -static inline void __set_tsk_resched(struct task_struct *p) -{ - set_tsk_need_resched(p); - set_preempt_need_resched(); -} - -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -void resched_task(struct task_struct *p) -{ - int cpu; - - lockdep_assert_held(&grq.lock); - - if (test_tsk_need_resched(p)) - return; - - set_tsk_need_resched(p); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) { - set_preempt_need_resched(); - return; - } - - smp_send_reschedule(cpu); -} - -/** - * task_curr - is this task currently executing on a CPU? - * @p: the task in question. - * - * Return: 1 if the task is currently executing. 0 otherwise. - */ -inline int task_curr(const struct task_struct *p) -{ - return cpu_curr(task_cpu(p)) == p; -} - -#ifdef CONFIG_SMP -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) -{ - unsigned long flags; - bool running, on_rq; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since this will return false - * if the runqueue has changed and p is actually now - * running somewhere else! - */ - while (task_running(p) && p == rq->curr) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the grq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_grq_lock(p, &flags); - trace_sched_wait_task(p); - running = task_running(p); - on_rq = p->on_rq; - ncsw = 0; - if (!match_state || p->state == match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_grq_unlock(p, &flags); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(on_rq)) { - ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ); - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - * - * NOTE: this function doesn't have to take the runqueue lock, - * because all it wants to ensure is that the remote task enters - * the kernel. If the IPI races and the task has been migrated - * to another CPU then no harm is done and the purpose has been - * achieved as well. - */ -void kick_process(struct task_struct *p) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); - preempt_enable(); -} -EXPORT_SYMBOL_GPL(kick_process); -#endif - -/* - * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the - * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or - * between themselves, they cooperatively multitask. An idle rq scores as - * prio PRIO_LIMIT so it is always preempted. - */ -static inline bool -can_preempt(struct task_struct *p, int prio, u64 deadline) -{ - /* Better static priority RT task or better policy preemption */ - if (p->prio < prio) - return true; - if (p->prio > prio) - return false; - /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */ - if (!deadline_before(p->deadline, deadline)) - return false; - return true; -} - -#ifdef CONFIG_SMP -#define cpu_online_map (*(cpumask_t *)cpu_online_mask) -#ifdef CONFIG_HOTPLUG_CPU -/* - * Check to see if there is a task that is affined only to offline CPUs but - * still wants runtime. This happens to kernel threads during suspend/halt and - * disabling of CPUs. - */ -static inline bool online_cpus(struct task_struct *p) -{ - return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed))); -} -#else /* CONFIG_HOTPLUG_CPU */ -/* All available CPUs are always online without hotplug. */ -static inline bool online_cpus(struct task_struct *p) -{ - return true; -} -#endif - -/* - * Check to see if p can run on cpu, and if not, whether there are any online - * CPUs it can run on instead. - */ -static inline bool needs_other_cpu(struct task_struct *p, int cpu) -{ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed))) - return true; - return false; -} - -static void try_preempt(struct task_struct *p, struct rq *this_rq) -{ - int i, this_entries = this_rq->soft_affined; - cpumask_t tmp; - - if (suitable_idle_cpus(p) && resched_best_idle(p)) - return; - - /* IDLEPRIO tasks never preempt anything but idle */ - if (p->policy == SCHED_IDLEPRIO) - return; - - cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); - - /* - * We iterate over CPUs in locality order using rq_order, finding the - * first one we can preempt if possible, thus staying closest in - * locality. - */ - for (i = 0; i < num_possible_cpus(); i++) { - struct rq *rq = this_rq->rq_order[i]; - - if (!cpumask_test_cpu(rq->cpu, &tmp)) - continue; - - if (!sched_interactive && rq != this_rq && rq->soft_affined <= this_entries) - continue; - if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { - /* - * If we have decided this task should preempt this CPU, - * set the task's CPU to match thereby speeding up matching - * this task in earliest_deadline_task. - */ - set_task_cpu(p, rq->cpu); - resched_curr(rq); - return; - } - } -} - -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check); -#else /* CONFIG_SMP */ -static inline bool needs_other_cpu(struct task_struct *p, int cpu) -{ - return false; -} - -static void try_preempt(struct task_struct *p, struct rq *this_rq) -{ - if (p->policy == SCHED_IDLEPRIO) - return; - if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) - resched_curr(uprq); -} - -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) -{ - return set_cpus_allowed_ptr(p, new_mask); -} -#endif /* CONFIG_SMP */ - -static void -ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -{ -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); - -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - - if (cpu == this_cpu) - schedstat_inc(rq, ttwu_local); - else { - struct sched_domain *sd; - - rcu_read_lock(); - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - rcu_read_unlock(); - } - -#endif /* CONFIG_SMP */ - - schedstat_inc(rq, ttwu_count); -#endif /* CONFIG_SCHEDSTATS */ -} - -void wake_up_if_idle(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - rcu_read_lock(); - - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; - - grq_lock_irqsave(&flags); - if (likely(is_idle_task(rq->curr))) - smp_send_reschedule(cpu); - /* Else cpu is not in idle, do nothing here */ - grq_unlock_irqrestore(&flags); - -out: - rcu_read_unlock(); -} - -#ifdef CONFIG_SMP -void scheduler_ipi(void) -{ - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); -} -#endif - -static inline void ttwu_activate(struct task_struct *p, struct rq *rq, - bool is_sync) -{ - activate_task(p, rq); - - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption if there are no idle cpus, - * instead waiting for current to deschedule. - */ - if (!is_sync || suitable_idle_cpus(p)) - try_preempt(p, rq); -} - -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, - bool success) -{ - trace_sched_wakeup(p); - p->state = TASK_RUNNING; - - /* - * if a worker is waking up, notify workqueue. Note that on BFS, we - * don't really know what cpu it will be, so we fake it for - * wq_worker_waking_up :/ - */ - if ((p->flags & PF_WQ_WORKER) && success) - wq_worker_waking_up(p, cpu_of(rq)); -} - -/* - * wake flags - */ -#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* child wakeup after fork */ -#define WF_MIGRATED 0x4 /* internal use, task got migrated */ - -/*** - * try_to_wake_up - wake up a thread - * @p: the thread to be awakened - * @state: the mask of task states that can be woken - * @wake_flags: wake modifier flags (WF_*) - * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. - * - * Return: %true if @p was woken up, %false if it was already running. - * or @state didn't match @p's state. - */ -static bool try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) -{ - bool success = false; - unsigned long flags; - struct rq *rq; - int cpu; - - /* - * If we are going to wake up a thread waiting for CONDITION we - * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. - */ - smp_mb__before_spinlock(); - - /* - * No need to do time_lock_grq as we only need to update the rq clock - * if we activate the task - */ - rq = task_grq_lock(p, &flags); - cpu = task_cpu(p); - - /* state is a volatile long, どうして、分からない */ - if (!((unsigned int)p->state & state)) - goto out_unlock; - - trace_sched_waking(p); - - if (task_queued(p) || task_running(p)) - goto out_running; - - ttwu_activate(p, rq, wake_flags & WF_SYNC); - success = true; - -out_running: - ttwu_post_activation(p, rq, success); -out_unlock: - task_grq_unlock(p, &flags); - - if (schedstat_enabled()) - ttwu_stat(p, cpu, wake_flags); - - return success; -} - -/** - * try_to_wake_up_local - try to wake up a local task with grq lock held - * @p: the thread to be awakened - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that grq is locked and, @p is not the current task. - * grq stays locked over invocation. - */ -static void try_to_wake_up_local(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - bool success = false; - - lockdep_assert_held(&grq.lock); - - if (!(p->state & TASK_NORMAL)) - return; - - trace_sched_waking(p); - - if (!task_queued(p)) { - if (likely(!task_running(p))) { - schedstat_inc(rq, ttwu_count); - schedstat_inc(rq, ttwu_local); - } - ttwu_activate(p, rq, false); - if (schedstat_enabled()) - ttwu_stat(p, smp_processor_id(), 0); - success = true; - } - ttwu_post_activation(p, rq, success); -} - -/** - * wake_up_process - Wake up a specific process - * @p: The process to be woken up. - * - * Attempt to wake up the nominated process and move it to the set of runnable - * processes. - * - * Return: 1 if the process was woken up, 0 if it was already running. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -int wake_up_process(struct task_struct *p) -{ - return try_to_wake_up(p, TASK_NORMAL, 0); -} -EXPORT_SYMBOL(wake_up_process); - -int wake_up_state(struct task_struct *p, unsigned int state) -{ - return try_to_wake_up(p, state, 0); -} - -static void time_slice_expired(struct task_struct *p); - -/* - * Perform scheduler related setup for a newly forked process p. - * p is forked by current. - */ -int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -{ -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); -#endif - /* - * The process state is set to the same value of the process executing - * do_fork() code. That is running. This guarantees that nobody will - * actually run it, and a signal or other external event cannot wake - * it up and insert it on the runqueue either. - */ - - /* Should be reset in fork.c but done here for ease of bfs patching */ - p->on_rq = - p->utime = - p->stime = - p->utimescaled = - p->stimescaled = - p->sched_time = - p->stime_pc = - p->utime_pc = 0; - skiplist_node_init(&p->node); - - /* - * We mark the process as NEW here. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_NEW; - - /* - * Revert to default priority/policy on fork if requested. - */ - if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { - p->policy = SCHED_NORMAL; - p->normal_prio = normal_prio(p); - } - - if (PRIO_TO_NICE(p->static_prio) < 0) { - p->static_prio = NICE_TO_PRIO(0); - p->normal_prio = p->static_prio; - } - - /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: - */ - p->sched_reset_on_fork = 0; - } - -#ifdef CONFIG_SCHED_INFO - if (unlikely(sched_info_on())) - memset(&p->sched_info, 0, sizeof(p->sched_info)); -#endif - p->on_cpu = false; - init_task_preempt_count(p); - return 0; -} - -#ifdef CONFIG_SCHEDSTATS - -DEFINE_STATIC_KEY_FALSE(sched_schedstats); -static bool __initdata __sched_schedstats = false; - -static void set_schedstats(bool enabled) -{ - if (enabled) - static_branch_enable(&sched_schedstats); - else - static_branch_disable(&sched_schedstats); -} - -void force_schedstat_enabled(void) -{ - if (!schedstat_enabled()) { - pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); - static_branch_enable(&sched_schedstats); - } -} - -static int __init setup_schedstats(char *str) -{ - int ret = 0; - if (!str) - goto out; - - /* - * This code is called before jump labels have been set up, so we can't - * change the static branch directly just yet. Instead set a temporary - * variable so init_schedstats() can do it later. - */ - if (!strcmp(str, "enable")) { - __sched_schedstats = true; - ret = 1; - } else if (!strcmp(str, "disable")) { - __sched_schedstats = false; - ret = 1; - } -out: - if (!ret) - pr_warn("Unable to parse schedstats=\n"); - - return ret; -} -__setup("schedstats=", setup_schedstats); - -static void __init init_schedstats(void) -{ - set_schedstats(__sched_schedstats); -} - -#ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - int err; - int state = static_branch_likely(&sched_schedstats); - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - t = *table; - t.data = &state; - err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - if (write) - set_schedstats(state); - return err; -} -#endif /* CONFIG_PROC_SYSCTL */ -#else /* !CONFIG_SCHEDSTATS */ -static inline void init_schedstats(void) {} -#endif /* CONFIG_SCHEDSTATS */ - -/* - * wake_up_new_task - wake up a newly created task for the first time. - * - * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created context, then puts the task - * on the runqueue and wakes it. - */ -void wake_up_new_task(struct task_struct *p) -{ - struct task_struct *parent, *rq_curr; - struct rq *rq, *new_rq; - unsigned long flags; - - parent = p->parent; - rq = task_grq_lock(p, &flags); - if (unlikely(needs_other_cpu(p, task_cpu(p)))) - set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); - rq_curr = rq->curr; - p->state = TASK_RUNNING; - - /* - * Reinit new task deadline as its creator deadline could have changed - * since call to dup_task_struct(). - */ - p->deadline = rq->rq_deadline; - - /* The new task might not be able to run on the same CPU as rq->curr */ - if (unlikely(needs_other_cpu(p, task_cpu(p)))) { - set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); - new_rq = task_rq(p); - } else - new_rq = rq; - - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = rq_curr->normal_prio; - - activate_task(p, rq); - trace_sched_wakeup_new(p); - - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. If it's negative, it won't - * matter since that's the same as being 0. current's time_slice is - * actually in rq_time_slice when it's running, as is its last_ran - * value. rq->rq_deadline is only modified within schedule() so it - * is always equal to current->deadline. - */ - p->last_ran = rq->rq_last_ran; - if (likely(rq_curr->policy != SCHED_FIFO)) { - rq->rq_time_slice /= 2; - if (unlikely(rq->rq_time_slice < RESCHED_US)) { - /* - * Forking task has run out of timeslice. Reschedule it and - * start its child with a new time slice and deadline. The - * child will end up running first because its deadline will - * be slightly earlier. - */ - rq->rq_time_slice = 0; - __set_tsk_resched(rq_curr); - time_slice_expired(p); - if (suitable_idle_cpus(p)) - resched_best_idle(p); - else if (unlikely(rq != new_rq)) - try_preempt(p, new_rq); - } else { - p->time_slice = rq->rq_time_slice; - if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ - __set_tsk_resched(rq_curr); - } else - try_preempt(p, new_rq); - } - } else { - time_slice_expired(p); - try_preempt(p, new_rq); - } - task_grq_unlock(p, &flags); -} - -#ifdef CONFIG_PREEMPT_NOTIFIERS - -static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; - -void preempt_notifier_inc(void) -{ - static_key_slow_inc(&preempt_notifier_key); -} -EXPORT_SYMBOL_GPL(preempt_notifier_inc); - -void preempt_notifier_dec(void) -{ - static_key_slow_dec(&preempt_notifier_key); -} -EXPORT_SYMBOL_GPL(preempt_notifier_dec); - -/** - * preempt_notifier_register - tell me when current is being preempted & rescheduled - * @notifier: notifier struct to register - */ -void preempt_notifier_register(struct preempt_notifier *notifier) -{ - if (!static_key_false(&preempt_notifier_key)) - WARN(1, "registering preempt_notifier while notifiers disabled\n"); - - hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -} -EXPORT_SYMBOL_GPL(preempt_notifier_register); - -/** - * preempt_notifier_unregister - no longer interested in preemption notifications - * @notifier: notifier struct to unregister - * - * This is *not* safe to call from within a preemption notifier. - */ -void preempt_notifier_unregister(struct preempt_notifier *notifier) -{ - hlist_del(¬ifier->link); -} -EXPORT_SYMBOL_GPL(preempt_notifier_unregister); - -static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ - struct preempt_notifier *notifier; - - hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) - notifier->ops->sched_in(notifier, raw_smp_processor_id()); -} - -static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ - if (static_key_false(&preempt_notifier_key)) - __fire_sched_in_preempt_notifiers(curr); -} - -static void -__fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ - struct preempt_notifier *notifier; - - hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) - notifier->ops->sched_out(notifier, next); -} - -static __always_inline void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ - if (static_key_false(&preempt_notifier_key)) - __fire_sched_out_preempt_notifiers(curr, next); -} - -#else /* !CONFIG_PREEMPT_NOTIFIERS */ - -static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ -} - -static inline void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ -} - -#endif /* CONFIG_PREEMPT_NOTIFIERS */ - -/** - * prepare_task_switch - prepare to switch tasks - * @rq: the runqueue preparing to switch - * @next: the task we are going to switch to. - * - * This is called with the rq lock held and interrupts off. It must - * be paired with a subsequent finish_task_switch after the context - * switch. - * - * prepare_task_switch sets up locking and calls architecture specific - * hooks. - */ -static inline void -prepare_task_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - sched_info_switch(rq, prev, next); - perf_event_task_sched_out(prev, next); - fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); - prepare_arch_switch(next); -} - -/** - * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch - * @prev: the thread we just switched away from. - * - * finish_task_switch must be called after the context switch, paired - * with a prepare_task_switch call before the context switch. - * finish_task_switch will reconcile locking set up by prepare_task_switch, - * and do any other architecture-specific cleanup actions. - * - * Note that we may have delayed dropping an mm in context_switch(). If - * so, we finish that here outside of the runqueue lock. (Doing it - * with the lock held can cause deadlocks; see schedule() for - * details.) - * - * The context switch have flipped the stack from under us and restored the - * local variables which were saved when this task called schedule() in the - * past. prev == current is still correct but we need to recalculate this_rq - * because prev may have moved to another CPU. - */ -static struct rq *finish_task_switch(struct task_struct *prev) - __releases(grq.lock) -{ - struct rq *rq = this_rq(); - struct mm_struct *mm = rq->prev_mm; - long prev_state; - - /* - * The previous task will have left us with a preempt_count of 2 - * because it left us after: - * - * schedule() - * preempt_disable(); // 1 - * __schedule() - * raw_spin_lock_irq(&rq->lock) // 2 - * - * Also, see FORK_PREEMPT_COUNT. - */ - if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, - "corrupted preempt_count: %s/%d/0x%x\n", - current->comm, current->pid, preempt_count())) - preempt_count_set(FORK_PREEMPT_COUNT); - - rq->prev_mm = NULL; - - /* - * A task struct has one reference for the use as "current". - * If a task dies, then it sets TASK_DEAD in tsk->state and calls - * schedule one last time. The schedule call will never return, and - * the scheduled task must drop that reference. - * - * We must observe prev->state before clearing prev->on_cpu (in - * finish_lock_switch), otherwise a concurrent wakeup can get prev - * running on another CPU and we could rave with its RUNNING -> DEAD - * transition, resulting in a double drop. - */ - prev_state = prev->state; - vtime_task_switch(prev); - perf_event_task_sched_in(prev, current); - finish_lock_switch(rq, prev); - finish_arch_post_lock_switch(); - - fire_sched_in_preempt_notifiers(current); - if (mm) - mmdrop(mm); - if (unlikely(prev_state == TASK_DEAD)) { - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - put_task_struct(prev); - } - return rq; -} - -/** - * schedule_tail - first thing a freshly forked thread must call. - * @prev: the thread we just switched away from. - */ -asmlinkage __visible void schedule_tail(struct task_struct *prev) - __releases(grq.lock) -{ - struct rq *rq; - - /* - * New tasks start with FORK_PREEMPT_COUNT, see there and - * finish_task_switch() for details. - * - * finish_task_switch() will drop rq->lock() and lower preempt_count - * and the preempt_enable() will end up enabling preemption (on - * PREEMPT_COUNT kernels). - */ - - rq = finish_task_switch(prev); - preempt_enable(); - - if (current->set_child_tid) - put_user(task_pid_vnr(current), current->set_child_tid); -} - -/* - * context_switch - switch to the new MM and the new thread's register state. - */ -static __always_inline struct rq * -context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - struct mm_struct *mm, *oldmm; - - prepare_task_switch(rq, prev, next); - - mm = next->mm; - oldmm = prev->active_mm; - /* - * For paravirt, this is coupled with an exit in switch_to to - * combine the page table reload and the switch backend into - * one hypercall. - */ - arch_start_context_switch(prev); - - if (!mm) { - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next); - } else - switch_mm_irqs_off(oldmm, mm, next); - - if (!prev->mm) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; - } - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); - - /* Here we just switch the register state and the stack. */ - switch_to(prev, next, prev); - barrier(); - - return finish_task_switch(prev); -} - -/* - * nr_running, nr_uninterruptible and nr_context_switches: - * - * externally visible scheduler statistics: current number of runnable - * threads, total number of context switches performed since bootup. All are - * measured without grabbing the grq lock but the occasional inaccurate result - * doesn't matter so long as it's positive. - */ -unsigned long nr_running(void) -{ - long nr = grq.nr_running; - - if (unlikely(nr < 0)) - nr = 0; - return (unsigned long)nr; -} - -static unsigned long nr_uninterruptible(void) -{ - long nu = grq.nr_uninterruptible; - - if (unlikely(nu < 0)) - nu = 0; - return nu; -} - -/* - * Check if only the current task is running on the cpu. - * - * Caution: this function does not check that the caller has disabled - * preemption, thus the result might have a time-of-check-to-time-of-use - * race. The caller is responsible to use it correctly, for example: - * - * - from a non-preemptable section (of course) - * - * - from a thread that is bound to a single CPU - * - * - in a loop with very short iterations (e.g. a polling loop) - */ -bool single_task_running(void) -{ - if (cpu_rq(smp_processor_id())->soft_affined == 1) - return true; - else - return false; -} -EXPORT_SYMBOL(single_task_running); - -unsigned long long nr_context_switches(void) -{ - long long ns = grq.nr_switches; - - /* This is of course impossible */ - if (unlikely(ns < 0)) - ns = 1; - return (unsigned long long)ns; -} - -unsigned long nr_iowait(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); - - return sum; -} - -unsigned long nr_iowait_cpu(int cpu) -{ - struct rq *this = cpu_rq(cpu); - return atomic_read(&this->nr_iowait); -} - -unsigned long nr_active(void) -{ - return nr_running() + nr_uninterruptible(); -} - -/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we - * base it on the number of running or queued tasks with their ->rq pointer - * set to this cpu as being the CPU they're more likely to run on. */ -void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) -{ - struct rq *rq = this_rq(); - - *nr_waiters = atomic_read(&rq->nr_iowait); - *load = rq->soft_affined; -} - -/* Variables and functions for calc_load */ -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - unsigned long newload; - - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; - - return newload / FIXED_1; -} - -/* - * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. - */ -void calc_global_load(unsigned long ticks) -{ - long active; - - if (time_before(jiffies, calc_load_update)) - return; - active = nr_active() * FIXED_1; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update = jiffies + LOAD_FREQ; -} - -DEFINE_PER_CPU(struct kernel_stat, kstat); -DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); - -EXPORT_PER_CPU_SYMBOL(kstat); -EXPORT_PER_CPU_SYMBOL(kernel_cpustat); - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void irqtime_account_irq(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(irqtime_account_irq); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ -/* - * In theory, the compile should just see 0 here, and optimize out the call - * to sched_rt_avg_update. But I don't trust it... - */ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}irq region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}irq - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; - - rq->prev_irq_time += irq_delta; - delta -= irq_delta; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_key_false((¶virt_steal_rq_enabled))) { - s64 steal = paravirt_steal_clock(cpu_of(rq)); - - steal -= rq->prev_steal_time_rq; - - if (unlikely(steal > delta)) - steal = delta; - - rq->prev_steal_time_rq += steal; - - delta -= steal; - } -#endif - - rq->clock_task += delta; -} - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static void irqtime_account_hi_si(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - u64 latest_ns; - - latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)); - if (latest_ns > cpustat[CPUTIME_IRQ]) - cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy; - - latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)); - if (latest_ns > cpustat[CPUTIME_SOFTIRQ]) - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy; -} -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -static inline void irqtime_account_hi_si(void) -{ -} -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -static __always_inline bool steal_account_process_tick(void) -{ -#ifdef CONFIG_PARAVIRT - if (static_key_false(¶virt_steal_enabled)) { - u64 steal; - cputime_t steal_ct; - - steal = paravirt_steal_clock(smp_processor_id()); - steal -= this_rq()->prev_steal_time; - - /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime - * granularity and account the rest on the next rounds. - */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); - - account_steal_time(steal_ct); - return steal_ct; - } -#endif - return false; -} - -/* - * Accumulate raw cputime values of dead tasks (sig->[us]time) and live - * tasks (sum on group iteration) belonging to @tsk's group. - */ -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ - struct signal_struct *sig = tsk->signal; - cputime_t utime, stime; - struct task_struct *t; - unsigned int seq, nextseq; - unsigned long flags; - - rcu_read_lock(); - /* Attempt a lockless read on the first round. */ - nextseq = 0; - do { - seq = nextseq; - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; - - for_each_thread(tsk, t) { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } - /* If lockless access failed, take the lock. */ - nextseq = 1; - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); -} - -/* - * On each tick, see what percentage of that tick was attributed to each - * component and add the percentage to the _pc values. Once a _pc value has - * accumulated one tick's worth, account for that. This means the total - * percentage of load components will always be 128 (pseudo 100) per tick. - */ -static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - if (atomic_read(&rq->nr_iowait) > 0) { - rq->iowait_pc += pc; - if (rq->iowait_pc >= 128) { - cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128; - rq->iowait_pc %= 128; - } - } else { - rq->idle_pc += pc; - if (rq->idle_pc >= 128) { - cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128; - rq->idle_pc %= 128; - } - } - acct_update_integrals(idle); -} - -static void -pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset, - unsigned long pc, unsigned long ns) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - - p->stime_pc += pc; - if (p->stime_pc >= 128) { - int jiffs = p->stime_pc / 128; - - p->stime_pc %= 128; - p->stime += (__force u64)cputime_one_jiffy * jiffs; - p->stimescaled += one_jiffy_scaled * jiffs; - account_group_system_time(p, cputime_one_jiffy * jiffs); - } - p->sched_time += ns; - account_group_exec_runtime(p, ns); - - if (hardirq_count() - hardirq_offset) { - rq->irq_pc += pc; - if (rq->irq_pc >= 128) { - cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128; - rq->irq_pc %= 128; - } - } else if (in_serving_softirq()) { - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; - } - } else { - rq->system_pc += pc; - if (rq->system_pc >= 128) { - cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128; - rq->system_pc %= 128; - } - } - acct_update_integrals(p); -} - -static void pc_user_time(struct rq *rq, struct task_struct *p, - unsigned long pc, unsigned long ns) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - - p->utime_pc += pc; - if (p->utime_pc >= 128) { - int jiffs = p->utime_pc / 128; - - p->utime_pc %= 128; - p->utime += (__force u64)cputime_one_jiffy * jiffs; - p->utimescaled += one_jiffy_scaled * jiffs; - account_group_user_time(p, cputime_one_jiffy * jiffs); - } - p->sched_time += ns; - account_group_exec_runtime(p, ns); - - if (this_cpu_ksoftirqd() == p) { - /* - * ksoftirqd time do not get accounted in cpu_softirq_time. - * So, we have to handle it separately here. - */ - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; - } - } - - if (task_nice(p) > 0 || idleprio_task(p)) { - rq->nice_pc += pc; - if (rq->nice_pc >= 128) { - cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128; - rq->nice_pc %= 128; - } - } else { - rq->user_pc += pc; - if (rq->user_pc >= 128) { - cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128; - rq->user_pc %= 128; - } - } - acct_update_integrals(p); -} - -/* - * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast - * shifts instead of 100 - */ -#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS) - -/* - * This is called on clock ticks. - * Bank in p->sched_time the ns elapsed since the last tick or switch. - * CPU scheduler quota accounting is also performed here in microseconds. - */ -static void -update_cpu_clock_tick(struct rq *rq, struct task_struct *p) -{ - long account_ns = rq->clock_task - rq->rq_last_ran; - struct task_struct *idle = rq->idle; - unsigned long account_pc; - - if (unlikely(account_ns < 0) || steal_account_process_tick()) - goto ts_account; - - account_pc = NS_TO_PC(account_ns); - - /* Accurate tick timekeeping */ - if (user_mode(get_irq_regs())) - pc_user_time(rq, p, account_pc, account_ns); - else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) - pc_system_time(rq, p, HARDIRQ_OFFSET, - account_pc, account_ns); - else - pc_idle_time(rq, idle, account_pc); - - if (sched_clock_irqtime) - irqtime_account_hi_si(); - -ts_account: - /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); - } - - rq->rq_last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; -} - -/* - * This is called on context switches. - * Bank in p->sched_time the ns elapsed since the last tick or switch. - * CPU scheduler quota accounting is also performed here in microseconds. - */ -static void -update_cpu_clock_switch(struct rq *rq, struct task_struct *p) -{ - long account_ns = rq->clock_task - rq->rq_last_ran; - struct task_struct *idle = rq->idle; - unsigned long account_pc; - - if (unlikely(account_ns < 0)) - goto ts_account; - - account_pc = NS_TO_PC(account_ns); - - /* Accurate subtick timekeeping */ - if (p != idle) { - pc_user_time(rq, p, account_pc, account_ns); - } - else - pc_idle_time(rq, idle, account_pc); - -ts_account: - /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); - } - - rq->rq_last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; -} - -/* - * Return any ns on the sched_clock that have not yet been accounted in - * @p in case that task is currently running. - * - * Called with task_grq_lock() held. - */ -static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -{ - u64 ns = 0; - - /* - * Must be ->curr _and_ ->on_rq. If dequeued, we would - * project cycles that may never be accounted to this - * thread, breaking clock_gettime(). - */ - if (p == rq->curr && p->on_rq) { - update_clocks(rq); - ns = rq->clock_task - rq->rq_last_ran; - if (unlikely((s64)ns < 0)) - ns = 0; - } - - return ns; -} - -/* - * Return accounted runtime for the task. - * Return separately the current's pending runtime that have not been - * accounted yet. - * - */ -unsigned long long task_sched_runtime(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns; - -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) - /* - * 64-bit doesn't need locks to atomically read a 64bit value. - * So we have a optimization chance when the task's delta_exec is 0. - * Reading ->on_cpu is racy, but this is ok. - * - * If we race with it leaving cpu, we'll take a lock. So we're correct. - * If we race with it entering cpu, unaccounted time is 0. This is - * indistinguishable from the read occurring a few cycles earlier. - * If we see ->on_cpu without ->on_rq, the task is leaving, and has - * been accounted, so we're correct here as well. - */ - if (!p->on_cpu || !p->on_rq) - return tsk_seruntime(p); -#endif - - rq = task_grq_lock(p, &flags); - ns = p->sched_time + do_task_delta_exec(p, rq); - task_grq_unlock(p, &flags); - - return ns; -} - -/* Compatibility crap */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ -} - -void account_idle_time(cputime_t cputime) -{ -} - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - /* Add guest time to process. */ - p->utime += (__force u64)cputime; - p->utimescaled += (__force u64)cputime_scaled; - account_group_user_time(p, cputime); - p->gtime += (__force u64)cputime; - - /* Add guest time to cpustat. */ - if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64)cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime; - } else { - cpustat[CPUTIME_USER] += (__force u64)cputime; - cpustat[CPUTIME_GUEST] += (__force u64)cputime; - } -} - -/* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated - */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, cputime64_t *target_cputime64) -{ - /* Add system time to process. */ - p->stime += (__force u64)cputime; - p->stimescaled += (__force u64)cputime_scaled; - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - *target_cputime64 += (__force u64)cputime; - - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * This is for guest only now. - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) - account_guest_time(p, cputime, cputime_scaled); -} - -/* - * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - cpustat[CPUTIME_STEAL] += (__force u64)cputime; -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -static void account_idle_times(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64)cputime; - else - cpustat[CPUTIME_IDLE] += (__force u64)cputime; -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - -void account_process_tick(struct task_struct *p, int user_tick) -{ -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - account_idle_times(jiffies_to_cputime(ticks)); -} -#endif - -static inline void grq_iso_lock(void) - __acquires(grq.iso_lock) -{ - raw_spin_lock(&grq.iso_lock); -} - -static inline void grq_iso_unlock(void) - __releases(grq.iso_lock) -{ - raw_spin_unlock(&grq.iso_lock); -} - -/* - * Functions to test for when SCHED_ISO tasks have used their allocated - * quota as real time scheduling and convert them back to SCHED_NORMAL. - * Where possible, the data is tested lockless, to avoid grabbing iso_lock - * because the occasional inaccurate result won't matter. However the - * tick data is only ever modified under lock. iso_refractory is only simply - * set to 0 or 1 so it's not worth grabbing the lock yet again for that. - */ -static bool set_iso_refractory(void) -{ - grq.iso_refractory = true; - return grq.iso_refractory; -} - -static bool clear_iso_refractory(void) -{ - grq.iso_refractory = false; - return grq.iso_refractory; -} - -/* - * Test if SCHED_ISO tasks have run longer than their alloted period as RT - * tasks and set the refractory flag if necessary. There is 10% hysteresis - * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a - * slow division. - */ -static bool test_ret_isorefractory(struct rq *rq) -{ - if (likely(!grq.iso_refractory)) { - if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu) - return set_iso_refractory(); - } else { - if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) - return clear_iso_refractory(); - } - return grq.iso_refractory; -} - -static void iso_tick(void) -{ - grq_iso_lock(); - grq.iso_ticks += 100; - grq_iso_unlock(); -} - -/* No SCHED_ISO task was running so decrease rq->iso_ticks */ -static inline void no_iso_tick(void) -{ - if (grq.iso_ticks) { - grq_iso_lock(); - grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; - if (unlikely(grq.iso_refractory && grq.iso_ticks < - ISO_PERIOD * (sched_iso_cpu * 115 / 128))) - clear_iso_refractory(); - grq_iso_unlock(); - } -} - -/* This manages tasks that have run out of timeslice during a scheduler_tick */ -static void task_running_tick(struct rq *rq) -{ - struct task_struct *p; - - /* - * If a SCHED_ISO task is running we increment the iso_ticks. In - * order to prevent SCHED_ISO tasks from causing starvation in the - * presence of true RT tasks we account those as iso_ticks as well. - */ - if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) { - if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128) - iso_tick(); - } else - no_iso_tick(); - - if (iso_queue(rq)) { - if (unlikely(test_ret_isorefractory(rq))) { - if (rq_running_iso(rq)) { - /* - * SCHED_ISO task is running as RT and limit - * has been hit. Force it to reschedule as - * SCHED_NORMAL by zeroing its time_slice - */ - rq->rq_time_slice = 0; - } - } - } - - /* SCHED_FIFO tasks never run out of timeslice. */ - if (rq->rq_policy == SCHED_FIFO) - return; - /* - * Tasks that were scheduled in the first half of a tick are not - * allowed to run into the 2nd half of the next tick if they will - * run out of time slice in the interim. Otherwise, if they have - * less than RESCHED_US μs of time slice left they will be rescheduled. - */ - if (rq->dither) { - if (rq->rq_time_slice > HALF_JIFFY_US) - return; - else - rq->rq_time_slice = 0; - } else if (rq->rq_time_slice >= RESCHED_US) - return; - - /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ - p = rq->curr; - - grq_lock(); - requeue_task(p); - resched_task(p); - grq_unlock(); -} - -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. The data modified is all - * local to struct rq so we don't need to grab grq lock. - */ -void scheduler_tick(void) -{ - int cpu __maybe_unused = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - sched_clock_tick(); - /* grq lock not grabbed, so only update rq clock */ - update_rq_clock(rq); - update_cpu_clock_tick(rq, rq->curr); - update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); - if (!rq_idle(rq)) - task_running_tick(rq); - else - no_iso_tick(); - rq->last_tick = rq->clock; - perf_event_task_tick(); -} - -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) -/* - * If the value passed in is equal to the current preempt count - * then we just disabled preemption. Start timing the latency. - */ -static inline void preempt_latency_start(int val) -{ - if (preempt_count() == val) { - unsigned long ip = get_lock_parent_ip(); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } -} - -void preempt_count_add(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) - return; -#endif - __preempt_count_add(val); -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Spinlock count overflowing soon? - */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= - PREEMPT_MASK - 10); -#endif - preempt_latency_start(val); -} -EXPORT_SYMBOL(preempt_count_add); -NOKPROBE_SYMBOL(preempt_count_add); - -/* - * If the value passed in equals to the current preempt count - * then we just enabled preemption. Stop timing the latency. - */ -static inline void preempt_latency_stop(int val) -{ - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -} - -void preempt_count_sub(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) - return; - /* - * Is the spinlock portion underflowing? - */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; -#endif - - preempt_latency_stop(val); - __preempt_count_sub(val); -} -EXPORT_SYMBOL(preempt_count_sub); -NOKPROBE_SYMBOL(preempt_count_sub); - -#else -static inline void preempt_latency_start(int val) { } -static inline void preempt_latency_stop(int val) { } -#endif - -/* - * The time_slice is only refilled when it is empty and that is when we set a - * new deadline. - */ -static void time_slice_expired(struct task_struct *p) -{ - p->time_slice = timeslice(); - p->deadline = grq.niffies + task_deadline_diff(p); -#ifdef CONFIG_SMT_NICE - if (!p->mm) - p->smt_bias = 0; - else if (rt_task(p)) - p->smt_bias = 1 << 30; - else if (task_running_iso(p)) - p->smt_bias = 1 << 29; - else if (idleprio_task(p)) { - if (task_running_idle(p)) - p->smt_bias = 0; - else - p->smt_bias = 1; - } else if (--p->smt_bias < 1) - p->smt_bias = MAX_PRIO - p->static_prio; -#endif -} - -/* - * Timeslices below RESCHED_US are considered as good as expired as there's no - * point rescheduling when there's so little time left. SCHED_BATCH tasks - * have been flagged be not latency sensitive and likely to be fully CPU - * bound so every time they're rescheduled they have their time_slice - * refilled, but get a new later deadline to have little effect on - * SCHED_NORMAL tasks. - - */ -static inline void check_deadline(struct task_struct *p) -{ - if (p->time_slice < RESCHED_US || batch_task(p)) - time_slice_expired(p); -} - -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) - -/* - * Scheduler queue bitmap specific find next bit. - */ -static inline unsigned long -next_sched_bit(const unsigned long *addr, unsigned long offset) -{ - const unsigned long *p; - unsigned long result; - unsigned long size; - unsigned long tmp; - - size = PRIO_LIMIT; - if (offset >= size) - return size; - - p = addr + BITOP_WORD(offset); - result = offset & ~(BITS_PER_LONG-1); - size -= result; - offset %= BITS_PER_LONG; - if (offset) { - tmp = *(p++); - tmp &= (~0UL << offset); - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - } - while (size & ~(BITS_PER_LONG-1)) { - if ((tmp = *(p++))) - goto found_middle; - result += BITS_PER_LONG; - size -= BITS_PER_LONG; - } - if (!size) - return result; - tmp = *p; - -found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __ffs(tmp); -} - -/* - * Task selection with skiplists is a simple matter of picking off the first - * task in the sorted list, an O(1) operation. The only time it takes longer - * is if tasks do not have suitable affinity and then we iterate over entries - * till we find the first that does. Worst case here is no tasks with suitable - * affinity and taking O(n). - */ -static inline struct -task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -{ - skiplist_node *node = &grq.node; - struct task_struct *edt = idle; - u64 earliest_deadline = ~0ULL; - - while ((node = node->next[0]) != &grq.node) { - struct task_struct *p = node->value; - - /* Make sure affinity is ok */ - if (needs_other_cpu(p, cpu)) - continue; - - if (!smt_schedule(p, rq)) - continue; - - if (!sched_interactive) { - int tcpu; - - if ((tcpu = task_cpu(p)) != cpu) { - u64 dl = p->deadline << locality_diff(tcpu, rq); - - if (!deadline_before(dl, earliest_deadline)) - continue; - earliest_deadline = dl; - edt = p; - /* We continue even though we've found the earliest - * deadline task as the locality offset means there - * may be a better candidate after it. */ - continue; - } - } - /* We've encountered the best deadline local task */ - edt = p; - break; - } - if (likely(edt != idle)) - take_task(cpu, edt); - return edt; -} - -/* - * Print scheduling while atomic bug: - */ -static noinline void __schedule_bug(struct task_struct *prev) -{ - if (oops_in_progress) - return; - - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); - - debug_show_held_locks(prev); - print_modules(); - if (irqs_disabled()) - print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { - pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); - pr_cont("\n"); - } -#endif - if (panic_on_warn) - panic("scheduling while atomic\n"); - - dump_stack(); - add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -} - -/* - * Various schedule()-time debugging checks and statistics: - */ -static inline void schedule_debug(struct task_struct *prev) -{ -#ifdef CONFIG_SCHED_STACK_END_CHECK - if (task_stack_end_corrupted(prev)) - panic("corrupted stack end detected inside scheduler\n"); -#endif - - if (unlikely(in_atomic_preempt_off())) { - __schedule_bug(prev); - preempt_count_set(PREEMPT_DISABLED); - } - rcu_sleep_check(); - - profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - - schedstat_inc(this_rq(), sched_count); -} - -/* - * The currently running task's information is all stored in rq local data - * which is only modified by the local CPU, thereby allowing the data to be - * changed without grabbing the grq lock. - */ -static inline void set_rq_task(struct rq *rq, struct task_struct *p) -{ - rq->rq_time_slice = p->time_slice; - rq->rq_deadline = p->deadline; - rq->rq_last_ran = p->last_ran = rq->clock_task; - rq->rq_policy = p->policy; - rq->rq_prio = p->prio; -#ifdef CONFIG_SMT_NICE - rq->rq_mm = p->mm; - rq->rq_smt_bias = p->smt_bias; -#endif -} - -static void reset_rq_task(struct rq *rq, struct task_struct *p) -{ - rq->rq_policy = p->policy; - rq->rq_prio = p->prio; -#ifdef CONFIG_SMT_NICE - rq->rq_smt_bias = p->smt_bias; -#endif -} - -#ifdef CONFIG_SMT_NICE -static void check_no_siblings(struct rq __maybe_unused *this_rq) {} -static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} -static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; -static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; - -/* Iterate over smt siblings when we've scheduled a process on cpu and decide - * whether they should continue running or be descheduled. */ -static void check_smt_siblings(struct rq *this_rq) -{ - int other_cpu; - - for_each_cpu(other_cpu, &this_rq->thread_mask) { - struct task_struct *p; - struct rq *rq; - - rq = cpu_rq(other_cpu); - if (rq_idle(rq)) - continue; - if (unlikely(!rq->online)) - continue; - p = rq->curr; - if (!smt_schedule(p, this_rq)) { - set_tsk_need_resched(p); - smp_send_reschedule(other_cpu); - } - } -} - -static void wake_smt_siblings(struct rq *this_rq) -{ - int other_cpu; - - if (!queued_notrunning()) - return; - - for_each_cpu(other_cpu, &this_rq->thread_mask) { - struct rq *rq; - - rq = cpu_rq(other_cpu); - if (unlikely(!rq->online)) - continue; - if (rq_idle(rq)) { - struct task_struct *p = rq->curr; - - set_tsk_need_resched(p); - smp_send_reschedule(other_cpu); - } - } -} -#else -static void check_siblings(struct rq __maybe_unused *this_rq) {} -static void wake_siblings(struct rq __maybe_unused *this_rq) {} -#endif - -/* - * schedule() is the main scheduler function. - * - * The main means of driving the scheduler and thus entering this function are: - * - * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. - * - * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return - * paths. For example, see arch/x86/entry_64.S. - * - * To drive preemption between tasks, the scheduler sets the flag in timer - * interrupt handler scheduler_tick(). - * - * 3. Wakeups don't really cause entry into schedule(). They add a - * task to the run-queue and that's it. - * - * Now, if the new task added to the run-queue preempts the current - * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets - * called on the nearest possible occasion: - * - * - If the kernel is preemptible (CONFIG_PREEMPT=y): - * - * - in syscall or exception context, at the next outmost - * preempt_enable(). (this might be as soon as the wake_up()'s - * spin_unlock()!) - * - * - in IRQ context, return from interrupt-handler to - * preemptible context - * - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) - * then at the next: - * - * - cond_resched() call - * - explicit schedule() call - * - return from syscall or exception to user-space - * - return from interrupt-handler to user-space - * - * WARNING: must be called with preemption disabled! - */ -static void __sched notrace __schedule(bool preempt) -{ - struct task_struct *prev, *next, *idle; - unsigned long *switch_count; - bool deactivate = false; - struct rq *rq; - int cpu; - - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - prev = rq->curr; - - /* - * do_exit() calls schedule() with preemption disabled as an exception; - * however we must fix that up, otherwise the next task will see an - * inconsistent (higher) preempt count. - * - * It also avoids the below schedule_debug() test from complaining - * about this. - */ - if (unlikely(prev->state == TASK_DEAD)) - preempt_enable_no_resched_notrace(); - - schedule_debug(prev); - - local_irq_disable(); - rcu_note_context_switch(); - - /* - * Make sure that signal_pending_state()->signal_pending() below - * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). - */ - smp_mb__before_spinlock(); - grq_lock(); - - switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (unlikely(signal_pending_state(prev->state, prev))) { - prev->state = TASK_RUNNING; - } else { - deactivate = true; - prev->on_rq = 0; - - /* - * If a worker is going to sleep, notify and - * ask workqueue whether it wants to wake up a - * task to maintain concurrency. If so, wake - * up the task. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev); - if (to_wakeup) { - /* This shouldn't happen, but does */ - if (unlikely(to_wakeup == prev)) - deactivate = false; - else - try_to_wake_up_local(to_wakeup); - } - } - } - switch_count = &prev->nvcsw; - } - - update_clocks(rq); - update_cpu_clock_switch(rq, prev); - if (rq->clock - rq->last_tick > HALF_JIFFY_NS) - rq->dither = false; - else - rq->dither = true; - - clear_tsk_need_resched(prev); - clear_preempt_need_resched(); - - idle = rq->idle; - if (idle != prev) { - /* Update all the information stored on struct rq */ - prev->time_slice = rq->rq_time_slice; - prev->deadline = rq->rq_deadline; - check_deadline(prev); - prev->last_ran = rq->clock_task; - return_task(prev, rq, deactivate); - } - - if (unlikely(!queued_notrunning())) { - /* - * This CPU is now truly idle as opposed to when idle is - * scheduled as a high priority task in its own right. - */ - next = idle; - schedstat_inc(rq, sched_goidle); - set_cpuidle_map(cpu); - } else { - next = earliest_deadline_task(rq, cpu, idle); - if (likely(next->prio != PRIO_LIMIT)) - clear_cpuidle_map(cpu); - else - set_cpuidle_map(cpu); - } - - if (likely(prev != next)) { - /* - * Don't reschedule an idle task or deactivated tasks - */ - if (prev != idle && !deactivate) - resched_suitable_idle(prev); - set_rq_task(rq, next); - if (next != idle) - check_siblings(rq); - else - wake_siblings(rq); - grq.nr_switches++; - prev->on_cpu = false; - next->on_cpu = true; - rq->curr = next; - ++*switch_count; - - trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next); /* unlocks the grq */ - } else { - check_siblings(rq); - grq_unlock_irq(); - } -} - -static inline void sched_submit_work(struct task_struct *tsk) -{ - if (!tsk->state || tsk_is_pi_blocked(tsk) || - preempt_count() || - signal_pending_state(tsk->state, tsk)) - return; - - /* - * If we are going to sleep and we have plugged IO queued, - * make sure to submit it to avoid deadlocks. - */ - if (blk_needs_flush_plug(tsk)) - blk_schedule_flush_plug(tsk); -} - -asmlinkage __visible void __sched schedule(void) -{ - struct task_struct *tsk = current; - - sched_submit_work(tsk); - do { - preempt_disable(); - __schedule(false); - sched_preempt_enable_no_resched(); - } while (need_resched()); -} - -EXPORT_SYMBOL(schedule); - -#ifdef CONFIG_CONTEXT_TRACKING -asmlinkage __visible void __sched schedule_user(void) -{ - /* - * If we come here after a random call to set_need_resched(), - * or we have been woken up remotely but the IPI has not yet arrived, - * we haven't yet exited the RCU idle mode. Do it here manually until - * we find a better solution. - * - * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != IN_USER, but that will trigger - * too frequently to make sense yet. - */ - enum ctx_state prev_state = exception_enter(); - schedule(); - exception_exit(prev_state); -} -#endif - -/** - * schedule_preempt_disabled - called with preemption disabled - * - * Returns with preemption disabled. Note: preempt_count must be 1 - */ -void __sched schedule_preempt_disabled(void) -{ - sched_preempt_enable_no_resched(); - schedule(); - preempt_disable(); -} - -static void __sched notrace preempt_schedule_common(void) -{ - do { - /* - * Because the function tracer can trace preempt_count_sub() - * and it also uses preempt_enable/disable_notrace(), if - * NEED_RESCHED is set, the preempt_enable_notrace() called - * by the function tracer will call this function again and - * cause infinite recursion. - * - * Preemption must be disabled here before the function - * tracer can trace. Break up preempt_disable() into two - * calls. One to disable preemption without fear of being - * traced. The other to still record the preemption latency, - * which can also be traced by the function tracer. - */ - preempt_disable_notrace(); - preempt_latency_start(1); - __schedule(true); - preempt_latency_stop(1); - preempt_enable_no_resched_notrace(); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - } while (need_resched()); -} - -#ifdef CONFIG_PREEMPT -/* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. - */ -asmlinkage __visible void __sched notrace preempt_schedule(void) -{ - /* - * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. - */ - if (likely(!preemptible())) - return; - - preempt_schedule_common(); -} -NOKPROBE_SYMBOL(preempt_schedule); -EXPORT_SYMBOL(preempt_schedule); - -/** - * preempt_schedule_notrace - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -{ - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - do { - /* - * Because the function tracer can trace preempt_count_sub() - * and it also uses preempt_enable/disable_notrace(), if - * NEED_RESCHED is set, the preempt_enable_notrace() called - * by the function tracer will call this function again and - * cause infinite recursion. - * - * Preemption must be disabled here before the function - * tracer can trace. Break up preempt_disable() into two - * calls. One to disable preemption without fear of being - * traced. The other to still record the preemption latency, - * which can also be traced by the function tracer. - */ - preempt_disable_notrace(); - preempt_latency_start(1); - /* - * Needs preempt disabled in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - prev_ctx = exception_enter(); - __schedule(true); - exception_exit(prev_ctx); - - preempt_latency_stop(1); - preempt_enable_no_resched_notrace(); - } while (need_resched()); -} -EXPORT_SYMBOL_GPL(preempt_schedule_notrace); - -#endif /* CONFIG_PREEMPT */ - -/* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. - */ -asmlinkage __visible void __sched preempt_schedule_irq(void) -{ - enum ctx_state prev_state; - - /* Catch callers which need to be fixed */ - BUG_ON(preempt_count() || !irqs_disabled()); - - prev_state = exception_enter(); - - do { - preempt_disable(); - local_irq_enable(); - __schedule(true); - local_irq_disable(); - sched_preempt_enable_no_resched(); - } while (need_resched()); - - exception_exit(prev_state); -} - -int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, - void *key) -{ - return try_to_wake_up(curr->private, mode, wake_flags); -} -EXPORT_SYMBOL(default_wake_function); - -#ifdef CONFIG_RT_MUTEXES - -/* - * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) - * - * This function changes the 'effective' priority of a task. It does - * not touch ->normal_prio like __setscheduler(). - * - * Used by the rt_mutex code to implement priority inheritance - * logic. Call site only calls if the priority of the task changed. - */ -void rt_mutex_setprio(struct task_struct *p, int prio) -{ - unsigned long flags; - struct rq *rq; - int oldprio; - - BUG_ON(prio < 0 || prio > MAX_PRIO); - - rq = task_grq_lock(p, &flags); - - /* - * Idle task boosting is a nono in general. There is one - * exception, when PREEMPT_RT and NOHZ is active: - * - * The idle task calls get_next_timer_interrupt() and holds - * the timer wheel base->lock on the CPU and another CPU wants - * to access the timer (probably to cancel it). We can safely - * ignore the boosting request, as the idle CPU runs this code - * with interrupts disabled and will complete the lock - * protected section without being interrupted. So there is no - * real need to boost. - */ - if (unlikely(p == rq->idle)) { - WARN_ON(p != rq->curr); - WARN_ON(p->pi_blocked_on); - goto out_unlock; - } - - trace_sched_pi_setprio(p, prio); - oldprio = p->prio; - p->prio = prio; - if (task_running(p)){ - if (prio > oldprio) - resched_task(p); - } else if (task_queued(p)) { - dequeue_task(p); - enqueue_task(p, rq); - if (prio < oldprio) - try_preempt(p, rq); - } -out_unlock: - task_grq_unlock(p, &flags); -} - -#endif - -/* - * Adjust the deadline for when the priority is to change, before it's - * changed. - */ -static inline void adjust_deadline(struct task_struct *p, int new_prio) -{ - p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); -} - -void set_user_nice(struct task_struct *p, long nice) -{ - int new_static, old_static; - unsigned long flags; - struct rq *rq; - - if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) - return; - new_static = NICE_TO_PRIO(nice); - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - rq = time_task_grq_lock(p, &flags); - /* - * The RT priorities are set via sched_setscheduler(), but we still - * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is - * not SCHED_NORMAL/SCHED_BATCH: - */ - if (has_rt_policy(p)) { - p->static_prio = new_static; - goto out_unlock; - } - - adjust_deadline(p, new_static); - old_static = p->static_prio; - p->static_prio = new_static; - p->prio = effective_prio(p); - - if (task_queued(p)) { - dequeue_task(p); - enqueue_task(p, rq); - if (new_static < old_static) - try_preempt(p, rq); - } else if (task_running(p)) { - reset_rq_task(rq, p); - if (old_static < new_static) - resched_task(p); - } -out_unlock: - task_grq_unlock(p, &flags); -} -EXPORT_SYMBOL(set_user_nice); - -/* - * can_nice - check if a task can reduce its nice value - * @p: task - * @nice: nice value - */ -int can_nice(const struct task_struct *p, const int nice) -{ - /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = nice_to_rlimit(nice); - - return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || - capable(CAP_SYS_NICE)); -} - -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -SYSCALL_DEFINE1(nice, int, increment) -{ - long nice, retval; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - - increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); - nice = task_nice(current) + increment; - - nice = clamp_val(nice, MIN_NICE, MAX_NICE); - if (increment < 0 && !can_nice(current, nice)) - return -EPERM; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - -/** - * task_prio - return the priority value of a given task. - * @p: the task in question. - * - * Return: The priority value as seen by users in /proc. - * RT tasks are offset by -100. Normal tasks are centered around 1, value goes - * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). - */ -int task_prio(const struct task_struct *p) -{ - int delta, prio = p->prio - MAX_RT_PRIO; - - /* rt tasks and iso tasks */ - if (prio <= 0) - goto out; - - /* Convert to ms to avoid overflows */ - delta = NS_TO_MS(p->deadline - grq.niffies); - delta = delta * 40 / ms_longest_deadline_diff(); - if (delta > 0 && delta <= 80) - prio += delta; - if (idleprio_task(p)) - prio += 40; -out: - return prio; -} - -/** - * idle_cpu - is a given cpu idle currently? - * @cpu: the processor in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int idle_cpu(int cpu) -{ - return cpu_curr(cpu) == cpu_rq(cpu)->idle; -} - -/** - * idle_task - return the idle task for a given cpu. - * @cpu: the processor in question. - * - * Return: The idle task for the cpu @cpu. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - * - * The task of @pid, if found. %NULL otherwise. - */ -static inline struct task_struct *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_vpid(pid) : current; -} - -/* Actually do priority change: must hold grq lock. */ -static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, - int prio, bool keep_boost) -{ - int oldrtprio, oldprio; - - p->policy = policy; - oldrtprio = p->rt_priority; - p->rt_priority = prio; - p->normal_prio = normal_prio(p); - oldprio = p->prio; - /* - * Keep a potential priority boosting if called from - * sched_setscheduler(). - */ - if (keep_boost) { - /* - * Take priority boosted tasks into account. If the new - * effective priority is unchanged, we just store the new - * normal parameters and do not touch the scheduler class and - * the runqueue. This will be done when the task deboost - * itself. - */ - p->prio = rt_mutex_get_effective_prio(p, p->normal_prio); - } else - p->prio = p->normal_prio; - - if (task_running(p)) { - reset_rq_task(rq, p); - resched_task(p); - } else if (task_queued(p)) { - dequeue_task(p); - enqueue_task(p, rq); - if (p->prio < oldprio || p->rt_priority > oldrtprio) - try_preempt(p, rq); - } -} - -/* - * check the target process has a UID that matches the current process's - */ -static bool check_same_owner(struct task_struct *p) -{ - const struct cred *cred = current_cred(), *pcred; - bool match; - - rcu_read_lock(); - pcred = __task_cred(p); - match = (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); - rcu_read_unlock(); - return match; -} - -static int -__sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool user, bool pi) -{ - struct sched_param zero_param = { .sched_priority = 0 }; - unsigned long flags, rlim_rtprio = 0; - int retval, oldpolicy = -1; - int reset_on_fork; - struct rq *rq; - - /* may grab non-irq protected spin_locks */ - BUG_ON(in_interrupt()); - - if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { - unsigned long lflags; - - if (!lock_task_sighand(p, &lflags)) - return -ESRCH; - rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - unlock_task_sighand(p, &lflags); - if (rlim_rtprio) - goto recheck; - /* - * If the caller requested an RT policy without having the - * necessary rights, we downgrade the policy to SCHED_ISO. - * We also set the parameter to zero to pass the checks. - */ - policy = SCHED_ISO; - param = &zero_param; - } -recheck: - /* double check policy once rq lock held */ - if (policy < 0) { - reset_on_fork = p->sched_reset_on_fork; - policy = oldpolicy = p->policy; - } else { - reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); - policy &= ~SCHED_RESET_ON_FORK; - - if (!SCHED_RANGE(policy)) - return -EINVAL; - } - - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and - * SCHED_BATCH is 0. - */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO - 1)) - return -EINVAL; - if (is_rt_policy(policy) != (param->sched_priority != 0)) - return -EINVAL; - - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (user && !capable(CAP_SYS_NICE)) { - if (is_rt_policy(policy)) { - unsigned long rlim_rtprio = - task_rlimit(p, RLIMIT_RTPRIO); - - /* can't set/change the rt policy */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; - - /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) - return -EPERM; - } else { - switch (p->policy) { - /* - * Can only downgrade policies but not back to - * SCHED_NORMAL - */ - case SCHED_ISO: - if (policy == SCHED_ISO) - goto out; - if (policy == SCHED_NORMAL) - return -EPERM; - break; - case SCHED_BATCH: - if (policy == SCHED_BATCH) - goto out; - if (policy != SCHED_IDLEPRIO) - return -EPERM; - break; - case SCHED_IDLEPRIO: - if (policy == SCHED_IDLEPRIO) - goto out; - return -EPERM; - default: - break; - } - } - - /* can't change other user's priorities */ - if (!check_same_owner(p)) - return -EPERM; - - /* Normal users shall not reset the sched_reset_on_fork flag */ - if (p->sched_reset_on_fork && !reset_on_fork) - return -EPERM; - } - - if (user) { - retval = security_task_setscheduler(p); - if (retval) - return retval; - } - - /* - * make sure no PI-waiters arrive (or leave) while we are - * changing the priority of the task: - * - * To be able to change p->policy safely, the grunqueue lock must be - * held. - */ - rq = task_grq_lock(p, &flags); - - /* - * Changing the policy of the stop threads its a very bad idea - */ - if (p == rq->stop) { - task_grq_unlock(p, &flags); - return -EINVAL; - } - - /* - * If not changing anything there's no need to proceed further: - */ - if (unlikely(policy == p->policy && (!is_rt_policy(policy) || - param->sched_priority == p->rt_priority))) { - - task_grq_unlock(p, &flags); - return 0; - } - - /* recheck policy now with rq lock held */ - if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { - policy = oldpolicy = -1; - task_grq_unlock(p, &flags); - goto recheck; - } - update_clocks(rq); - p->sched_reset_on_fork = reset_on_fork; - - __setscheduler(p, rq, policy, param->sched_priority, pi); - task_grq_unlock(p, &flags); - - if (pi) - rt_mutex_adjust_pi(p); -out: - return 0; -} - -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, true, true); -} - -EXPORT_SYMBOL_GPL(sched_setscheduler); - -int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -{ - const struct sched_param param = { .sched_priority = attr->sched_priority }; - int policy = attr->sched_policy; - - return __sched_setscheduler(p, policy, ¶m, true, true); -} -EXPORT_SYMBOL_GPL(sched_setattr); - -/** - * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Just like sched_setscheduler, only don't bother checking if the - * current context has permission. For example, this is needed in - * stop_machine(): we create temporary high priority worker threads, - * but our caller might not have that capability. - * - * Return: 0 on success. An error code otherwise. - */ -int sched_setscheduler_nocheck(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, false, true); -} -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); - -static int -do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -{ - struct sched_param lparam; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - if (copy_from_user(&lparam, param, sizeof(struct sched_param))) - return -EFAULT; - - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); - rcu_read_unlock(); - - return retval; -} - -/* - * Mimics kernel/events/core.c perf_copy_attr(). - */ -static int sched_copy_attr(struct sched_attr __user *uattr, - struct sched_attr *attr) -{ - u32 size; - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = SCHED_ATTR_SIZE_VER0; - - if (size < SCHED_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * XXX: do we want to be lenient like existing syscalls; or do we want - * to be strict and return an error on out-of-bounds values? - */ - attr->sched_nice = clamp(attr->sched_nice, -20, 19); - - /* sched/core.c uses zero here but we already know ret is zero */ - return 0; - -err_size: - put_user(sizeof(*attr), &uattr->size); - return -E2BIG; -} - -/** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy. - * - * Return: 0 on success. An error code otherwise. - * @param: structure containing the new RT priority. - */ -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, - struct sched_param __user *param) -{ - /* negative values for policy are not valid */ - if (policy < 0) - return -EINVAL; - - return do_sched_setscheduler(pid, policy, param); -} - -/* - * sched_setparam() passes in -1 for its policy, to let the functions - * it calls know not to change it. - */ -#define SETPARAM_POLICY -1 - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -{ - return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -} - -/** - * sys_sched_setattr - same as above, but with extended sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - */ -SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, flags) -{ - struct sched_attr attr; - struct task_struct *p; - int retval; - - if (!uattr || pid < 0 || flags) - return -EINVAL; - - retval = sched_copy_attr(uattr, &attr); - if (retval) - return retval; - - if ((int)attr.sched_policy < 0) - return -EINVAL; - - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setattr(p, &attr); - rcu_read_unlock(); - - return retval; -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - * - * Return: On success, the policy of the thread. Otherwise, a negative error - * code. - */ -SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -{ - struct task_struct *p; - int retval = -EINVAL; - - if (pid < 0) - goto out_nounlock; - - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy; - } - rcu_read_unlock(); - -out_nounlock: - return retval; -} - -/** - * sys_sched_getscheduler - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - * - * Return: On success, 0 and the RT priority is in @param. Otherwise, an error - * code. - */ -SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -{ - struct sched_param lp = { .sched_priority = 0 }; - struct task_struct *p; - int retval = -EINVAL; - - if (!param || pid < 0) - goto out_nounlock; - - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - if (has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - -out_nounlock: - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; -} - -static int sched_read_attr(struct sched_attr __user *uattr, - struct sched_attr *attr, - unsigned int usize) -{ - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, usize)) - return -EFAULT; - - /* - * If we're handed a smaller struct than we know of, - * ensure all the unknown bits are 0 - i.e. old - * user-space does not get uncomplete information. - */ - if (usize < sizeof(*attr)) { - unsigned char *addr; - unsigned char *end; - - addr = (void *)attr + usize; - end = (void *)attr + sizeof(*attr); - - for (; addr < end; addr++) { - if (*addr) - return -EFBIG; - } - - attr->size = usize; - } - - ret = copy_to_user(uattr, attr, attr->size); - if (ret) - return -EFAULT; - - /* sched/core.c uses zero here but we already know ret is zero */ - return ret; -} - -/** - * sys_sched_getattr - similar to sched_getparam, but with sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @size: sizeof(attr) for fwd/bwd comp. - * @flags: for future extension. - */ -SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size, unsigned int, flags) -{ - struct sched_attr attr = { - .size = sizeof(struct sched_attr), - }; - struct task_struct *p; - int retval; - - if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0 || flags) - return -EINVAL; - - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - attr.sched_policy = p->policy; - if (rt_task(p)) - attr.sched_priority = p->rt_priority; - else - attr.sched_nice = task_nice(p); - - rcu_read_unlock(); - - retval = sched_read_attr(uattr, &attr, size); - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; -} - -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -{ - cpumask_var_t cpus_allowed, new_mask; - struct task_struct *p; - int retval; - - get_online_cpus(); - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); - put_online_cpus(); - return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); - - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; - if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - goto out_unlock; - } - rcu_read_unlock(); - } - - retval = security_task_setscheduler(p); - if (retval) - goto out_unlock; - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); -again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_unlock: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); -out_put_task: - put_task_struct(p); - put_online_cpus(); - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) -{ - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - - -/** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - cpumask_var_t new_mask; - int retval; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); - if (retval == 0) - retval = sched_setaffinity(pid, new_mask); - free_cpumask_var(new_mask); - return retval; -} - -long sched_getaffinity(pid_t pid, cpumask_t *mask) -{ - struct task_struct *p; - unsigned long flags; - int retval; - - get_online_cpus(); - rcu_read_lock(); - - retval = -ESRCH; - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - grq_lock_irqsave(&flags); - cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask); - grq_unlock_irqrestore(&flags); - -out_unlock: - rcu_read_unlock(); - put_online_cpus(); - - return retval; -} - -/** - * sys_sched_getaffinity - get the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask - * - * Return: size of CPU mask copied to user_mask_ptr on success. An - * error code otherwise. - */ -SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - int ret; - cpumask_var_t mask; - - if ((len * BITS_PER_BYTE) < nr_cpu_ids) - return -EINVAL; - if (len & (sizeof(unsigned long)-1)) - return -EINVAL; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - ret = sched_getaffinity(pid, mask); - if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); - - if (copy_to_user(user_mask_ptr, mask, retlen)) - ret = -EFAULT; - else - ret = retlen; - } - free_cpumask_var(mask); - - return ret; -} - -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. It does this by - * scheduling away the current task. If it still has the earliest deadline - * it will be scheduled again as the next task. - * - * Return: 0. - */ -SYSCALL_DEFINE0(sched_yield) -{ - struct task_struct *p; - - p = current; - grq_lock_irq(); - schedstat_inc(task_rq(p), yld_count); - requeue_task(p); - - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ - __release(grq.lock); - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&grq.lock); - sched_preempt_enable_no_resched(); - - schedule(); - - return 0; -} - -int __sched _cond_resched(void) -{ - if (should_resched(0)) { - preempt_schedule_common(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(_cond_resched); - -/* - * __cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -int __cond_resched_lock(spinlock_t *lock) -{ - int resched = should_resched(PREEMPT_LOCK_OFFSET); - int ret = 0; - - lockdep_assert_held(lock); - - if (spin_needbreak(lock) || resched) { - spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else - cpu_relax(); - ret = 1; - spin_lock(lock); - } - return ret; -} -EXPORT_SYMBOL(__cond_resched_lock); - -int __sched __cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { - local_bh_enable(); - preempt_schedule_common(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(__cond_resched_softirq); - -/** - * yield - yield the current processor to other threads. - * - * Do not ever use this function, there's a 99% chance you're doing it wrong. - * - * The scheduler is at all times free to pick the calling task as the most - * eligible task to run, if removing the yield() call from your code breaks - * it, its already broken. - * - * Typical broken usage is: - * - * while (!event) - * yield(); - * - * where one assumes that yield() will let 'the other' process run that will - * make event true. If the current task is a SCHED_FIFO task that will never - * happen. Never use yield() as a progress guarantee!! - * - * If you want to use yield() to wait for something, use wait_event(). - * If you want to use yield() to be 'nice' for others, use cond_resched(). - * If you still want to use yield(), do not! - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - sys_sched_yield(); -} -EXPORT_SYMBOL(yield); - -/** - * yield_to - yield the current processor to another thread in - * your thread group, or accelerate that thread toward the - * processor it's on. - * @p: target task - * @preempt: whether task preemption is allowed or not - * - * It's the caller's job to ensure that the target task struct - * can't go away on us before we can do any checks. - * - * Return: - * true (>0) if we indeed boosted the target task. - * false (0) if we failed to boost the target. - * -ESRCH if there's no task to yield to. - */ -int __sched yield_to(struct task_struct *p, bool preempt) -{ - struct rq *rq, *p_rq; - unsigned long flags; - int yielded = 0; - - rq = this_rq(); - grq_lock_irqsave(&flags); - if (task_running(p) || p->state) { - yielded = -ESRCH; - goto out_unlock; - } - - p_rq = task_rq(p); - yielded = 1; - if (p->deadline > rq->rq_deadline) - p->deadline = rq->rq_deadline; - p->time_slice += rq->rq_time_slice; - rq->rq_time_slice = 0; - if (p->time_slice > timeslice()) - p->time_slice = timeslice(); - if (preempt && rq != p_rq) - resched_curr(p_rq); -out_unlock: - grq_unlock_irqrestore(&flags); - - if (yielded > 0) - schedule(); - return yielded; -} -EXPORT_SYMBOL_GPL(yield_to); - -/* - * This task is about to go to sleep on IO. Increment rq->nr_iowait so - * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) - */ - -long __sched io_schedule_timeout(long timeout) -{ - int old_iowait = current->in_iowait; - struct rq *rq; - long ret; - - current->in_iowait = 1; - blk_schedule_flush_plug(current); - - delayacct_blkio_start(); - rq = raw_rq(); - atomic_inc(&rq->nr_iowait); - ret = schedule_timeout(timeout); - current->in_iowait = old_iowait; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); - - return ret; -} -EXPORT_SYMBOL(io_schedule_timeout); - -/** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the maximum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_ISO: - case SCHED_IDLEPRIO: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the minimum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_ISO: - case SCHED_IDLEPRIO: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) -{ - struct task_struct *p; - unsigned int time_slice; - unsigned long flags; - int retval; - struct timespec t; - - if (pid < 0) - return -EINVAL; - - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - grq_lock_irqsave(&flags); - time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); - grq_unlock_irqrestore(&flags); - - rcu_read_unlock(); - t = ns_to_timespec(time_slice); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; -} - -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - -void sched_show_task(struct task_struct *p) -{ - unsigned long free = 0; - int ppid; - unsigned long state = p->state; - - if (state) - state = __ffs(state) + 1; - printk(KERN_INFO "%-15.15s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif -#ifdef CONFIG_DEBUG_STACK_USAGE - free = stack_not_used(p); -#endif - ppid = 0; - rcu_read_lock(); - if (pid_alive(p)) - ppid = task_pid_nr(rcu_dereference(p->real_parent)); - rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, - (unsigned long)task_thread_info(p)->flags); - - print_worker_info(KERN_INFO, p); - show_stack(p, NULL); -} - -void show_state_filter(unsigned long state_filter) -{ - struct task_struct *g, *p; - -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif - rcu_read_lock(); - for_each_process_thread(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take a lot of time: - * Also, reset softlockup watchdogs on all CPUs, because - * another CPU might be blocked waiting for us to process - * an IPI. - */ - touch_nmi_watchdog(); - touch_all_softlockup_watchdogs(); - if (!state_filter || (p->state & state_filter)) - sched_show_task(p); - } - - rcu_read_unlock(); - /* - * Only show locks if all tasks are dumped: - */ - if (!state_filter) - debug_show_all_locks(); -} - -void dump_cpu_task(int cpu) -{ - pr_info("Task dump for CPU %d:\n", cpu); - sched_show_task(cpu_curr(cpu)); -} - -#ifdef CONFIG_SMP -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -{ - cpumask_copy(&p->cpus_allowed, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); -} - -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ - cpumask_copy(tsk_cpus_allowed(p), new_mask); - if (needs_other_cpu(p, task_cpu(p))) - set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); -} -#endif - -/** - * init_idle - set up an idle thread for a given CPU - * @idle: task in question - * @cpu: cpu the idle task belongs to - * - * NOTE: this function does not set the idle thread's NEED_RESCHED - * flag, to make booting more robust. - */ -void init_idle(struct task_struct *idle, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - raw_spin_lock_irqsave(&idle->pi_lock, flags); - time_lock_grq(rq); - idle->last_ran = rq->clock_task; - idle->state = TASK_RUNNING; - /* Setting prio to illegal value shouldn't matter when never queued */ - idle->prio = PRIO_LIMIT; - - kasan_unpoison_task_stack(idle); - -#ifdef CONFIG_SMP - /* - * It's possible that init_idle() gets called multiple times on a task, - * in that case do_set_cpus_allowed() will not do the right thing. - * - * And since this is boot we can forgo the serialisation. - */ - set_cpus_allowed_common(idle, cpumask_of(cpu)); -#ifdef CONFIG_SMT_NICE - idle->smt_bias = 0; -#endif -#endif - set_rq_task(rq, idle); - - /* Silence PROVE_RCU */ - rcu_read_lock(); - set_task_cpu(idle, cpu); - rcu_read_unlock(); - - rq->curr = rq->idle = idle; - idle->on_cpu = 1; - grq_unlock(); - raw_spin_unlock_irqrestore(&idle->pi_lock, flags); - - /* Set the preempt count _outside_ the spinlocks! */ - init_idle_preempt_count(idle, cpu); - - ftrace_graph_init_idle_task(idle, cpu); -#ifdef CONFIG_SMP - sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif -} - -int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, - const struct cpumask __maybe_unused *trial) -{ - return 1; -} - -int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) -{ - int ret = 0; - - /* - * Kthreads which disallow setaffinity shouldn't be moved - * to a new cpuset; we don't want to change their cpu - * affinity and isolating such threads by their set of - * allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for - * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. - */ - if (p->flags & PF_NO_SETAFFINITY) - ret = -EINVAL; - - return ret; -} - -void wake_q_add(struct wake_q_head *head, struct task_struct *task) -{ - struct wake_q_node *node = &task->wake_q; - - /* - * Atomically grab the task, if ->wake_q is !nil already it means - * its already queued (either by us or someone else) and will get the - * wakeup due to that. - * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_q(). - */ - if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) - return; - - get_task_struct(task); - - /* - * The head is context local, there can be no concurrency. - */ - *head->lastp = node; - head->lastp = &node->next; -} - -void wake_up_q(struct wake_q_head *head) -{ - struct wake_q_node *node = head->first; - - while (node != WAKE_Q_TAIL) { - struct task_struct *task; - - task = container_of(node, struct task_struct, wake_q); - BUG_ON(!task); - /* task can safely be re-inserted now */ - node = node->next; - task->wake_q.next = NULL; - - /* - * wake_up_process() implies a wmb() to pair with the queueing - * in wake_q_add() so as not to miss wakeups. - */ - wake_up_process(task); - put_task_struct(task); - } -} - -void resched_cpu(int cpu) -{ - unsigned long flags; - - grq_lock_irqsave(&flags); - resched_task(cpu_curr(cpu)); - grq_unlock_irqrestore(&flags); -} - -#ifdef CONFIG_SMP -#ifdef CONFIG_NO_HZ_COMMON -void nohz_balance_enter_idle(int cpu) -{ -} - -void select_nohz_load_balancer(int stop_tick) -{ -} - -void set_cpu_sd_state_idle(void) {} -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * lowest_flag_domain - Return lowest sched_domain containing flag. - * @cpu: The cpu whose lowest level of sched domain is to - * be returned. - * @flag: The flag to check for the lowest sched_domain - * for the given cpu. - * - * Returns the lowest sched_domain of a cpu which contains the given flag. - */ -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -{ - struct sched_domain *sd; - - for_each_domain(cpu, sd) - if (sd && (sd->flags & flag)) - break; - - return sd; -} - -/** - * for_each_flag_domain - Iterates over sched_domains containing the flag. - * @cpu: The cpu whose domains we're iterating over. - * @sd: variable holding the value of the power_savings_sd - * for cpu. - * @flag: The flag to filter the sched_domains to be iterated. - * - * Iterates over all the scheduler domains for a given cpu that has the 'flag' - * set, starting from the lowest sched_domain to the highest. - */ -#define for_each_flag_domain(cpu, sd, flag) \ - for (sd = lowest_flag_domain(cpu, flag); \ - (sd && (sd->flags & flag)); sd = sd->parent) - -#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ - -/* - * In the semi idle case, use the nearest busy cpu for migrating timers - * from an idle cpu. This is good for power-savings. - * - * We don't do similar optimization for completely idle system, as - * selecting an idle cpu will add more delays to the timers than intended - * (as that cpu's timer base may not be uptodate wrt jiffies etc). - */ -int get_nohz_timer_target(void) -{ - int i, cpu = smp_processor_id(); - struct sched_domain *sd; - - if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) - return cpu; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (cpu == i) - continue; - - if (!idle_cpu(i) && is_housekeeping_cpu(i)) { - cpu = i; - cpu = i; - goto unlock; - } - } - } - - if (!is_housekeeping_cpu(cpu)) - cpu = housekeeping_any_cpu(); -unlock: - rcu_read_unlock(); - return cpu; -} - -/* - * When add_timer_on() enqueues a timer into the timer wheel of an - * idle CPU then this timer might expire before the next timer event - * which is scheduled to wake up that CPU. In case of a completely - * idle system the next event might even be infinite time into the - * future. wake_up_idle_cpu() ensures that the CPU is woken up and - * leaves the inner idle loop so the newly added timer is taken into - * account when the CPU goes back to idle and evaluates the timer - * wheel for the next timer event. - */ -void wake_up_idle_cpu(int cpu) -{ - if (cpu == smp_processor_id()) - return; - - set_tsk_need_resched(cpu_rq(cpu)->idle); - smp_send_reschedule(cpu); -} - -void wake_up_nohz_cpu(int cpu) -{ - wake_up_idle_cpu(cpu); -} -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) -{ - const struct cpumask *cpu_valid_mask = cpu_active_mask; - bool running_wrong = false; - struct cpumask old_mask; - bool queued = false; - unsigned long flags; - struct rq *rq; - int ret = 0; - - rq = task_grq_lock(p, &flags); - - if (p->flags & PF_KTHREAD) { - /* - * Kernel threads are allowed on online && !active CPUs - */ - cpu_valid_mask = cpu_online_mask; - } - - /* - * Must re-check here, to close a race against __kthread_bind(), - * sched_setaffinity() is not guaranteed to observe the flag. - */ - if (check && (p->flags & PF_NO_SETAFFINITY)) { - ret = -EINVAL; - goto out; - } - - cpumask_copy(&old_mask, tsk_cpus_allowed(p)); - if (cpumask_equal(&old_mask, new_mask)) - goto out; - - if (!cpumask_intersects(new_mask, cpu_valid_mask)) { - ret = -EINVAL; - goto out; - } - - queued = task_queued(p); - - do_set_cpus_allowed(p, new_mask); - - if (p->flags & PF_KTHREAD) { - /* - * For kernel threads that do indeed end up on online && - * !active we want to ensure they are strict per-cpu threads. - */ - WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && - !cpumask_intersects(new_mask, cpu_active_mask) && - tsk_nr_cpus_allowed(p) != 1); - } - - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; - - if (task_running(p)) { - /* Task is running on the wrong cpu now, reschedule it. */ - if (rq == this_rq()) { - set_tsk_need_resched(p); - running_wrong = true; - } else - resched_task(p); - } else - set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask)); - -out: - if (queued && !cpumask_subset(new_mask, &old_mask)) - try_preempt(p, rq); - if (running_wrong) - preempt_disable(); - task_grq_unlock(p, &flags); - - if (running_wrong) { - __schedule(true); - preempt_enable(); - } - - return ret; -} - -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - return __set_cpus_allowed_ptr(p, new_mask, false); -} -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -static bool sched_smp_initialized __read_mostly; - -#ifdef CONFIG_HOTPLUG_CPU -/* Run through task list and find tasks affined to the dead cpu, then remove - * that cpu from the list, enable cpu0 and set the zerobound flag. */ -static void bind_zero(int src_cpu) -{ - struct task_struct *p, *t; - int bound = 0; - - if (src_cpu == 0) - return; - - do_each_thread(t, p) { - if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) { - cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p)); - cpumask_set_cpu(0, tsk_cpus_allowed(p)); - p->zerobound = true; - bound++; - if (task_cpu(p) == src_cpu) { - set_task_cpu(p, 0); - if (task_running(p)) - resched_task(p); - } - } - } while_each_thread(t, p); - - if (bound) { - printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n", - bound, src_cpu); - } -} - -/* Find processes with the zerobound flag and reenable their affinity for the - * CPU coming alive. */ -static void unbind_zero(int src_cpu) -{ - int unbound = 0, zerobound = 0; - struct task_struct *p, *t; - - if (src_cpu == 0) - return; - - do_each_thread(t, p) { - if (!p->mm) - p->zerobound = false; - if (p->zerobound) { - unbound++; - cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p)); - /* Once every CPU affinity has been re-enabled, remove - * the zerobound flag */ - if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) { - p->zerobound = false; - zerobound++; - } - } - } while_each_thread(t, p); - - if (unbound) { - printk(KERN_INFO "Added affinity for %d processes to cpu %d\n", - unbound, src_cpu); - } - if (zerobound) { - printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n", - zerobound); - } -} - -/* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. - */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) { - switch_mm_irqs_off(mm, &init_mm, current); - finish_arch_post_lock_switch(); - } - mmdrop(mm); -} -#else /* CONFIG_HOTPLUG_CPU */ -static void unbind_zero(int src_cpu) {} -#endif /* CONFIG_HOTPLUG_CPU */ - -void sched_set_stop_task(int cpu, struct task_struct *stop) -{ - struct sched_param stop_param = { .sched_priority = STOP_PRIO }; - struct sched_param start_param = { .sched_priority = 0 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - - if (stop) { - /* - * Make it appear like a SCHED_FIFO task, its something - * userspace knows about and won't get confused about. - * - * Also, it will make PI more or less work without too - * much confusion -- but then, stop work should not - * rely on PI working anyway. - */ - sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); - } - - cpu_rq(cpu)->stop = stop; - - if (old_stop) { - /* - * Reset it back to a normal scheduling policy so that - * it can die in pieces. - */ - sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); - } -} - -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -#define CPU_LOAD_IDX_MAX 5 -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(14); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -void unregister_sched_domain_sysctl(void) -{ - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#endif /* CONFIG_SYSCTL */ - -static void set_rq_online(struct rq *rq) -{ - if (!rq->online) { - cpumask_set_cpu(cpu_of(rq), rq->rd->online); - rq->online = true; - } -} - -static void set_rq_offline(struct rq *rq) -{ - if (rq->online) { - int cpu = cpu_of(rq); - - cpumask_clear_cpu(cpu, rq->rd->online); - rq->online = false; - clear_cpuidle_map(cpu); - } -} - -static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ - -#ifdef CONFIG_SCHED_DEBUG - -static __read_mostly int sched_debug_enabled; - -static int __init sched_debug_setup(char *str) -{ - sched_debug_enabled = 1; - - return 0; -} -early_param("sched_debug", sched_debug_setup); - -static inline bool sched_debug(void) -{ - return sched_debug_enabled; -} - -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - struct cpumask *groupmask) -{ - cpumask_clear(groupmask); - - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); - return -1; - } - - printk(KERN_CONT "span %*pbl level %s\n", - cpumask_pr_args(sched_domain_span(sd)), sd->name); - - if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); - } - - printk(KERN_CONT "\n"); - - if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - - if (sd->parent && - !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); - return 0; -} - -static void sched_domain_debug(struct sched_domain *sd, int cpu) -{ - int level = 0; - - if (!sched_debug_enabled) - return; - - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; - } - - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) - break; - level++; - sd = sd->parent; - if (!sd) - break; - } -} -#else /* !CONFIG_SCHED_DEBUG */ -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ - -static int sd_degenerate(struct sched_domain *sd) -{ - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; - - /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_AFFINE)) - return 0; - - return 1; -} - -static int -sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) -{ - unsigned long cflags = sd->flags, pflags = parent->flags; - - if (sd_degenerate(parent)) - return 1; - - if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) - return 0; - - if (~cflags & pflags) - return 0; - - return 1; -} - -static void free_rootdomain(struct rcu_head *rcu) -{ - struct root_domain *rd = container_of(rcu, struct root_domain, rcu); - - cpupri_cleanup(&rd->cpupri); - free_cpumask_var(rd->rto_mask); - free_cpumask_var(rd->online); - free_cpumask_var(rd->span); - kfree(rd); -} - -static void rq_attach_root(struct rq *rq, struct root_domain *rd) -{ - struct root_domain *old_rd = NULL; - unsigned long flags; - - grq_lock_irqsave(&flags); - - if (rq->rd) { - old_rd = rq->rd; - - if (cpumask_test_cpu(rq->cpu, old_rd->online)) - set_rq_offline(rq); - - cpumask_clear_cpu(rq->cpu, old_rd->span); - - /* - * If we dont want to free the old_rd yet then - * set old_rd to NULL to skip the freeing later - * in this function: - */ - if (!atomic_dec_and_test(&old_rd->refcount)) - old_rd = NULL; - } - - atomic_inc(&rd->refcount); - rq->rd = rd; - - cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); - - grq_unlock_irqrestore(&flags); - - if (old_rd) - call_rcu_sched(&old_rd->rcu, free_rootdomain); -} - -static int init_rootdomain(struct root_domain *rd) -{ - memset(rd, 0, sizeof(*rd)); - - if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto out; - if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) - goto free_span; - if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) - goto free_online; - - if (cpupri_init(&rd->cpupri) != 0) - goto free_rto_mask; - return 0; - -free_rto_mask: - free_cpumask_var(rd->rto_mask); -free_online: - free_cpumask_var(rd->online); -free_span: - free_cpumask_var(rd->span); -out: - return -ENOMEM; -} - -static void init_defrootdomain(void) -{ - init_rootdomain(&def_root_domain); - - atomic_set(&def_root_domain.refcount, 1); -} - -static struct root_domain *alloc_rootdomain(void) -{ - struct root_domain *rd; - - rd = kmalloc(sizeof(*rd), GFP_KERNEL); - if (!rd) - return NULL; - - if (init_rootdomain(rd) != 0) { - kfree(rd); - return NULL; - } - - return rd; -} - -static void free_sched_domain(struct rcu_head *rcu) -{ - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - - kfree(sd); -} - -static void destroy_sched_domain(struct sched_domain *sd, int cpu) -{ - call_rcu(&sd->rcu, free_sched_domain); -} - -static void destroy_sched_domains(struct sched_domain *sd, int cpu) -{ - for (; sd; sd = sd->parent) - destroy_sched_domain(sd, cpu); -} - -/* - * Attach the domain 'sd' to 'cpu' as its base domain. Callers must - * hold the hotplug lock. - */ -static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; - - /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; ) { - struct sched_domain *parent = tmp->parent; - if (!parent) - break; - - if (sd_parent_degenerate(tmp, parent)) { - tmp->parent = parent->parent; - if (parent->parent) - parent->parent->child = tmp; - /* - * Transfer SD_PREFER_SIBLING down in case of a - * degenerate parent; the spans match for this - * so the property transfers. - */ - if (parent->flags & SD_PREFER_SIBLING) - tmp->flags |= SD_PREFER_SIBLING; - destroy_sched_domain(parent, cpu); - } else - tmp = tmp->parent; - } - - if (sd && sd_degenerate(sd)) { - tmp = sd; - sd = sd->parent; - destroy_sched_domain(tmp, cpu); - if (sd) - sd->child = NULL; - } - - sched_domain_debug(sd, cpu); - - rq_attach_root(rq, rd); - tmp = rq->sd; - rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp, cpu); -} - -/* Setup the mask of cpus configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - int ret; - - alloc_bootmem_cpumask_var(&cpu_isolated_map); - ret = cpulist_parse(str, cpu_isolated_map); - if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); - return 0; - } - return 1; -} - -__setup("isolcpus=", isolated_cpu_setup); - -struct s_data { - struct sched_domain ** __percpu sd; - struct root_domain *rd; -}; - -enum s_alloc { - sa_rootdomain, - sa_sd, - sa_sd_storage, - sa_none, -}; - -/* - * Initializers for schedule domains - * Non-inlined to reduce accumulated stack pressure in build_sched_domains() - */ - -static int default_relax_domain_level = -1; -int sched_domain_level_max; - -static int __init setup_relax_domain_level(char *str) -{ - if (kstrtoint(str, 0, &default_relax_domain_level)) - pr_warn("Unable to set relax_domain_level\n"); - - return 1; -} -__setup("relax_domain_level=", setup_relax_domain_level); - -static void set_domain_attribute(struct sched_domain *sd, - struct sched_domain_attr *attr) -{ - int request; - - if (!attr || attr->relax_domain_level < 0) { - if (default_relax_domain_level < 0) - return; - else - request = default_relax_domain_level; - } else - request = attr->relax_domain_level; - if (request < sd->level) { - /* turn off idle balance on this domain */ - sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } else { - /* turn on idle balance on this domain */ - sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } -} - -static void __sdt_free(const struct cpumask *cpu_map); -static int __sdt_alloc(const struct cpumask *cpu_map); - -static void __free_domain_allocs(struct s_data *d, enum s_alloc what, - const struct cpumask *cpu_map) -{ - switch (what) { - case sa_rootdomain: - if (!atomic_read(&d->rd->refcount)) - free_rootdomain(&d->rd->rcu); /* fall through */ - case sa_sd: - free_percpu(d->sd); /* fall through */ - case sa_sd_storage: - __sdt_free(cpu_map); /* fall through */ - case sa_none: - break; - } -} - -static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, - const struct cpumask *cpu_map) -{ - memset(d, 0, sizeof(*d)); - - if (__sdt_alloc(cpu_map)) - return sa_sd_storage; - d->sd = alloc_percpu(struct sched_domain *); - if (!d->sd) - return sa_sd_storage; - d->rd = alloc_rootdomain(); - if (!d->rd) - return sa_sd; - return sa_rootdomain; -} - -/* - * NULL the sd_data elements we've used to build the sched_domain - * structure so that the subsequent __free_domain_allocs() - * will not free the data we're using. - */ -static void claim_allocations(int cpu, struct sched_domain *sd) -{ - struct sd_data *sdd = sd->private; - - WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); - *per_cpu_ptr(sdd->sd, cpu) = NULL; -} - -#ifdef CONFIG_NUMA -static int sched_domains_numa_levels; -static int *sched_domains_numa_distance; -static struct cpumask ***sched_domains_numa_masks; -static int sched_domains_curr_level; -#endif - -/* - * SD_flags allowed in topology descriptions. - * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain - * - * Odd one out: - * SD_ASYM_PACKING - describes SMT quirks - */ -#define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUCAPACITY | \ - SD_SHARE_PKG_RESOURCES | \ - SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) - -static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) -{ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int sd_weight, sd_flags = 0; - -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); - - if (tl->sd_flags) - sd_flags = (*tl->sd_flags)(); - if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, - "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; - - *sd = (struct sched_domain){ - .min_interval = sd_weight, - .max_interval = 2*sd_weight, - .busy_factor = 32, - .imbalance_pct = 125, - - .cache_nice_tries = 0, - .busy_idx = 0, - .idle_idx = 0, - .newidle_idx = 0, - .wake_idx = 0, - .forkexec_idx = 0, - - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE - | 1*SD_BALANCE_EXEC - | 1*SD_BALANCE_FORK - | 0*SD_BALANCE_WAKE - | 1*SD_WAKE_AFFINE - | 0*SD_SHARE_CPUCAPACITY - | 0*SD_SHARE_PKG_RESOURCES - | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING - | 0*SD_NUMA - | sd_flags - , - - .last_balance = jiffies, - .balance_interval = sd_weight, - .smt_gain = 0, - .max_newidle_lb_cost = 0, - .next_decay_max_lb_cost = jiffies, -#ifdef CONFIG_SCHED_DEBUG - .name = tl->name, -#endif - }; - - /* - * Convert topological properties into behaviour. - */ - - if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; - sd->imbalance_pct = 110; - sd->smt_gain = 1178; /* ~15% */ - - } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->imbalance_pct = 117; - sd->cache_nice_tries = 1; - sd->busy_idx = 2; - -#ifdef CONFIG_NUMA - } else if (sd->flags & SD_NUMA) { - sd->cache_nice_tries = 2; - sd->busy_idx = 3; - sd->idle_idx = 2; - - sd->flags |= SD_SERIALIZE; - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { - sd->flags &= ~(SD_BALANCE_EXEC | - SD_BALANCE_FORK | - SD_WAKE_AFFINE); - } - -#endif - } else { - sd->flags |= SD_PREFER_SIBLING; - sd->cache_nice_tries = 1; - sd->busy_idx = 2; - sd->idle_idx = 1; - } - - sd->private = &tl->data; - - return sd; -} - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = - default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->mask; tl++) - -void set_sched_topology(struct sched_domain_topology_level *tl) -{ - sched_domain_topology = tl; -} - -#ifdef CONFIG_NUMA - -static const struct cpumask *sd_numa_mask(int cpu) -{ - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; -} - -static void sched_numa_warn(const char *str) -{ - static int done = false; - int i,j; - - if (done) - return; - - done = true; - - printk(KERN_WARNING "ERROR: %s\n\n", str); - - for (i = 0; i < nr_node_ids; i++) { - printk(KERN_WARNING " "); - for (j = 0; j < nr_node_ids; j++) - printk(KERN_CONT "%02d ", node_distance(i,j)); - printk(KERN_CONT "\n"); - } - printk(KERN_WARNING "\n"); -} - -static bool find_numa_distance(int distance) -{ - int i; - - if (distance == node_distance(0, 0)) - return true; - - for (i = 0; i < sched_domains_numa_levels; i++) { - if (sched_domains_numa_distance[i] == distance) - return true; - } - - return false; -} - -static void sched_init_numa(void) -{ - int next_distance, curr_distance = node_distance(0, 0); - struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* - * O(nr_nodes^2) deduplicating selection sort -- in order to find the - * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. - */ - next_distance = curr_distance; - for (i = 0; i < nr_node_ids; i++) { - for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; - - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); - - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); - } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; - } - - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; - } - /* - * 'level' contains the number of unique distances, excluding the - * identity distance node_distance(i,i). - * - * The sched_domains_numa_distance[] array includes the actual distance - * numbers. - */ - - /* - * Here, we should temporarily reset sched_domains_numa_levels to 0. - * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be - * dangerous when we use it to iterate array sched_domains_numa_masks[][] - * in other functions. - * - * We reset it to 'level' at the end of this function. - */ - sched_domains_numa_levels = 0; - - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); - if (!sched_domains_numa_masks) - return; - - /* - * Now for each level, construct a mask per node which contains all - * cpus of nodes that are that many hops away from us. - */ - for (i = 0; i < level; i++) { - sched_domains_numa_masks[i] = - kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); - if (!sched_domains_numa_masks[i]) - return; - - for (j = 0; j < nr_node_ids; j++) { - struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!mask) - return; - - sched_domains_numa_masks[i][j] = mask; - - for_each_node(k) { - if (node_distance(j, k) > sched_domains_numa_distance[i]) - continue; - - cpumask_or(mask, mask, cpumask_of_node(k)); - } - } - } - - /* Compute default topology size */ - for (i = 0; sched_domain_topology[i].mask; i++); - - tl = kzalloc((i + level + 1) * - sizeof(struct sched_domain_topology_level), GFP_KERNEL); - if (!tl) - return; - - /* - * Copy the default topology bits.. - */ - for (i = 0; sched_domain_topology[i].mask; i++) - tl[i] = sched_domain_topology[i]; - - /* - * .. and append 'j' levels of NUMA goodness. - */ - for (j = 0; j < level; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; - } - - sched_domain_topology = tl; - - sched_domains_numa_levels = level; -} - -static void sched_domains_numa_masks_set(int cpu) -{ - int node = cpu_to_node(cpu); - int i, j; - - for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) { - if (node_distance(j, node) <= sched_domains_numa_distance[i]) - cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); - } - } -} - -static void sched_domains_numa_masks_clear(int cpu) -{ - int i, j; - - for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) - cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); - } -} - -#else -static inline void sched_init_numa(void) { } -static void sched_domains_numa_masks_set(unsigned int cpu) { } -static void sched_domains_numa_masks_clear(unsigned int cpu) { } -#endif /* CONFIG_NUMA */ - -static int __sdt_alloc(const struct cpumask *cpu_map) -{ - struct sched_domain_topology_level *tl; - int j; - - for_each_sd_topology(tl) { - struct sd_data *sdd = &tl->data; - - sdd->sd = alloc_percpu(struct sched_domain *); - if (!sdd->sd) - return -ENOMEM; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd; - - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sd) - return -ENOMEM; - - *per_cpu_ptr(sdd->sd, j) = sd; - } - } - - return 0; -} - -static void __sdt_free(const struct cpumask *cpu_map) -{ - struct sched_domain_topology_level *tl; - int j; - - for_each_sd_topology(tl) { - struct sd_data *sdd = &tl->data; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd; - - if (sdd->sd) { - sd = *per_cpu_ptr(sdd->sd, j); - kfree(*per_cpu_ptr(sdd->sd, j)); - } - } - free_percpu(sdd->sd); - sdd->sd = NULL; - } -} - -struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int cpu) -{ - struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; - - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - if (child) { - sd->level = child->level + 1; - sched_domain_level_max = max(sched_domain_level_max, sd->level); - child->parent = sd; - sd->child = child; - - if (!cpumask_subset(sched_domain_span(child), - sched_domain_span(sd))) { - pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG - pr_err(" the %s domain not a subset of the %s domain\n", - child->name, sd->name); -#endif - /* Fixup, ensure @sd has at least @child cpus. */ - cpumask_or(sched_domain_span(sd), - sched_domain_span(sd), - sched_domain_span(child)); - } - - } - set_domain_attribute(sd, attr); - - return sd; -} - -/* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus - */ -static int build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) -{ - enum s_alloc alloc_state; - struct sched_domain *sd; - struct s_data d; - int i, ret = -ENOMEM; - - alloc_state = __visit_domain_allocation_hell(&d, cpu_map); - if (alloc_state != sa_rootdomain) - goto error; - - /* Set up domains for cpus specified by the cpu_map. */ - for_each_cpu(i, cpu_map) { - struct sched_domain_topology_level *tl; - - sd = NULL; - for_each_sd_topology(tl) { - sd = build_sched_domain(tl, cpu_map, attr, sd, i); - if (tl == sched_domain_topology) - *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP) - sd->flags |= SD_OVERLAP; - if (cpumask_equal(cpu_map, sched_domain_span(sd))) - break; - } - } - - /* Calculate CPU capacity for physical packages and nodes */ - for (i = nr_cpumask_bits-1; i >= 0; i--) { - if (!cpumask_test_cpu(i, cpu_map)) - continue; - - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - claim_allocations(i, sd); - } - } - - /* Attach the domains */ - rcu_read_lock(); - for_each_cpu(i, cpu_map) { - sd = *per_cpu_ptr(d.sd, i); - cpu_attach_domain(sd, d.rd, i); - } - rcu_read_unlock(); - - ret = 0; -error: - __free_domain_allocs(&d, alloc_state, cpu_map); - return ret; -} - -static cpumask_var_t *doms_cur; /* current sched domains */ -static int ndoms_cur; /* number of sched domains in 'doms_cur' */ -static struct sched_domain_attr *dattr_cur; - /* attribues of custom domains in 'doms_cur' */ - -/* - * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask) fails, then fallback to a single sched domain, - * as determined by the single cpumask fallback_doms. - */ -static cpumask_var_t fallback_doms; - -/* - * arch_update_cpu_topology lets virtualized architectures update the - * cpu core maps. It is supposed to return 1 if the topology changed - * or 0 if it stayed the same. - */ -int __weak arch_update_cpu_topology(void) -{ - return 0; -} - -cpumask_var_t *alloc_sched_domains(unsigned int ndoms) -{ - int i; - cpumask_var_t *doms; - - doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); - if (!doms) - return NULL; - for (i = 0; i < ndoms; i++) { - if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { - free_sched_domains(doms, i); - return NULL; - } - } - return doms; -} - -void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) -{ - unsigned int i; - for (i = 0; i < ndoms; i++) - free_cpumask_var(doms[i]); - kfree(doms); -} - -/* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ -static int init_sched_domains(const struct cpumask *cpu_map) -{ - int err; - - arch_update_cpu_topology(); - ndoms_cur = 1; - doms_cur = alloc_sched_domains(ndoms_cur); - if (!doms_cur) - doms_cur = &fallback_doms; - cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - err = build_sched_domains(doms_cur[0], NULL); - register_sched_domain_sysctl(); - - return err; -} - -/* - * Detach sched domains from a group of cpus specified in cpu_map - * These cpus will now be attached to the NULL domain - */ -static void detach_destroy_domains(const struct cpumask *cpu_map) -{ - int i; - - rcu_read_lock(); - for_each_cpu(i, cpu_map) - cpu_attach_domain(NULL, &def_root_domain, i); - rcu_read_unlock(); -} - -/* handle null as "default" */ -static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, - struct sched_domain_attr *new, int idx_new) -{ - struct sched_domain_attr tmp; - - /* fast path */ - if (!new && !cur) - return 1; - - tmp = SD_ATTR_INIT; - return !memcmp(cur ? (cur + idx_cur) : &tmp, - new ? (new + idx_new) : &tmp, - sizeof(struct sched_domain_attr)); -} - -/* - * Partition sched domains as specified by the 'ndoms_new' - * cpumasks in the array doms_new[] of cpumasks. This compares - * doms_new[] to the current sched domain partitioning, doms_cur[]. - * It destroys each deleted domain and builds each new domain. - * - * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. - * The masks don't intersect (don't overlap.) We should setup one - * sched domain for each mask. CPUs not in any of the cpumasks will - * not be load balanced. If the same cpumask appears both in the - * current 'doms_cur' domains and in the new 'doms_new', we can leave - * it as it is. - * - * The passed in 'doms_new' should be allocated using - * alloc_sched_domains. This routine takes ownership of it and will - * free_sched_domains it when done with it. If the caller failed the - * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, - * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms', it also forces the domains to be rebuilt. - * - * If doms_new == NULL it will be replaced with cpu_online_mask. - * ndoms_new == 0 is a special case for destroying existing domains, - * and it will not create the default domain. - * - * Call with hotplug lock held - */ -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - int i, j, n; - int new_topology; - - mutex_lock(&sched_domains_mutex); - - /* always unregister in case we don't destroy any domains */ - unregister_sched_domain_sysctl(); - - /* Let architecture update cpu core mappings. */ - new_topology = arch_update_cpu_topology(); - - n = doms_new ? ndoms_new : 0; - - /* Destroy deleted domains */ - for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_cur[i], doms_new[j]) - && dattrs_equal(dattr_cur, i, dattr_new, j)) - goto match1; - } - /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur[i]); -match1: - ; - } - - n = ndoms_cur; - if (doms_new == NULL) { - n = 0; - doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); - WARN_ON_ONCE(dattr_new); - } - - /* Build new domains */ - for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_new[i], doms_cur[j]) - && dattrs_equal(dattr_new, i, dattr_cur, j)) - goto match2; - } - /* no match - add a new doms_new */ - build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); -match2: - ; - } - - /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) - free_sched_domains(doms_cur, ndoms_cur); - kfree(dattr_cur); /* kfree(NULL) is safe */ - doms_cur = doms_new; - dattr_cur = dattr_new; - ndoms_cur = ndoms_new; - - register_sched_domain_sysctl(); - - mutex_unlock(&sched_domains_mutex); -} - -static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ - -/* - * Update cpusets according to cpu_active mask. If cpusets are - * disabled, cpuset_update_active_cpus() becomes a simple wrapper - * around partition_sched_domains(). - * - * If we come here as part of a suspend/resume, don't touch cpusets because we - * want to restore it back to its original state upon resume anyway. - */ -static void cpuset_cpu_active(void) -{ - if (cpuhp_tasks_frozen) { - /* - * num_cpus_frozen tracks how many CPUs are involved in suspend - * resume sequence. As long as this is not the last online - * operation in the resume sequence, just build a single sched - * domain, ignoring cpusets. - */ - num_cpus_frozen--; - if (likely(num_cpus_frozen)) { - partition_sched_domains(1, NULL, NULL); - return; - } - /* - * This is the last CPU online operation. So fall through and - * restore the original sched domains by considering the - * cpuset configurations. - */ - } - - cpuset_update_active_cpus(true); -} - -static int cpuset_cpu_inactive(unsigned int cpu) -{ - if (!cpuhp_tasks_frozen) { - cpuset_update_active_cpus(false); - } else { - num_cpus_frozen++; - partition_sched_domains(1, NULL, NULL); - } - return 0; -} - -int sched_cpu_activate(unsigned int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - set_cpu_active(cpu, true); - - if (sched_smp_initialized) { - sched_domains_numa_masks_set(cpu); - cpuset_cpu_active(); - } - - /* - * Put the rq online, if not already. This happens: - * - * 1) In the early boot process, because we build the real domains - * after all cpus have been brought up. - * - * 2) At runtime, if cpuset_cpu_active() fails to rebuild the - * domains. - */ - grq_lock_irqsave(&flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_online(rq); - } - unbind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); - - return 0; -} - -int sched_cpu_deactivate(unsigned int cpu) -{ - int ret; - - set_cpu_active(cpu, false); - /* - * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU - * users of this state to go away such that all new such users will - * observe it. - * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * - * Do sync before park smpboot threads to take care the rcu boost case. - */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); - - if (!sched_smp_initialized) - return 0; - - ret = cpuset_cpu_inactive(cpu); - if (ret) { - set_cpu_active(cpu, true); - return ret; - } - sched_domains_numa_masks_clear(cpu); - return 0; -} - -int sched_cpu_starting(unsigned int __maybe_unused cpu) -{ - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -int sched_cpu_dying(unsigned int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - grq_lock_irqsave(&flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - bind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); - - return 0; -} -#endif - -#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) -/* - * Cheaper version of the below functions in case support for SMT and MC is - * compiled in but CPUs have no siblings. - */ -static bool sole_cpu_idle(struct rq *rq) -{ - return rq_idle(rq); -} -#endif -#ifdef CONFIG_SCHED_SMT -static const cpumask_t *thread_cpumask(int cpu) -{ - return topology_sibling_cpumask(cpu); -} -/* All this CPU's SMT siblings are idle */ -static bool siblings_cpu_idle(struct rq *rq) -{ - return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map); -} -#endif -#ifdef CONFIG_SCHED_MC -static const cpumask_t *core_cpumask(int cpu) -{ - return topology_core_cpumask(cpu); -} -/* All this CPU's shared cache siblings are idle */ -static bool cache_cpu_idle(struct rq *rq) -{ - return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map); -} -#endif - -enum sched_domain_level { - SD_LV_NONE = 0, - SD_LV_SIBLING, - SD_LV_MC, - SD_LV_BOOK, - SD_LV_CPU, - SD_LV_NODE, - SD_LV_ALLNODES, - SD_LV_MAX -}; - -void __init sched_init_smp(void) -{ - struct sched_domain *sd; - int cpu, other_cpu; -#ifdef CONFIG_SCHED_SMT - bool smt_threads = false; -#endif - struct rq *rq; - - cpumask_var_t non_isolated_cpus; - - alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - - sched_init_numa(); - - /* - * There's no userspace yet to cause hotplug operations; hence all the - * cpu masks are stable and all blatant races in the below code cannot - * happen. - */ - mutex_lock(&sched_domains_mutex); - init_sched_domains(cpu_active_mask); - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - if (cpumask_empty(non_isolated_cpus)) - cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); - mutex_unlock(&sched_domains_mutex); - - /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) - BUG(); - free_cpumask_var(non_isolated_cpus); - - mutex_lock(&sched_domains_mutex); - grq_lock_irq(); - /* - * Set up the relative cache distance of each online cpu from each - * other in a simple array for quick lookup. Locality is determined - * by the closest sched_domain that CPUs are separated by. CPUs with - * shared cache in SMT and MC are treated as local. Separate CPUs - * (within the same package or physically) within the same node are - * treated as not local. CPUs not even in the same domain (different - * nodes) are treated as very distant. - */ - for_each_online_cpu(cpu) { - rq = cpu_rq(cpu); - - /* First check if this cpu is in the same node */ - for_each_domain(cpu, sd) { - if (sd->level > SD_LV_MC) - continue; - /* Set locality to local node if not already found lower */ - for_each_cpu(other_cpu, sched_domain_span(sd)) { - if (rq->cpu_locality[other_cpu] > 3) - rq->cpu_locality[other_cpu] = 3; - } - } - - /* - * Each runqueue has its own function in case it doesn't have - * siblings of its own allowing mixed topologies. - */ -#ifdef CONFIG_SCHED_MC - for_each_cpu(other_cpu, core_cpumask(cpu)) { - if (rq->cpu_locality[other_cpu] > 2) - rq->cpu_locality[other_cpu] = 2; - } - if (cpumask_weight(core_cpumask(cpu)) > 1) { - cpumask_copy(&rq->core_mask, core_cpumask(cpu)); - cpumask_clear_cpu(cpu, &rq->core_mask); - rq->cache_idle = cache_cpu_idle; - } -#endif -#ifdef CONFIG_SCHED_SMT - for_each_cpu(other_cpu, thread_cpumask(cpu)) - rq->cpu_locality[other_cpu] = 1; - if (cpumask_weight(thread_cpumask(cpu)) > 1) { - cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); - cpumask_clear_cpu(cpu, &rq->thread_mask); - rq->siblings_idle = siblings_cpu_idle; - smt_threads = true; - } -#endif - } - for_each_possible_cpu(cpu) { - int total_cpus = 0, locality; - - rq = cpu_rq(cpu); - for (locality = 0; locality <= 4; locality++) { - for_each_possible_cpu(other_cpu) { - if (rq->cpu_locality[other_cpu] == locality) - rq->rq_order[total_cpus++] = cpu_rq(other_cpu); - } - } - } -#ifdef CONFIG_SMT_NICE - if (smt_threads) { - check_siblings = &check_smt_siblings; - wake_siblings = &wake_smt_siblings; - smt_schedule = &smt_should_schedule; - } -#endif - grq_unlock_irq(); - mutex_unlock(&sched_domains_mutex); - - for_each_online_cpu(cpu) { - rq = cpu_rq(cpu); - - for_each_online_cpu(other_cpu) { - if (other_cpu <= cpu) - continue; - printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); - } - } - sched_smp_initialized = true; -} -#else -void __init sched_init_smp(void) -{ -} -#endif /* CONFIG_SMP */ - -int in_sched_functions(unsigned long addr) -{ - return in_lock_functions(addr) || - (addr >= (unsigned long)__sched_text_start - && addr < (unsigned long)__sched_text_end); -} - -#ifdef CONFIG_CGROUP_SCHED -/* task group related information */ -struct task_group { - struct cgroup_subsys_state css; - - struct rcu_head rcu; - struct list_head list; - - struct task_group *parent; - struct list_head siblings; - struct list_head children; -}; - -/* - * Default task group. - * Every task in system belongs to this group at bootup. - */ -struct task_group root_task_group; -LIST_HEAD(task_groups); - -/* Cacheline aligned slab cache for task_group */ -static struct kmem_cache *task_group_cache __read_mostly; -#endif /* CONFIG_CGROUP_SCHED */ - -void __init sched_init(void) -{ -#ifdef CONFIG_SMP - int cpu_ids; -#endif - int i; - struct rq *rq; - - prio_ratios[0] = 128; - for (i = 1 ; i < NICE_WIDTH ; i++) - prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; - - raw_spin_lock_init(&grq.lock); - grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; - grq.niffies = 0; - grq.last_jiffy = jiffies; - raw_spin_lock_init(&grq.iso_lock); - grq.iso_ticks = 0; - grq.iso_refractory = false; - grq.noc = 1; - skiplist_init(&grq.node); - grq.sl = new_skiplist(&grq.node); - skiplist_node_init(&init_task.node); - -#ifdef CONFIG_SMP - init_defrootdomain(); - grq.qnr = grq.idle_cpus = 0; - cpumask_clear(&grq.cpu_idle_map); -#else - uprq = &per_cpu(runqueues, 0); -#endif - -#ifdef CONFIG_CGROUP_SCHED - task_group_cache = KMEM_CACHE(task_group, 0); - - list_add(&root_task_group.list, &task_groups); - INIT_LIST_HEAD(&root_task_group.children); - INIT_LIST_HEAD(&root_task_group.siblings); -#endif /* CONFIG_CGROUP_SCHED */ - for_each_possible_cpu(i) { - rq = cpu_rq(i); - rq->grq_lock = &grq.lock; - rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = - rq->iowait_pc = rq->idle_pc = 0; - rq->dither = false; -#ifdef CONFIG_SMP - rq->last_niffy = 0; - rq->sd = NULL; - rq->rd = NULL; - rq->online = false; - rq->cpu = i; - rq_attach_root(rq, &def_root_domain); -#endif - atomic_set(&rq->nr_iowait, 0); - } - -#ifdef CONFIG_SMP - cpu_ids = i; - /* - * Set the base locality for cpu cache distance calculation to - * "distant" (3). Make sure the distance from a CPU to itself is 0. - */ - for_each_possible_cpu(i) { - int j; - - rq = cpu_rq(i); -#ifdef CONFIG_SCHED_SMT - rq->siblings_idle = sole_cpu_idle; -#endif -#ifdef CONFIG_SCHED_MC - rq->cache_idle = sole_cpu_idle; -#endif - rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); - for_each_possible_cpu(j) { - if (i == j) - rq->cpu_locality[j] = 0; - else - rq->cpu_locality[j] = 4; - } - rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); - rq->rq_order[0] = rq; - for (j = 1; j < cpu_ids; j++) - rq->rq_order[j] = cpu_rq(j); - } -#endif - -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - - /* - * The boot idle thread does lazy MMU switching as well: - */ - atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current); - - /* - * Make us the idle thread. Technically, schedule() should not be - * called from this thread, however somewhere below it might be, - * but because we are the idle thread, we just pick up running again - * when this runqueue becomes "idle". - */ - init_idle(current, smp_processor_id()); - -#ifdef CONFIG_SMP - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); - /* May be allocated at isolcpus cmdline parse time */ - if (cpu_isolated_map == NULL) - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); - idle_thread_set_boot_cpu(); -#endif /* SMP */ - - init_schedstats(); -} - -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline int preempt_count_equals(int preempt_offset) -{ - int nested = preempt_count() + rcu_preempt_depth(); - - return (nested == preempt_offset); -} - -void __might_sleep(const char *file, int line, int preempt_offset) -{ - /* - * Blocking primitives will set (and therefore destroy) current->state, - * since we will exit with TASK_RUNNING make sure we enter with it, - * otherwise we will destroy state. - */ - WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, - "do not call blocking ops when !TASK_RUNNING; " - "state=%lx set at [<%p>] %pS\n", - current->state, - (void *)current->task_state_change, - (void *)current->task_state_change); - - ___might_sleep(file, line, preempt_offset); -} -EXPORT_SYMBOL(__might_sleep); - -void ___might_sleep(const char *file, int line, int preempt_offset) -{ - static unsigned long prev_jiffy; /* ratelimiting */ - - rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) - return; - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); - - if (task_stack_end_corrupted(current)) - printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); - - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { - pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); - pr_cont("\n"); - } -#endif - dump_stack(); -} -EXPORT_SYMBOL(___might_sleep); -#endif - -#ifdef CONFIG_MAGIC_SYSRQ -static inline void normalise_rt_tasks(void) -{ - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; - - read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - /* - * Only normalize user tasks: - */ - if (p->flags & PF_KTHREAD) - continue; - - if (!rt_task(p) && !iso_task(p)) - continue; - - rq = task_grq_lock(p, &flags); - __setscheduler(p, rq, SCHED_NORMAL, 0, false); - task_grq_unlock(p, &flags); - } - read_unlock(&tasklist_lock); -} - -void normalize_rt_tasks(void) -{ - normalise_rt_tasks(); -} -#endif /* CONFIG_MAGIC_SYSRQ */ - -#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -/* - * These functions are only useful for the IA64 MCA handling, or kdb. - * - * They can only be called when the whole system has been - * stopped - every CPU needs to be quiescent, and no scheduling - * activity can take place. Using them for anything else would - * be a serious bug, and as a result, they aren't even visible - * under any other configuration. - */ - -/** - * curr_task - return the current task for a given cpu. - * @cpu: the processor in question. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - * - * Return: The current task for @cpu. - */ -struct task_struct *curr_task(int cpu) -{ - return cpu_curr(cpu); -} - -#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ - -#ifdef CONFIG_IA64 -/** - * set_curr_task - set the current task for a given cpu. - * @cpu: the processor in question. - * @p: the task pointer to set. - * - * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a cpu in a non-blocking manner. This function - * must be called with all CPU's synchronised, and interrupts disabled, the - * and caller must save the original value of the current task (see - * curr_task() above) and restore that value before reenabling interrupts and - * re-starting the system. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -void set_curr_task(int cpu, struct task_struct *p) -{ - cpu_curr(cpu) = p; -} - -#endif - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} -EXPORT_SYMBOL_GPL(task_cputime_adjusted); - -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} - -void vtime_account_system_irqsafe(struct task_struct *tsk) -{ - unsigned long flags; - - local_irq_save(flags); - vtime_account_system(tsk); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); - -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_system(prev); - - vtime_account_user(prev); - arch_vtime_task_switch(prev); -} -#endif - -#else -/* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) { - u64 tmp = rtime; rtime = stime; stime = tmp; - } - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return (__force cputime_t) scaled; -} - -/* - * Adjust tick based cputime random precision against scheduler - * runtime accounting. - */ -static void cputime_adjust(struct task_cputime *curr, - struct prev_cputime *prev, - cputime_t *ut, cputime_t *st) -{ - cputime_t rtime, stime, utime, total; - - stime = curr->stime; - total = stime + curr->utime; - - /* - * Tick based cputime accounting depend on random scheduling - * timeslices of a task to be interrupted or not by the timer. - * Depending on these circumstances, the number of these interrupts - * may be over or under-optimistic, matching the real user and system - * cputime with a variable precision. - * - * Fix this by scaling these tick based values against the total - * runtime accounted by the CFS scheduler. - */ - rtime = nsecs_to_cputime(curr->sum_exec_runtime); - - /* - * Update userspace visible utime/stime values only if actual execution - * time is bigger than already exported. Note that can happen, that we - * provided bigger values due to scaling inaccuracy on big numbers. - */ - if (prev->stime + prev->utime >= rtime) - goto out; - - if (total) { - stime = scale_stime((__force u64)stime, - (__force u64)rtime, (__force u64)total); - utime = rtime - stime; - } else { - stime = rtime; - utime = 0; - } - - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); - -out: - *ut = prev->utime; - *st = prev->stime; -} - -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime = { - .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); - cputime_adjust(&cputime, &p->prev_cputime, ut, st); -} -EXPORT_SYMBOL_GPL(task_cputime_adjusted); - -/* - * Must be called with siglock held. - */ -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); -} -#endif - -void init_idle_bootup_task(struct task_struct *idle) -{} - -#ifdef CONFIG_SCHED_DEBUG -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) -{} - -void proc_sched_set_task(struct task_struct *p) -{} -#endif - -#ifdef CONFIG_SMP -#define SCHED_LOAD_SHIFT (10) -#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) - -unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) -{ - return SCHED_LOAD_SCALE; -} - -unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) -{ - unsigned long weight = cpumask_weight(sched_domain_span(sd)); - unsigned long smt_gain = sd->smt_gain; - - smt_gain /= weight; - - return smt_gain; -} -#endif - -#ifdef CONFIG_CGROUP_SCHED -static void sched_free_group(struct task_group *tg) -{ - kmem_cache_free(task_group_cache, tg); -} - -/* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(struct task_group *parent) -{ - struct task_group *tg; - - tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); - if (!tg) - return ERR_PTR(-ENOMEM); - - return tg; -} - -void sched_online_group(struct task_group *tg, struct task_group *parent) -{ -} - -/* rcu callback to free various structures associated with a task group */ -static void sched_free_group_rcu(struct rcu_head *rhp) -{ - /* now it should be safe to free those cfs_rqs */ - sched_free_group(container_of(rhp, struct task_group, rcu)); -} - -void sched_destroy_group(struct task_group *tg) -{ - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, sched_free_group_rcu); -} - -void sched_offline_group(struct task_group *tg) -{ -} - -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct task_group, css) : NULL; -} - -static struct cgroup_subsys_state * -cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct task_group *parent = css_tg(parent_css); - struct task_group *tg; - - if (!parent) { - /* This is early initialization for the top cgroup */ - return &root_task_group.css; - } - - tg = sched_create_group(parent); - if (IS_ERR(tg)) - return ERR_PTR(-ENOMEM); - return &tg->css; -} - -static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - sched_offline_group(tg); -} - -static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - /* - * Relies on the RCU grace period between css_released() and this. - */ - sched_free_group(tg); -} - -static void cpu_cgroup_fork(struct task_struct *task) -{ -} - -static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -{ - return 0; -} - -static void cpu_cgroup_attach(struct cgroup_taskset *tset) -{ -} - -static struct cftype cpu_files[] = { - { } /* terminate */ -}; - -struct cgroup_subsys cpu_cgrp_subsys = { - .css_alloc = cpu_cgroup_css_alloc, - .css_released = cpu_cgroup_css_released, - .css_free = cpu_cgroup_css_free, - .fork = cpu_cgroup_fork, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_files, - .early_init = true, -}; -#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h deleted file mode 100644 index 00a16ba0a..000000000 --- a/kernel/sched/bfs_sched.h +++ /dev/null @@ -1,224 +0,0 @@ -#include -#include -#include - -#ifndef BFS_SCHED_H -#define BFS_SCHED_H - -/* - * This is the main, per-CPU runqueue data structure. - * This data should only be modified by the local cpu. - */ -struct rq { - struct task_struct *curr, *idle, *stop; - struct mm_struct *prev_mm; - - /* Pointer to grq spinlock */ - raw_spinlock_t *grq_lock; - - /* Stored data about rq->curr to work outside grq lock */ - u64 rq_deadline; - unsigned int rq_policy; - int rq_time_slice; - u64 rq_last_ran; - int rq_prio; - int soft_affined; /* Running or queued tasks with this set as their rq */ - u64 load_update; /* When we last updated load */ - unsigned long load_avg; /* Rolling load average */ -#ifdef CONFIG_SMT_NICE - struct mm_struct *rq_mm; - int rq_smt_bias; /* Policy/nice level bias across smt siblings */ -#endif - /* Accurate timekeeping data */ - u64 timekeep_clock; - unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, - iowait_pc, idle_pc; - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - int cpu; /* cpu of this runqueue */ - bool online; - - struct root_domain *rd; - struct sched_domain *sd; - int *cpu_locality; /* CPU relative cache distance */ - struct rq **rq_order; /* RQs ordered by relative cache distance */ -#ifdef CONFIG_SCHED_SMT - cpumask_t thread_mask; - bool (*siblings_idle)(struct rq *rq); - /* See if all smt siblings are idle */ -#endif /* CONFIG_SCHED_SMT */ -#ifdef CONFIG_SCHED_MC - cpumask_t core_mask; - bool (*cache_idle)(struct rq *rq); - /* See if all cache siblings are idle */ -#endif /* CONFIG_SCHED_MC */ - u64 last_niffy; /* Last time this RQ updated grq.niffies */ -#endif /* CONFIG_SMP */ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -#ifdef CONFIG_PARAVIRT - u64 prev_steal_time; -#endif /* CONFIG_PARAVIRT */ -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - u64 prev_steal_time_rq; -#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ - - u64 clock, old_clock, last_tick; - u64 clock_task; - bool dither; - -#ifdef CONFIG_SCHEDSTATS - - /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ - - /* sys_sched_yield() stats */ - unsigned int yld_count; - - /* schedule() stats */ - unsigned int sched_switch; - unsigned int sched_count; - unsigned int sched_goidle; - - /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; -#endif /* CONFIG_SCHEDSTATS */ -#ifdef CONFIG_CPU_IDLE - /* Must be inspected within a rcu lock section */ - struct cpuidle_state *idle_state; -#endif -}; - -#ifdef CONFIG_SMP -struct rq *cpu_rq(int cpu); -#endif - -#ifndef CONFIG_SMP -extern struct rq *uprq; -#define cpu_rq(cpu) (uprq) -#define this_rq() (uprq) -#define raw_rq() (uprq) -#define task_rq(p) (uprq) -#define cpu_curr(cpu) ((uprq)->curr) -#else /* CONFIG_SMP */ -DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#define this_rq() this_cpu_ptr(&runqueues) -#define raw_rq() raw_cpu_ptr(&runqueues) -#endif /* CONFIG_SMP */ - -static inline u64 __rq_clock_broken(struct rq *rq) -{ - return READ_ONCE(rq->clock); -} - -static inline u64 rq_clock(struct rq *rq) -{ - lockdep_assert_held(rq->grq_lock); - return rq->clock; -} - -static inline u64 rq_clock_task(struct rq *rq) -{ - lockdep_assert_held(rq->grq_lock); - return rq->clock_task; -} - -extern struct mutex sched_domains_mutex; -extern struct static_key_false sched_schedstats; - -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) - -/* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. - * - * The domain tree of any CPU may only be accessed from within - * preempt-disabled sections. - */ -#define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) - -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -void register_sched_domain_sysctl(void); -void unregister_sched_domain_sysctl(void); -#else -static inline void register_sched_domain_sysctl(void) -{ -} -static inline void unregister_sched_domain_sysctl(void) -{ -} -#endif - -static inline void sched_ttwu_pending(void) { } - -static inline int task_on_rq_queued(struct task_struct *p) -{ - return p->on_rq; -} - -#ifdef CONFIG_SMP - -extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); - -#endif - -#ifdef CONFIG_CPU_IDLE -static inline void idle_set_state(struct rq *rq, - struct cpuidle_state *idle_state) -{ - rq->idle_state = idle_state; -} - -static inline struct cpuidle_state *idle_get_state(struct rq *rq) -{ - WARN_ON(!rcu_read_lock_held()); - return rq->idle_state; -} -#else -static inline void idle_set_state(struct rq *rq, - struct cpuidle_state *idle_state) -{ -} - -static inline struct cpuidle_state *idle_get_state(struct rq *rq) -{ - return NULL; -} -#endif - -#ifdef CONFIG_CPU_FREQ -DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); - -static inline void cpufreq_trigger(u64 time, unsigned long util) -{ - struct update_util_data *data; - - if (util > SCHED_CAPACITY_SCALE) - util = SCHED_CAPACITY_SCALE; - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); - if (data) - data->func(data, time, util, SCHED_CAPACITY_SCALE); -} -#else -static inline void cpufreq_trigger(u64 time, unsigned long util) -{ -} -#endif /* CONFIG_CPU_FREQ */ - -#ifdef arch_scale_freq_capacity -#ifndef arch_scale_freq_invariant -#define arch_scale_freq_invariant() (true) -#endif -#else /* arch_scale_freq_capacity */ -#define arch_scale_freq_invariant() (false) -#endif - -#endif /* BFS_SCHED_H */ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 2ebd7b0c4..2a644b6be 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -9,8 +9,8 @@ * published by the Free Software Foundation. */ -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" #else #include "sched.h" #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index eba226d7c..a2bf1ea69 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -16,8 +16,8 @@ #include #include -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" #else #include "sched.h" #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1e855dcbd..060b76d85 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -14,8 +14,8 @@ #include -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" #else #include "sched.h" #endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 7466a0bb2..ba7b137c3 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -4,10 +4,10 @@ #include #include -#ifndef CONFIG_SCHED_BFS +#ifndef CONFIG_SCHED_MUQSS #include "sched.h" #else -#include "bfs_sched.h" +#include "MuQSS.h" #endif /* diff --git a/kernel/skip_list.c b/kernel/skip_list.c new file mode 100644 index 000000000..5c66067f2 --- /dev/null +++ b/kernel/skip_list.c @@ -0,0 +1,174 @@ +/* + Copyright (C) 2011,2016 Con Kolivas. + + Code based on example originally by William Pugh. + +Skip Lists are a probabilistic alternative to balanced trees, as +described in the June 1990 issue of CACM and were invented by +William Pugh in 1987. + +A couple of comments about this implementation: +The routine randomLevel has been hard-coded to generate random +levels using p=0.25. It can be easily changed. + +The insertion routine has been implemented so as to use the +dirty hack described in the CACM paper: if a random level is +generated that is more than the current maximum level, the +current maximum level plus one is used instead. + +Levels start at zero and go up to MaxLevel (which is equal to +MaxNumberOfLevels-1). + +The routines defined in this file are: + +init: defines slnode + +new_skiplist: returns a new, empty list + +randomLevel: Returns a random level based on a u64 random seed passed to it. +In MuQSS, the "niffy" time is used for this purpose. + +insert(l,key, value): inserts the binding (key, value) into l. This operation +occurs in O(log n) time. + +delnode(slnode, l, node): deletes any binding of key from the l based on the +actual node value. This operation occurs in O(k) time where k is the +number of levels of the node in question (max 8). The original delete +function occurred in O(log n) time and involved a search. + +MuQSS Notes: In this implementation of skiplists, there are bidirectional +next/prev pointers and the insert function returns a pointer to the actual +node the value is stored. The key here is chosen by the scheduler so as to +sort tasks according to the priority list requirements and is no longer used +by the scheduler after insertion. The scheduler lookup, however, occurs in +O(1) time because it is always the first item in the level 0 linked list. +Since the task struct stores a copy of the node pointer upon skiplist_insert, +it can also remove it much faster than the original implementation with the +aid of prev<->next pointer manipulation and no searching. + +*/ + +#include +#include + +#define MaxNumberOfLevels 8 +#define MaxLevel (MaxNumberOfLevels - 1) + +void skiplist_init(skiplist_node *slnode) +{ + int i; + + slnode->key = 0xFFFFFFFFFFFFFFFF; + slnode->level = 0; + slnode->value = NULL; + for (i = 0; i < MaxNumberOfLevels; i++) + slnode->next[i] = slnode->prev[i] = slnode; +} + +skiplist *new_skiplist(skiplist_node *slnode) +{ + skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); + + BUG_ON(!l); + l->header = slnode; + return l; +} + +void free_skiplist(skiplist *l) +{ + skiplist_node *p, *q; + + p = l->header; + do { + q = p->next[0]; + p->next[0]->prev[0] = q->prev[0]; + skiplist_node_init(p); + p = q; + } while (p != l->header); + kfree(l); +} + +void skiplist_node_init(skiplist_node *node) +{ + memset(node, 0, sizeof(skiplist_node)); +} + +/* + * Returns a pseudo-random number based on the randseed value by masking out + * 0-15. As many levels are not required when only few values are on the list, + * we limit the height of the levels according to how many list entries there + * are in a cheap manner. The height of the levels may have been higher while + * there were more entries queued previously but as this code is used only by + * the scheduler, entries are short lived and will be torn down regularly. + * + * 00-03 entries - 1 level + * 04-07 entries - 2 levels + * 08-15 entries - 4 levels + * 15-31 entries - 7 levels + * 32+ entries - max(16) levels + */ +static inline unsigned int randomLevel(int entries, unsigned int randseed) +{ + unsigned int mask; + + if (entries > 15) + mask = 0x7; + else if (entries > 7) + mask = 0x3; + else if (entries > 3) + mask = 0x1; + else + return 0; + + return randseed & mask; +} + +void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) +{ + skiplist_node *update[MaxNumberOfLevels]; + skiplist_node *p, *q; + int k = l->level; + + p = l->header; + do { + while (q = p->next[k], q->key <= key) + p = q; + update[k] = p; + } while (--k >= 0); + + k = randomLevel(++l->entries, randseed); + if (k > MaxLevel) + k = MaxLevel; + if (k > l->level) { + k = ++l->level; + update[k] = l->header; + } + + node->level = k; + node->key = key; + node->value = value; + do { + p = update[k]; + node->next[k] = p->next[k]; + p->next[k] = node; + node->prev[k] = p; + node->next[k]->prev[k] = node; + } while (--k >= 0); +} + +void skiplist_delete(skiplist *l, skiplist_node *node) +{ + int k, m = node->level; + + for (k = 0; k <= m; k++) { + node->prev[k]->next[k] = node->next[k]; + node->next[k]->prev[k] = node->prev[k]; + } + skiplist_node_init(node); + if (m == l->level) { + while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) + m--; + l->level = m; + } + l->entries--; +} diff --git a/kernel/skip_lists.c b/kernel/skip_lists.c deleted file mode 100644 index 40b7ba240..000000000 --- a/kernel/skip_lists.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - Copyright (C) 2011,2016 Con Kolivas. - - Code based on example originally by William Pugh. - -Skip Lists are a probabilistic alternative to balanced trees, as -described in the June 1990 issue of CACM and were invented by -William Pugh in 1987. - -A couple of comments about this implementation: -The routine randomLevel has been hard-coded to generate random -levels using p=0.25. It can be easily changed. - -The insertion routine has been implemented so as to use the -dirty hack described in the CACM paper: if a random level is -generated that is more than the current maximum level, the -current maximum level plus one is used instead. - -Levels start at zero and go up to MaxLevel (which is equal to -MaxNumberOfLevels-1). - -The routines defined in this file are: - -init: defines slnode - -new_skiplist: returns a new, empty list - -randomLevel: Returns a random level based on a u64 random seed passed to it. -In BFS, the "niffy" time is used for this purpose. - -insert(l,key, value): inserts the binding (key, value) into l. This operation -occurs in O(log n) time. - -delnode(slnode, l, node): deletes any binding of key from the l based on the -actual node value. This operation occurs in O(k) time where k is the -number of levels of the node in question (max 16). The original delete -function occurred in O(log n) time and involved a search. - -BFS Notes: In this implementation of skiplists, there are bidirectional -next/prev pointers and the insert function returns a pointer to the actual -node the value is stored. The key here is chosen by the scheduler so as to -sort tasks according to the priority list requirements and is no longer used -by the scheduler after insertion. The scheduler lookup, however, occurs in -O(1) time because it is always the first item in the level 0 linked list. -Since the task struct stores a copy of the node pointer upon skiplist_insert, -it can also remove it much faster than the original implementation with the -aid of prev<->next pointer manipulation and no searching. - -*/ - -#include -#include - -#define MaxNumberOfLevels 16 -#define MaxLevel (MaxNumberOfLevels - 1) - -void skiplist_init(skiplist_node *slnode) -{ - int i; - - slnode->key = 0xFFFFFFFFFFFFFFFF; - slnode->level = 0; - slnode->value = NULL; - for (i = 0; i < MaxNumberOfLevels; i++) - slnode->next[i] = slnode->prev[i] = slnode; -} - -skiplist *new_skiplist(skiplist_node *slnode) -{ - skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); - - BUG_ON(!l); - l->header = slnode; - return l; -} - -void free_skiplist(skiplist *l) -{ - skiplist_node *p, *q; - - p = l->header; - do { - q = p->next[0]; - p->next[0]->prev[0] = q->prev[0]; - skiplist_node_init(p); - p = q; - } while (p != l->header); - kfree(l); -} - -void skiplist_node_init(skiplist_node *node) -{ - memset(node, 0, sizeof(skiplist_node)); -} - -/* - * Returns a pseudo-random number based on the randseed value by masking out - * 0-15. As many levels are not required when only few values are on the list, - * we limit the height of the levels according to how many list entries there - * are in a cheap manner. The height of the levels may have been higher while - * there were more entries queued previously but as this code is used only by - * the scheduler, entries are short lived and will be torn down regularly. - * - * 00-03 entries - 1 level - * 04-07 entries - 2 levels - * 08-15 entries - 4 levels - * 15-31 entries - 7 levels - * 32+ entries - max(16) levels - */ -static inline unsigned int randomLevel(int entries, unsigned int randseed) -{ - unsigned int mask; - - if (entries > 31) - mask = 0xF; - else if (entries > 15) - mask = 0x7; - else if (entries > 7) - mask = 0x3; - else if (entries > 3) - mask = 0x1; - else - return 0; - - return randseed & mask; -} - -void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) -{ - skiplist_node *update[MaxNumberOfLevels]; - skiplist_node *p, *q; - int k = l->level; - - p = l->header; - do { - while (q = p->next[k], q->key <= key) - p = q; - update[k] = p; - } while (--k >= 0); - - k = randomLevel(++l->entries, randseed); - if (k > l->level) { - k = ++l->level; - update[k] = l->header; - } - - node->level = k; - node->key = key; - node->value = value; - do { - p = update[k]; - node->next[k] = p->next[k]; - p->next[k] = node; - node->prev[k] = p; - node->next[k]->prev[k] = node; - } while (--k >= 0); -} - -void skiplist_delete(skiplist *l, skiplist_node *node) -{ - int k, m = node->level; - - for (k = 0; k <= m; k++) { - node->prev[k]->next[k] = node->next[k]; - node->next[k]->prev[k] = node->prev[k]; - } - skiplist_node_init(node); - if (m == l->level) { - while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) - m--; - l->level = m; - } - l->entries--; -} diff --git a/kernel/smpboot.c b/kernel/smpboot.c index fc0d8270f..13bc43d1f 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data) if (kthread_should_park()) { __set_current_state(TASK_RUNNING); + preempt_enable(); if (ht->park && td->status == HP_THREAD_ACTIVE) { BUG_ON(td->cpu != smp_processor_id()); ht->park(td->cpu); td->status = HP_THREAD_PARKED; } - preempt_enable(); kthread_parkme(); /* We might have been woken for stop */ continue; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ca8093ed7..4bd185938 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -127,7 +127,7 @@ static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int __read_mostly one_hundred = 100; static int __read_mostly one_thousand = 1000; -#ifdef CONFIG_SCHED_BFS +#ifdef CONFIG_SCHED_MUQSS extern int rr_interval; extern int sched_interactive; extern int sched_iso_cpu; @@ -269,7 +269,7 @@ static struct ctl_table sysctl_base_table[] = { { } }; -#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS) +#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ @@ -286,7 +286,7 @@ static int max_extfrag_threshold = 1000; #endif static struct ctl_table kern_table[] = { -#ifndef CONFIG_SCHED_BFS +#ifndef CONFIG_SCHED_MUQSS { .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, @@ -455,7 +455,7 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif -#endif /* !CONFIG_SCHED_BFS */ +#endif /* !CONFIG_SCHED_MUQSS */ #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -1020,7 +1020,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SCHED_BFS +#ifdef CONFIG_SCHED_MUQSS { .procname = "rr_interval", .data = &rr_interval, diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 6931b6e3c..10e18d267 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -89,7 +89,7 @@ config NO_HZ_IDLE config NO_HZ_FULL bool "Full dynticks system (tickless)" # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_MUQSS # We need at least one periodic CPU for timekeeping depends on SMP depends on HAVE_CONTEXT_TRACKING diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 287cf721c..69ee53a67 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1039,8 +1039,8 @@ static int trace_wakeup_test_thread(void *data) { /* Make this a -deadline thread */ static const struct sched_attr attr = { -#ifdef CONFIG_SCHED_BFS - /* No deadline on BFS, use RR */ +#ifdef CONFIG_SCHED_MUQSS + /* No deadline on MuQSS, use RR */ .sched_policy = SCHED_RR, #else .sched_policy = SCHED_DEADLINE, -- cgit v1.2.3-54-g00ecf