diff options
Diffstat (limited to 'kernel')
192 files changed, 7059 insertions, 30427 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 08561f1ac..ebdb00432 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER def_bool y depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER -config ARCH_USE_QUEUE_RWLOCK +config ARCH_USE_QUEUED_SPINLOCKS bool -config QUEUE_RWLOCK - def_bool y if ARCH_USE_QUEUE_RWLOCK +config QUEUED_SPINLOCKS + def_bool y if ARCH_USE_QUEUED_SPINLOCKS + depends on SMP + +config ARCH_USE_QUEUED_RWLOCKS + bool + +config QUEUED_RWLOCKS + def_bool y if ARCH_USE_QUEUED_RWLOCKS depends on SMP diff --git a/kernel/Makefile b/kernel/Makefile index 60c302cfb..43c4c920f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -137,7 +137,7 @@ endif ifneq ($(wildcard $(obj)/.x509.list),) ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) -$(info X.509 certificate list changed) +$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)") $(shell rm $(obj)/.x509.list) endif endif diff --git a/kernel/audit.c b/kernel/audit.c index 1c13e4267..f9e606534 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1904,7 +1904,7 @@ EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_link_denied - report a link restriction denial - * @operation: specific link opreation + * @operation: specific link operation * @link: the path that triggered the restriction */ void audit_log_link_denied(const char *operation, struct path *link) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9fb9d1cb8..e85bdfd15 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -599,9 +599,7 @@ static int audit_filter_rules(struct task_struct *tsk, result = match_tree_refs(ctx, rule->tree); break; case AUDIT_LOGINUID: - result = 0; - if (ctx) - result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); + result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); break; case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); @@ -1023,8 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { - WARN_ON(1); + if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { send_sig(SIGKILL, current, 0); return -1; } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8a6616583..cb31229a6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -14,12 +14,7 @@ #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/mm.h> - -struct bpf_array { - struct bpf_map map; - u32 elem_size; - char value[0] __aligned(8); -}; +#include <linux/filter.h> /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) @@ -154,3 +149,109 @@ static int __init register_array_map(void) return 0; } late_initcall(register_array_map); + +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ + /* only bpf_prog file descriptors can be stored in prog_array map */ + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + return array_map_alloc(attr); +} + +static void prog_array_map_free(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + synchronize_rcu(); + + /* make sure it's empty */ + for (i = 0; i < array->map.max_entries; i++) + BUG_ON(array->prog[i] != NULL); + kvfree(array); +} + +static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* only called from syscall */ +static int prog_array_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog, *old_prog; + u32 index = *(u32 *)key, ufd; + + if (map_flags != BPF_ANY) + return -EINVAL; + + if (index >= array->map.max_entries) + return -E2BIG; + + ufd = *(u32 *)value; + prog = bpf_prog_get(ufd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (!bpf_prog_array_compatible(array, prog)) { + bpf_prog_put(prog); + return -EINVAL; + } + + old_prog = xchg(array->prog + index, prog); + if (old_prog) + bpf_prog_put_rcu(old_prog); + + return 0; +} + +static int prog_array_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *old_prog; + u32 index = *(u32 *)key; + + if (index >= array->map.max_entries) + return -E2BIG; + + old_prog = xchg(array->prog + index, NULL); + if (old_prog) { + bpf_prog_put_rcu(old_prog); + return 0; + } else { + return -ENOENT; + } +} + +/* decrement refcnt of all bpf_progs that are stored in this map */ +void bpf_prog_array_map_clear(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + for (i = 0; i < array->map.max_entries; i++) + prog_array_map_delete_elem(map, &i); +} + +static const struct bpf_map_ops prog_array_ops = { + .map_alloc = prog_array_map_alloc, + .map_free = prog_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = prog_array_map_lookup_elem, + .map_update_elem = prog_array_map_update_elem, + .map_delete_elem = prog_array_map_delete_elem, +}; + +static struct bpf_map_type_list prog_array_type __read_mostly = { + .ops = &prog_array_ops, + .type = BPF_MAP_TYPE_PROG_ARRAY, +}; + +static int __init register_prog_array_map(void) +{ + bpf_register_map_type(&prog_array_type); + return 0; +} +late_initcall(register_prog_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 54f0e7fcd..c5bedc82b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -26,9 +26,10 @@ #include <linux/vmalloc.h> #include <linux/random.h> #include <linux/moduleloader.h> -#include <asm/unaligned.h> #include <linux/bpf.h> +#include <asm/unaligned.h> + /* Registers */ #define BPF_R0 regs[BPF_REG_0] #define BPF_R1 regs[BPF_REG_1] @@ -62,6 +63,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns ptr = skb_network_header(skb) + k - SKF_NET_OFF; else if (k >= SKF_LL_OFF) ptr = skb_mac_header(skb) + k - SKF_LL_OFF; + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; @@ -244,6 +246,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, + [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, @@ -286,6 +289,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; + u32 tail_call_cnt = 0; void *ptr; int off; @@ -431,6 +435,30 @@ select_insn: BPF_R4, BPF_R5); CONT; + JMP_TAIL_CALL: { + struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog; + u64 index = BPF_R3; + + if (unlikely(index >= array->map.max_entries)) + goto out; + + if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) + goto out; + + tail_call_cnt++; + + prog = READ_ONCE(array->prog[index]); + if (unlikely(!prog)) + goto out; + + ARG1 = BPF_R1; + insn = prog->insnsi; + goto select_insn; +out: + CONT; + } /* JMP */ JMP_JA: insn += insn->off; @@ -615,25 +643,63 @@ load_byte: return 0; } -void __weak bpf_int_jit_compile(struct bpf_prog *prog) +bool bpf_prog_array_compatible(struct bpf_array *array, + const struct bpf_prog *fp) { + if (!array->owner_prog_type) { + /* There's no owner yet where we could check for + * compatibility. + */ + array->owner_prog_type = fp->type; + array->owner_jited = fp->jited; + + return true; + } + + return array->owner_prog_type == fp->type && + array->owner_jited == fp->jited; +} + +static int bpf_check_tail_call(const struct bpf_prog *fp) +{ + struct bpf_prog_aux *aux = fp->aux; + int i; + + for (i = 0; i < aux->used_map_cnt; i++) { + struct bpf_map *map = aux->used_maps[i]; + struct bpf_array *array; + + if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + continue; + + array = container_of(map, struct bpf_array, map); + if (!bpf_prog_array_compatible(array, fp)) + return -EINVAL; + } + + return 0; } /** - * bpf_prog_select_runtime - select execution runtime for BPF program + * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program * - * try to JIT internal BPF program, if JIT is not available select interpreter - * BPF program will be executed via BPF_PROG_RUN() macro + * Try to JIT eBPF program, if JIT is not available, use interpreter. + * The BPF program will be executed via BPF_PROG_RUN() macro. */ -void bpf_prog_select_runtime(struct bpf_prog *fp) +int bpf_prog_select_runtime(struct bpf_prog *fp) { fp->bpf_func = (void *) __bpf_prog_run; - /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); - /* Lock whole bpf_prog as read-only */ bpf_prog_lock_ro(fp); + + /* The tail call compatibility check can only be done at + * this late stage as we need to determine, if we deal + * with JITed or non JITed program concatenations and not + * all eBPF JITs might immediately support all features. + */ + return bpf_check_tail_call(fp); } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -663,6 +729,29 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; +const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; +const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; +const struct bpf_func_proto bpf_get_current_comm_proto __weak; +const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) +{ + return NULL; +} + +/* Always built-in helper functions. */ +const struct bpf_func_proto bpf_tail_call_proto = { + .func = NULL, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ +void __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ +} /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bd7f5988e..1447ec094 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -13,6 +13,9 @@ #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> +#include <linux/ktime.h> +#include <linux/sched.h> +#include <linux/uidgid.h> /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -44,11 +47,11 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } const struct bpf_func_proto bpf_map_lookup_elem_proto = { - .func = bpf_map_lookup_elem, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_KEY, + .func = bpf_map_lookup_elem, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, }; static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) @@ -63,13 +66,13 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } const struct bpf_func_proto bpf_map_update_elem_proto = { - .func = bpf_map_update_elem, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_KEY, - .arg3_type = ARG_PTR_TO_MAP_VALUE, - .arg4_type = ARG_ANYTHING, + .func = bpf_map_update_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_PTR_TO_MAP_VALUE, + .arg4_type = ARG_ANYTHING, }; static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) @@ -83,11 +86,11 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } const struct bpf_func_proto bpf_map_delete_elem_proto = { - .func = bpf_map_delete_elem, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_KEY, + .func = bpf_map_delete_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, }; static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) @@ -111,3 +114,71 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* NMI safe access to clock monotonic */ + return ktime_get_mono_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_ns_proto = { + .func = bpf_ktime_get_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct task_struct *task = current; + + if (!task) + return -EINVAL; + + return (u64) task->tgid << 32 | task->pid; +} + +const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { + .func = bpf_get_current_pid_tgid, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct task_struct *task = current; + kuid_t uid; + kgid_t gid; + + if (!task) + return -EINVAL; + + current_uid_gid(&uid, &gid); + return (u64) from_kgid(&init_user_ns, gid) << 32 | + from_kuid(&init_user_ns, uid); +} + +const struct bpf_func_proto bpf_get_current_uid_gid_proto = { + .func = bpf_get_current_uid_gid, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) +{ + struct task_struct *task = current; + char *buf = (char *) (long) r1; + + if (!task) + return -EINVAL; + + memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + return 0; +} + +const struct bpf_func_proto bpf_get_current_comm_proto = { + .func = bpf_get_current_comm, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3bae6c591..a1b14d197 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp) { struct bpf_map *map = filp->private_data; + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) + /* prog_array stores refcnt-ed bpf_prog pointers + * release them all when user space closes prog_array_fd + */ + bpf_prog_array_map_clear(map); + bpf_map_put(map); return 0; } @@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog) */ BUG_ON(!prog->aux->ops->get_func_proto); + if (insn->imm == BPF_FUNC_tail_call) { + /* mark bpf_tail_call as different opcode + * to avoid conditional branch in + * interpeter for every normal call + * and to prevent accidental JITing by + * JIT compiler that doesn't support + * bpf_tail_call yet + */ + insn->imm = 0; + insn->code |= BPF_X; + continue; + } + fn = prog->aux->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions @@ -413,6 +432,23 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +static void __prog_put_rcu(struct rcu_head *rcu) +{ + struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); + + free_used_maps(aux); + bpf_prog_free(aux->prog); +} + +/* version of bpf_prog_put() that is called after a grace period */ +void bpf_prog_put_rcu(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + prog->aux->prog = prog; + call_rcu(&prog->aux->rcu, __prog_put_rcu); + } +} + void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { @@ -426,7 +462,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) { struct bpf_prog *prog = filp->private_data; - bpf_prog_put(prog); + bpf_prog_put_rcu(prog); return 0; } @@ -532,7 +568,9 @@ static int bpf_prog_load(union bpf_attr *attr) fixup_bpf_calls(prog); /* eBPF program is ready to be JITed */ - bpf_prog_select_runtime(prog); + err = bpf_prog_select_runtime(prog); + if (err < 0) + goto free_used_maps; err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); if (err < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 47dcd3aa6..039d866fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id) fn->ret_type, func_id); return -EINVAL; } + + if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && + func_id != BPF_FUNC_tail_call) + /* prog_array map type needs extra care: + * only allow to pass it into bpf_tail_call() for now. + * bpf_map_delete_elem() can be allowed in the future, + * while bpf_map_update_elem() must only be done via syscall + */ + return -EINVAL; + + if (func_id == BPF_FUNC_tail_call && + map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + /* don't allow any other map type to be passed into + * bpf_tail_call() + */ + return -EINVAL; + return 0; } @@ -1675,6 +1692,8 @@ static int do_check(struct verifier_env *env) } } else if (class == BPF_STX) { + enum bpf_reg_type dst_reg_type; + if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); if (err) @@ -1683,11 +1702,6 @@ static int do_check(struct verifier_env *env) continue; } - if (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0) { - verbose("BPF_STX uses reserved fields\n"); - return -EINVAL; - } /* check src1 operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) @@ -1697,6 +1711,8 @@ static int do_check(struct verifier_env *env) if (err) return err; + dst_reg_type = regs[insn->dst_reg].type; + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, @@ -1704,6 +1720,15 @@ static int do_check(struct verifier_env *env) if (err) return err; + if (insn->imm == 0) { + insn->imm = dst_reg_type; + } else if (dst_reg_type != insn->imm && + (dst_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { @@ -1822,12 +1847,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && - (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0)) { + (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { verbose("BPF_LDX uses reserved fields\n"); return -EINVAL; } + if (BPF_CLASS(insn->code) == BPF_STX && + ((BPF_MODE(insn->code) != BPF_MEM && + BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { + verbose("BPF_STX uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_map *map; struct fd f; @@ -1950,12 +1981,17 @@ static int convert_ctx_accesses(struct verifier_env *env) struct bpf_prog *new_prog; u32 cnt; int i; + enum bpf_access_type type; if (!env->prog->aux->ops->convert_ctx_access) return 0; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) + type = BPF_READ; + else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) + type = BPF_WRITE; + else continue; if (insn->imm != PTR_TO_CTX) { @@ -1965,7 +2001,7 @@ static int convert_ctx_accesses(struct verifier_env *env) } cnt = env->prog->aux->ops-> - convert_ctx_access(insn->dst_reg, insn->src_reg, + convert_ctx_access(type, insn->dst_reg, insn->src_reg, insn->off, insn_buf); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e8a5491be..f89d9292e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,6 +46,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/rwsem.h> +#include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/sort.h> #include <linux/kmod.h> @@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); +struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + #define cgroup_assert_mutex_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ lockdep_is_held(&cgroup_mutex), \ @@ -156,7 +159,7 @@ static bool cgrp_dfl_root_visible; static bool cgroup_legacy_files_on_dfl; /* some controllers are not supported in the default hierarchy */ -static unsigned int cgrp_dfl_root_inhibit_ss_mask; +static unsigned long cgrp_dfl_root_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -175,18 +178,19 @@ static DEFINE_IDR(cgroup_hierarchy_idr); */ static u64 css_serial_nr_next = 1; -/* This flag indicates whether tasks in the fork and exit paths should - * check for fork/exit handlers to call. This avoids us having to do - * extra work in the fork/exit path if none of the subsystems need to - * be called. +/* + * These bitmask flags indicate whether tasks in the fork and exit paths have + * fork/exit handlers to call. This avoids us having to do extra work in the + * fork/exit path to check which subsystems have fork/exit callbacks. */ -static int need_forkexit_callback __read_mostly; +static unsigned long have_fork_callback __read_mostly; +static unsigned long have_exit_callback __read_mostly; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned int ss_mask); + unsigned long ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, bool visible); @@ -261,7 +265,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * - * Similar to cgroup_css() but returns the effctive css, which is defined + * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. @@ -409,6 +413,24 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) +/** + * for_each_subsys_which - filter for_each_subsys with a bitmask + * @ss: the iteration cursor + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * @ss_maskp: a pointer to the bitmask + * + * The block will only run for cases where the ssid-th bit (1 << ssid) of + * mask is set to 1. + */ +#define for_each_subsys_which(ss, ssid, ss_maskp) \ + if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ + (ssid) = 0; \ + else \ + for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \ + if (((ss) = cgroup_subsys[ssid]) && false) \ + break; \ + else + /* iterate across the hierarchies */ #define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) @@ -882,7 +904,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) static void cgroup_free_root(struct cgroup_root *root) { if (root) { - /* hierarhcy ID shoulid already have been released */ + /* hierarchy ID should already have been released */ WARN_ON_ONCE(root->hierarchy_id); idr_destroy(&root->cgroup_idr); @@ -998,7 +1020,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; @@ -1068,11 +1090,11 @@ static void cgroup_put(struct cgroup *cgrp) * @subtree_control is to be applied to @cgrp. The returned mask is always * a superset of @subtree_control and follows the usual hierarchy rules. */ -static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, - unsigned int subtree_control) +static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, + unsigned long subtree_control) { struct cgroup *parent = cgroup_parent(cgrp); - unsigned int cur_ss_mask = subtree_control; + unsigned long cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1082,11 +1104,10 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, return cur_ss_mask; while (true) { - unsigned int new_ss_mask = cur_ss_mask; + unsigned long new_ss_mask = cur_ss_mask; - for_each_subsys(ss, ssid) - if (cur_ss_mask & (1 << ssid)) - new_ss_mask |= ss->depends_on; + for_each_subsys_which(ss, ssid, &cur_ss_mask) + new_ss_mask |= ss->depends_on; /* * Mask out subsystems which aren't available. This can @@ -1200,7 +1221,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; int i; @@ -1215,18 +1236,16 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) } } -static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, + unsigned long ss_mask) { struct cgroup_subsys *ss; - unsigned int tmp_ss_mask; + unsigned long tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); - for_each_subsys(ss, ssid) { - if (!(ss_mask & (1 << ssid))) - continue; - + for_each_subsys_which(ss, ssid, &ss_mask) { /* if @ss has non-root csses attached to it, can't move */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) return -EBUSY; @@ -1253,7 +1272,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", + pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", ret, ss_mask); pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } @@ -1263,18 +1282,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - for_each_subsys(ss, ssid) - if (ss_mask & (1 << ssid)) - cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); + for_each_subsys_which(ss, ssid, &ss_mask) + cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - for_each_subsys(ss, ssid) { + for_each_subsys_which(ss, ssid, &ss_mask) { struct cgroup_root *src_root; struct cgroup_subsys_state *css; struct css_set *cset; - if (!(ss_mask & (1 << ssid))) - continue; - src_root = ss->root; css = cgroup_css(&src_root->cgrp, ss); @@ -1338,7 +1353,7 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned int subsys_mask; + unsigned long subsys_mask; unsigned int flags; char *release_agent; bool cpuset_clone_children; @@ -1351,7 +1366,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned int mask = -1U; + unsigned long mask = -1UL; struct cgroup_subsys *ss; int nr_opts = 0; int i; @@ -1495,7 +1510,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned int added_mask, removed_mask; + unsigned long added_mask, removed_mask; if (root == &cgrp_dfl_root) { pr_err("remount is not allowed\n"); @@ -1641,7 +1656,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -2050,9 +2065,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, lockdep_assert_held(&css_set_rwsem); /* - * We are synchronized through threadgroup_lock() against PF_EXITING - * setting such that we can't race against cgroup_exit() changing the - * css_set to init_css_set and dropping the old one. + * We are synchronized through cgroup_threadgroup_rwsem against + * PF_EXITING setting such that we can't race against cgroup_exit() + * changing the css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); @@ -2109,10 +2124,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * - * This function may be called without holding threadgroup_lock even if the - * target is a process. Threads may be created and destroyed but as long - * as cgroup_mutex is not dropped, no new css_set can be put into play and - * the preloaded css_sets are guaranteed to cover all migrations. + * This function may be called without holding cgroup_threadgroup_rwsem + * even if the target is a process. Threads may be created and destroyed + * but as long as cgroup_mutex is not dropped, no new css_set can be put + * into play and the preloaded css_sets are guaranteed to cover all + * migrations. */ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, @@ -2215,7 +2231,7 @@ err: * @threadgroup: whether @leader points to the whole process or a single task * * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding threadgroup_lock of @leader. The + * process, the caller must be holding cgroup_threadgroup_rwsem. The * caller is also responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). @@ -2343,7 +2359,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and threadgroup_lock of @leader. + * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2374,6 +2390,47 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return ret; } +static int cgroup_procs_write_permission(struct task_struct *task, + struct cgroup *dst_cgrp, + struct kernfs_open_file *of) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = get_task_cred(task); + int ret = 0; + + /* + * even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; + + if (!ret && cgroup_on_dfl(dst_cgrp)) { + struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup *cgrp; + struct inode *inode; + + down_read(&css_set_rwsem); + cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + up_read(&css_set_rwsem); + + while (!cgroup_is_descendant(dst_cgrp, cgrp)) + cgrp = cgroup_parent(cgrp); + + ret = -ENOMEM; + inode = kernfs_get_inode(sb, cgrp->procs_kn); + if (inode) { + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + } + } + + put_cred(tcred); + return ret; +} + /* * Find the task_struct of the task to attach by vpid and pass it along to the * function to attach either it or all tasks in its threadgroup. Will lock @@ -2383,7 +2440,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; - const struct cred *cred = current_cred(), *tcred; struct cgroup *cgrp; pid_t pid; int ret; @@ -2395,29 +2451,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (!cgrp) return -ENODEV; -retry_find_task: + percpu_down_write(&cgroup_threadgroup_rwsem); rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { - rcu_read_unlock(); ret = -ESRCH; - goto out_unlock_cgroup; + goto out_unlock_rcu; } - /* - * even if we're attaching all tasks in the thread group, we - * only need to check permissions on one of them. - */ - tcred = __task_cred(tsk); - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - rcu_read_unlock(); - ret = -EACCES; - goto out_unlock_cgroup; - } - } else + } else { tsk = current; + } if (threadgroup) tsk = tsk->group_leader; @@ -2429,35 +2473,23 @@ retry_find_task: */ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; - rcu_read_unlock(); - goto out_unlock_cgroup; + goto out_unlock_rcu; } get_task_struct(tsk); rcu_read_unlock(); - threadgroup_lock(tsk); - if (threadgroup) { - if (!thread_group_leader(tsk)) { - /* - * a race with de_thread from another thread's exec() - * may strip us of our leadership, if this happens, - * there is no choice but to throw this task away and - * try again; this is - * "double-double-toil-and-trouble-check locking". - */ - threadgroup_unlock(tsk); - put_task_struct(tsk); - goto retry_find_task; - } - } - - ret = cgroup_attach_task(cgrp, tsk, threadgroup); - - threadgroup_unlock(tsk); + ret = cgroup_procs_write_permission(tsk, cgrp, of); + if (!ret) + ret = cgroup_attach_task(cgrp, tsk, threadgroup); put_task_struct(tsk); -out_unlock_cgroup: + goto out_unlock_threadgroup; + +out_unlock_rcu: + rcu_read_unlock(); +out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -2540,19 +2572,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; - for_each_subsys(ss, ssid) { - if (ss_mask & (1 << ssid)) { - if (printed) - seq_putc(seq, ' '); - seq_printf(seq, "%s", ss->name); - printed = true; - } + for_each_subsys_which(ss, ssid, &ss_mask) { + if (printed) + seq_putc(seq, ' '); + seq_printf(seq, "%s", ss->name); + printed = true; } if (printed) seq_putc(seq, '\n'); @@ -2604,6 +2634,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + /* look up all csses currently attached to @cgrp's subtree */ down_read(&css_set_rwsem); css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { @@ -2659,17 +2691,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; - threadgroup_lock(task); - /* raced against de_thread() from another thread? */ - if (!thread_group_leader(task)) { - threadgroup_unlock(task); - put_task_struct(task); - continue; - } - ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - threadgroup_unlock(task); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -2679,6 +2702,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) out_finish: cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } @@ -2687,8 +2711,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - unsigned int enable = 0, disable = 0; - unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; + unsigned long enable = 0, disable = 0; + unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2700,11 +2724,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { + unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask; + if (tok[0] == '\0') continue; - for_each_subsys(ss, ssid) { - if (ss->disabled || strcmp(tok + 1, ss->name) || - ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) + for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + if (ss->disabled || strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { @@ -2791,10 +2816,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * still around. In such cases, wait till it's gone using * offline_waitq. */ - for_each_subsys(ss, ssid) { - if (!(css_enable & (1 << ssid))) - continue; - + for_each_subsys_which(ss, ssid, &css_enable) { cgroup_for_each_live_child(child, cgrp) { DEFINE_WAIT(wait); @@ -3085,7 +3107,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return ret; } - if (cft->seq_show == cgroup_populated_show) + if (cft->write == cgroup_procs_write) + cgrp->procs_kn = kn; + else if (cft->seq_show == cgroup_populated_show) cgrp->populated_kn = kn; return 0; } @@ -4320,7 +4344,7 @@ static struct cftype cgroup_legacy_base_files[] = { * * On failure, no file is added. */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; int i, ret = 0; @@ -4929,7 +4953,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) * init_css_set is in the subsystem's root cgroup. */ init_css_set.subsys[ss->id] = css; - need_forkexit_callback |= ss->fork || ss->exit; + have_fork_callback |= (bool)ss->fork << ss->id; + have_exit_callback |= (bool)ss->exit << ss->id; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't @@ -4987,6 +5012,7 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; + BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); @@ -5239,11 +5265,8 @@ void cgroup_post_fork(struct task_struct *child) * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ - if (need_forkexit_callback) { - for_each_subsys(ss, i) - if (ss->fork) - ss->fork(child); - } + for_each_subsys_which(ss, i, &have_fork_callback) + ss->fork(child); } /** @@ -5287,16 +5310,12 @@ void cgroup_exit(struct task_struct *tsk) cset = task_css_set(tsk); RCU_INIT_POINTER(tsk->cgroups, &init_css_set); - if (need_forkexit_callback) { - /* see cgroup_post_fork() for details */ - for_each_subsys(ss, i) { - if (ss->exit) { - struct cgroup_subsys_state *old_css = cset->subsys[i]; - struct cgroup_subsys_state *css = task_css(tsk, i); + /* see cgroup_post_fork() for details */ + for_each_subsys_which(ss, i, &have_exit_callback) { + struct cgroup_subsys_state *old_css = cset->subsys[i]; + struct cgroup_subsys_state *css = task_css(tsk, i); - ss->exit(css, old_css, tsk); - } - } + ss->exit(css, old_css, tsk); } if (put_cset) diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config new file mode 100644 index 000000000..ff756221f --- /dev/null +++ b/kernel/configs/xen.config @@ -0,0 +1,48 @@ +# global stuff - these enable us to allow some +# of the not so generic stuff below for xen +CONFIG_PARAVIRT=y +CONFIG_NET=y +CONFIG_NET_CORE=y +CONFIG_NETDEVICES=y +CONFIG_BLOCK=y +CONFIG_WATCHDOG=y +CONFIG_TARGET_CORE=y +CONFIG_SCSI=y +CONFIG_FB=y +CONFIG_INPUT_MISC=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_TTY=y +# Technically not required but otherwise produces +# pretty useless systems starting from allnoconfig +# You want TCP/IP and ELF binaries right? +CONFIG_INET=y +CONFIG_BINFMT_ELF=y +# generic config +CONFIG_XEN=y +CONFIG_XEN_DOM0=y +# backend drivers +CONFIG_XEN_BACKEND=y +CONFIG_XEN_BLKDEV_BACKEND=m +CONFIG_XEN_NETDEV_BACKEND=m +CONFIG_HVC_XEN=y +CONFIG_XEN_WDT=m +CONFIG_XEN_SCSI_BACKEND=m +# frontend drivers +CONFIG_XEN_FBDEV_FRONTEND=m +CONFIG_HVC_XEN_FRONTEND=y +CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m +CONFIG_XEN_SCSI_FRONTEND=m +# others +CONFIG_XEN_BALLOON=y +CONFIG_XEN_SCRUB_PAGES=y +CONFIG_XEN_DEV_EVTCHN=m +CONFIG_XEN_BLKDEV_FRONTEND=m +CONFIG_XEN_NETDEV_FRONTEND=m +CONFIG_XENFS=m +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +CONFIG_XEN_GNTDEV=m +CONFIG_XEN_GRANT_DEV_ALLOC=m +CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PRIVCMD=m diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 72d59a1a6..0a495ab35 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -30,12 +30,23 @@ EXPORT_SYMBOL_GPL(context_tracking_enabled); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); -void context_tracking_cpu_set(int cpu) +static bool context_tracking_recursion_enter(void) { - if (!per_cpu(context_tracking.active, cpu)) { - per_cpu(context_tracking.active, cpu) = true; - static_key_slow_inc(&context_tracking_enabled); - } + int recursion; + + recursion = __this_cpu_inc_return(context_tracking.recursion); + if (recursion == 1) + return true; + + WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion); + __this_cpu_dec(context_tracking.recursion); + + return false; +} + +static void context_tracking_recursion_exit(void) +{ + __this_cpu_dec(context_tracking.recursion); } /** @@ -75,6 +86,9 @@ void context_tracking_enter(enum ctx_state state) WARN_ON_ONCE(!current->mm); local_irq_save(flags); + if (!context_tracking_recursion_enter()) + goto out_irq_restore; + if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { /* @@ -105,6 +119,8 @@ void context_tracking_enter(enum ctx_state state) */ __this_cpu_write(context_tracking.state, state); } + context_tracking_recursion_exit(); +out_irq_restore: local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); @@ -139,6 +155,9 @@ void context_tracking_exit(enum ctx_state state) return; local_irq_save(flags); + if (!context_tracking_recursion_enter()) + goto out_irq_restore; + if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { /* @@ -153,6 +172,8 @@ void context_tracking_exit(enum ctx_state state) } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } + context_tracking_recursion_exit(); +out_irq_restore: local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); @@ -164,24 +185,26 @@ void context_tracking_user_exit(void) } NOKPROBE_SYMBOL(context_tracking_user_exit); -/** - * __context_tracking_task_switch - context switch the syscall callbacks - * @prev: the task that is being switched out - * @next: the task that is being switched in - * - * The context tracking uses the syscall slow path to implement its user-kernel - * boundaries probes on syscalls. This way it doesn't impact the syscall fast - * path on CPUs that don't do context tracking. - * - * But we need to clear the flag on the previous task because it may later - * migrate to some CPU that doesn't do the context tracking. As such the TIF - * flag may not be desired there. - */ -void __context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) +void __init context_tracking_cpu_set(int cpu) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); + static __initdata bool initialized = false; + + if (!per_cpu(context_tracking.active, cpu)) { + per_cpu(context_tracking.active, cpu) = true; + static_key_slow_inc(&context_tracking_enabled); + } + + if (initialized) + return; + + /* + * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork + * This assumes that init is the only task at this early boot stage. + */ + set_tsk_thread_flag(&init_task, TIF_NOHZ); + WARN_ON_ONCE(!tasklist_empty()); + + initialized = true; } #ifdef CONFIG_CONTEXT_TRACKING_FORCE diff --git a/kernel/cpu.c b/kernel/cpu.c index 94bbe4695..5644ec558 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -21,6 +21,7 @@ #include <linux/suspend.h> #include <linux/lockdep.h> #include <linux/tick.h> +#include <linux/irq.h> #include <trace/events/power.h> #include "smpboot.h" @@ -392,14 +393,19 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) smpboot_park_threads(cpu); /* - * So now all preempt/rcu users must observe !cpu_active(). + * Prevent irq alloc/free while the dying cpu reorganizes the + * interrupt affinities. */ + irq_lock_sparse(); + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ - smpboot_unpark_threads(cpu); cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + irq_unlock_sparse(); goto out_release; } BUG_ON(cpu_online(cpu)); @@ -416,6 +422,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ per_cpu(cpu_dead_idle, cpu) = false; + /* Interrupts are moved away from the dying cpu, reenable alloc/free */ + irq_unlock_sparse(); + hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); @@ -463,6 +472,7 @@ static int smpboot_thread_call(struct notifier_block *nfb, switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: case CPU_ONLINE: smpboot_unpark_threads(cpu); break; @@ -479,7 +489,7 @@ static struct notifier_block smpboot_thread_notifier = { .priority = CPU_PRI_SMPBOOT, }; -void __cpuinit smpboot_thread_init(void) +void smpboot_thread_init(void) { register_cpu_notifier(&smpboot_thread_notifier); } @@ -519,6 +529,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); + if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ee14e3a35..f0acff0f6 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1223,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ - update_nodemasks_hier(cs, &cs->mems_allowed); + update_nodemasks_hier(cs, &trialcs->mems_allowed); done: return retval; } diff --git a/kernel/delayacct.c b/kernel/delayacct.c index d12807d40..ef90b04d7 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -104,7 +104,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) */ t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; - t3 = tsk_seruntime(tsk); + t3 = tsk->se.sum_exec_runtime; d->cpu_count += t1; diff --git a/kernel/events/core.c b/kernel/events/core.c index 0ceb38677..e6feb5114 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -36,7 +36,7 @@ #include <linux/kernel_stat.h> #include <linux/cgroup.h> #include <linux/perf_event.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/module.h> @@ -51,9 +51,11 @@ static struct workqueue_struct *perf_wq; +typedef int (*remote_function_f)(void *); + struct remote_function_call { struct task_struct *p; - int (*func)(void *info); + remote_function_f func; void *info; int ret; }; @@ -86,7 +88,7 @@ static void remote_function(void *data) * -EAGAIN - when the process moved away */ static int -task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, @@ -110,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info) * * returns: @func return value or -ENXIO when the cpu is offline */ -static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, @@ -747,62 +749,31 @@ perf_cgroup_mark_enabled(struct perf_event *event, /* * function must be called with interrupts disbled */ -static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; - enum hrtimer_restart ret = HRTIMER_NORESTART; int rotations = 0; WARN_ON(!irqs_disabled()); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); - rotations = perf_rotate_context(cpuctx); - /* - * arm timer if needed - */ - if (rotations) { + raw_spin_lock(&cpuctx->hrtimer_lock); + if (rotations) hrtimer_forward_now(hr, cpuctx->hrtimer_interval); - ret = HRTIMER_RESTART; - } - - return ret; -} - -/* CPU is going down */ -void perf_cpu_hrtimer_cancel(int cpu) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - if (WARN_ON(cpu != smp_processor_id())) - return; - - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - if (pmu->task_ctx_nr == perf_sw_context) - continue; - - hrtimer_cancel(&cpuctx->hrtimer); - } - - rcu_read_unlock(); + else + cpuctx->hrtimer_active = 0; + raw_spin_unlock(&cpuctx->hrtimer_lock); - local_irq_restore(flags); + return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } -static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) { - struct hrtimer *hr = &cpuctx->hrtimer; + struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; - int timer; + u64 interval; /* no multiplexing needed for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) @@ -812,31 +783,36 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) * check default is sane, if not set then force to * default interval (1/tick) */ - timer = pmu->hrtimer_interval_ms; - if (timer < 1) - timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + interval = pmu->hrtimer_interval_ms; + if (interval < 1) + interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - hr->function = perf_cpu_hrtimer_handler; + raw_spin_lock_init(&cpuctx->hrtimer_lock); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + timer->function = perf_mux_hrtimer_handler; } -static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) { - struct hrtimer *hr = &cpuctx->hrtimer; + struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; + unsigned long flags; /* not for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) - return; + return 0; - if (hrtimer_active(hr)) - return; + raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); + if (!cpuctx->hrtimer_active) { + cpuctx->hrtimer_active = 1; + hrtimer_forward_now(timer, cpuctx->hrtimer_interval); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + } + raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); - if (!hrtimer_callback_running(hr)) - __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, - 0, HRTIMER_MODE_REL_PINNED, 0); + return 0; } void perf_pmu_disable(struct pmu *pmu) @@ -1526,11 +1502,17 @@ static int __init perf_workqueue_init(void) core_initcall(perf_workqueue_init); +static inline int pmu_filter_match(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + return pmu->filter_match ? pmu->filter_match(event) : 1; +} + static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event); + && perf_cgroup_match(event) && pmu_filter_match(event); } static void @@ -1886,8 +1868,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - event->tstamp_running += tstamp - event->tstamp_stopped; - perf_set_shadow_time(event, ctx, tstamp); perf_log_itrace_start(event); @@ -1899,6 +1879,8 @@ event_sched_in(struct perf_event *event, goto out; } + event->tstamp_running += tstamp - event->tstamp_stopped; + if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) @@ -1935,7 +1917,7 @@ group_sched_in(struct perf_event *group_event, if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -1982,7 +1964,7 @@ group_error: pmu->cancel_txn(pmu); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -2255,7 +2237,7 @@ static int __perf_event_enable(void *info) */ if (leader != event) { group_sched_out(leader, cpuctx, ctx); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); } if (leader->attr.pinned) { update_group_times(leader); @@ -3976,28 +3958,21 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) -{ - struct perf_event_context *ctx = event->ctx; - int ret = 0, active; +struct period_event { + struct perf_event *event; u64 value; +}; - if (!is_sampling_event(event)) - return -EINVAL; - - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - - if (!value) - return -EINVAL; +static int __perf_event_period(void *info) +{ + struct period_event *pe = info; + struct perf_event *event = pe->event; + struct perf_event_context *ctx = event->ctx; + u64 value = pe->value; + bool active; - raw_spin_lock_irq(&ctx->lock); + raw_spin_lock(&ctx->lock); if (event->attr.freq) { - if (value > sysctl_perf_event_sample_rate) { - ret = -EINVAL; - goto unlock; - } - event->attr.sample_freq = value; } else { event->attr.sample_period = value; @@ -4016,11 +3991,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } + raw_spin_unlock(&ctx->lock); -unlock: + return 0; +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct period_event pe = { .event = event, }; + struct perf_event_context *ctx = event->ctx; + struct task_struct *task; + u64 value; + + if (!is_sampling_event(event)) + return -EINVAL; + + if (copy_from_user(&value, arg, sizeof(value))) + return -EFAULT; + + if (!value) + return -EINVAL; + + if (event->attr.freq && value > sysctl_perf_event_sample_rate) + return -EINVAL; + + task = ctx->task; + pe.value = value; + + if (!task) { + cpu_function_call(event->cpu, __perf_event_period, &pe); + return 0; + } + +retry: + if (!task_function_call(task, __perf_event_period, &pe)) + return 0; + + raw_spin_lock_irq(&ctx->lock); + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + task = ctx->task; + goto retry; + } + + __perf_event_period(&pe); raw_spin_unlock_irq(&ctx->lock); - return ret; + return 0; } static const struct file_operations perf_fops; @@ -4376,14 +4393,6 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_unlock(); } -static void rb_free_rcu(struct rcu_head *rcu_head) -{ - struct ring_buffer *rb; - - rb = container_of(rcu_head, struct ring_buffer, rcu_head); - rb_free(rb); -} - struct ring_buffer *ring_buffer_get(struct perf_event *event) { struct ring_buffer *rb; @@ -4766,12 +4775,20 @@ static const struct file_operations perf_fops = { * to user-space before waking everybody up. */ +static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) +{ + /* only the parent has fasync state */ + if (event->parent) + event = event->parent; + return &event->fasync; +} + void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { - kill_fasync(&event->fasync, SIGIO, event->pending_kill); + kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } @@ -5381,9 +5398,9 @@ void perf_prepare_sample(struct perf_event_header *header, } } -static void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +void perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) { struct perf_output_handle handle; struct perf_event_header header; @@ -5812,7 +5829,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); + name = file_path(file, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; @@ -5975,6 +5992,39 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, } /* + * Lost/dropped samples logging + */ +void perf_log_lost_samples(struct perf_event *event, u64 lost) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + struct { + struct perf_event_header header; + u64 lost; + } lost_samples_event = { + .header = { + .type = PERF_RECORD_LOST_SAMPLES, + .misc = 0, + .size = sizeof(lost_samples_event), + }, + .lost = lost, + }; + + perf_event_header__init_id(&lost_samples_event.header, &sample, event); + + ret = perf_output_begin(&handle, event, + lost_samples_event.header.size); + if (ret) + return; + + perf_output_put(&handle, lost_samples_event); + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +} + +/* * IRQ throttle logging */ @@ -6117,7 +6167,7 @@ static int __perf_event_overflow(struct perf_event *event, else perf_event_output(event, data, regs); - if (event->fasync && event->pending_kill) { + if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending); } @@ -6864,9 +6914,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) } else { period = max_t(u64, 10000, hwc->sample_period); } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), + HRTIMER_MODE_REL_PINNED); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) @@ -7167,6 +7216,8 @@ perf_event_mux_interval_ms_show(struct device *dev, return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); } +static DEFINE_MUTEX(mux_interval_mutex); + static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, @@ -7186,17 +7237,21 @@ perf_event_mux_interval_ms_store(struct device *dev, if (timer == pmu->hrtimer_interval_ms) return count; + mutex_lock(&mux_interval_mutex); pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ - for_each_possible_cpu(cpu) { + get_online_cpus(); + for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - if (hrtimer_active(&cpuctx->hrtimer)) - hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + cpu_function_call(cpu, + (remote_function_f)perf_mux_hrtimer_restart, cpuctx); } + put_online_cpus(); + mutex_unlock(&mux_interval_mutex); return count; } @@ -7301,7 +7356,7 @@ skip_type: lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; - __perf_cpu_hrtimer_init(cpuctx, cpu); + __perf_mux_hrtimer_init(cpuctx, cpu); cpuctx->unique_pmu = pmu; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 9f6ce9ba4..2bbad9c12 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,6 +11,7 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; + struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ @@ -55,6 +56,15 @@ struct ring_buffer { }; extern void rb_free(struct ring_buffer *rb); + +static inline void rb_free_rcu(struct rcu_head *rcu_head) +{ + struct ring_buffer *rb; + + rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb_free(rb); +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); @@ -72,15 +82,6 @@ static inline bool rb_has_aux(struct ring_buffer *rb) void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags); -extern void -perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event); -extern void -perf_event__output_id_sample(struct perf_event *event, - struct perf_output_handle *handle, - struct perf_sample_data *sample); - extern struct page * perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 725c41608..c8aa3f75b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle, perf_output_get_handle(handle); do { - tail = ACCESS_ONCE(rb->user_page->data_tail); + tail = READ_ONCE_CTRL(rb->user_page->data_tail); offset = head = local_read(&rb->head); if (!rb->overwrite && unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) @@ -221,6 +221,8 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } +static void rb_irq_work(struct irq_work *work); + static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -241,6 +243,16 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); + init_irq_work(&rb->irq_work, rb_irq_work); +} + +static void ring_buffer_put_async(struct ring_buffer *rb) +{ + if (!atomic_dec_and_test(&rb->refcount)) + return; + + rb->rcu_head.next = (void *)rb; + irq_work_queue(&rb->irq_work); } /* @@ -319,7 +331,7 @@ err_put: rb_free_aux(rb); err: - ring_buffer_put(rb); + ring_buffer_put_async(rb); handle->event = NULL; return NULL; @@ -370,7 +382,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, local_set(&rb->aux_nest, 0); rb_free_aux(rb); - ring_buffer_put(rb); + ring_buffer_put_async(rb); } /* @@ -547,17 +559,30 @@ static void __rb_free_aux(struct ring_buffer *rb) rb->aux_priv = NULL; } - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } } void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) + irq_work_queue(&rb->irq_work); +} + +static void rb_irq_work(struct irq_work *work) +{ + struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); + + if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); + + if (rb->rcu_head.next == (void *)rb) + call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC diff --git a/kernel/exit.c b/kernel/exit.c index 490a707c7..031325e9a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -135,7 +135,7 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); @@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - unmark_oom_victim(); + exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -711,10 +711,10 @@ void do_exit(long code) current->comm, task_pid_nr(current), preempt_count()); - acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index e37f372d3..d6dfe2c23 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -138,7 +138,7 @@ static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) { - return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node); + return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); } static inline void free_task_struct(struct task_struct *tsk) @@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested) max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } +#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT +/* Initialized by the architecture: */ +int arch_task_struct_size __read_mostly; +#endif + void __init fork_init(void) { #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR @@ -295,7 +300,7 @@ void __init fork_init(void) #endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = - kmem_cache_create("task_struct", sizeof(struct task_struct), + kmem_cache_create("task_struct", arch_task_struct_size, ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); #endif @@ -456,7 +461,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; - vma_get_file(tmp); + get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); i_mmap_lock_write(mapping); @@ -1091,10 +1096,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) { unsigned long cpu_limit; - /* Thread group counters. */ - thread_group_cputime_init(sig); - - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); sig->cputimer.running = 1; @@ -1144,10 +1146,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); -#ifdef CONFIG_CGROUPS - init_rwsem(&sig->group_rwsem); -#endif - sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1241,7 +1239,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, - int trace) + int trace, + unsigned long tls) { int retval; struct task_struct *p; @@ -1396,6 +1395,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->hardirq_context = 0; p->softirq_context = 0; #endif + + p->pagefault_disabled = 0; + #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; @@ -1447,7 +1449,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, stack_start, stack_size, p); + retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); if (retval) goto bad_fork_cleanup_io; @@ -1659,7 +1661,7 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1674,11 +1676,12 @@ struct task_struct *fork_idle(int cpu) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long do_fork(unsigned long clone_flags, +long _do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr) + int __user *child_tidptr, + unsigned long tls) { struct task_struct *p; int trace = 0; @@ -1703,7 +1706,7 @@ long do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace); + child_tidptr, NULL, trace, tls); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1744,20 +1747,34 @@ long do_fork(unsigned long clone_flags, return nr; } +#ifndef CONFIG_HAVE_COPY_THREAD_TLS +/* For compatibility with architectures that call do_fork directly rather than + * using the syscall entry points below. */ +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + return _do_fork(clone_flags, stack_start, stack_size, + parent_tidptr, child_tidptr, 0); +} +#endif + /* * Create a kernel thread. */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, - (unsigned long)arg, NULL, NULL); + return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, + (unsigned long)arg, NULL, NULL, 0); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return do_fork(SIGCHLD, 0, 0, NULL, NULL); + return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); #else /* can not support in nommu mode */ return -EINVAL; @@ -1768,8 +1785,8 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, - 0, NULL, NULL); + return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + 0, NULL, NULL, 0); } #endif @@ -1777,27 +1794,27 @@ SYSCALL_DEFINE0(vfork) #ifdef CONFIG_CLONE_BACKWARDS SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, - int, tls_val, + unsigned long, tls, int __user *, child_tidptr) #elif defined(CONFIG_CLONE_BACKWARDS2) SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #elif defined(CONFIG_CLONE_BACKWARDS3) SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, int, stack_size, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #endif { - return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); + return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); } #endif diff --git a/kernel/futex.c b/kernel/futex.c index 2579e407f..c4a182f53 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q) /* * The hash bucket lock must be held when this is called. - * Afterwards, the futex_q must not be accessed. + * Afterwards, the futex_q must not be accessed. Callers + * must ensure to later call wake_up_q() for the actual + * wakeups to occur. */ -static void wake_futex(struct futex_q *q) +static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) { struct task_struct *p = q->task; @@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q) return; /* - * We set q->lock_ptr = NULL _before_ we wake up the task. If - * a non-futex wake up happens on another CPU then the task - * might exit and p would dereference a non-existing task - * struct. Prevent this by holding a reference on p across the - * wake up. + * Queue the task for later wakeup for after we've released + * the hb->lock. wake_q_add() grabs reference to p. */ - get_task_struct(p); - + wake_q_add(wake_q, p); __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as @@ -1117,16 +1115,16 @@ static void wake_futex(struct futex_q *q) */ smp_wmb(); q->lock_ptr = NULL; - - wake_up_state(p, TASK_NORMAL); - put_task_struct(p); } -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, + struct futex_hash_bucket *hb) { struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + WAKE_Q(wake_q); + bool deboost; int ret = 0; if (!pi_state) @@ -1178,7 +1176,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) raw_spin_unlock_irq(&new_owner->pi_lock); raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - rt_mutex_unlock(&pi_state->pi_mutex); + + deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + + /* + * First unlock HB so the waiter does not spin on it once he got woken + * up. Second wake up the waiter before the priority is adjusted. If we + * deboost first (and lose our higher priority), then the task might get + * scheduled away before the wake up can take place. + */ + spin_unlock(&hb->lock); + wake_up_q(&wake_q); + if (deboost) + rt_mutex_adjust_prio(current); return 0; } @@ -1217,6 +1227,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; int ret; + WAKE_Q(wake_q); if (!bitset) return -EINVAL; @@ -1244,13 +1255,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (!(this->bitset & bitset)) continue; - wake_futex(this); + mark_wake_futex(&wake_q, this); if (++ret >= nr_wake) break; } } spin_unlock(&hb->lock); + wake_up_q(&wake_q); out_put_key: put_futex_key(&key); out: @@ -1269,6 +1281,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; + WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); @@ -1320,7 +1333,7 @@ retry_private: ret = -EINVAL; goto out_unlock; } - wake_futex(this); + mark_wake_futex(&wake_q, this); if (++ret >= nr_wake) break; } @@ -1334,7 +1347,7 @@ retry_private: ret = -EINVAL; goto out_unlock; } - wake_futex(this); + mark_wake_futex(&wake_q, this); if (++op_ret >= nr_wake2) break; } @@ -1344,6 +1357,7 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); out_put_keys: put_futex_key(&key2); out_put_key1: @@ -1503,6 +1517,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; + WAKE_Q(wake_q); if (requeue_pi) { /* @@ -1679,7 +1694,7 @@ retry_private: * woken by futex_unlock_pi(). */ if (++task_count <= nr_wake && !requeue_pi) { - wake_futex(this); + mark_wake_futex(&wake_q, this); continue; } @@ -1719,6 +1734,7 @@ retry_private: out_unlock: free_pi_state(pi_state); double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); hb_waiters_dec(hb2); /* @@ -2055,7 +2071,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, { /* * The task state is guaranteed to be set before another task can - * wake it. set_current_state() is implemented using set_mb() and + * wake it. set_current_state() is implemented using smp_store_mb() and * queue_me() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ @@ -2063,11 +2079,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, queue_me(q, hb); /* Arm the timer */ - if (timeout) { + if (timeout) hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } /* * If we have been removed from the hash list, then another task @@ -2412,13 +2425,23 @@ retry: */ match = futex_top_waiter(hb, &key); if (match) { - ret = wake_futex_pi(uaddr, uval, match); + ret = wake_futex_pi(uaddr, uval, match, hb); + /* + * In case of success wake_futex_pi dropped the hash + * bucket lock. + */ + if (!ret) + goto out_putkey; /* * The atomic access to the futex value generated a * pagefault, so retry the user-access and the wakeup: */ if (ret == -EFAULT) goto pi_faulted; + /* + * wake_futex_pi has detected invalid state. Tell user + * space. + */ goto out_unlock; } @@ -2439,6 +2462,7 @@ retry: out_unlock: spin_unlock(&hb->lock); +out_putkey: put_futex_key(&key); return ret; diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index a744098e4..7080ae1eb 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -92,6 +92,12 @@ void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_time_profile); +void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_icall_topn); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 826ba9fb5..e25e92fb4 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 +#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#define GCOV_COUNTERS 10 +#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 #else #define GCOV_COUNTERS 8 diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index eb9a4ea39..ae216824e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -719,15 +719,9 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) } void -__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, - const char *name) +__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, + int is_chained, const char *name) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); - - if (!desc) - return; - if (!handle) { handle = handle_bad_irq; } else { @@ -749,13 +743,13 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, * right away. */ if (WARN_ON(is_chained)) - goto out; + return; /* Try the parent */ irq_data = irq_data->parent_data; } #endif if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) - goto out; + return; } /* Uninstall? */ @@ -774,12 +768,41 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, irq_settings_set_nothread(desc); irq_startup(desc, true); } -out: +} + +void +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); + + if (!desc) + return; + + __irq_do_set_handler(desc, handle, is_chained, name); irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL_GPL(__irq_set_handler); void +irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, + void *data) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); + + if (!desc) + return; + + __irq_do_set_handler(desc, handle, 1, NULL); + desc->irq_data.handler_data = data; + + irq_put_desc_busunlock(desc, flags); +} +EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); + +void irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { @@ -876,6 +899,34 @@ void irq_cpu_offline(void) #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** + * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if + * NULL) + * @data: Pointer to interrupt specific data + */ +void irq_chip_enable_parent(struct irq_data *data) +{ + data = data->parent_data; + if (data->chip->irq_enable) + data->chip->irq_enable(data); + else + data->chip->irq_unmask(data); +} + +/** + * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if + * NULL) + * @data: Pointer to interrupt specific data + */ +void irq_chip_disable_parent(struct irq_data *data) +{ + data = data->parent_data; + if (data->chip->irq_disable) + data->chip->irq_disable(data); + else + data->chip->irq_mask(data); +} + +/** * irq_chip_ack_parent - Acknowledge the parent interrupt * @data: Pointer to interrupt specific data */ @@ -934,6 +985,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data, } /** + * irq_chip_set_type_parent - Set IRQ type on the parent interrupt + * @data: Pointer to interrupt specific data + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h + * + * Conditional, as the underlying parent chip might not implement it. + */ +int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) +{ + data = data->parent_data; + + if (data->chip->irq_set_type) + return data->chip->irq_set_type(data, type); + + return -ENOSYS; +} + +/** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware * @data: Pointer to interrupt specific data * @@ -946,6 +1014,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) if (data->chip && data->chip->irq_retrigger) return data->chip->irq_retrigger(data); + return 0; +} + +/** + * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt + * @data: Pointer to interrupt specific data + * @dest: The vcpu affinity information + */ +int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) +{ + data = data->parent_data; + if (data->chip->irq_set_vcpu_affinity) + return data->chip->irq_set_vcpu_affinity(data, vcpu_info); + return -ENOSYS; } diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 2feb6feca..326a67f24 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -42,6 +42,7 @@ struct irq_chip no_irq_chip = { .irq_enable = noop, .irq_disable = noop, .irq_ack = ack_bad, + .flags = IRQCHIP_SKIP_SET_WAKE, }; /* diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 61024e8ab..15b370daf 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -360,7 +360,7 @@ static struct lock_class_key irq_nested_lock_class; int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw_irq) { - struct irq_data *data = irq_get_irq_data(virq); + struct irq_data *data = irq_domain_get_irq_data(d, virq); struct irq_domain_chip_generic *dgc = d->gc; struct irq_chip_generic *gc; struct irq_chip_type *ct; @@ -405,8 +405,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, else data->mask = 1 << idx; - irq_set_chip_and_handler(virq, chip, ct->handler); - irq_set_chip_data(virq, gc); + irq_domain_set_info(d, virq, hw_irq, chip, gc, ct->handler, NULL, NULL); irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index df553b0af..61008b843 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -59,8 +59,6 @@ enum { #include "debug.h" #include "settings.h" -#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq); @@ -78,12 +76,8 @@ extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } -extern void irq_lock_sparse(void); -extern void irq_unlock_sparse(void); #else extern void irq_mark_irq(unsigned int irq); -static inline void irq_lock_sparse(void) { } -static inline void irq_unlock_sparse(void) { } #endif extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); @@ -170,27 +164,27 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) */ static inline void irqd_set_move_pending(struct irq_data *d) { - d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; + __irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING; } static inline void irqd_clr_move_pending(struct irq_data *d) { - d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; + __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING; } static inline void irqd_clear(struct irq_data *d, unsigned int mask) { - d->state_use_accessors &= ~mask; + __irqd_to_state(d) &= ~mask; } static inline void irqd_set(struct irq_data *d, unsigned int mask) { - d->state_use_accessors |= mask; + __irqd_to_state(d) |= mask; } static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) { - return d->state_use_accessors & mask; + return __irqd_to_state(d) & mask; } static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) @@ -199,6 +193,11 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d __this_cpu_inc(kstat.irqs_sum); } +static inline int irq_desc_get_node(struct irq_desc *desc) +{ + return irq_data_get_node(&desc->irq_data); +} + #ifdef CONFIG_PM_SLEEP bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 99793b9b6..4afc45761 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -59,16 +59,10 @@ static void desc_smp_init(struct irq_desc *desc, int node) #endif } -static inline int desc_node(struct irq_desc *desc) -{ - return desc->irq_data.node; -} - #else static inline int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node) { } -static inline int desc_node(struct irq_desc *desc) { return 0; } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, @@ -76,6 +70,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, { int cpu; + desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; @@ -299,7 +294,7 @@ static void free_desc(unsigned int irq) unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc), NULL); + desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); } @@ -619,7 +614,7 @@ unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); int cpu; - int sum = 0; + unsigned int sum = 0; if (!desc || !desc->kstat_irqs) return 0; @@ -639,7 +634,7 @@ unsigned int kstat_irqs(unsigned int irq) */ unsigned int kstat_irqs_usr(unsigned int irq) { - int sum; + unsigned int sum; irq_lock_sparse(); sum = kstat_irqs(irq); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 7fac31105..8c3577fef 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -830,10 +830,12 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, { struct irq_data *irq_data; - irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node); + irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, + irq_data_get_node(child)); if (irq_data) { child->parent_data = irq_data; irq_data->irq = child->irq; + irq_data->common = child->common; irq_data->node = child->node; irq_data->domain = domain; } @@ -1232,6 +1234,27 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, return (irq_data && irq_data->domain == domain) ? irq_data : NULL; } +/** + * irq_domain_set_info - Set the complete data for a @virq in @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hardware interrupt number + * @chip: The associated interrupt chip + * @chip_data: The associated interrupt chip data + * @handler: The interrupt flow handler + * @handler_data: The interrupt flow handler data + * @handler_name: The interrupt handler name + */ +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name) +{ + irq_set_chip_and_handler_name(virq, chip, handler, handler_name); + irq_set_chip_data(virq, chip_data); + irq_set_handler_data(virq, handler_data); +} + static void irq_domain_check_hierarchy(struct irq_domain *domain) { } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e68932bb3..f9744853b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -256,6 +256,37 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +/** + * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt + * @irq: interrupt number to set affinity + * @vcpu_info: vCPU specific data + * + * This function uses the vCPU specific data to set the vCPU + * affinity for an irq. The vCPU specific data is passed from + * outside, such as KVM. One example code path is as below: + * KVM -> IOMMU -> irq_set_vcpu_affinity(). + */ +int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + struct irq_data *data; + struct irq_chip *chip; + int ret = -ENOSYS; + + if (!desc) + return -EINVAL; + + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + ret = chip->irq_set_vcpu_affinity(data, vcpu_info); + irq_put_desc_unlock(desc, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); + static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = @@ -332,7 +363,7 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { struct cpumask *set = irq_default_affinity; - int node = desc->irq_data.node; + int node = irq_desc_get_node(desc); /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index ca3f4aaff..37ddb7bda 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,21 +7,21 @@ void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); - struct irq_chip *chip = idata->chip; + struct irq_chip *chip = desc->irq_data.chip; if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; + irqd_clr_move_pending(&desc->irq_data); + /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (!irqd_can_balance(&desc->irq_data)) { + if (irqd_is_per_cpu(&desc->irq_data)) { WARN_ON(1); return; } - irqd_clr_move_pending(&desc->irq_data); - if (unlikely(cpumask_empty(desc->pending_mask))) return; @@ -52,6 +52,13 @@ void irq_move_irq(struct irq_data *idata) { bool masked; + /* + * Get top level irq_data when CONFIG_IRQ_DOMAIN_HIERARCHY is enabled, + * and it should be optimized away when CONFIG_IRQ_DOMAIN_HIERARCHY is + * disabled. So we avoid an "#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY" here. + */ + idata = irq_desc_get_irq_data(irq_data_to_desc(idata)); + if (likely(!irqd_is_setaffinity_pending(idata))) return; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 474de5cb3..7bf1f1bbb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -124,7 +124,7 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_top(domain, virq, nr_irqs); } -static struct irq_domain_ops msi_domain_ops = { +static const struct irq_domain_ops msi_domain_ops = { .alloc = msi_domain_alloc, .free = msi_domain_free, .activate = msi_domain_activate, diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 5204a6d1b..d22786a6d 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -123,6 +123,8 @@ void suspend_device_irqs(void) unsigned long flags; bool sync; + if (irq_settings_is_nested_thread(desc)) + continue; raw_spin_lock_irqsave(&desc->lock, flags); sync = suspend_device_irq(desc, irq); raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -163,6 +165,8 @@ static void resume_irqs(bool want_early) if (!is_early && want_early) continue; + if (irq_settings_is_nested_thread(desc)) + continue; raw_spin_lock_irqsave(&desc->lock, flags); resume_irq(desc, irq); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index df2f4642d..0e97c142c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -241,7 +241,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); - seq_printf(m, "%d\n", desc->irq_data.node); + seq_printf(m, "%d\n", irq_desc_get_node(desc)); return 0; } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 9019f15de..52ebaca1b 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -302,7 +302,7 @@ static int jump_label_add_module(struct module *mod) continue; key = iterk; - if (__module_address(iter->key) == mod) { + if (within_module(iter->key, mod)) { /* * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. */ @@ -339,7 +339,7 @@ static void jump_label_del_module(struct module *mod) key = (struct static_key *)(unsigned long)iter->key; - if (__module_address(iter->key) == mod) + if (within_module(iter->key, mod)) continue; prev = &key->next; @@ -443,14 +443,16 @@ static void jump_label_update(struct static_key *key, int enable) { struct jump_entry *stop = __stop___jump_table; struct jump_entry *entry = jump_label_get_entries(key); - #ifdef CONFIG_MODULES - struct module *mod = __module_address((unsigned long)key); + struct module *mod; __jump_label_mod_update(key, enable); + preempt_disable(); + mod = __module_address((unsigned long)key); if (mod) stop = mod->jump_entries + mod->num_jump_entries; + preempt_enable(); #endif /* if there are no users, entry can be NULL */ if (entry) diff --git a/kernel/kexec.c b/kernel/kexec.c index 7a36fdcca..a785c1015 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -84,6 +84,17 @@ struct resource crashk_low_res = { int kexec_should_crash(struct task_struct *p) { + /* + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). + */ + if (crash_kexec_post_notifiers) + return 0; + /* + * There are 4 panic() calls in do_exit() path, each of which + * corresponds to each of these 4 conditions. + */ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) return 1; return 0; diff --git a/kernel/kthread.c b/kernel/kthread.c index c4237f12c..fdea0bee7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -97,6 +97,7 @@ bool kthread_should_park(void) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); } +EXPORT_SYMBOL_GPL(kthread_should_park); /** * kthread_freezable_should_stop - should this freezable kthread return now? @@ -171,6 +172,7 @@ void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } +EXPORT_SYMBOL_GPL(kthread_parkme); static int kthread(void *_create) { @@ -272,7 +274,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), - GFP_KERNEL | ___GFP_TOI_NOTRACK); + GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); @@ -411,6 +413,7 @@ void kthread_unpark(struct task_struct *k) if (kthread) __kthread_unpark(k, kthread); } +EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). @@ -441,6 +444,7 @@ int kthread_park(struct task_struct *k) } return ret; } +EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 9ec555732..c40ebcca0 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -128,7 +128,7 @@ static bool klp_is_patch_registered(struct klp_patch *patch) static bool klp_initialized(void) { - return klp_root_kobj; + return !!klp_root_kobj; } struct klp_find_arg { @@ -242,8 +242,9 @@ static int klp_find_verify_func_addr(struct klp_object *obj, int ret; #if defined(CONFIG_RANDOMIZE_BASE) - /* KASLR is enabled, disregard old_addr from user */ - func->old_addr = 0; + /* If KASLR has been enabled, adjust old_addr accordingly */ + if (kaslr_enabled() && func->old_addr) + func->old_addr += kaslr_offset(); #endif if (!func->old_addr || klp_is_module(obj)) @@ -430,7 +431,7 @@ static void klp_disable_object(struct klp_object *obj) { struct klp_func *func; - for (func = obj->funcs; func->old_name; func++) + klp_for_each_func(obj, func) if (func->state == KLP_ENABLED) klp_disable_func(func); @@ -448,7 +449,7 @@ static int klp_enable_object(struct klp_object *obj) if (WARN_ON(!klp_is_object_loaded(obj))) return -EINVAL; - for (func = obj->funcs; func->old_name; func++) { + klp_for_each_func(obj, func) { ret = klp_enable_func(func); if (ret) { klp_disable_object(obj); @@ -471,7 +472,7 @@ static int __klp_disable_patch(struct klp_patch *patch) pr_notice("disabling patch '%s'\n", patch->mod->name); - for (obj = patch->objs; obj->funcs; obj++) { + klp_for_each_object(patch, obj) { if (obj->state == KLP_ENABLED) klp_disable_object(obj); } @@ -531,7 +532,7 @@ static int __klp_enable_patch(struct klp_patch *patch) pr_notice("enabling patch '%s'\n", patch->mod->name); - for (obj = patch->objs; obj->funcs; obj++) { + klp_for_each_object(patch, obj) { if (!klp_is_object_loaded(obj)) continue; @@ -659,6 +660,15 @@ static struct kobj_type klp_ktype_patch = { .default_attrs = klp_patch_attrs, }; +static void klp_kobj_release_object(struct kobject *kobj) +{ +} + +static struct kobj_type klp_ktype_object = { + .release = klp_kobj_release_object, + .sysfs_ops = &kobj_sysfs_ops, +}; + static void klp_kobj_release_func(struct kobject *kobj) { } @@ -688,7 +698,7 @@ static void klp_free_object_loaded(struct klp_object *obj) obj->mod = NULL; - for (func = obj->funcs; func->old_name; func++) + klp_for_each_func(obj, func) func->old_addr = 0; } @@ -703,7 +713,7 @@ static void klp_free_objects_limited(struct klp_patch *patch, for (obj = patch->objs; obj->funcs && obj != limit; obj++) { klp_free_funcs_limited(obj, NULL); - kobject_put(obj->kobj); + kobject_put(&obj->kobj); } } @@ -721,7 +731,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->state = KLP_DISABLED; return kobject_init_and_add(&func->kobj, &klp_ktype_func, - obj->kobj, "%s", func->old_name); + &obj->kobj, "%s", func->old_name); } /* parts of the initialization that is done only when the object is loaded */ @@ -737,7 +747,7 @@ static int klp_init_object_loaded(struct klp_patch *patch, return ret; } - for (func = obj->funcs; func->old_name; func++) { + klp_for_each_func(obj, func) { ret = klp_find_verify_func_addr(obj, func); if (ret) return ret; @@ -761,11 +771,12 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) klp_find_object_module(obj); name = klp_is_module(obj) ? obj->name : "vmlinux"; - obj->kobj = kobject_create_and_add(name, &patch->kobj); - if (!obj->kobj) - return -ENOMEM; + ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, + &patch->kobj, "%s", name); + if (ret) + return ret; - for (func = obj->funcs; func->old_name; func++) { + klp_for_each_func(obj, func) { ret = klp_init_func(obj, func); if (ret) goto free; @@ -781,7 +792,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) free: klp_free_funcs_limited(obj, func); - kobject_put(obj->kobj); + kobject_put(&obj->kobj); return ret; } @@ -802,7 +813,7 @@ static int klp_init_patch(struct klp_patch *patch) if (ret) goto unlock; - for (obj = patch->objs; obj->funcs; obj++) { + klp_for_each_object(patch, obj) { ret = klp_init_object(patch, obj); if (ret) goto free; @@ -891,7 +902,7 @@ int klp_register_patch(struct klp_patch *patch) } EXPORT_SYMBOL_GPL(klp_register_patch); -static void klp_module_notify_coming(struct klp_patch *patch, +static int klp_module_notify_coming(struct klp_patch *patch, struct klp_object *obj) { struct module *pmod = patch->mod; @@ -899,22 +910,23 @@ static void klp_module_notify_coming(struct klp_patch *patch, int ret; ret = klp_init_object_loaded(patch, obj); - if (ret) - goto err; + if (ret) { + pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", + pmod->name, mod->name, ret); + return ret; + } if (patch->state == KLP_DISABLED) - return; + return 0; pr_notice("applying patch '%s' to loading module '%s'\n", pmod->name, mod->name); ret = klp_enable_object(obj); - if (!ret) - return; - -err: - pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", - pmod->name, mod->name, ret); + if (ret) + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + pmod->name, mod->name, ret); + return ret; } static void klp_module_notify_going(struct klp_patch *patch, @@ -938,6 +950,7 @@ disabled: static int klp_module_notify(struct notifier_block *nb, unsigned long action, void *data) { + int ret; struct module *mod = data; struct klp_patch *patch; struct klp_object *obj; @@ -957,13 +970,18 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action, mod->klp_alive = false; list_for_each_entry(patch, &klp_patches, list) { - for (obj = patch->objs; obj->funcs; obj++) { + klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; if (action == MODULE_STATE_COMING) { obj->mod = mod; - klp_module_notify_coming(patch, obj); + ret = klp_module_notify_coming(patch, obj); + if (ret) { + obj->mod = NULL; + pr_warn("patch '%s' is in an inconsistent state!\n", + patch->mod->name); + } } else /* MODULE_STATE_GOING */ klp_module_notify_going(patch, obj); @@ -981,7 +999,7 @@ static struct notifier_block klp_module_nb = { .priority = INT_MIN+1, /* called late but before ftrace notifier */ }; -static int klp_init(void) +static int __init klp_init(void) { int ret; diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index de7a416cc..7dd5c9918 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o @@ -25,5 +26,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o -obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o +obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c index 86ae2aebf..951cfcd10 100644 --- a/kernel/locking/lglock.c +++ b/kernel/locking/lglock.c @@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) } EXPORT_SYMBOL(lg_local_unlock_cpu); +void lg_double_lock(struct lglock *lg, int cpu1, int cpu2) +{ + BUG_ON(cpu1 == cpu2); + + /* lock in cpu order, just like lg_global_lock */ + if (cpu2 < cpu1) + swap(cpu1, cpu2); + + preempt_disable(); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); + arch_spin_lock(per_cpu_ptr(lg->lock, cpu1)); + arch_spin_lock(per_cpu_ptr(lg->lock, cpu2)); +} + +void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2) +{ + lock_release(&lg->lock_dep_map, 1, _RET_IP_); + arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1)); + arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2)); + preempt_enable(); +} + void lg_global_lock(struct lglock *lg) { int i; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index aaeae885d..8acfbf773 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3157,6 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->waittime_stamp = 0; hlock->holdtime_stamp = lockstat_clock(); #endif + hlock->pin_count = 0; if (check && !mark_irqflags(curr, hlock)) return 0; @@ -3260,26 +3261,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; } -/* - * Common debugging checks for both nested and non-nested unlock: - */ -static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (unlikely(!debug_locks)) - return 0; - /* - * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - - if (curr->lockdep_depth <= 0) - return print_unlock_imbalance_bug(curr, lock, ip); - - return 1; -} - static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) { if (hlock->instance == lock) @@ -3376,31 +3357,35 @@ found_it: } /* - * Remove the lock to the list of currently held locks in a - * potentially non-nested (out of order) manner. This is a - * relatively rare operation, as all the unlock APIs default - * to nested mode (which uses lock_release()): + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). + * + * @nested is an hysterical artifact, needs a tree wide cleanup. */ static int -lock_release_non_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) +__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { + struct task_struct *curr = current; struct held_lock *hlock, *prev_hlock; unsigned int depth; int i; - /* - * Check whether the lock exists in the current stack - * of held locks: - */ + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't * own any locks, you've been drinking again? */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return 0; + if (DEBUG_LOCKS_WARN_ON(depth <= 0)) + return print_unlock_imbalance_bug(curr, lock, ip); + /* + * Check whether the lock exists in the current stack + * of held locks: + */ prev_hlock = NULL; for (i = depth-1; i >= 0; i--) { hlock = curr->held_locks + i; @@ -3419,6 +3404,8 @@ found_it: if (hlock->instance == lock) lock_release_holdtime(hlock); + WARN(hlock->pin_count, "releasing a pinned lock\n"); + if (hlock->references) { hlock->references--; if (hlock->references) { @@ -3456,91 +3443,66 @@ found_it: */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) return 0; + return 1; } -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static int lock_release_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) +static int __lock_is_held(struct lockdep_map *lock) { - struct held_lock *hlock; - unsigned int depth; - - /* - * Pop off the top of the lock stack: - */ - depth = curr->lockdep_depth - 1; - hlock = curr->held_locks + depth; - - /* - * Is the unlock non-nested: - */ - if (hlock->instance != lock || hlock->references) - return lock_release_non_nested(curr, lock, ip); - curr->lockdep_depth--; - - /* - * No more locks, but somehow we've got hash left over, who left it? - */ - if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) - return 0; + struct task_struct *curr = current; + int i; - curr->curr_chain_key = hlock->prev_chain_key; + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; - lock_release_holdtime(hlock); + if (match_held_lock(hlock, lock)) + return 1; + } -#ifdef CONFIG_DEBUG_LOCKDEP - hlock->prev_chain_key = 0; - hlock->class_idx = 0; - hlock->acquire_ip = 0; - hlock->irq_context = 0; -#endif - return 1; + return 0; } -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static void -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +static void __lock_pin_lock(struct lockdep_map *lock) { struct task_struct *curr = current; + int i; - if (!check_unlock(curr, lock, ip)) + if (unlikely(!debug_locks)) return; - if (nested) { - if (!lock_release_nested(curr, lock, ip)) - return; - } else { - if (!lock_release_non_nested(curr, lock, ip)) + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) { + hlock->pin_count++; return; + } } - check_chain_key(curr); + WARN(1, "pinning an unheld lock\n"); } -static int __lock_is_held(struct lockdep_map *lock) +static void __lock_unpin_lock(struct lockdep_map *lock) { struct task_struct *curr = current; int i; + if (unlikely(!debug_locks)) + return; + for (i = 0; i < curr->lockdep_depth; i++) { struct held_lock *hlock = curr->held_locks + i; - if (match_held_lock(hlock, lock)) - return 1; + if (match_held_lock(hlock, lock)) { + if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n")) + return; + + hlock->pin_count--; + return; + } } - return 0; + WARN(1, "unpinning an unheld lock\n"); } /* @@ -3639,7 +3601,8 @@ void lock_release(struct lockdep_map *lock, int nested, check_flags(flags); current->lockdep_recursion = 1; trace_lock_release(lock, ip); - __lock_release(lock, nested, ip); + if (__lock_release(lock, nested, ip)) + check_chain_key(current); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@ -3665,6 +3628,40 @@ int lock_is_held(struct lockdep_map *lock) } EXPORT_SYMBOL_GPL(lock_is_held); +void lock_pin_lock(struct lockdep_map *lock) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + __lock_pin_lock(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_pin_lock); + +void lock_unpin_lock(struct lockdep_map *lock) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + __lock_unpin_lock(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_unpin_lock); + void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { current->lockdep_reclaim_gfp = gfp_mask; @@ -4067,8 +4064,7 @@ void __init lockdep_info(void) #ifdef CONFIG_DEBUG_LOCKDEP if (lockdep_init_error) { - printk("WARNING: lockdep init error! lock-%s was acquired" - "before lockdep_init\n", lock_init_error); + printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error); printk("Call stack leading to lockdep invocation was:\n"); print_stack_trace(&lockdep_init_trace, 0); } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index ec8cce259..32244186f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -122,12 +122,12 @@ static int torture_lock_busted_write_lock(void) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_us = 100; + const unsigned long longdelay_ms = 100; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_us))) - mdelay(longdelay_us); + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); #ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ @@ -160,14 +160,14 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_us = 100; + const unsigned long longdelay_ms = 100; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_us))) - mdelay(longdelay_us); + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); @@ -309,7 +309,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) static void torture_rwlock_read_unlock_irq(void) __releases(torture_rwlock) { - write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); + read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); } static struct lock_torture_ops rw_lock_irq_ops = { diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 75e114bdf..fd91aaa45 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -17,6 +17,7 @@ struct mcs_spinlock { struct mcs_spinlock *next; int locked; /* 1 if lock acquired */ + int count; /* nesting count, see qspinlock.c */ }; #ifndef arch_mcs_spin_lock_contended diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index f956ede7f..6c5da4839 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -1,5 +1,5 @@ /* - * Queue read/write lock + * Queued read/write locks * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,6 +22,26 @@ #include <linux/hardirq.h> #include <asm/qrwlock.h> +/* + * This internal data structure is used for optimizing access to some of + * the subfields within the atomic_t cnts. + */ +struct __qrwlock { + union { + atomic_t cnts; + struct { +#ifdef __LITTLE_ENDIAN + u8 wmode; /* Writer mode */ + u8 rcnts[3]; /* Reader counts */ +#else + u8 rcnts[3]; /* Reader counts */ + u8 wmode; /* Writer mode */ +#endif + }; + }; + arch_spinlock_t lock; +}; + /** * rspin_until_writer_unlock - inc reader count & spin until writer is gone * @lock : Pointer to queue rwlock structure @@ -107,10 +127,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock) * or wait for a previous writer to go away. */ for (;;) { - cnts = atomic_read(&lock->cnts); - if (!(cnts & _QW_WMASK) && - (atomic_cmpxchg(&lock->cnts, cnts, - cnts | _QW_WAITING) == cnts)) + struct __qrwlock *l = (struct __qrwlock *)lock; + + if (!READ_ONCE(l->wmode) && + (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax_lowlatency(); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c new file mode 100644 index 000000000..38c49202d --- /dev/null +++ b/kernel/locking/qspinlock.c @@ -0,0 +1,473 @@ +/* + * Queued spinlock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. + * (C) Copyright 2013-2014 Red Hat, Inc. + * (C) Copyright 2015 Intel Corp. + * + * Authors: Waiman Long <waiman.long@hp.com> + * Peter Zijlstra <peterz@infradead.org> + */ + +#ifndef _GEN_PV_LOCK_SLOWPATH + +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <asm/byteorder.h> +#include <asm/qspinlock.h> + +/* + * The basic principle of a queue-based spinlock can best be understood + * by studying a classic queue-based spinlock implementation called the + * MCS lock. The paper below provides a good description for this kind + * of lock. + * + * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf + * + * This queued spinlock implementation is based on the MCS lock, however to make + * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing + * API, we must modify it somehow. + * + * In particular; where the traditional MCS lock consists of a tail pointer + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to + * unlock the next pending (next->locked), we compress both these: {tail, + * next->locked} into a single u32 value. + * + * Since a spinlock disables recursion of its own context and there is a limit + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now + * we can encode the tail by combining the 2-bit nesting level with the cpu + * number. With one byte for the lock value and 3 bytes for the tail, only a + * 32-bit word is now needed. Even though we only need 1 bit for the lock, + * we extend it to a full byte to achieve better performance for architectures + * that support atomic byte write. + * + * We also change the first spinner to spin on the lock bit instead of its + * node; whereby avoiding the need to carry a node from lock to unlock, and + * preserving existing lock API. This also makes the unlock code simpler and + * faster. + * + * N.B. The current implementation only supports architectures that allow + * atomic operations on smaller 8-bit and 16-bit data types. + * + */ + +#include "mcs_spinlock.h" + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define MAX_NODES 8 +#else +#define MAX_NODES 4 +#endif + +/* + * Per-CPU queue node structures; we can never have more than 4 nested + * contexts: task, softirq, hardirq, nmi. + * + * Exactly fits one 64-byte cacheline on a 64-bit architecture. + * + * PV doubles the storage and uses the second cacheline for PV state. + */ +static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); + +/* + * We must be able to distinguish between no-tail and the tail at 0:0, + * therefore increment the cpu number by one. + */ + +static inline u32 encode_tail(int cpu, int idx) +{ + u32 tail; + +#ifdef CONFIG_DEBUG_SPINLOCK + BUG_ON(idx > 3); +#endif + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline struct mcs_spinlock *decode_tail(u32 tail) +{ + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return per_cpu_ptr(&mcs_nodes[idx], cpu); +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +/* + * By using the whole 2nd least significant byte for the pending bit, we + * can allow better optimization of the lock acquisition for the pending + * bit holder. + * + * This internal structure is also used by the set_locked function which + * is not restricted to _Q_PENDING_BITS == 8. + */ +struct __qspinlock { + union { + atomic_t val; +#ifdef __LITTLE_ENDIAN + struct { + u8 locked; + u8 pending; + }; + struct { + u16 locked_pending; + u16 tail; + }; +#else + struct { + u16 tail; + u16 locked_pending; + }; + struct { + u8 reserved[2]; + u8 pending; + u8 locked; + }; +#endif + }; +}; + +#if _Q_PENDING_BITS == 8 +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); +} + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + struct __qspinlock *l = (void *)lock; + + return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); +} + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new, val = atomic_read(&lock->val); + + for (;;) { + new = (val & _Q_LOCKED_PENDING_MASK) | tail; + old = atomic_cmpxchg(&lock->val, val, new); + if (old == val) + break; + + val = old; + } + return old; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->locked, _Q_LOCKED_VAL); +} + + +/* + * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for + * all the PV callbacks. + */ + +static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } + +static __always_inline void __pv_wait_head(struct qspinlock *lock, + struct mcs_spinlock *node) { } + +#define pv_enabled() false + +#define pv_init_node __pv_init_node +#define pv_wait_node __pv_wait_node +#define pv_kick_node __pv_kick_node +#define pv_wait_head __pv_wait_head + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath +#endif + +#endif /* _GEN_PV_LOCK_SLOWPATH */ + +/** + * queued_spin_lock_slowpath - acquire the queued spinlock + * @lock: Pointer to queued spinlock structure + * @val: Current value of the queued spinlock 32-bit word + * + * (queue tail, pending bit, lock value) + * + * fast : slow : unlock + * : : + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) + * : | ^--------.------. / : + * : v \ \ | : + * pending : (0,1,1) +--> (0,1,0) \ | : + * : | ^--' | | : + * : v | | : + * uncontended : (n,x,y) +--> (n,0,0) --' | : + * queue : | ^--' | : + * : v | : + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : + * queue : ^--' : + */ +void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + struct mcs_spinlock *prev, *next, *node; + u32 new, old, tail; + int idx; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + if (pv_enabled()) + goto queue; + + if (virt_queued_spin_lock(lock)) + return; + + /* + * wait for in-progress pending->locked hand-overs + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) + cpu_relax(); + } + + /* + * trylock || pending + * + * 0,0,0 -> 0,0,1 ; trylock + * 0,0,1 -> 0,1,1 ; pending + */ + for (;;) { + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + new = _Q_LOCKED_VAL; + if (val == new) + new |= _Q_PENDING_VAL; + + old = atomic_cmpxchg(&lock->val, val, new); + if (old == val) + break; + + val = old; + } + + /* + * we won the trylock + */ + if (new == _Q_LOCKED_VAL) + return; + + /* + * we're pending, wait for the owner to go away. + * + * *,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all clear_pending_set_locked() + * implementations imply full barriers. + */ + while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) + cpu_relax(); + + /* + * take ownership and clear the pending bit. + * + * *,1,0 -> *,0,1 + */ + clear_pending_set_locked(lock); + return; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + node = this_cpu_ptr(&mcs_nodes[0]); + idx = node->count++; + tail = encode_tail(smp_processor_id(), idx); + + node += idx; + node->locked = 0; + node->next = NULL; + pv_init_node(node); + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (queued_spin_trylock(lock)) + goto release; + + /* + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + prev = decode_tail(old); + WRITE_ONCE(prev->next, node); + + pv_wait_node(node); + arch_mcs_spin_lock_contended(&node->locked); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + * + */ + pv_wait_head(lock, node); + while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) + cpu_relax(); + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,0,0 -> *,0,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail), + * clear the tail code and grab the lock. Otherwise, we only need + * to grab the lock. + */ + for (;;) { + if (val != tail) { + set_locked(lock); + break; + } + old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); + if (old == val) + goto release; /* No contention */ + + val = old; + } + + /* + * contended path; wait for next, release. + */ + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + + arch_mcs_spin_unlock_contended(&next->locked); + pv_kick_node(next); + +release: + /* + * release the node + */ + this_cpu_dec(mcs_nodes[0].count); +} +EXPORT_SYMBOL(queued_spin_lock_slowpath); + +/* + * Generate the paravirt code for queued_spin_unlock_slowpath(). + */ +#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#define _GEN_PV_LOCK_SLOWPATH + +#undef pv_enabled +#define pv_enabled() true + +#undef pv_init_node +#undef pv_wait_node +#undef pv_kick_node +#undef pv_wait_head + +#undef queued_spin_lock_slowpath +#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath + +#include "qspinlock_paravirt.h" +#include "qspinlock.c" + +#endif diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h new file mode 100644 index 000000000..df19ae4de --- /dev/null +++ b/kernel/locking/qspinlock_paravirt.h @@ -0,0 +1,334 @@ +#ifndef _GEN_PV_LOCK_SLOWPATH +#error "do not include this file" +#endif + +#include <linux/hash.h> +#include <linux/bootmem.h> +#include <linux/debug_locks.h> + +/* + * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead + * of spinning them. + * + * This relies on the architecture to provide two paravirt hypercalls: + * + * pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val + * pv_kick(cpu) -- wakes a suspended vcpu + * + * Using these we implement __pv_queued_spin_lock_slowpath() and + * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and + * native_queued_spin_unlock(). + */ + +#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) + +enum vcpu_state { + vcpu_running = 0, + vcpu_halted, +}; + +struct pv_node { + struct mcs_spinlock mcs; + struct mcs_spinlock __res[3]; + + int cpu; + u8 state; +}; + +/* + * Lock and MCS node addresses hash table for fast lookup + * + * Hashing is done on a per-cacheline basis to minimize the need to access + * more than one cacheline. + * + * Dynamically allocate a hash table big enough to hold at least 4X the + * number of possible cpus in the system. Allocation is done on page + * granularity. So the minimum number of hash buckets should be at least + * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page. + * + * Since we should not be holding locks from NMI context (very rare indeed) the + * max load factor is 0.75, which is around the point where open addressing + * breaks down. + * + */ +struct pv_hash_entry { + struct qspinlock *lock; + struct pv_node *node; +}; + +#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry)) +#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry)) + +static struct pv_hash_entry *pv_lock_hash; +static unsigned int pv_lock_hash_bits __read_mostly; + +/* + * Allocate memory for the PV qspinlock hash buckets + * + * This function should be called from the paravirt spinlock initialization + * routine. + */ +void __init __pv_init_lock_hash(void) +{ + int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE); + + if (pv_hash_size < PV_HE_MIN) + pv_hash_size = PV_HE_MIN; + + /* + * Allocate space from bootmem which should be page-size aligned + * and hence cacheline aligned. + */ + pv_lock_hash = alloc_large_system_hash("PV qspinlock", + sizeof(struct pv_hash_entry), + pv_hash_size, 0, HASH_EARLY, + &pv_lock_hash_bits, NULL, + pv_hash_size, pv_hash_size); +} + +#define for_each_hash_entry(he, offset, hash) \ + for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \ + offset < (1 << pv_lock_hash_bits); \ + offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)]) + +static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) +{ + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); + struct pv_hash_entry *he; + + for_each_hash_entry(he, offset, hash) { + if (!cmpxchg(&he->lock, NULL, lock)) { + WRITE_ONCE(he->node, node); + return &he->lock; + } + } + /* + * Hard assume there is a free entry for us. + * + * This is guaranteed by ensuring every blocked lock only ever consumes + * a single entry, and since we only have 4 nesting levels per CPU + * and allocated 4*nr_possible_cpus(), this must be so. + * + * The single entry is guaranteed by having the lock owner unhash + * before it releases. + */ + BUG(); +} + +static struct pv_node *pv_unhash(struct qspinlock *lock) +{ + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); + struct pv_hash_entry *he; + struct pv_node *node; + + for_each_hash_entry(he, offset, hash) { + if (READ_ONCE(he->lock) == lock) { + node = READ_ONCE(he->node); + WRITE_ONCE(he->lock, NULL); + return node; + } + } + /* + * Hard assume we'll find an entry. + * + * This guarantees a limited lookup time and is itself guaranteed by + * having the lock owner do the unhash -- IFF the unlock sees the + * SLOW flag, there MUST be a hash entry. + */ + BUG(); +} + +/* + * Initialize the PV part of the mcs_spinlock node. + */ +static void pv_init_node(struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + + BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock)); + + pn->cpu = smp_processor_id(); + pn->state = vcpu_running; +} + +/* + * Wait for node->locked to become true, halt the vcpu after a short spin. + * pv_kick_node() is used to wake the vcpu again. + */ +static void pv_wait_node(struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + int loop; + + for (;;) { + for (loop = SPIN_THRESHOLD; loop; loop--) { + if (READ_ONCE(node->locked)) + return; + cpu_relax(); + } + + /* + * Order pn->state vs pn->locked thusly: + * + * [S] pn->state = vcpu_halted [S] next->locked = 1 + * MB MB + * [L] pn->locked [RmW] pn->state = vcpu_running + * + * Matches the xchg() from pv_kick_node(). + */ + smp_store_mb(pn->state, vcpu_halted); + + if (!READ_ONCE(node->locked)) + pv_wait(&pn->state, vcpu_halted); + + /* + * Reset the vCPU state to avoid unncessary CPU kicking + */ + WRITE_ONCE(pn->state, vcpu_running); + + /* + * If the locked flag is still not set after wakeup, it is a + * spurious wakeup and the vCPU should wait again. However, + * there is a pretty high overhead for CPU halting and kicking. + * So it is better to spin for a while in the hope that the + * MCS lock will be released soon. + */ + } + /* + * By now our node->locked should be 1 and our caller will not actually + * spin-wait for it. We do however rely on our caller to do a + * load-acquire for us. + */ +} + +/* + * Called after setting next->locked = 1, used to wake those stuck in + * pv_wait_node(). + */ +static void pv_kick_node(struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + + /* + * Note that because node->locked is already set, this actual + * mcs_spinlock entry could be re-used already. + * + * This should be fine however, kicking people for no reason is + * harmless. + * + * See the comment in pv_wait_node(). + */ + if (xchg(&pn->state, vcpu_running) == vcpu_halted) + pv_kick(pn->cpu); +} + +/* + * Wait for l->locked to become clear; halt the vcpu after a short spin. + * __pv_queued_spin_unlock() will wake us. + */ +static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; + struct qspinlock **lp = NULL; + int loop; + + for (;;) { + for (loop = SPIN_THRESHOLD; loop; loop--) { + if (!READ_ONCE(l->locked)) + return; + cpu_relax(); + } + + WRITE_ONCE(pn->state, vcpu_halted); + if (!lp) { /* ONCE */ + lp = pv_hash(lock, pn); + /* + * lp must be set before setting _Q_SLOW_VAL + * + * [S] lp = lock [RmW] l = l->locked = 0 + * MB MB + * [S] l->locked = _Q_SLOW_VAL [L] lp + * + * Matches the cmpxchg() in __pv_queued_spin_unlock(). + */ + if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { + /* + * The lock is free and _Q_SLOW_VAL has never + * been set. Therefore we need to unhash before + * getting the lock. + */ + WRITE_ONCE(*lp, NULL); + return; + } + } + pv_wait(&l->locked, _Q_SLOW_VAL); + + /* + * The unlocker should have freed the lock before kicking the + * CPU. So if the lock is still not free, it is a spurious + * wakeup and so the vCPU should wait again after spinning for + * a while. + */ + } + + /* + * Lock is unlocked now; the caller will acquire it without waiting. + * As with pv_wait_node() we rely on the caller to do a load-acquire + * for us. + */ +} + +/* + * PV version of the unlock function to be used in stead of + * queued_spin_unlock(). + */ +__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + struct pv_node *node; + u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + + /* + * We must not unlock if SLOW, because in that case we must first + * unhash. Otherwise it would be possible to have multiple @lock + * entries, which would be BAD. + */ + if (likely(lockval == _Q_LOCKED_VAL)) + return; + + if (unlikely(lockval != _Q_SLOW_VAL)) { + if (debug_locks_silent) + return; + WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val)); + return; + } + + /* + * Since the above failed to release, this must be the SLOW path. + * Therefore start by looking up the blocked node and unhashing it. + */ + node = pv_unhash(lock); + + /* + * Now that we have a reference to the (likely) blocked pv_node, + * release the lock. + */ + smp_store_release(&l->locked, 0); + + /* + * At this point the memory pointed at by lock can be freed/reused, + * however we can still use the pv_node to kick the CPU. + */ + if (READ_ONCE(node->state) == vcpu_halted) + pv_kick(node->cpu); +} +/* + * Include the architecture specific callee-save thunk of the + * __pv_queued_spin_unlock(). This thunk is put together with + * __pv_queued_spin_unlock() near the top of the file to make sure + * that the callee-save thunk and the real unlock function are close + * to each other sharing consecutive instruction cachelines. + */ +#include <asm/qspinlock_paravirt.h> + diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b025295f4..5674b0734 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) } /* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up + * We can speed up the acquire/release, if there's no debugging state to be + * set up. */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +#ifndef CONFIG_DEBUG_RT_MUTEXES # define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { @@ -300,7 +300,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) * of task. We do not use the spin_xx_mutex() variants here as we are * outside of the debug path.) */ -static void rt_mutex_adjust_prio(struct task_struct *task) +void rt_mutex_adjust_prio(struct task_struct *task) { unsigned long flags; @@ -624,7 +624,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ prerequeue_top_waiter = rt_mutex_top_waiter(lock); - /* [7] Requeue the waiter in the lock waiter list. */ + /* [7] Requeue the waiter in the lock waiter tree. */ rt_mutex_dequeue(lock, waiter); waiter->prio = task->prio; rt_mutex_enqueue(lock, waiter); @@ -662,7 +662,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * The waiter became the new top (highest priority) * waiter on the lock. Replace the previous top waiter - * in the owner tasks pi waiters list with this waiter + * in the owner tasks pi waiters tree with this waiter * and adjust the priority of the owner. */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); @@ -673,7 +673,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * The waiter was the top waiter on the lock, but is * no longer the top prority waiter. Replace waiter in - * the owner tasks pi waiters list with the new top + * the owner tasks pi waiters tree with the new top * (highest priority) waiter and adjust the priority * of the owner. * The new top waiter is stored in @waiter so that @@ -747,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * * @lock: The lock to be acquired. * @task: The task which wants to acquire the lock - * @waiter: The waiter that is queued to the lock's wait list if the + * @waiter: The waiter that is queued to the lock's wait tree if the * callsite called task_blocked_on_lock(), otherwise NULL */ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, @@ -782,7 +782,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, /* * If @waiter != NULL, @task has already enqueued the waiter - * into @lock waiter list. If @waiter == NULL then this is a + * into @lock waiter tree. If @waiter == NULL then this is a * trylock attempt. */ if (waiter) { @@ -795,7 +795,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, /* * We can acquire the lock. Remove the waiter from the - * lock waiters list. + * lock waiters tree. */ rt_mutex_dequeue(lock, waiter); @@ -827,7 +827,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * No waiters. Take the lock without the * pi_lock dance.@task->pi_blocked_on is NULL * and we have no waiters to enqueue in @task - * pi waiters list. + * pi waiters tree. */ goto takeit; } @@ -844,7 +844,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, /* * Finish the lock acquisition. @task is the new owner. If * other waiters exist we have to insert the highest priority - * waiter into @task->pi_waiters list. + * waiter into @task->pi_waiters tree. */ if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); @@ -955,14 +955,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, } /* - * Wake up the next waiter on the lock. - * - * Remove the top waiter from the current tasks pi waiter list and - * wake it up. + * Remove the top waiter from the current tasks pi waiter tree and + * queue it up. * * Called with lock->wait_lock held. */ -static void wakeup_next_waiter(struct rt_mutex *lock) +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, + struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; unsigned long flags; @@ -991,12 +990,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - /* - * It's safe to dereference waiter as it cannot go away as - * long as we hold lock->wait_lock. The waiter task needs to - * acquire it in order to dequeue the waiter. - */ - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); } /* @@ -1182,11 +1176,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { + if (unlikely(timeout)) hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); @@ -1253,10 +1244,11 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) } /* - * Slow path to release a rt-mutex: + * Slow path to release a rt-mutex. + * Return whether the current task needs to undo a potential priority boosting. */ -static void __sched -rt_mutex_slowunlock(struct rt_mutex *lock) +static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, + struct wake_q_head *wake_q) { raw_spin_lock(&lock->wait_lock); @@ -1298,7 +1290,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock) while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ if (unlock_rt_mutex_safe(lock) == true) - return; + return false; /* Relock the rtmutex and try again */ raw_spin_lock(&lock->wait_lock); } @@ -1306,13 +1298,15 @@ rt_mutex_slowunlock(struct rt_mutex *lock) /* * The wakeup next waiter path does not suffer from the above * race. See the comments there. + * + * Queue the next waiter for wakeup once we release the wait_lock. */ - wakeup_next_waiter(lock); + mark_wakeup_next_waiter(wake_q, lock); raw_spin_unlock(&lock->wait_lock); - /* Undo pi boosting if necessary: */ - rt_mutex_adjust_prio(current); + /* check PI boosting */ + return true; } /* @@ -1363,12 +1357,23 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, static inline void rt_mutex_fastunlock(struct rt_mutex *lock, - void (*slowfn)(struct rt_mutex *lock)) + bool (*slowfn)(struct rt_mutex *lock, + struct wake_q_head *wqh)) { - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + WAKE_Q(wake_q); + + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); - else - slowfn(lock); + + } else { + bool deboost = slowfn(lock, &wake_q); + + wake_up_q(&wake_q); + + /* Undo pi boosting if necessary: */ + if (deboost) + rt_mutex_adjust_prio(current); + } } /** @@ -1443,10 +1448,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); * * @lock: the rt_mutex to be locked * + * This function can only be called in thread context. It's safe to + * call it from atomic regions, but not from hard interrupt or soft + * interrupt context. + * * Returns 1 on success and 0 on contention */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { + if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) + return 0; + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); } EXPORT_SYMBOL_GPL(rt_mutex_trylock); @@ -1463,6 +1475,23 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** + * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock + * @lock: the rt_mutex to be unlocked + * + * Returns: true/false indicating whether priority adjustment is + * required or not. + */ +bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { + rt_mutex_deadlock_account_unlock(current); + return false; + } + return rt_mutex_slowunlock(lock, wqh); +} + +/** * rt_mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed * diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 855212501..7844f8f0e 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -131,6 +131,9 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter); extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); +extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh); +extern void rt_mutex_adjust_prio(struct task_struct *task); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 3417d0172..0f189714e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -409,11 +409,24 @@ done: return taken; } +/* + * Return true if the rwsem has active spinner + */ +static inline bool rwsem_has_spinner(struct rw_semaphore *sem) +{ + return osq_is_locked(&sem->osq); +} + #else static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } + +static inline bool rwsem_has_spinner(struct rw_semaphore *sem) +{ + return false; +} #endif /* @@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + /* + * If a spinner is present, it is not necessary to do the wakeup. + * Try to do wakeup only if the trylock succeeds to minimize + * spinlock contention which may introduce too much delay in the + * unlock operation. + * + * spinning writer up_write/up_read caller + * --------------- ----------------------- + * [S] osq_unlock() [L] osq + * MB RMB + * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) + * + * Here, it is important to make sure that there won't be a missed + * wakeup while the rwsem is free and the only spinning writer goes + * to sleep without taking the rwsem. Even when the spinning writer + * is just going to break out of the waiting loop, it will still do + * a trylock in rwsem_down_write_failed() before sleeping. IOW, if + * rwsem_has_spinner() is true, it will guarantee at least one + * trylock attempt on the rwsem later on. + */ + if (rwsem_has_spinner(sem)) { + /* + * The smp_rmb() here is to make sure that the spinner + * state is consulted before reading the wait_lock. + */ + smp_rmb(); + if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) + return sem; + goto locked; + } raw_spin_lock_irqsave(&sem->wait_lock, flags); +locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) diff --git a/kernel/module.c b/kernel/module.c index cfc9e843a..b86b7bf1b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,7 +18,7 @@ */ #include <linux/export.h> #include <linux/moduleloader.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/init.h> #include <linux/kallsyms.h> #include <linux/file.h> @@ -101,48 +101,201 @@ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); -#ifdef CONFIG_KGDB_KDB -struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ -#endif /* CONFIG_KGDB_KDB */ -#ifdef CONFIG_MODULE_SIG -#ifdef CONFIG_MODULE_SIG_FORCE -static bool sig_enforce = true; -#else -static bool sig_enforce = false; +#ifdef CONFIG_MODULES_TREE_LOOKUP + +/* + * Use a latched RB-tree for __module_address(); this allows us to use + * RCU-sched lookups of the address from any context. + * + * Because modules have two address ranges: init and core, we need two + * latch_tree_nodes entries. Therefore we need the back-pointer from + * mod_tree_node. + * + * Because init ranges are short lived we mark them unlikely and have placed + * them outside the critical cacheline in struct module. + * + * This is conditional on PERF_EVENTS || TRACING because those can really hit + * __module_address() hard by doing a lot of stack unwinding; potentially from + * NMI context. + */ -static int param_set_bool_enable_only(const char *val, - const struct kernel_param *kp) +static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) { - int err; - bool test; - struct kernel_param dummy_kp = *kp; + struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); + struct module *mod = mtn->mod; - dummy_kp.arg = &test; + if (unlikely(mtn == &mod->mtn_init)) + return (unsigned long)mod->module_init; - err = param_set_bool(val, &dummy_kp); - if (err) - return err; + return (unsigned long)mod->module_core; +} + +static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) +{ + struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); + struct module *mod = mtn->mod; + + if (unlikely(mtn == &mod->mtn_init)) + return (unsigned long)mod->init_size; - /* Don't let them unset it once it's set! */ - if (!test && sig_enforce) - return -EROFS; + return (unsigned long)mod->core_size; +} + +static __always_inline bool +mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) +{ + return __mod_tree_val(a) < __mod_tree_val(b); +} + +static __always_inline int +mod_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long start, end; + + start = __mod_tree_val(n); + if (val < start) + return -1; + + end = start + __mod_tree_size(n); + if (val >= end) + return 1; - if (test) - sig_enforce = true; return 0; } -static const struct kernel_param_ops param_ops_bool_enable_only = { - .flags = KERNEL_PARAM_OPS_FL_NOARG, - .set = param_set_bool_enable_only, - .get = param_get_bool, +static const struct latch_tree_ops mod_tree_ops = { + .less = mod_tree_less, + .comp = mod_tree_comp, }; -#define param_check_bool_enable_only param_check_bool +static struct mod_tree_root { + struct latch_tree_root root; + unsigned long addr_min; + unsigned long addr_max; +} mod_tree __cacheline_aligned = { + .addr_min = -1UL, +}; + +#define module_addr_min mod_tree.addr_min +#define module_addr_max mod_tree.addr_max + +static noinline void __mod_tree_insert(struct mod_tree_node *node) +{ + latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops); +} + +static void __mod_tree_remove(struct mod_tree_node *node) +{ + latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops); +} + +/* + * These modifications: insert, remove_init and remove; are serialized by the + * module_mutex. + */ +static void mod_tree_insert(struct module *mod) +{ + mod->mtn_core.mod = mod; + mod->mtn_init.mod = mod; + + __mod_tree_insert(&mod->mtn_core); + if (mod->init_size) + __mod_tree_insert(&mod->mtn_init); +} + +static void mod_tree_remove_init(struct module *mod) +{ + if (mod->init_size) + __mod_tree_remove(&mod->mtn_init); +} + +static void mod_tree_remove(struct module *mod) +{ + __mod_tree_remove(&mod->mtn_core); + mod_tree_remove_init(mod); +} + +static struct module *mod_find(unsigned long addr) +{ + struct latch_tree_node *ltn; + + ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops); + if (!ltn) + return NULL; + + return container_of(ltn, struct mod_tree_node, node)->mod; +} + +#else /* MODULES_TREE_LOOKUP */ + +static unsigned long module_addr_min = -1UL, module_addr_max = 0; + +static void mod_tree_insert(struct module *mod) { } +static void mod_tree_remove_init(struct module *mod) { } +static void mod_tree_remove(struct module *mod) { } + +static struct module *mod_find(unsigned long addr) +{ + struct module *mod; + + list_for_each_entry_rcu(mod, &modules, list) { + if (within_module(addr, mod)) + return mod; + } + + return NULL; +} + +#endif /* MODULES_TREE_LOOKUP */ + +/* + * Bounds of module text, for speeding up __module_address. + * Protected by module_mutex. + */ +static void __mod_update_bounds(void *base, unsigned int size) +{ + unsigned long min = (unsigned long)base; + unsigned long max = min + size; + + if (min < module_addr_min) + module_addr_min = min; + if (max > module_addr_max) + module_addr_max = max; +} + +static void mod_update_bounds(struct module *mod) +{ + __mod_update_bounds(mod->module_core, mod->core_size); + if (mod->init_size) + __mod_update_bounds(mod->module_init, mod->init_size); +} + +#ifdef CONFIG_KGDB_KDB +struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ +#endif /* CONFIG_KGDB_KDB */ + +static void module_assert_mutex(void) +{ + lockdep_assert_held(&module_mutex); +} + +static void module_assert_mutex_or_preempt(void) +{ +#ifdef CONFIG_LOCKDEP + if (unlikely(!debug_locks)) + return; + + WARN_ON(!rcu_read_lock_sched_held() && + !lockdep_is_held(&module_mutex)); +#endif +} + +static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); +#ifndef CONFIG_MODULE_SIG_FORCE module_param(sig_enforce, bool_enable_only, 0644); #endif /* !CONFIG_MODULE_SIG_FORCE */ -#endif /* CONFIG_MODULE_SIG */ /* Block module loading/unloading? */ int modules_disabled = 0; @@ -153,10 +306,6 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq); static BLOCKING_NOTIFIER_HEAD(module_notify_list); -/* Bounds of module allocation, for speeding __module_address. - * Protected by module_mutex. */ -static unsigned long module_addr_min = -1UL, module_addr_max = 0; - int register_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&module_notify_list, nb); @@ -318,6 +467,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, #endif }; + module_assert_mutex_or_preempt(); + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) return true; @@ -451,12 +602,17 @@ const struct kernel_symbol *find_symbol(const char *name, } EXPORT_SYMBOL_GPL(find_symbol); -/* Search for module by name: must hold module_mutex. */ +/* + * Search for module by name: must hold module_mutex (or preempt disabled + * for read-only access). + */ static struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; + module_assert_mutex_or_preempt(); + list_for_each_entry(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; @@ -468,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len, struct module *find_module(const char *name) { + module_assert_mutex(); return find_module_all(name, strlen(name), false); } EXPORT_SYMBOL_GPL(find_module); @@ -1169,11 +1326,17 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; - /* Since this should be found in kernel (which can't be removed), - * no locking is necessary. */ + /* + * Since this should be found in kernel (which can't be removed), no + * locking is necessary -- use preempt_disable() to placate lockdep. + */ + preempt_disable(); if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, - &crc, true, false)) + &crc, true, false)) { + preempt_enable(); BUG(); + } + preempt_enable(); return check_version(sechdrs, versindex, VMLINUX_SYMBOL_STR(module_layout), mod, crc, NULL); @@ -1661,6 +1824,10 @@ static void mod_sysfs_fini(struct module *mod) mod_kobject_put(mod); } +static void init_param_lock(struct module *mod) +{ + mutex_init(&mod->param_lock); +} #else /* !CONFIG_SYSFS */ static int mod_sysfs_setup(struct module *mod, @@ -1683,6 +1850,9 @@ static void del_usage_links(struct module *mod) { } +static void init_param_lock(struct module *mod) +{ +} #endif /* CONFIG_SYSFS */ static void mod_sysfs_teardown(struct module *mod) @@ -1852,10 +2022,11 @@ static void free_module(struct module *mod) mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mod_tree_remove(mod); /* Remove this module from bug list, this uses list_del_rcu */ module_bug_cleanup(mod); - /* Wait for RCU synchronizing before releasing mod->list and buglist. */ - synchronize_rcu(); + /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ + synchronize_sched(); mutex_unlock(&module_mutex); /* This may be NULL, but that's OK */ @@ -2384,22 +2555,6 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } -static void *module_alloc_update_bounds(unsigned long size) -{ - void *ret = module_alloc(size); - - if (ret) { - mutex_lock(&module_mutex); - /* Update module bounds. */ - if ((unsigned long)ret < module_addr_min) - module_addr_min = (unsigned long)ret; - if ((unsigned long)ret + size > module_addr_max) - module_addr_max = (unsigned long)ret + size; - mutex_unlock(&module_mutex); - } - return ret; -} - #ifdef CONFIG_DEBUG_KMEMLEAK static void kmemleak_load_module(const struct module *mod, const struct load_info *info) @@ -2805,7 +2960,7 @@ static int move_module(struct module *mod, struct load_info *info) void *ptr; /* Do the allocs. */ - ptr = module_alloc_update_bounds(mod->core_size); + ptr = module_alloc(mod->core_size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a @@ -2819,7 +2974,7 @@ static int move_module(struct module *mod, struct load_info *info) mod->module_core = ptr; if (mod->init_size) { - ptr = module_alloc_update_bounds(mod->init_size); + ptr = module_alloc(mod->init_size); /* * The pointer to this block is stored in the module structure * which is inside the block. This block doesn't need to be @@ -3107,7 +3262,7 @@ static noinline int do_init_module(struct module *mod) * * http://thread.gmane.org/gmane.linux.kernel/1420814 */ - if (current->flags & PF_USED_ASYNC) + if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) async_synchronize_full(); mutex_lock(&module_mutex); @@ -3119,6 +3274,7 @@ static noinline int do_init_module(struct module *mod) mod->symtab = mod->core_symtab; mod->strtab = mod->core_strtab; #endif + mod_tree_remove_init(mod); unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); mod->module_init = NULL; @@ -3127,11 +3283,11 @@ static noinline int do_init_module(struct module *mod) mod->init_text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be - * walking this with preempt disabled. In all the failure paths, - * we call synchronize_rcu/synchronize_sched, but we don't want - * to slow down the success path, so use actual RCU here. + * walking this with preempt disabled. In all the failure paths, we + * call synchronize_sched(), but we don't want to slow down the success + * path, so use actual RCU here. */ - call_rcu(&freeinit->rcu, do_free_init); + call_rcu_sched(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); @@ -3188,7 +3344,9 @@ again: err = -EEXIST; goto out; } + mod_update_bounds(mod); list_add_rcu(&mod->list, &modules); + mod_tree_insert(mod); err = 0; out: @@ -3237,10 +3395,19 @@ out: return err; } -static int unknown_module_param_cb(char *param, char *val, const char *modname) +static int unknown_module_param_cb(char *param, char *val, const char *modname, + void *arg) { + struct module *mod = arg; + int ret; + + if (strcmp(param, "async_probe") == 0) { + mod->async_probe_requested = true; + return 0; + } + /* Check for magic 'dyndbg' arg */ - int ret = ddebug_dyndbg_module_param_cb(param, val, modname); + ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); return 0; @@ -3295,6 +3462,8 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto unlink_mod; + init_param_lock(mod); + /* Now we've got everything in the final locations, we can * find optional sections. */ err = find_module_sections(mod, info); @@ -3342,7 +3511,8 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, unknown_module_param_cb); + -32768, 32767, NULL, + unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); goto bug_cleanup; @@ -3391,9 +3561,10 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mod_tree_remove(mod); wake_up_all(&module_wq); - /* Wait for RCU synchronizing before releasing mod->list. */ - synchronize_rcu(); + /* Wait for RCU-sched synchronizing before releasing mod->list. */ + synchronize_sched(); mutex_unlock(&module_mutex); free_module: /* Free lock-classes; relies on the preceding sync_rcu() */ @@ -3517,19 +3688,15 @@ const char *module_address_lookup(unsigned long addr, char **modname, char *namebuf) { - struct module *mod; const char *ret = NULL; + struct module *mod; preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - if (modname) - *modname = mod->name; - ret = get_ksymbol(mod, addr, size, offset); - break; - } + mod = __module_address(addr); + if (mod) { + if (modname) + *modname = mod->name; + ret = get_ksymbol(mod, addr, size, offset); } /* Make a copy in here where it's safe */ if (ret) { @@ -3537,6 +3704,7 @@ const char *module_address_lookup(unsigned long addr, ret = namebuf; } preempt_enable(); + return ret; } @@ -3660,6 +3828,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned int i; int ret; + module_assert_mutex(); + list_for_each_entry(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -3834,13 +4004,15 @@ struct module *__module_address(unsigned long addr) if (addr < module_addr_min || addr > module_addr_max) return NULL; - list_for_each_entry_rcu(mod, &modules, list) { + module_assert_mutex_or_preempt(); + + mod = mod_find(addr); + if (mod) { + BUG_ON(!within_module(addr, mod)); if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) - return mod; + mod = NULL; } - return NULL; + return mod; } EXPORT_SYMBOL_GPL(__module_address); diff --git a/kernel/panic.c b/kernel/panic.c index 8136ad76e..04e91ff75 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,7 +32,7 @@ static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -static bool crash_kexec_post_notifiers; +bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; @@ -142,7 +142,8 @@ void panic(const char *fmt, ...) * Note: since some panic_notifiers can make crashed kernel * more unstable, it can increase risks of the kdump failure too. */ - crash_kexec(NULL); + if (crash_kexec_post_notifiers) + crash_kexec(NULL); bust_spinlocks(0); diff --git a/kernel/params.c b/kernel/params.c index a22d6a759..b6554aa71 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -25,15 +25,34 @@ #include <linux/slab.h> #include <linux/ctype.h> -/* Protects all parameters, and incidentally kmalloced_param list. */ +#ifdef CONFIG_SYSFS +/* Protects all built-in parameters, modules use their own param_lock */ static DEFINE_MUTEX(param_lock); +/* Use the module's mutex, or if built-in use the built-in mutex */ +#ifdef CONFIG_MODULES +#define KPARAM_MUTEX(mod) ((mod) ? &(mod)->param_lock : ¶m_lock) +#else +#define KPARAM_MUTEX(mod) (¶m_lock) +#endif + +static inline void check_kparam_locked(struct module *mod) +{ + BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod))); +} +#else +static inline void check_kparam_locked(struct module *mod) +{ +} +#endif /* !CONFIG_SYSFS */ + /* This just allows us to keep track of which parameters are kmalloced. */ struct kmalloced_param { struct list_head list; char val[]; }; static LIST_HEAD(kmalloced_params); +static DEFINE_SPINLOCK(kmalloced_params_lock); static void *kmalloc_parameter(unsigned int size) { @@ -43,7 +62,10 @@ static void *kmalloc_parameter(unsigned int size) if (!p) return NULL; + spin_lock(&kmalloced_params_lock); list_add(&p->list, &kmalloced_params); + spin_unlock(&kmalloced_params_lock); + return p->val; } @@ -52,6 +74,7 @@ static void maybe_kfree_parameter(void *param) { struct kmalloced_param *p; + spin_lock(&kmalloced_params_lock); list_for_each_entry(p, &kmalloced_params, list) { if (p->val == param) { list_del(&p->list); @@ -59,6 +82,7 @@ static void maybe_kfree_parameter(void *param) break; } } + spin_unlock(&kmalloced_params_lock); } static char dash2underscore(char c) @@ -100,8 +124,9 @@ static int parse_one(char *param, unsigned num_params, s16 min_level, s16 max_level, + void *arg, int (*handle_unknown)(char *param, char *val, - const char *doing)) + const char *doing, void *arg)) { unsigned int i; int err; @@ -118,17 +143,17 @@ static int parse_one(char *param, return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); - mutex_lock(¶m_lock); + kernel_param_lock(params[i].mod); param_check_unsafe(¶ms[i]); err = params[i].ops->set(val, ¶ms[i]); - mutex_unlock(¶m_lock); + kernel_param_unlock(params[i].mod); return err; } } if (handle_unknown) { pr_debug("doing %s: %s='%s'\n", doing, param, val); - return handle_unknown(param, val, doing); + return handle_unknown(param, val, doing, arg); } pr_debug("Unknown argument '%s'\n", param); @@ -194,7 +219,9 @@ char *parse_args(const char *doing, unsigned num, s16 min_level, s16 max_level, - int (*unknown)(char *param, char *val, const char *doing)) + void *arg, + int (*unknown)(char *param, char *val, + const char *doing, void *arg)) { char *param, *val; @@ -214,7 +241,7 @@ char *parse_args(const char *doing, return args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, - min_level, max_level, unknown); + min_level, max_level, arg, unknown); if (irq_was_disabled && !irqs_disabled()) pr_warn("%s: option '%s' enabled irq's!\n", doing, param); @@ -251,7 +278,7 @@ char *parse_args(const char *doing, return scnprintf(buffer, PAGE_SIZE, format, \ *((type *)kp->arg)); \ } \ - struct kernel_param_ops param_ops_##name = { \ + const struct kernel_param_ops param_ops_##name = { \ .set = param_set_##name, \ .get = param_get_##name, \ }; \ @@ -303,7 +330,7 @@ static void param_free_charp(void *arg) maybe_kfree_parameter(*((char **)arg)); } -struct kernel_param_ops param_ops_charp = { +const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, .get = param_get_charp, .free = param_free_charp, @@ -328,13 +355,44 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_bool); -struct kernel_param_ops param_ops_bool = { +const struct kernel_param_ops param_ops_bool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; EXPORT_SYMBOL(param_ops_bool); +int param_set_bool_enable_only(const char *val, const struct kernel_param *kp) +{ + int err = 0; + bool new_value; + bool orig_value = *(bool *)kp->arg; + struct kernel_param dummy_kp = *kp; + + dummy_kp.arg = &new_value; + + err = param_set_bool(val, &dummy_kp); + if (err) + return err; + + /* Don't let them unset it once it's set! */ + if (!new_value && orig_value) + return -EROFS; + + if (new_value) + err = param_set_bool(val, kp); + + return err; +} +EXPORT_SYMBOL_GPL(param_set_bool_enable_only); + +const struct kernel_param_ops param_ops_bool_enable_only = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = param_set_bool_enable_only, + .get = param_get_bool, +}; +EXPORT_SYMBOL_GPL(param_ops_bool_enable_only); + /* This one must be bool. */ int param_set_invbool(const char *val, const struct kernel_param *kp) { @@ -356,7 +414,7 @@ int param_get_invbool(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_invbool); -struct kernel_param_ops param_ops_invbool = { +const struct kernel_param_ops param_ops_invbool = { .set = param_set_invbool, .get = param_get_invbool, }; @@ -364,12 +422,11 @@ EXPORT_SYMBOL(param_ops_invbool); int param_set_bint(const char *val, const struct kernel_param *kp) { - struct kernel_param boolkp; + /* Match bool exactly, by re-using it. */ + struct kernel_param boolkp = *kp; bool v; int ret; - /* Match bool exactly, by re-using it. */ - boolkp = *kp; boolkp.arg = &v; ret = param_set_bool(val, &boolkp); @@ -379,7 +436,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) } EXPORT_SYMBOL(param_set_bint); -struct kernel_param_ops param_ops_bint = { +const struct kernel_param_ops param_ops_bint = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bint, .get = param_get_int, @@ -387,7 +444,8 @@ struct kernel_param_ops param_ops_bint = { EXPORT_SYMBOL(param_ops_bint); /* We break the rule and mangle the string. */ -static int param_array(const char *name, +static int param_array(struct module *mod, + const char *name, const char *val, unsigned int min, unsigned int max, void *elem, int elemsize, @@ -418,7 +476,7 @@ static int param_array(const char *name, /* nul-terminate and parse */ save = val[len]; ((char *)val)[len] = '\0'; - BUG_ON(!mutex_is_locked(¶m_lock)); + check_kparam_locked(mod); ret = set(val, &kp); if (ret != 0) @@ -440,7 +498,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp) const struct kparam_array *arr = kp->arr; unsigned int temp_num; - return param_array(kp->name, val, 1, arr->max, arr->elem, + return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem, arr->elemsize, arr->ops->set, kp->level, arr->num ?: &temp_num); } @@ -449,14 +507,13 @@ static int param_array_get(char *buffer, const struct kernel_param *kp) { int i, off, ret; const struct kparam_array *arr = kp->arr; - struct kernel_param p; + struct kernel_param p = *kp; - p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { if (i) buffer[off++] = ','; p.arg = arr->elem + arr->elemsize * i; - BUG_ON(!mutex_is_locked(¶m_lock)); + check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); if (ret < 0) return ret; @@ -476,7 +533,7 @@ static void param_array_free(void *arg) arr->ops->free(arr->elem + arr->elemsize * i); } -struct kernel_param_ops param_array_ops = { +const struct kernel_param_ops param_array_ops = { .set = param_array_set, .get = param_array_get, .free = param_array_free, @@ -504,7 +561,7 @@ int param_get_string(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_string); -struct kernel_param_ops param_ops_string = { +const struct kernel_param_ops param_ops_string = { .set = param_set_copystring, .get = param_get_string, }; @@ -539,9 +596,9 @@ static ssize_t param_attr_show(struct module_attribute *mattr, if (!attribute->param->ops->get) return -EPERM; - mutex_lock(¶m_lock); + kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); - mutex_unlock(¶m_lock); + kernel_param_unlock(mk->mod); if (count > 0) { strcat(buf, "\n"); ++count; @@ -551,7 +608,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr, /* sysfs always hands a nul-terminated string in buf. We rely on that. */ static ssize_t param_attr_store(struct module_attribute *mattr, - struct module_kobject *km, + struct module_kobject *mk, const char *buf, size_t len) { int err; @@ -560,10 +617,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr, if (!attribute->param->ops->set) return -EPERM; - mutex_lock(¶m_lock); + kernel_param_lock(mk->mod); param_check_unsafe(attribute->param); err = attribute->param->ops->set(buf, attribute->param); - mutex_unlock(¶m_lock); + kernel_param_unlock(mk->mod); if (!err) return len; return err; @@ -577,17 +634,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr, #endif #ifdef CONFIG_SYSFS -void __kernel_param_lock(void) +void kernel_param_lock(struct module *mod) { - mutex_lock(¶m_lock); + mutex_lock(KPARAM_MUTEX(mod)); } -EXPORT_SYMBOL(__kernel_param_lock); -void __kernel_param_unlock(void) +void kernel_param_unlock(struct module *mod) { - mutex_unlock(¶m_lock); + mutex_unlock(KPARAM_MUTEX(mod)); } -EXPORT_SYMBOL(__kernel_param_unlock); + +EXPORT_SYMBOL(kernel_param_lock); +EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs @@ -853,6 +911,7 @@ static void __init version_sysfs_builtin(void) mk = locate_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); + WARN_ON_ONCE(err); kobject_uevent(&mk->kobj, KOBJ_ADD); kobject_put(&mk->kobj); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 89a46f3ff..9e302315e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -91,284 +91,6 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. -menuconfig TOI_CORE - bool "Enhanced Hibernation (TuxOnIce)" - depends on HIBERNATION - default y - ---help--- - TuxOnIce is the 'new and improved' suspend support. - - See the TuxOnIce home page (tuxonice.net) - for FAQs, HOWTOs and other documentation. - - comment "Image Storage (you need at least one allocator)" - depends on TOI_CORE - - config TOI_FILE - bool "File Allocator" - depends on TOI_CORE - default y - ---help--- - This option enables support for storing an image in a - simple file. You might want this if your swap is - sometimes full enough that you don't have enough spare - space to store an image. - - config TOI_SWAP - bool "Swap Allocator" - depends on TOI_CORE && SWAP - default y - ---help--- - This option enables support for storing an image in your - swap space. - - comment "General Options" - depends on TOI_CORE - - config TOI_PRUNE - bool "Image pruning support" - depends on TOI_CORE && CRYPTO && BROKEN - default y - ---help--- - This option adds support for using cryptoapi hashing - algorithms to identify pages with the same content. We - then write a much smaller pointer to the first copy of - the data instead of a complete (perhaps compressed) - additional copy. - - You probably want this, so say Y here. - - comment "No image pruning support available without Cryptoapi support." - depends on TOI_CORE && !CRYPTO - - config TOI_CRYPTO - bool "Compression support" - depends on TOI_CORE && CRYPTO - default y - ---help--- - This option adds support for using cryptoapi compression - algorithms. Compression is particularly useful as it can - more than double your suspend and resume speed (depending - upon how well your image compresses). - - You probably want this, so say Y here. - - comment "No compression support available without Cryptoapi support." - depends on TOI_CORE && !CRYPTO - - config TOI_USERUI - bool "Userspace User Interface support" - depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE) - default y - ---help--- - This option enabled support for a userspace based user interface - to TuxOnIce, which allows you to have a nice display while suspending - and resuming, and also enables features such as pressing escape to - cancel a cycle or interactive debugging. - - config TOI_USERUI_DEFAULT_PATH - string "Default userui program location" - default "/usr/local/sbin/tuxoniceui_text" - depends on TOI_USERUI - ---help--- - This entry allows you to specify a default path to the userui binary. - - config TOI_DEFAULT_IMAGE_SIZE_LIMIT - int "Default image size limit" - range -2 65536 - default "-2" - depends on TOI_CORE - ---help--- - This entry allows you to specify a default image size limit. It can - be overridden at run-time using /sys/power/tuxonice/image_size_limit. - - config TOI_KEEP_IMAGE - bool "Allow Keep Image Mode" - depends on TOI_CORE - ---help--- - This option allows you to keep and image and reuse it. It is intended - __ONLY__ for use with systems where all filesystems are mounted read- - only (kiosks, for example). To use it, compile this option in and boot - normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend. - When you resume, the image will not be removed. You will be unable to turn - off swap partitions (assuming you are using the swap allocator), but future - suspends simply do a power-down. The image can be updated using the - kernel command line parameter suspend_act= to turn off the keep image - bit. Keep image mode is a little less user friendly on purpose - it - should not be used without thought! - - config TOI_INCREMENTAL - bool "Incremental Image Support" - depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE - default n - ---help--- - This option enables the work in progress toward using the dirty page - tracking to record changes to pages. It is hoped that - this will be an initial step toward implementing storing just - the differences between consecutive images, which will - increase the amount of storage needed for the image, but also - increase the speed at which writing an image occurs and - reduce the wear and tear on drives. - - At the moment, all that is implemented is the first step of keeping - an existing image and then comparing it to the contents in memory - (by setting /sys/power/tuxonice/verify_image to 1 and triggering a - (fake) resume) to see what the page change tracking should find to be - different. If you have verify_image set to 1, TuxOnIce will automatically - invalidate the old image when you next try to hibernate, so there's no - greater chance of disk corruption than normal. - - comment "No incremental image support available without Keep Image support." - depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT - - config TOI_REPLACE_SWSUSP - bool "Replace swsusp by default" - default y - depends on TOI_CORE - ---help--- - TuxOnIce can replace swsusp. This option makes that the default state, - requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want - to use the vanilla kernel functionality. Note that your initrd/ramfs will - need to do this before trying to resume, too. - With overriding swsusp enabled, echoing disk to /sys/power/state will - start a TuxOnIce cycle. If resume= doesn't specify an allocator and both - the swap and file allocators are compiled in, the swap allocator will be - used by default. - - config TOI_IGNORE_LATE_INITCALL - bool "Wait for initrd/ramfs to run, by default" - default n - depends on TOI_CORE - ---help--- - When booting, TuxOnIce can check for an image and start to resume prior - to any initrd/ramfs running (via a late initcall). - - If you don't have an initrd/ramfs, this is what you want to happen - - otherwise you won't be able to safely resume. You should set this option - to 'No'. - - If, however, you want your initrd/ramfs to run anyway before resuming, - you need to tell TuxOnIce to ignore that earlier opportunity to resume. - This can be done either by using this compile time option, or by - overriding this option with the boot-time parameter toi_initramfs_resume_only=1. - - Note that if TuxOnIce can't resume at the earlier opportunity, the - value of this option won't matter - the initramfs/initrd (if any) will - run anyway. - - menuconfig TOI_CLUSTER - bool "Cluster support" - default n - depends on TOI_CORE && NET && BROKEN - ---help--- - Support for linking multiple machines in a cluster so that they suspend - and resume together. - - config TOI_DEFAULT_CLUSTER_INTERFACE - string "Default cluster interface" - depends on TOI_CLUSTER - ---help--- - The default interface on which to communicate with other nodes in - the cluster. - - If no value is set here, cluster support will be disabled by default. - - config TOI_DEFAULT_CLUSTER_KEY - string "Default cluster key" - default "Default" - depends on TOI_CLUSTER - ---help--- - The default key used by this node. All nodes in the same cluster - have the same key. Multiple clusters may coexist on the same lan - by using different values for this key. - - config TOI_CLUSTER_IMAGE_TIMEOUT - int "Timeout when checking for image" - default 15 - depends on TOI_CLUSTER - ---help--- - Timeout (seconds) before continuing to boot when waiting to see - whether other nodes might have an image. Set to -1 to wait - indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue - booting sooner than this timeout. - - config TOI_CLUSTER_WAIT_UNTIL_NODES - int "Nodes without image before continuing" - default 0 - depends on TOI_CLUSTER - ---help--- - When booting and no image is found, we wait to see if other nodes - have an image before continuing to boot. This value lets us - continue after seeing a certain number of nodes without an image, - instead of continuing to wait for the timeout. Set to 0 to only - use the timeout. - - config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE - string "Default pre-hibernate script" - depends on TOI_CLUSTER - ---help--- - The default script to be called when starting to hibernate. - - config TOI_DEFAULT_CLUSTER_POST_HIBERNATE - string "Default post-hibernate script" - depends on TOI_CLUSTER - ---help--- - The default script to be called after resuming from hibernation. - - config TOI_DEFAULT_WAIT - int "Default waiting time for emergency boot messages" - default "25" - range -1 32768 - depends on TOI_CORE - help - TuxOnIce can display warnings very early in the process of resuming, - if (for example) it appears that you have booted a kernel that doesn't - match an image on disk. It can then give you the opportunity to either - continue booting that kernel, or reboot the machine. This option can be - used to control how long to wait in such circumstances. -1 means wait - forever. 0 means don't wait at all (do the default action, which will - generally be to continue booting and remove the image). Values of 1 or - more indicate a number of seconds (up to 255) to wait before doing the - default. - - config TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE - int "Default extra pages allowance" - default "2000" - range 500 32768 - depends on TOI_CORE - help - This value controls the default for the allowance TuxOnIce makes for - drivers to allocate extra memory during the atomic copy. The default - value of 2000 will be okay in most cases. If you are using - DRI, the easiest way to find what value to use is to try to hibernate - and look at how many pages were actually needed in the sysfs entry - /sys/power/tuxonice/debug_info (first number on the last line), adding - a little extra because the value is not always the same. - - config TOI_CHECKSUM - bool "Checksum pageset2" - default n - depends on TOI_CORE - select CRYPTO - select CRYPTO_ALGAPI - select CRYPTO_MD4 - ---help--- - Adds support for checksumming pageset2 pages, to ensure you really get an - atomic copy. Since some filesystems (XFS especially) change metadata even - when there's no other activity, we need this to check for pages that have - been changed while we were saving the page cache. If your debugging output - always says no pages were resaved, you may be able to safely disable this - option. - -config TOI - bool - depends on TOI_CORE!=n - default y - -config TOI_ZRAM_SUPPORT - def_bool y - depends on TOI && ZRAM!=n - config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS diff --git a/kernel/power/Makefile b/kernel/power/Makefile index b8d7b68f7..cb880a14c 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,46 +1,13 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -tuxonice_core-y := tuxonice_modules.o - -obj-$(CONFIG_TOI) += tuxonice_builtin.o -obj-$(CONFIG_TOI_INCREMENTAL) += tuxonice_incremental.o \ - tuxonice_copy_before_write.o - -tuxonice_core-$(CONFIG_PM_DEBUG) += tuxonice_alloc.o - -# Compile these in after allocation debugging, if used. - -tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \ - tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \ - tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \ - tuxonice_power_off.o tuxonice_atomic_copy.o - -tuxonice_core-$(CONFIG_TOI_CHECKSUM) += tuxonice_checksum.o - -tuxonice_core-$(CONFIG_NET) += tuxonice_storage.o tuxonice_netlink.o - -obj-$(CONFIG_TOI_CORE) += tuxonice_core.o -obj-$(CONFIG_TOI_PRUNE) += tuxonice_prune.o -obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o - -tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \ - tuxonice_bio_signature.o - -obj-$(CONFIG_TOI_SWAP) += tuxonice_bio.o tuxonice_swap.o -obj-$(CONFIG_TOI_FILE) += tuxonice_bio.o tuxonice_file.o -obj-$(CONFIG_TOI_CLUSTER) += tuxonice_cluster.o - -obj-$(CONFIG_TOI_USERUI) += tuxonice_userui.o - obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ - block_io.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c deleted file mode 100644 index 9a58bc258..000000000 --- a/kernel/power/block_io.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * This file provides functions for block I/O operations on swap/file. - * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> - * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * - * This file is released under the GPLv2. - */ - -#include <linux/bio.h> -#include <linux/kernel.h> -#include <linux/pagemap.h> -#include <linux/swap.h> - -#include "power.h" - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, struct block_device *bdev, sector_t sector, - struct page *page, struct bio **bio_chain) -{ - const int bio_rw = rw | REQ_SYNC; - struct bio *bio; - - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", - (unsigned long long)sector); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(bio_rw, bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(bio_rw, bio); - } - return 0; -} - -int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7d3bc724..690f78f21 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -31,7 +31,7 @@ #include <linux/ktime.h> #include <trace/events/power.h> -#include "tuxonice.h" +#include "power.h" static int nocompress; @@ -39,7 +39,7 @@ static int noresume; static int nohibernate; static int resume_wait; static unsigned int resume_delay; -char resume_file[256] = CONFIG_PM_STD_PARTITION; +static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; __visible int in_suspend __nosavedata; @@ -123,7 +123,7 @@ static int hibernation_test(int level) { return 0; } * platform_begin - Call platform to start hibernation. * @platform_mode: Whether or not to use the platform driver. */ -int platform_begin(int platform_mode) +static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->begin() : 0; @@ -133,7 +133,7 @@ int platform_begin(int platform_mode) * platform_end - Call platform to finish transition to the working state. * @platform_mode: Whether or not to use the platform driver. */ -void platform_end(int platform_mode) +static void platform_end(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->end(); @@ -147,7 +147,7 @@ void platform_end(int platform_mode) * if so configured, and return an error code if that fails. */ -int platform_pre_snapshot(int platform_mode) +static int platform_pre_snapshot(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_snapshot() : 0; @@ -162,7 +162,7 @@ int platform_pre_snapshot(int platform_mode) * * This routine is called on one CPU with interrupts disabled. */ -void platform_leave(int platform_mode) +static void platform_leave(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->leave(); @@ -177,7 +177,7 @@ void platform_leave(int platform_mode) * * This routine must be called after platform_prepare(). */ -void platform_finish(int platform_mode) +static void platform_finish(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->finish(); @@ -193,7 +193,7 @@ void platform_finish(int platform_mode) * If the restore fails after this function has been called, * platform_restore_cleanup() must be called. */ -int platform_pre_restore(int platform_mode) +static int platform_pre_restore(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_restore() : 0; @@ -210,7 +210,7 @@ int platform_pre_restore(int platform_mode) * function must be called too, regardless of the result of * platform_pre_restore(). */ -void platform_restore_cleanup(int platform_mode) +static void platform_restore_cleanup(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->restore_cleanup(); @@ -220,7 +220,7 @@ void platform_restore_cleanup(int platform_mode) * platform_recover - Recover from a failure to suspend devices. * @platform_mode: Whether or not to use the platform driver. */ -void platform_recover(int platform_mode) +static void platform_recover(int platform_mode) { if (platform_mode && hibernation_ops && hibernation_ops->recover) hibernation_ops->recover(); @@ -552,7 +552,7 @@ int hibernation_platform_enter(void) error = disable_nonboot_cpus(); if (error) - goto Platform_finish; + goto Enable_cpus; local_irq_disable(); syscore_suspend(); @@ -568,6 +568,8 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); local_irq_enable(); + + Enable_cpus: enable_nonboot_cpus(); Platform_finish: @@ -646,9 +648,6 @@ int hibernate(void) { int error; - if (test_action_state(TOI_REPLACE_SWSUSP)) - return try_tuxonice_hibernate(); - if (!hibernation_available()) { pr_debug("PM: Hibernation not available.\n"); return -EPERM; @@ -738,19 +737,11 @@ int hibernate(void) * attempts to recover gracefully and make the kernel return to the normal mode * of operation. */ -int software_resume(void) +static int software_resume(void) { int error; unsigned int flags; - resume_attempted = 1; - - /* - * We can't know (until an image header - if any - is loaded), whether - * we did override swsusp. We therefore ensure that both are tried. - */ - try_tuxonice_resume(); - /* * If the user said "noresume".. bail out early. */ @@ -1137,7 +1128,6 @@ static int __init hibernate_setup(char *str) static int __init noresume_setup(char *str) { noresume = 1; - set_toi_state(TOI_NORESUME_SPECIFIED); return 1; } diff --git a/kernel/power/main.c b/kernel/power/main.c index 86e8157a4..63d395b5d 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -272,7 +272,7 @@ static inline void pm_print_times_init(void) { pm_print_times_enabled = !!initcall_debug; } -#else /* !CONFIG_PP_SLEEP_DEBUG */ +#else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ diff --git a/kernel/power/power.h b/kernel/power/power.h index 095ed9f03..caadb566e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -36,12 +36,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; } -#else -extern char *check_image_kernel(struct swsusp_info *info); #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ -extern int init_header(struct swsusp_info *info); -extern char resume_file[256]; /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] @@ -81,8 +77,6 @@ static struct kobj_attribute _name##_attr = { \ .store = _name##_store, \ } -extern struct pbe *restore_pblist; - /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ @@ -169,15 +163,6 @@ extern void swsusp_close(fmode_t); extern int swsusp_unmark(void); #endif -/* kernel/power/block_io.c */ -extern struct block_device *hib_resume_bdev; - -extern int hib_bio_read_page(pgoff_t page_off, void *addr, - struct bio **bio_chain); -extern int hib_bio_write_page(pgoff_t page_off, void *addr, - struct bio **bio_chain); -extern int hib_wait_on_bio_chain(struct bio **bio_chain); - struct timeval; /* kernel/power/swsusp.c */ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); @@ -275,31 +260,6 @@ static inline void suspend_thaw_processes(void) } #endif -extern struct page *saveable_page(struct zone *z, unsigned long p); -#ifdef CONFIG_HIGHMEM -struct page *saveable_highmem_page(struct zone *z, unsigned long p); -#else -static -inline void *saveable_highmem_page(struct zone *z, unsigned long p) -{ - return NULL; -} -#endif - -#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe)) -extern struct list_head nosave_regions; - -/** - * This structure represents a range of page frames the contents of which - * should not be saved during the suspend. - */ - -struct nosave_region { - struct list_head list; - unsigned long start_pfn; - unsigned long end_pfn; -}; - #ifdef CONFIG_PM_AUTOSLEEP /* kernel/power/autosleep.c */ @@ -326,10 +286,3 @@ extern int pm_wake_lock(const char *buf); extern int pm_wake_unlock(const char *buf); #endif /* !CONFIG_PM_WAKELOCKS */ - -#ifdef CONFIG_TOI -unsigned long toi_get_nonconflicting_page(void); -#define BM_END_OF_MAP (~0UL) -#else -#define toi_get_nonconflicting_page() (0) -#endif diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ba9d20ebc..5235dd4e1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -36,9 +36,6 @@ #include <asm/tlbflush.h> #include <asm/io.h> -#include "tuxonice_modules.h" -#include "tuxonice_builtin.h" -#include "tuxonice_alloc.h" #include "power.h" static int swsusp_page_is_free(struct page *); @@ -101,9 +98,6 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) { void *res; - if (toi_running) - return (void *) toi_get_nonconflicting_page(); - res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) while (res && swsusp_page_is_free(virt_to_page(res))) { @@ -149,11 +143,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free) page = virt_to_page(addr); - if (toi_running) { - toi__free_page(29, page); - return; - } - swsusp_unset_page_forbidden(page); if (clear_nosave_free) swsusp_unset_page_free(page); @@ -313,15 +302,13 @@ struct bm_position { int node_bit; }; -#define BM_POSITION_SLOTS (NR_CPUS * 2) - struct memory_bitmap { struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects */ - struct bm_position cur[BM_POSITION_SLOTS]; /* most recently used bit position */ + struct bm_position cur; /* most recently used bit position */ }; /* Functions that operate on memory bitmaps */ @@ -486,39 +473,16 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, free_image_page(node->data, clear_nosave_free); } -void memory_bm_position_reset(struct memory_bitmap *bm) +static void memory_bm_position_reset(struct memory_bitmap *bm) { - int index; - - for (index = 0; index < BM_POSITION_SLOTS; index++) { - bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, list); - bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next, + bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); - bm->cur[index].node_pfn = 0; - bm->cur[index].node_bit = 0; - } + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; } -static void memory_bm_clear_current(struct memory_bitmap *bm, int index); -unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index); - -/** - * memory_bm_clear - * @param bm - The bitmap to clear - * - * Only run while single threaded - locking not needed - */ -void memory_bm_clear(struct memory_bitmap *bm) -{ - memory_bm_position_reset(bm); - - while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) { - memory_bm_clear_current(bm, 0); - } - - memory_bm_position_reset(bm); -} static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); struct mem_extent { @@ -631,8 +595,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) } bm->p_list = ca.chain; - - memory_bm_position_reset(bm); + memory_bm_position_reset(bm); Exit: free_mem_extents(&mem_extents); return error; @@ -668,24 +631,14 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * It walks the radix tree to find the page which contains the bit for * pfn and returns the bit position in **addr and *bit_nr. */ -int memory_bm_find_bit(struct memory_bitmap *bm, int index, - unsigned long pfn, void **addr, unsigned int *bit_nr) +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + void **addr, unsigned int *bit_nr) { struct mem_zone_bm_rtree *curr, *zone; struct rtree_node *node; int i, block_nr; - if (!bm->cur[index].zone) { - // Reset - bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, - list); - bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next, - struct rtree_node, list); - bm->cur[index].node_pfn = 0; - bm->cur[index].node_bit = 0; - } - - zone = bm->cur[index].zone; + zone = bm->cur.zone; if (pfn >= zone->start_pfn && pfn < zone->end_pfn) goto zone_found; @@ -709,8 +662,8 @@ zone_found: * node for our pfn. */ - node = bm->cur[index].node; - if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn) + node = bm->cur.node; + if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; node = zone->rtree; @@ -727,9 +680,9 @@ zone_found: node_found: /* Update last position */ - bm->cur[index].zone = zone; - bm->cur[index].node = node; - bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + bm->cur.zone = zone; + bm->cur.node = node; + bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; /* Set return values */ *addr = node->data; @@ -738,66 +691,66 @@ node_found: return 0; } -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn) +static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); set_bit(bit, addr); } -int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn) +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); if (!error) set_bit(bit, addr); return error; } -void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn) +static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); clear_bit(bit, addr); } -static void memory_bm_clear_current(struct memory_bitmap *bm, int index) +static void memory_bm_clear_current(struct memory_bitmap *bm) { int bit; - bit = max(bm->cur[index].node_bit - 1, 0); - clear_bit(bit, bm->cur[index].node->data); + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); } -int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn) +static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); return test_bit(bit, addr); } -static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn) +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; - return !memory_bm_find_bit(bm, index, pfn, &addr, &bit); + return !memory_bm_find_bit(bm, pfn, &addr, &bit); } /* @@ -810,25 +763,25 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned * * Returns true if there is a next node, false otherwise. */ -static bool rtree_next_node(struct memory_bitmap *bm, int index) +static bool rtree_next_node(struct memory_bitmap *bm) { - bm->cur[index].node = list_entry(bm->cur[index].node->list.next, + bm->cur.node = list_entry(bm->cur.node->list.next, struct rtree_node, list); - if (&bm->cur[index].node->list != &bm->cur[index].zone->leaves) { - bm->cur[index].node_pfn += BM_BITS_PER_BLOCK; - bm->cur[index].node_bit = 0; + if (&bm->cur.node->list != &bm->cur.zone->leaves) { + bm->cur.node_pfn += BM_BITS_PER_BLOCK; + bm->cur.node_bit = 0; touch_softlockup_watchdog(); return true; } /* No more nodes, goto next zone */ - bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next, + bm->cur.zone = list_entry(bm->cur.zone->list.next, struct mem_zone_bm_rtree, list); - if (&bm->cur[index].zone->list != &bm->zones) { - bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next, + if (&bm->cur.zone->list != &bm->zones) { + bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); - bm->cur[index].node_pfn = 0; - bm->cur[index].node_bit = 0; + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; return true; } @@ -846,29 +799,38 @@ static bool rtree_next_node(struct memory_bitmap *bm, int index) * It is required to run memory_bm_position_reset() before the * first call to this function. */ -unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index) +static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { unsigned long bits, pfn, pages; int bit; - index += NR_CPUS; /* Iteration state is separated from get/set/test */ - do { - pages = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn; - bits = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK); - bit = find_next_bit(bm->cur[index].node->data, bits, - bm->cur[index].node_bit); + pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; + bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); + bit = find_next_bit(bm->cur.node->data, bits, + bm->cur.node_bit); if (bit < bits) { - pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit; - bm->cur[index].node_bit = bit + 1; + pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; + bm->cur.node_bit = bit + 1; return pfn; } - } while (rtree_next_node(bm, index)); + } while (rtree_next_node(bm)); return BM_END_OF_MAP; } -LIST_HEAD(nosave_regions); +/** + * This structure represents a range of page frames the contents of which + * should not be saved during the suspend. + */ + +struct nosave_region { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + +static LIST_HEAD(nosave_regions); /** * register_nosave_region - register a range of page frames the contents @@ -927,37 +889,37 @@ static struct memory_bitmap *free_pages_map; void swsusp_set_page_free(struct page *page) { if (free_pages_map) - memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page)); + memory_bm_set_bit(free_pages_map, page_to_pfn(page)); } static int swsusp_page_is_free(struct page *page) { return free_pages_map ? - memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0; + memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0; } void swsusp_unset_page_free(struct page *page) { if (free_pages_map) - memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page)); + memory_bm_clear_bit(free_pages_map, page_to_pfn(page)); } static void swsusp_set_page_forbidden(struct page *page) { if (forbidden_pages_map) - memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page)); + memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page)); } int swsusp_page_is_forbidden(struct page *page) { return forbidden_pages_map ? - memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0; + memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0; } static void swsusp_unset_page_forbidden(struct page *page) { if (forbidden_pages_map) - memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page)); + memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page)); } /** @@ -988,7 +950,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) * touch the PFNs for which the error is * returned anyway. */ - mem_bm_set_bit_check(bm, 0, pfn); + mem_bm_set_bit_check(bm, pfn); } } } @@ -1116,7 +1078,7 @@ static unsigned int count_free_highmem_pages(void) * We should save the page if it isn't Nosave or NosaveFree, or Reserved, * and it isn't a part of a free chunk of pages. */ -struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -1163,6 +1125,11 @@ static unsigned int count_highmem_pages(void) } return n; } +#else +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} #endif /* CONFIG_HIGHMEM */ /** @@ -1173,7 +1140,7 @@ static unsigned int count_highmem_pages(void) * of pages statically defined as 'unsaveable', and it isn't a part of * a free chunk of pages. */ -struct page *saveable_page(struct zone *zone, unsigned long pfn) +static struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -1311,15 +1278,15 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (page_is_saveable(zone, pfn)) - memory_bm_set_bit(orig_bm, 0, pfn); + memory_bm_set_bit(orig_bm, pfn); } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); for(;;) { - pfn = memory_bm_next_pfn(orig_bm, 0); + pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; - copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn); + copy_data_page(memory_bm_next_pfn(copy_bm), pfn); } } @@ -1365,8 +1332,8 @@ void swsusp_free(void) memory_bm_position_reset(free_pages_map); loop: - fr_pfn = memory_bm_next_pfn(free_pages_map, 0); - fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0); + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); /* * Find the next bit set in both bitmaps. This is guaranteed to @@ -1374,16 +1341,16 @@ loop: */ do { if (fb_pfn < fr_pfn) - fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); if (fr_pfn < fb_pfn) - fr_pfn = memory_bm_next_pfn(free_pages_map, 0); + fr_pfn = memory_bm_next_pfn(free_pages_map); } while (fb_pfn != fr_pfn); if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { struct page *page = pfn_to_page(fr_pfn); - memory_bm_clear_current(forbidden_pages_map, 0); - memory_bm_clear_current(free_pages_map, 0); + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); __free_page(page); goto loop; } @@ -1418,7 +1385,7 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) page = alloc_image_page(mask); if (!page) break; - memory_bm_set_bit(©_bm, 0, page_to_pfn(page)); + memory_bm_set_bit(©_bm, page_to_pfn(page)); if (PageHighMem(page)) alloc_highmem++; else @@ -1514,7 +1481,7 @@ static unsigned long free_unnecessary_pages(void) memory_bm_position_reset(©_bm); while (to_free_normal > 0 || to_free_highmem > 0) { - unsigned long pfn = memory_bm_next_pfn(©_bm, 0); + unsigned long pfn = memory_bm_next_pfn(©_bm); struct page *page = pfn_to_page(pfn); if (PageHighMem(page)) { @@ -1528,7 +1495,7 @@ static unsigned long free_unnecessary_pages(void) to_free_normal--; alloc_normal--; } - memory_bm_clear_bit(©_bm, 0, pfn); + memory_bm_clear_bit(©_bm, pfn); swsusp_unset_page_forbidden(page); swsusp_unset_page_free(page); __free_page(page); @@ -1813,7 +1780,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) struct page *page; page = alloc_image_page(__GFP_HIGHMEM); - memory_bm_set_bit(bm, 0, page_to_pfn(page)); + memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; } @@ -1856,7 +1823,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); if (!page) goto err_out; - memory_bm_set_bit(copy_bm, 0, page_to_pfn(page)); + memory_bm_set_bit(copy_bm, page_to_pfn(page)); } } @@ -1871,9 +1838,6 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - if (toi_running) - return toi_post_context_save(); - printk(KERN_INFO "PM: Creating hibernation image:\n"); drain_local_pages(NULL); @@ -1921,7 +1885,7 @@ static int init_header_complete(struct swsusp_info *info) return 0; } -char *check_image_kernel(struct swsusp_info *info) +static char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; @@ -1942,7 +1906,7 @@ unsigned long snapshot_get_image_size(void) return nr_copy_pages + nr_meta_pages + 1; } -int init_header(struct swsusp_info *info) +static int init_header(struct swsusp_info *info) { memset(info, 0, sizeof(struct swsusp_info)); info->num_physpages = get_num_physpages(); @@ -1964,7 +1928,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { - buf[j] = memory_bm_next_pfn(bm, 0); + buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; /* Save page key for data page (s390 only). */ @@ -2015,7 +1979,7 @@ int snapshot_read_next(struct snapshot_handle *handle) } else { struct page *page; - page = pfn_to_page(memory_bm_next_pfn(©_bm, 0)); + page = pfn_to_page(memory_bm_next_pfn(©_bm)); if (PageHighMem(page)) { /* Highmem pages are copied to the buffer, * because we can't return with a kmapped @@ -2057,7 +2021,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) /* Mark pages that correspond to the "original" pfns as "unsafe" */ memory_bm_position_reset(bm); do { - pfn = memory_bm_next_pfn(bm, 0); + pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { if (likely(pfn_valid(pfn))) swsusp_set_page_free(pfn_to_page(pfn)); @@ -2077,10 +2041,10 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) unsigned long pfn; memory_bm_position_reset(src); - pfn = memory_bm_next_pfn(src, 0); + pfn = memory_bm_next_pfn(src); while (pfn != BM_END_OF_MAP) { - memory_bm_set_bit(dst, 0, pfn); - pfn = memory_bm_next_pfn(src, 0); + memory_bm_set_bit(dst, pfn); + pfn = memory_bm_next_pfn(src); } } @@ -2131,8 +2095,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) /* Extract and buffer page key for data page (s390 only). */ page_key_memorize(buf + j); - if (memory_bm_pfn_present(bm, 0, buf[j])) - memory_bm_set_bit(bm, 0, buf[j]); + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); else return -EFAULT; } @@ -2175,12 +2139,12 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) unsigned int cnt = 0; memory_bm_position_reset(bm); - pfn = memory_bm_next_pfn(bm, 0); + pfn = memory_bm_next_pfn(bm); while (pfn != BM_END_OF_MAP) { if (PageHighMem(pfn_to_page(pfn))) cnt++; - pfn = memory_bm_next_pfn(bm, 0); + pfn = memory_bm_next_pfn(bm); } return cnt; } @@ -2225,7 +2189,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) page = alloc_page(__GFP_HIGHMEM); if (!swsusp_page_is_free(page)) { /* The page is "safe", set its bit the bitmap */ - memory_bm_set_bit(bm, 0, page_to_pfn(page)); + memory_bm_set_bit(bm, page_to_pfn(page)); safe_highmem_pages++; } /* Mark the page as allocated */ @@ -2283,7 +2247,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) /* Copy of the page will be stored in high memory */ kaddr = buffer; - tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0)); + tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); safe_highmem_pages--; last_highmem_page = tmp; pbe->copy_page = tmp; @@ -2454,7 +2418,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; struct page *page; - unsigned long pfn = memory_bm_next_pfn(bm, 0); + unsigned long pfn = memory_bm_next_pfn(bm); if (pfn == BM_END_OF_MAP) return ERR_PTR(-EFAULT); @@ -2641,82 +2605,3 @@ int restore_highmem(void) return 0; } #endif /* CONFIG_HIGHMEM */ - -struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map, - *pageset1_copy_map, *io_map, *page_resave_map, *compare_map; - -int resume_attempted; - -int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)) -{ - int result; - - memory_bm_position_reset(bm); - - do { - result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE); - - if (result) - return result; - } while (rtree_next_node(bm, 0)); - return 0; -} - -int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)) -{ - int result; - - memory_bm_position_reset(bm); - - do { - result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE); - - if (result) - return result; - - } while (rtree_next_node(bm, 0)); - return 0; -} - -int memory_bm_space_needed(struct memory_bitmap *bm) -{ - unsigned long bytes = 0; - - memory_bm_position_reset(bm); - do { - bytes += PAGE_SIZE; - } while (rtree_next_node(bm, 0)); - return bytes; -} - -int toi_alloc_bitmap(struct memory_bitmap **bm) -{ - int error; - struct memory_bitmap *bm1; - - bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); - if (!bm1) - return -ENOMEM; - - error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); - if (error) { - printk("Error returned - %d.\n", error); - kfree(bm1); - return -ENOMEM; - } - - *bm = bm1; - return 0; -} - -void toi_free_bitmap(struct memory_bitmap **bm) -{ - if (!*bm) - return; - - memory_bm_free(*bm, 0); - kfree(*bm); - *bm = NULL; -} diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8d7a1ef72..53266b729 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -366,6 +366,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) trace_suspend_resume(TPS("machine_suspend"), state, false); events_check_enabled = false; + } else if (*wakeup) { + error = -EBUSY; } syscore_resume(); } @@ -468,7 +470,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warning("PM: Unsupported test mode for freeze state," + pr_warning("PM: Unsupported test mode for suspend to idle," "please choose none/freezer/devices/platform.\n"); return -EAGAIN; } @@ -488,7 +490,7 @@ static int enter_state(suspend_state_t state) printk("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); + pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); error = suspend_prepare(state); if (error) goto Unlock; @@ -497,7 +499,7 @@ static int enter_state(suspend_state_t state) goto Finish; trace_suspend_resume(TPS("suspend_enter"), state, false); - pr_debug("PM: Entering %s sleep\n", pm_states[state]); + pr_debug("PM: Suspending system (%s)\n", pm_states[state]); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 570aff817..2f30ca91e 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -212,7 +212,84 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -struct block_device *hib_resume_bdev; +static struct block_device *hib_resume_bdev; + +struct hib_bio_batch { + atomic_t count; + wait_queue_head_t wait; + int error; +}; + +static void hib_init_batch(struct hib_bio_batch *hb) +{ + atomic_set(&hb->count, 0); + init_waitqueue_head(&hb->wait); + hb->error = 0; +} + +static void hib_end_io(struct bio *bio, int error) +{ + struct hib_bio_batch *hb = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (!uptodate || error) { + printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); + + if (!error) + error = -EIO; + } + + if (bio_data_dir(bio) == WRITE) + put_page(page); + + if (error && !hb->error) + hb->error = error; + if (atomic_dec_and_test(&hb->count)) + wake_up(&hb->wait); + + bio_put(bio); +} + +static int hib_submit_io(int rw, pgoff_t page_off, void *addr, + struct hib_bio_batch *hb) +{ + struct page *page = virt_to_page(addr); + struct bio *bio; + int error = 0; + + bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); + bio->bi_bdev = hib_resume_bdev; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", + (unsigned long long)bio->bi_iter.bi_sector); + bio_put(bio); + return -EFAULT; + } + + if (hb) { + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(rw, bio); + } else { + error = submit_bio_wait(rw, bio); + bio_put(bio); + } + + return error; +} + +static int hib_wait_io(struct hib_bio_batch *hb) +{ + wait_event(hb->wait, atomic_read(&hb->count) == 0); + return hb->error; +} /* * Saving part @@ -222,7 +299,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -231,7 +308,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_bio_write_page(swsusp_resume_block, + error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); @@ -271,10 +348,10 @@ static int swsusp_swap_check(void) * write_page - Write one page to given swap location. * @buf: Address we're writing. * @offset: Offset of the swap page we're writing to. - * @bio_chain: Link the next write BIO here + * @hb: bio completion batch */ -static int write_page(void *buf, sector_t offset, struct bio **bio_chain) +static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { void *src; int ret; @@ -282,13 +359,13 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (!offset) return -ENOSPC; - if (bio_chain) { + if (hb) { src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); } else { - ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ + ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; src = (void *)__get_free_page(__GFP_WAIT | @@ -298,14 +375,14 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) copy_page(src, buf); } else { WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ + hb = NULL; /* Go synchronous */ src = buf; } } } else { src = buf; } - return hib_bio_write_page(offset, src, bio_chain); + return hib_submit_io(WRITE_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -348,7 +425,7 @@ err_close: } static int swap_write_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) + struct hib_bio_batch *hb) { int error = 0; sector_t offset; @@ -356,7 +433,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, if (!handle->cur) return -EINVAL; offset = alloc_swapdev_block(root_swap); - error = write_page(buf, offset, bio_chain); + error = write_page(buf, offset, hb); if (error) return error; handle->cur->entries[handle->k++] = offset; @@ -365,15 +442,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, bio_chain); + error = write_page(handle->cur, handle->cur_swap, hb); if (error) goto out; clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; - if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { - error = hib_wait_on_bio_chain(bio_chain); + if (hb && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_io(hb); if (error) goto out; /* @@ -445,23 +522,24 @@ static int save_image(struct swap_map_handle *handle, int ret; int nr_pages; int err2; - struct bio *bio; + struct hib_bio_batch hb; ktime_t start; ktime_t stop; + hib_init_batch(&hb); + printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); while (1) { ret = snapshot_read_next(snapshot); if (ret <= 0) break; - ret = swap_write_page(handle, data_of(*snapshot), &bio); + ret = swap_write_page(handle, data_of(*snapshot), &hb); if (ret) break; if (!(nr_pages % m)) @@ -469,7 +547,7 @@ static int save_image(struct swap_map_handle *handle, nr_pages / m * 10); nr_pages++; } - err2 = hib_wait_on_bio_chain(&bio); + err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -580,7 +658,7 @@ static int save_image_lzo(struct swap_map_handle *handle, int ret = 0; int nr_pages; int err2; - struct bio *bio; + struct hib_bio_batch hb; ktime_t start; ktime_t stop; size_t off; @@ -589,6 +667,8 @@ static int save_image_lzo(struct swap_map_handle *handle, struct cmp_data *data = NULL; struct crc_data *crc = NULL; + hib_init_batch(&hb); + /* * We'll limit the number of threads for compression to limit memory * footprint. @@ -674,7 +754,6 @@ static int save_image_lzo(struct swap_map_handle *handle, if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { @@ -748,7 +827,7 @@ static int save_image_lzo(struct swap_map_handle *handle, off += PAGE_SIZE) { memcpy(page, data[thr].cmp + off, PAGE_SIZE); - ret = swap_write_page(handle, page, &bio); + ret = swap_write_page(handle, page, &hb); if (ret) goto out_finish; } @@ -759,7 +838,7 @@ static int save_image_lzo(struct swap_map_handle *handle, } out_finish: - err2 = hib_wait_on_bio_chain(&bio); + err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -906,7 +985,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_bio_read_page(offset, tmp->map, NULL); + error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -919,7 +998,7 @@ static int get_swap_reader(struct swap_map_handle *handle, } static int swap_read_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) + struct hib_bio_batch *hb) { sector_t offset; int error; @@ -930,7 +1009,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_bio_read_page(offset, buf, bio_chain); + error = hib_submit_io(READ_SYNC, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -968,27 +1047,28 @@ static int load_image(struct swap_map_handle *handle, int ret = 0; ktime_t start; ktime_t stop; - struct bio *bio; + struct hib_bio_batch hb; int err2; unsigned nr_pages; + hib_init_batch(&hb); + printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); for ( ; ; ) { ret = snapshot_write_next(snapshot); if (ret <= 0) break; - ret = swap_read_page(handle, data_of(*snapshot), &bio); + ret = swap_read_page(handle, data_of(*snapshot), &hb); if (ret) break; if (snapshot->sync_read) - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) break; if (!(nr_pages % m)) @@ -996,7 +1076,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages / m * 10); nr_pages++; } - err2 = hib_wait_on_bio_chain(&bio); + err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -1067,7 +1147,7 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned int m; int ret = 0; int eof = 0; - struct bio *bio; + struct hib_bio_batch hb; ktime_t start; ktime_t stop; unsigned nr_pages; @@ -1080,6 +1160,8 @@ static int load_image_lzo(struct swap_map_handle *handle, struct dec_data *data = NULL; struct crc_data *crc = NULL; + hib_init_batch(&hb); + /* * We'll limit the number of threads for decompression to limit memory * footprint. @@ -1190,7 +1272,6 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); ret = snapshot_write_next(snapshot); @@ -1199,7 +1280,7 @@ static int load_image_lzo(struct swap_map_handle *handle, for(;;) { for (i = 0; !eof && i < want; i++) { - ret = swap_read_page(handle, page[ring], &bio); + ret = swap_read_page(handle, page[ring], &hb); if (ret) { /* * On real read error, finish. On end of data, @@ -1226,7 +1307,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!asked) break; - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; @@ -1281,7 +1362,7 @@ static int load_image_lzo(struct swap_map_handle *handle, * Wait for more data while we are decompressing. */ if (have < LZO_CMP_PAGES && asked) { - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; @@ -1430,7 +1511,7 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_bio_read_page(swsusp_resume_block, + error = hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -1438,7 +1519,7 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_bio_write_page(swsusp_resume_block, + error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; @@ -1482,10 +1563,10 @@ int swsusp_unmark(void) { int error; - hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_bio_write_page(swsusp_resume_block, + error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h deleted file mode 100644 index 1aff98026..000000000 --- a/kernel/power/tuxonice.h +++ /dev/null @@ -1,260 +0,0 @@ -/* - * kernel/power/tuxonice.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations used throughout swsusp. - * - */ - -#ifndef KERNEL_POWER_TOI_H -#define KERNEL_POWER_TOI_H - -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/suspend.h> -#include <linux/fs.h> -#include <asm/setup.h> -#include "tuxonice_pageflags.h" -#include "power.h" - -#define TOI_CORE_VERSION "3.3" -#define TOI_HEADER_VERSION 3 -#define MY_BOOT_KERNEL_DATA_VERSION 4 - -struct toi_boot_kernel_data { - int version; - int size; - unsigned long toi_action; - unsigned long toi_debug_state; - u32 toi_default_console_level; - int toi_io_time[2][2]; - char toi_nosave_commandline[COMMAND_LINE_SIZE]; - unsigned long pages_used[33]; - unsigned long incremental_bytes_in; - unsigned long incremental_bytes_out; - unsigned long compress_bytes_in; - unsigned long compress_bytes_out; - unsigned long pruned_pages; -}; - -extern struct toi_boot_kernel_data toi_bkd; - -/* Location of book kernel data struct in kernel being resumed */ -extern unsigned long boot_kernel_data_buffer; - -/* == Action states == */ - -enum { - TOI_REBOOT, - TOI_PAUSE, - TOI_LOGALL, - TOI_CAN_CANCEL, - TOI_KEEP_IMAGE, - TOI_FREEZER_TEST, - TOI_SINGLESTEP, - TOI_PAUSE_NEAR_PAGESET_END, - TOI_TEST_FILTER_SPEED, - TOI_TEST_BIO, - TOI_NO_PAGESET2, - TOI_IGNORE_ROOTFS, - TOI_REPLACE_SWSUSP, - TOI_PAGESET2_FULL, - TOI_ABORT_ON_RESAVE_NEEDED, - TOI_NO_MULTITHREADED_IO, - TOI_NO_DIRECT_LOAD, /* Obsolete */ - TOI_LATE_CPU_HOTPLUG, /* Obsolete */ - TOI_GET_MAX_MEM_ALLOCD, - TOI_NO_FLUSHER_THREAD, - TOI_NO_PS2_IF_UNNEEDED, - TOI_POST_RESUME_BREAKPOINT, - TOI_NO_READAHEAD, - TOI_TRACE_DEBUG_ON, - TOI_INCREMENTAL_IMAGE, -}; - -extern unsigned long toi_bootflags_mask; - -#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action)) - -/* == Result states == */ - -enum { - TOI_ABORTED, - TOI_ABORT_REQUESTED, - TOI_NOSTORAGE_AVAILABLE, - TOI_INSUFFICIENT_STORAGE, - TOI_FREEZING_FAILED, - TOI_KEPT_IMAGE, - TOI_WOULD_EAT_MEMORY, - TOI_UNABLE_TO_FREE_ENOUGH_MEMORY, - TOI_PM_SEM, - TOI_DEVICE_REFUSED, - TOI_SYSDEV_REFUSED, - TOI_EXTRA_PAGES_ALLOW_TOO_SMALL, - TOI_UNABLE_TO_PREPARE_IMAGE, - TOI_FAILED_MODULE_INIT, - TOI_FAILED_MODULE_CLEANUP, - TOI_FAILED_IO, - TOI_OUT_OF_MEMORY, - TOI_IMAGE_ERROR, - TOI_PLATFORM_PREP_FAILED, - TOI_CPU_HOTPLUG_FAILED, - TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */ - TOI_RESAVE_NEEDED, - TOI_CANT_SUSPEND, - TOI_NOTIFIERS_PREPARE_FAILED, - TOI_PRE_SNAPSHOT_FAILED, - TOI_PRE_RESTORE_FAILED, - TOI_USERMODE_HELPERS_ERR, - TOI_CANT_USE_ALT_RESUME, - TOI_HEADER_TOO_BIG, - TOI_WAKEUP_EVENT, - TOI_SYSCORE_REFUSED, - TOI_DPM_PREPARE_FAILED, - TOI_DPM_SUSPEND_FAILED, - TOI_NUM_RESULT_STATES /* Used in printing debug info only */ -}; - -extern unsigned long toi_result; - -#define set_result_state(bit) (test_and_set_bit(bit, &toi_result)) -#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \ - test_and_set_bit(bit, &toi_result)) -#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result)) -#define test_result_state(bit) (test_bit(bit, &toi_result)) - -/* == Debug sections and levels == */ - -/* debugging levels. */ -enum { - TOI_STATUS = 0, - TOI_ERROR = 2, - TOI_LOW, - TOI_MEDIUM, - TOI_HIGH, - TOI_VERBOSE, -}; - -enum { - TOI_ANY_SECTION, - TOI_EAT_MEMORY, - TOI_IO, - TOI_HEADER, - TOI_WRITER, - TOI_MEMORY, - TOI_PAGEDIR, - TOI_COMPRESS, - TOI_BIO, -}; - -#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state)) -#define clear_debug_state(bit) \ - (test_and_clear_bit(bit, &toi_bkd.toi_debug_state)) -#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state)) - -/* == Steps in hibernating == */ - -enum { - STEP_HIBERNATE_PREPARE_IMAGE, - STEP_HIBERNATE_SAVE_IMAGE, - STEP_HIBERNATE_POWERDOWN, - STEP_RESUME_CAN_RESUME, - STEP_RESUME_LOAD_PS1, - STEP_RESUME_DO_RESTORE, - STEP_RESUME_READ_PS2, - STEP_RESUME_GO, - STEP_RESUME_ALT_IMAGE, - STEP_CLEANUP, - STEP_QUIET_CLEANUP -}; - -/* == TuxOnIce states == - (see also include/linux/suspend.h) */ - -#define get_toi_state() (toi_state) -#define restore_toi_state(saved_state) \ - do { toi_state = saved_state; } while (0) - -/* == Module support == */ - -struct toi_core_fns { - int (*post_context_save)(void); - unsigned long (*get_nonconflicting_page)(void); - int (*try_hibernate)(void); - void (*try_resume)(void); -}; - -extern struct toi_core_fns *toi_core_fns; - -/* == All else == */ -#define KB(x) ((x) << (PAGE_SHIFT - 10)) -#define MB(x) ((x) >> (20 - PAGE_SHIFT)) - -extern int toi_start_anything(int toi_or_resume); -extern void toi_finish_anything(int toi_or_resume); - -extern int save_image_part1(void); -extern int toi_atomic_restore(void); - -extern int toi_try_hibernate(void); -extern void toi_try_resume(void); - -extern int __toi_post_context_save(void); - -extern unsigned int nr_hibernates; -extern char alt_resume_param[256]; - -extern void copyback_post(void); -extern int toi_hibernate(void); -extern unsigned long extra_pd1_pages_used; - -#define SECTOR_SIZE 512 - -extern void toi_early_boot_message(int can_erase_image, int default_answer, - char *warning_reason, ...); - -extern int do_check_can_resume(void); -extern int do_toi_step(int step); -extern int toi_launch_userspace_program(char *command, int channel_no, - int wait, int debug); - -extern char tuxonice_signature[9]; - -extern int toi_start_other_threads(void); -extern void toi_stop_other_threads(void); - -extern int toi_trace_index; -#define TOI_TRACE_DEBUG(PFN, DESC, ...) \ - do { \ - if (test_action_state(TOI_TRACE_DEBUG_ON)) { \ - printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \ - } \ - } while(0) - -#ifdef CONFIG_TOI_KEEP_IMAGE -#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE)) -#else -#define toi_keeping_image (0) -#endif - -#ifdef CONFIG_TOI_INCREMENTAL -extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose); -extern int toi_reset_dirtiness(int verbose); -extern void toi_cbw_write(void); -extern void toi_cbw_restore(void); -extern int toi_allocate_cbw_data(void); -extern void toi_free_cbw_data(void); -extern int toi_cbw_init(void); -extern void toi_mark_tasks_cbw(void); -#else -static inline int toi_reset_dirtiness(int verbose) { return 0; } -#define toi_cbw_write() do { } while(0) -#define toi_cbw_restore() do { } while(0) -#define toi_allocate_cbw_data() do { } while(0) -#define toi_free_cbw_data() do { } while(0) -static inline int toi_cbw_init(void) { return 0; } -#endif -#endif diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c deleted file mode 100644 index 5729240d8..000000000 --- a/kernel/power/tuxonice_alloc.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * kernel/power/tuxonice_alloc.c - * - * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include <linux/export.h> -#include <linux/slab.h> -#include "tuxonice_modules.h" -#include "tuxonice_alloc.h" -#include "tuxonice_sysfs.h" -#include "tuxonice.h" - -#define TOI_ALLOC_PATHS 41 - -static DEFINE_MUTEX(toi_alloc_mutex); - -static struct toi_module_ops toi_alloc_ops; - -static int toi_fail_num; - -static atomic_t toi_alloc_count[TOI_ALLOC_PATHS], - toi_free_count[TOI_ALLOC_PATHS], - toi_test_count[TOI_ALLOC_PATHS], - toi_fail_count[TOI_ALLOC_PATHS]; -static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS]; -static int cur_allocd, max_allocd; - -static char *toi_alloc_desc[TOI_ALLOC_PATHS] = { - "", /* 0 */ - "get_io_info_struct", - "extent", - "extent (loading chain)", - "userui channel", - "userui arg", /* 5 */ - "attention list metadata", - "extra pagedir memory metadata", - "bdev metadata", - "extra pagedir memory", - "header_locations_read", /* 10 */ - "bio queue", - "prepare_readahead", - "i/o buffer", - "writer buffer in bio_init", - "checksum buffer", /* 15 */ - "compression buffer", - "filewriter signature op", - "set resume param alloc1", - "set resume param alloc2", - "debugging info buffer", /* 20 */ - "check can resume buffer", - "write module config buffer", - "read module config buffer", - "write image header buffer", - "read pageset1 buffer", /* 25 */ - "get_have_image_data buffer", - "checksum page", - "worker rw loop", - "get nonconflicting page", - "ps1 load addresses", /* 30 */ - "remove swap image", - "swap image exists", - "swap parse sig location", - "sysfs kobj", - "swap mark resume attempted buffer", /* 35 */ - "cluster member", - "boot kernel data buffer", - "setting swap signature", - "block i/o bdev struct", - "copy before write", /* 40 */ -}; - -#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \ - do { \ - BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \ - \ - if (FAIL_NUM == toi_fail_num) { \ - atomic_inc(&toi_test_count[FAIL_NUM]); \ - toi_fail_num = 0; \ - return FAIL_VAL; \ - } \ - } while (0) - -static void alloc_update_stats(int fail_num, void *result, int size) -{ - if (!result) { - atomic_inc(&toi_fail_count[fail_num]); - return; - } - - atomic_inc(&toi_alloc_count[fail_num]); - if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { - mutex_lock(&toi_alloc_mutex); - toi_cur_allocd[fail_num]++; - cur_allocd += size; - if (unlikely(cur_allocd > max_allocd)) { - int i; - - for (i = 0; i < TOI_ALLOC_PATHS; i++) - toi_max_allocd[i] = toi_cur_allocd[i]; - max_allocd = cur_allocd; - } - mutex_unlock(&toi_alloc_mutex); - } -} - -static void free_update_stats(int fail_num, int size) -{ - BUG_ON(fail_num >= TOI_ALLOC_PATHS); - atomic_inc(&toi_free_count[fail_num]); - if (unlikely(atomic_read(&toi_free_count[fail_num]) > - atomic_read(&toi_alloc_count[fail_num]))) - dump_stack(); - if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { - mutex_lock(&toi_alloc_mutex); - cur_allocd -= size; - toi_cur_allocd[fail_num]--; - mutex_unlock(&toi_alloc_mutex); - } -} - -void *toi_kzalloc(int fail_num, size_t size, gfp_t flags) -{ - void *result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, NULL); - result = kzalloc(size, flags); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, result, size); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -unsigned long toi_get_free_pages(int fail_num, gfp_t mask, - unsigned int order) -{ - unsigned long result; - - mask |= ___GFP_TOI_NOTRACK; - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, 0); - result = __get_free_pages(mask, order); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, - PAGE_SIZE << order); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -struct page *toi_alloc_page(int fail_num, gfp_t mask) -{ - struct page *result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, NULL); - mask |= ___GFP_TOI_NOTRACK; - result = alloc_page(mask); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask) -{ - unsigned long result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, 0); - mask |= ___GFP_TOI_NOTRACK; - result = get_zeroed_page(mask); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -void toi_kfree(int fail_num, const void *arg, int size) -{ - if (arg && toi_alloc_ops.enabled) - free_update_stats(fail_num, size); - - if (fail_num == toi_trace_allocs) - dump_stack(); - kfree(arg); -} - -void toi_free_page(int fail_num, unsigned long virt) -{ - if (virt && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE); - - if (fail_num == toi_trace_allocs) - dump_stack(); - free_page(virt); -} - -void toi__free_page(int fail_num, struct page *page) -{ - if (page && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE); - - if (fail_num == toi_trace_allocs) - dump_stack(); - __free_page(page); -} - -void toi_free_pages(int fail_num, struct page *page, int order) -{ - if (page && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE << order); - - if (fail_num == toi_trace_allocs) - dump_stack(); - __free_pages(page, order); -} - -void toi_alloc_print_debug_stats(void) -{ - int i, header_done = 0; - - if (!toi_alloc_ops.enabled) - return; - - for (i = 0; i < TOI_ALLOC_PATHS; i++) - if (atomic_read(&toi_alloc_count[i]) != - atomic_read(&toi_free_count[i])) { - if (!header_done) { - printk(KERN_INFO "Idx Allocs Frees Tests " - " Fails Max Description\n"); - header_done = 1; - } - - printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i, - atomic_read(&toi_alloc_count[i]), - atomic_read(&toi_free_count[i]), - atomic_read(&toi_test_count[i]), - atomic_read(&toi_fail_count[i]), - toi_max_allocd[i], - toi_alloc_desc[i]); - } -} - -static int toi_alloc_initialise(int starting_cycle) -{ - int i; - - if (!starting_cycle) - return 0; - - if (toi_trace_allocs) - dump_stack(); - - for (i = 0; i < TOI_ALLOC_PATHS; i++) { - atomic_set(&toi_alloc_count[i], 0); - atomic_set(&toi_free_count[i], 0); - atomic_set(&toi_test_count[i], 0); - atomic_set(&toi_fail_count[i], 0); - toi_cur_allocd[i] = 0; - toi_max_allocd[i] = 0; - }; - - max_allocd = 0; - cur_allocd = 0; - return 0; -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL), - SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0, - NULL), - SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action, - TOI_GET_MAX_MEM_ALLOCD, 0), - SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0, - NULL) -}; - -static struct toi_module_ops toi_alloc_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "allocation debugging", - .directory = "alloc", - .module = THIS_MODULE, - .early = 1, - .initialise = toi_alloc_initialise, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_alloc_init(void) -{ - int result = toi_register_module(&toi_alloc_ops); - return result; -} - -void toi_alloc_exit(void) -{ - toi_unregister_module(&toi_alloc_ops); -} diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h deleted file mode 100644 index 28c5af193..000000000 --- a/kernel/power/tuxonice_alloc.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * kernel/power/tuxonice_alloc.h - * - * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include <linux/slab.h> -#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN) -#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN) - -#ifdef CONFIG_PM_DEBUG -extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags); -extern void toi_kfree(int fail_num, const void *arg, int size); - -extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask, - unsigned int order); -#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0) -extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask); -extern void toi_free_page(int fail_num, unsigned long buf); -extern void toi__free_page(int fail_num, struct page *page); -extern void toi_free_pages(int fail_num, struct page *page, int order); -extern struct page *toi_alloc_page(int fail_num, gfp_t mask); -extern int toi_alloc_init(void); -extern void toi_alloc_exit(void); - -extern void toi_alloc_print_debug_stats(void); - -#else /* CONFIG_PM_DEBUG */ - -#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS)) -#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN)) - -#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER) -#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS) -#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS) -#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0) -#define toi__free_page(FAIL, PAGE) __free_page(PAGE) -#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER) -#define toi_alloc_page(FAIL, MASK) alloc_page(MASK) -static inline int toi_alloc_init(void) -{ - return 0; -} - -static inline void toi_alloc_exit(void) { } - -static inline void toi_alloc_print_debug_stats(void) { } - -#endif - -extern int toi_trace_allocs; diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c deleted file mode 100644 index 7b9886f54..000000000 --- a/kernel/power/tuxonice_atomic_copy.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * kernel/power/tuxonice_atomic_copy.c - * - * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * Routines for doing the atomic save/restore. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/cpu.h> -#include <linux/freezer.h> -#include <linux/console.h> -#include <linux/syscore_ops.h> -#include <linux/ftrace.h> -#include <asm/suspend.h> -#include "tuxonice.h" -#include "tuxonice_storage.h" -#include "tuxonice_power_off.h" -#include "tuxonice_ui.h" -#include "tuxonice_io.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_checksum.h" -#include "tuxonice_builtin.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_alloc.h" -#include "tuxonice_modules.h" - -unsigned long extra_pd1_pages_used; - -/** - * free_pbe_list - free page backup entries used by the atomic copy code. - * @list: List to free. - * @highmem: Whether the list is in highmem. - * - * Normally, this function isn't used. If, however, we need to abort before - * doing the atomic copy, we use this to free the pbes previously allocated. - **/ -static void free_pbe_list(struct pbe **list, int highmem) -{ - while (*list) { - int i; - struct pbe *free_pbe, *next_page = NULL; - struct page *page; - - if (highmem) { - page = (struct page *) *list; - free_pbe = (struct pbe *) kmap(page); - } else { - page = virt_to_page(*list); - free_pbe = *list; - } - - for (i = 0; i < PBES_PER_PAGE; i++) { - if (!free_pbe) - break; - if (highmem) - toi__free_page(29, free_pbe->address); - else - toi_free_page(29, - (unsigned long) free_pbe->address); - free_pbe = free_pbe->next; - } - - if (highmem) { - if (free_pbe) - next_page = free_pbe; - kunmap(page); - } else { - if (free_pbe) - next_page = free_pbe; - } - - toi__free_page(29, page); - *list = (struct pbe *) next_page; - }; -} - -/** - * copyback_post - post atomic-restore actions - * - * After doing the atomic restore, we have a few more things to do: - * 1) We want to retain some values across the restore, so we now copy - * these from the nosave variables to the normal ones. - * 2) Set the status flags. - * 3) Resume devices. - * 4) Tell userui so it can redraw & restore settings. - * 5) Reread the page cache. - **/ -void copyback_post(void) -{ - struct toi_boot_kernel_data *bkd = - (struct toi_boot_kernel_data *) boot_kernel_data_buffer; - - if (toi_activate_storage(1)) - panic("Failed to reactivate our storage."); - - toi_post_atomic_restore_modules(bkd); - - toi_cond_pause(1, "About to reload secondary pagedir."); - - if (read_pageset2(0)) - panic("Unable to successfully reread the page cache."); - - /* - * If the user wants to sleep again after resuming from full-off, - * it's most likely to be in order to suspend to ram, so we'll - * do this check after loading pageset2, to give them the fastest - * wakeup when they are ready to use the computer again. - */ - toi_check_resleep(); - - if (test_action_state(TOI_INCREMENTAL_IMAGE)) - toi_reset_dirtiness(1); -} - -/** - * toi_copy_pageset1 - do the atomic copy of pageset1 - * - * Make the atomic copy of pageset1. We can't use copy_page (as we once did) - * because we can't be sure what side effects it has. On my old Duron, with - * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt - * count at resume time 4 instead of 3. - * - * We don't want to call kmap_atomic unconditionally because it has the side - * effect of incrementing the preempt count, which will leave it one too high - * post resume (the page containing the preempt count will be copied after - * its incremented. This is essentially the same problem. - **/ -void toi_copy_pageset1(void) -{ - int i; - unsigned long source_index, dest_index; - - memory_bm_position_reset(pageset1_map); - memory_bm_position_reset(pageset1_copy_map); - - source_index = memory_bm_next_pfn(pageset1_map, 0); - dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); - - for (i = 0; i < pagedir1.size; i++) { - unsigned long *origvirt, *copyvirt; - struct page *origpage, *copypage; - int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1, - was_present1, was_present2; - - origpage = pfn_to_page(source_index); - copypage = pfn_to_page(dest_index); - - origvirt = PageHighMem(origpage) ? - kmap_atomic(origpage) : - page_address(origpage); - - copyvirt = PageHighMem(copypage) ? - kmap_atomic(copypage) : - page_address(copypage); - - was_present1 = kernel_page_present(origpage); - if (!was_present1) - kernel_map_pages(origpage, 1, 1); - - was_present2 = kernel_page_present(copypage); - if (!was_present2) - kernel_map_pages(copypage, 1, 1); - - while (loop >= 0) { - *(copyvirt + loop) = *(origvirt + loop); - loop--; - } - - if (!was_present1) - kernel_map_pages(origpage, 1, 0); - - if (!was_present2) - kernel_map_pages(copypage, 1, 0); - - if (PageHighMem(origpage)) - kunmap_atomic(origvirt); - - if (PageHighMem(copypage)) - kunmap_atomic(copyvirt); - - source_index = memory_bm_next_pfn(pageset1_map, 0); - dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); - } -} - -/** - * __toi_post_context_save - steps after saving the cpu context - * - * Steps taken after saving the CPU state to make the actual - * atomic copy. - * - * Called from swsusp_save in snapshot.c via toi_post_context_save. - **/ -int __toi_post_context_save(void) -{ - unsigned long old_ps1_size = pagedir1.size; - - check_checksums(); - - free_checksum_pages(); - - toi_recalculate_image_contents(1); - - extra_pd1_pages_used = pagedir1.size > old_ps1_size ? - pagedir1.size - old_ps1_size : 0; - - if (extra_pd1_pages_used > extra_pd1_pages_allowance) { - printk(KERN_INFO "Pageset1 has grown by %lu pages. " - "extra_pages_allowance is currently only %lu.\n", - pagedir1.size - old_ps1_size, - extra_pd1_pages_allowance); - - /* - * Highlevel code will see this, clear the state and - * retry if we haven't already done so twice. - */ - if (any_to_free(1)) { - set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - return 1; - } - if (try_allocate_extra_memory()) { - printk(KERN_INFO "Failed to allocate the extra memory" - " needed. Restarting the process."); - set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - return 1; - } - printk(KERN_INFO "However it looks like there's enough" - " free ram and storage to handle this, so " - " continuing anyway."); - /* - * What if try_allocate_extra_memory above calls - * toi_allocate_extra_pagedir_memory and it allocs a new - * slab page via toi_kzalloc which should be in ps1? So... - */ - toi_recalculate_image_contents(1); - } - - if (!test_action_state(TOI_TEST_FILTER_SPEED) && - !test_action_state(TOI_TEST_BIO)) - toi_copy_pageset1(); - - return 0; -} - -/** - * toi_hibernate - high level code for doing the atomic copy - * - * High-level code which prepares to do the atomic copy. Loosely based - * on the swsusp version, but with the following twists: - * - We set toi_running so the swsusp code uses our code paths. - * - We give better feedback regarding what goes wrong if there is a - * problem. - * - We use an extra function to call the assembly, just in case this code - * is in a module (return address). - **/ -int toi_hibernate(void) -{ - int error; - - error = toi_lowlevel_builtin(); - - if (!error) { - struct toi_boot_kernel_data *bkd = - (struct toi_boot_kernel_data *) boot_kernel_data_buffer; - - /* - * The boot kernel's data may be larger (newer version) or - * smaller (older version) than ours. Copy the minimum - * of the two sizes, so that we don't overwrite valid values - * from pre-atomic copy. - */ - - memcpy(&toi_bkd, (char *) boot_kernel_data_buffer, - min_t(int, sizeof(struct toi_boot_kernel_data), - bkd->size)); - } - - return error; -} - -/** - * toi_atomic_restore - prepare to do the atomic restore - * - * Get ready to do the atomic restore. This part gets us into the same - * state we are in prior to do calling do_toi_lowlevel while - * hibernating: hot-unplugging secondary cpus and freeze processes, - * before starting the thread that will do the restore. - **/ -int toi_atomic_restore(void) -{ - int error; - - toi_prepare_status(DONT_CLEAR_BAR, "Atomic restore."); - - memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line, - strlen(saved_command_line)); - - toi_pre_atomic_restore_modules(&toi_bkd); - - if (add_boot_kernel_data_pbe()) - goto Failed; - - toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); - - if (toi_go_atomic(PMSG_QUIESCE, 0)) - goto Failed; - - /* We'll ignore saved state, but this gets preempt count (etc) right */ - save_processor_state(); - - error = swsusp_arch_resume(); - /* - * Code below is only ever reached in case of failure. Otherwise - * execution continues at place where swsusp_arch_suspend was called. - * - * We don't know whether it's safe to continue (this shouldn't happen), - * so lets err on the side of caution. - */ - BUG(); - -Failed: - free_pbe_list(&restore_pblist, 0); -#ifdef CONFIG_HIGHMEM - free_pbe_list(&restore_highmem_pblist, 1); -#endif - return 1; -} - -/** - * toi_go_atomic - do the actual atomic copy/restore - * @state: The state to use for dpm_suspend_start & power_down calls. - * @suspend_time: Whether we're suspending or resuming. - **/ -int toi_go_atomic(pm_message_t state, int suspend_time) -{ - if (suspend_time) { - if (platform_begin(1)) { - set_abort_result(TOI_PLATFORM_PREP_FAILED); - toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); - return 1; - } - - if (dpm_prepare(PMSG_FREEZE)) { - set_abort_result(TOI_DPM_PREPARE_FAILED); - dpm_complete(PMSG_RECOVER); - toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); - return 1; - } - } - - suspend_console(); - pm_restrict_gfp_mask(); - - if (suspend_time) { - if (dpm_suspend(state)) { - set_abort_result(TOI_DPM_SUSPEND_FAILED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); - return 1; - } - } else { - if (dpm_suspend_start(state)) { - set_abort_result(TOI_DPM_SUSPEND_FAILED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); - return 1; - } - } - - /* At this point, dpm_suspend_start() has been called, but *not* - * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ - - if (dpm_suspend_end(state)) { - set_abort_result(TOI_DEVICE_REFUSED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1); - return 1; - } - - if (suspend_time) { - if (platform_pre_snapshot(1)) - set_abort_result(TOI_PRE_SNAPSHOT_FAILED); - } else { - if (platform_pre_restore(1)) - set_abort_result(TOI_PRE_RESTORE_FAILED); - } - - if (test_result_state(TOI_ABORTED)) { - toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1); - return 1; - } - - if (disable_nonboot_cpus()) { - set_abort_result(TOI_CPU_HOTPLUG_FAILED); - toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG, - suspend_time, 1); - return 1; - } - - local_irq_disable(); - - if (syscore_suspend()) { - set_abort_result(TOI_SYSCORE_REFUSED); - toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1); - return 1; - } - - if (suspend_time && pm_wakeup_pending()) { - set_abort_result(TOI_WAKEUP_EVENT); - toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1); - return 1; - } - return 0; -} - -/** - * toi_end_atomic - post atomic copy/restore routines - * @stage: What step to start at. - * @suspend_time: Whether we're suspending or resuming. - * @error: Whether we're recovering from an error. - **/ -void toi_end_atomic(int stage, int suspend_time, int error) -{ - pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) : - PMSG_RESTORE; - - switch (stage) { - case ATOMIC_ALL_STEPS: - if (!suspend_time) { - events_check_enabled = false; - } - platform_leave(1); - case ATOMIC_STEP_SYSCORE_RESUME: - syscore_resume(); - case ATOMIC_STEP_IRQS: - local_irq_enable(); - case ATOMIC_STEP_CPU_HOTPLUG: - enable_nonboot_cpus(); - case ATOMIC_STEP_PLATFORM_FINISH: - if (!suspend_time && error & 2) - platform_restore_cleanup(1); - else - platform_finish(1); - dpm_resume_start(msg); - case ATOMIC_STEP_DEVICE_RESUME: - if (suspend_time && (error & 2)) - platform_recover(1); - dpm_resume(msg); - if (!toi_in_suspend()) { - dpm_resume_end(PMSG_RECOVER); - } - if (error || !toi_in_suspend()) { - pm_restore_gfp_mask(); - } - resume_console(); - case ATOMIC_STEP_DPM_COMPLETE: - dpm_complete(msg); - case ATOMIC_STEP_PLATFORM_END: - platform_end(1); - - toi_prepare_status(DONT_CLEAR_BAR, "Post atomic."); - } -} diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h deleted file mode 100644 index 2de0e3b49..000000000 --- a/kernel/power/tuxonice_atomic_copy.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * kernel/power/tuxonice_atomic_copy.h - * - * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * Routines for doing the atomic save/restore. - */ - -enum { - ATOMIC_ALL_STEPS, - ATOMIC_STEP_SYSCORE_RESUME, - ATOMIC_STEP_IRQS, - ATOMIC_STEP_CPU_HOTPLUG, - ATOMIC_STEP_PLATFORM_FINISH, - ATOMIC_STEP_DEVICE_RESUME, - ATOMIC_STEP_DPM_COMPLETE, - ATOMIC_STEP_PLATFORM_END, -}; - -int toi_go_atomic(pm_message_t state, int toi_time); -void toi_end_atomic(int stage, int toi_time, int error); - -extern void platform_recover(int platform_mode); diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h deleted file mode 100644 index 201e3cd47..000000000 --- a/kernel/power/tuxonice_bio.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * kernel/power/tuxonice_bio.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains declarations for functions exported from - * tuxonice_bio.c, which contains low level io functions. - */ - -#include <linux/buffer_head.h> -#include "tuxonice_extent.h" - -void toi_put_extent_chain(struct hibernate_extent_chain *chain); -int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, - unsigned long start, unsigned long end); - -struct hibernate_extent_saved_state { - int extent_num; - struct hibernate_extent *extent_ptr; - unsigned long offset; -}; - -struct toi_bdev_info { - struct toi_bdev_info *next; - struct hibernate_extent_chain blocks; - struct block_device *bdev; - struct toi_module_ops *allocator; - int allocator_index; - struct hibernate_extent_chain allocations; - char name[266]; /* "swap on " or "file " + up to 256 chars */ - - /* Saved in header */ - char uuid[17]; - dev_t dev_t; - int prio; - int bmap_shift; - int blocks_per_page; - unsigned long pages_used; - struct hibernate_extent_saved_state saved_state[4]; -}; - -struct toi_extent_iterate_state { - struct toi_bdev_info *current_chain; - int num_chains; - int saved_chain_number[4]; - struct toi_bdev_info *saved_chain_ptr[4]; -}; - -/* - * Our exported interface so the swapwriter and filewriter don't - * need these functions duplicated. - */ -struct toi_bio_ops { - int (*bdev_page_io) (int rw, struct block_device *bdev, long pos, - struct page *page); - int (*register_storage)(struct toi_bdev_info *new); - void (*free_storage)(void); -}; - -struct toi_allocator_ops { - unsigned long (*toi_swap_storage_available) (void); -}; - -extern struct toi_bio_ops toi_bio_ops; - -extern char *toi_writer_buffer; -extern int toi_writer_buffer_posn; - -struct toi_bio_allocator_ops { - int (*register_storage) (void); - unsigned long (*storage_available)(void); - int (*allocate_storage) (struct toi_bdev_info *, unsigned long); - int (*bmap) (struct toi_bdev_info *); - void (*free_storage) (struct toi_bdev_info *); - unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used); -}; diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c deleted file mode 100644 index 364fae9db..000000000 --- a/kernel/power/tuxonice_bio_chains.c +++ /dev/null @@ -1,1126 +0,0 @@ -/* - * kernel/power/tuxonice_bio_devinfo.c - * - * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - */ - -#include <linux/mm_types.h> -#include "tuxonice_bio.h" -#include "tuxonice_bio_internal.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" -#include "tuxonice.h" -#include "tuxonice_io.h" - -static struct toi_bdev_info *prio_chain_head; -static int num_chains; - -/* Pointer to current entry being loaded/saved. */ -struct toi_extent_iterate_state toi_writer_posn; - -#define metadata_size (sizeof(struct toi_bdev_info) - \ - offsetof(struct toi_bdev_info, uuid)) - -/* - * After section 0 (header) comes 2 => next_section[0] = 2 - */ -static int next_section[3] = { 2, 3, 1 }; - -/** - * dump_block_chains - print the contents of the bdev info array. - **/ -void dump_block_chains(void) -{ - int i = 0; - int j; - struct toi_bdev_info *cur_chain = prio_chain_head; - - while (cur_chain) { - struct hibernate_extent *this = cur_chain->blocks.first; - - printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio); - - while (this) { - printk(KERN_CONT " [%lu-%lu]%s", this->start, - this->end, this->next ? "," : ""); - this = this->next; - } - - printk("\n"); - cur_chain = cur_chain->next; - i++; - } - - printk(KERN_DEBUG "Saved states:\n"); - for (i = 0; i < 4; i++) { - printk(KERN_DEBUG "Slot %d: Chain %d.\n", - i, toi_writer_posn.saved_chain_number[i]); - - cur_chain = prio_chain_head; - j = 0; - while (cur_chain) { - printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n", - j, cur_chain->saved_state[i].extent_num, - cur_chain->saved_state[i].offset); - cur_chain = cur_chain->next; - j++; - } - printk(KERN_CONT "\n"); - } -} - -/** - * - **/ -static void toi_extent_chain_next(void) -{ - struct toi_bdev_info *this = toi_writer_posn.current_chain; - - if (!this->blocks.current_extent) - return; - - if (this->blocks.current_offset == this->blocks.current_extent->end) { - if (this->blocks.current_extent->next) { - this->blocks.current_extent = - this->blocks.current_extent->next; - this->blocks.current_offset = - this->blocks.current_extent->start; - } else { - this->blocks.current_extent = NULL; - this->blocks.current_offset = 0; - } - } else - this->blocks.current_offset++; -} - -/** - * - */ - -static struct toi_bdev_info *__find_next_chain_same_prio(void) -{ - struct toi_bdev_info *start_chain = toi_writer_posn.current_chain; - struct toi_bdev_info *this = start_chain; - int orig_prio = this->prio; - - do { - this = this->next; - - if (!this) - this = prio_chain_head; - - /* Back on original chain? Use it again. */ - if (this == start_chain) - return start_chain; - - } while (!this->blocks.current_extent || this->prio != orig_prio); - - return this; -} - -static void find_next_chain(void) -{ - struct toi_bdev_info *this; - - this = __find_next_chain_same_prio(); - - /* - * If we didn't get another chain of the same priority that we - * can use, look for the next priority. - */ - while (this && !this->blocks.current_extent) - this = this->next; - - toi_writer_posn.current_chain = this; -} - -/** - * toi_extent_state_next - go to the next extent - * @blocks: The number of values to progress. - * @stripe_mode: Whether to spread usage across all chains. - * - * Given a state, progress to the next valid entry. We may begin in an - * invalid state, as we do when invoked after extent_state_goto_start below. - * - * When using compression and expected_compression > 0, we let the image size - * be larger than storage, so we can validly run out of data to return. - **/ -static unsigned long toi_extent_state_next(int blocks, int current_stream) -{ - int i; - - if (!toi_writer_posn.current_chain) - return -ENOSPC; - - /* Assume chains always have lengths that are multiples of @blocks */ - for (i = 0; i < blocks; i++) - toi_extent_chain_next(); - - /* The header stream is not striped */ - if (current_stream || - !toi_writer_posn.current_chain->blocks.current_extent) - find_next_chain(); - - return toi_writer_posn.current_chain ? 0 : -ENOSPC; -} - -static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this) -{ - struct toi_bdev_info **prev_ptr; - struct toi_bdev_info *cur; - - /* Loop through the existing chain, finding where to insert it */ - prev_ptr = &prio_chain_head; - cur = prio_chain_head; - - while (cur && cur->prio >= this->prio) { - prev_ptr = &cur->next; - cur = cur->next; - } - - this->next = *prev_ptr; - *prev_ptr = this; - - this = prio_chain_head; - while (this) - this = this->next; - num_chains++; -} - -/** - * toi_extent_state_goto_start - reinitialize an extent chain iterator - * @state: Iterator to reinitialize - **/ -void toi_extent_state_goto_start(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Setting current extent to %p.", this->blocks.first); - this->blocks.current_extent = this->blocks.first; - if (this->blocks.current_extent) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Setting current offset to %lu.", - this->blocks.current_extent->start); - this->blocks.current_offset = - this->blocks.current_extent->start; - } - - this = this->next; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.", - prio_chain_head); - toi_writer_posn.current_chain = prio_chain_head; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start."); -} - -/** - * toi_extent_state_save - save state of the iterator - * @state: Current state of the chain - * @saved_state: Iterator to populate - * - * Given a state and a struct hibernate_extent_state_store, save the current - * position in a format that can be used with relocated chains (at - * resume time). - **/ -void toi_extent_state_save(int slot) -{ - struct toi_bdev_info *cur_chain = prio_chain_head; - struct hibernate_extent *extent; - struct hibernate_extent_saved_state *chain_state; - int i = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.", - slot); - - if (!toi_writer_posn.current_chain) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => " - "chain_num = -1."); - toi_writer_posn.saved_chain_number[slot] = -1; - return; - } - - while (cur_chain) { - i++; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) " - "state, slot %d.", i, cur_chain, slot); - - chain_state = &cur_chain->saved_state[slot]; - - chain_state->offset = cur_chain->blocks.current_offset; - - if (toi_writer_posn.current_chain == cur_chain) { - toi_writer_posn.saved_chain_number[slot] = i; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain " - "we were on => chain_num is %d.", i); - } - - if (!cur_chain->blocks.current_extent) { - chain_state->extent_num = 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent " - "for this chain => extent_num %d is 0.", - i); - cur_chain = cur_chain->next; - continue; - } - - extent = cur_chain->blocks.first; - chain_state->extent_num = 1; - - while (extent != cur_chain->blocks.current_extent) { - chain_state->extent_num++; - extent = extent->next; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i, - chain_state->extent_num); - - cur_chain = cur_chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Completed saving extent state slot %d.", slot); -} - -/** - * toi_extent_state_restore - restore the position saved by extent_state_save - * @state: State to populate - * @saved_state: Iterator saved to restore - **/ -void toi_extent_state_restore(int slot) -{ - int i = 0; - struct toi_bdev_info *cur_chain = prio_chain_head; - struct hibernate_extent_saved_state *chain_state; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "toi_extent_state_restore - slot %d.", slot); - - if (toi_writer_posn.saved_chain_number[slot] == -1) { - toi_writer_posn.current_chain = NULL; - return; - } - - while (cur_chain) { - int posn; - int j; - i++; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) " - "state, slot %d.", i, cur_chain, slot); - - chain_state = &cur_chain->saved_state[slot]; - - posn = chain_state->extent_num; - - cur_chain->blocks.current_extent = cur_chain->blocks.first; - cur_chain->blocks.current_offset = chain_state->offset; - - if (i == toi_writer_posn.saved_chain_number[slot]) { - toi_writer_posn.current_chain = cur_chain; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Found current chain."); - } - - for (j = 0; j < 4; j++) - if (i == toi_writer_posn.saved_chain_number[j]) { - toi_writer_posn.saved_chain_ptr[j] = cur_chain; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Found saved chain ptr %d (%p) (offset" - " %d).", j, cur_chain, - cur_chain->saved_state[j].offset); - } - - if (posn) { - while (--posn) - cur_chain->blocks.current_extent = - cur_chain->blocks.current_extent->next; - } else - cur_chain->blocks.current_extent = NULL; - - cur_chain = cur_chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done."); - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); -} - -/* - * Storage needed - * - * Returns amount of space in the image header required - * for the chain data. This ignores the links between - * pages, which we factor in when allocating the space. - */ -int toi_bio_devinfo_storage_needed(void) -{ - int result = sizeof(num_chains); - struct toi_bdev_info *chain = prio_chain_head; - - while (chain) { - result += metadata_size; - - /* Chain size */ - result += sizeof(int); - - /* Extents */ - result += (2 * sizeof(unsigned long) * - chain->blocks.num_extents); - - chain = chain->next; - } - - result += 4 * sizeof(int); - return result; -} - -static unsigned long chain_pages_used(struct toi_bdev_info *chain) -{ - struct hibernate_extent *this = chain->blocks.first; - struct hibernate_extent_saved_state *state = &chain->saved_state[3]; - unsigned long size = 0; - int extent_idx = 1; - - if (!state->extent_num) { - if (!this) - return 0; - else - return chain->blocks.size; - } - - while (extent_idx < state->extent_num) { - size += (this->end - this->start + 1); - this = this->next; - extent_idx++; - } - - /* We didn't use the one we're sitting on, so don't count it */ - return size + state->offset - this->start; -} - -void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain) -{ - unsigned long used = chain_pages_used(chain); - - /* Free the storage */ - unsigned long first_freed = 0; - - if (chain->allocator->bio_allocator_ops->free_unused_storage) - first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used); - - printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed); - - /* Adjust / free the extents. */ - toi_put_extent_chain_from(&chain->blocks, first_freed); - - { - struct hibernate_extent *this = chain->blocks.first; - while (this) { - printk("Extent %lx-%lx.\n", this->start, this->end); - this = this->next; - } - } -} - -/** - * toi_serialise_extent_chain - write a chain in the image - * @chain: Chain to write. - **/ -static int toi_serialise_extent_chain(struct toi_bdev_info *chain) -{ - struct hibernate_extent *this; - int ret; - int i = 1; - - chain->pages_used = chain_pages_used(chain); - - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).", - chain->dev_t); - /* Device info - dev_t, prio, bmap_shift, blocks per page, positions */ - ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, - (char *) &chain->uuid, metadata_size); - if (ret) - return ret; - - /* Num extents */ - ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, - (char *) &chain->blocks.num_extents, sizeof(int)); - if (ret) - return ret; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", - chain->blocks.num_extents); - - this = chain->blocks.first; - while (this) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i); - ret = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, - (char *) this, 2 * sizeof(this->start)); - if (ret) - return ret; - this = this->next; - i++; - } - - return ret; -} - -int toi_serialise_extent_chains(void) -{ - struct toi_bdev_info *this = prio_chain_head; - int result; - - /* Write the number of chains */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)", - num_chains); - result = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, (char *) &num_chains, - sizeof(int)); - if (result) - return result; - - /* Then the chains themselves */ - while (this) { - result = toi_serialise_extent_chain(this); - if (result) - return result; - this = this->next; - } - - /* - * Finally, the chain we should be on at the start of each - * section. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers."); - result = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, - (char *) &toi_writer_posn.saved_chain_number[0], - 4 * sizeof(int)); - - return result; -} - -int toi_register_storage_chain(struct toi_bdev_info *new) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.", - new); - toi_insert_chain_in_prio_list(new); - return 0; -} - -static void free_bdev_info(struct toi_bdev_info *chain) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents."); - toi_put_extent_chain(&chain->blocks); - - /* - * The allocator may need to do more than just free the chains - * (swap_free, for example). Don't call from boot kernel. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents."); - if (chain->allocator) - chain->allocator->bio_allocator_ops->free_storage(chain); - - /* - * Dropping out of reading atomic copy? Need to undo - * toi_open_by_devnum. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev."); - if (chain->bdev && !IS_ERR(chain->bdev) && - chain->bdev != resume_block_device && - chain->bdev != header_block_device && - test_toi_state(TOI_TRYING_TO_RESUME)) - toi_close_bdev(chain->bdev); - - /* Poison */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct."); - toi_kfree(39, chain, sizeof(*chain)); - - if (prio_chain_head == chain) - prio_chain_head = NULL; - - num_chains--; -} - -void free_all_bdev_info(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - struct toi_bdev_info *next = this->next; - free_bdev_info(this); - this = next; - } - - memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn)); - prio_chain_head = NULL; -} - -static void set_up_start_position(void) -{ - toi_writer_posn.current_chain = prio_chain_head; - go_next_page(0, 0); -} - -/** - * toi_load_extent_chain - read back a chain saved in the image - * @chain: Chain to load - * - * The linked list of extents is reconstructed from the disk. chain will point - * to the first entry. - **/ -int toi_load_extent_chain(int index, int *num_loaded) -{ - struct toi_bdev_info *chain = toi_kzalloc(39, - sizeof(struct toi_bdev_info), GFP_ATOMIC); - struct hibernate_extent *this, *last = NULL; - int i, ret; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index); - /* Get dev_t, prio, bmap_shift, blocks per page, positions */ - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &chain->uuid, metadata_size); - - if (ret) { - printk(KERN_ERR "Failed to read the size of extent chain.\n"); - toi_kfree(39, chain, sizeof(*chain)); - return 1; - } - - toi_bkd.pages_used[index] = chain->pages_used; - - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &chain->blocks.num_extents, sizeof(int)); - if (ret) { - printk(KERN_ERR "Failed to read the size of extent chain.\n"); - toi_kfree(39, chain, sizeof(*chain)); - return 1; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", - chain->blocks.num_extents); - - for (i = 0; i < chain->blocks.num_extents; i++) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1); - - this = toi_kzalloc(2, sizeof(struct hibernate_extent), - TOI_ATOMIC_GFP); - if (!this) { - printk(KERN_INFO "Failed to allocate a new extent.\n"); - free_bdev_info(chain); - return -ENOMEM; - } - this->next = NULL; - /* Get the next page */ - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, - NULL, (char *) this, 2 * sizeof(this->start)); - if (ret) { - printk(KERN_INFO "Failed to read an extent.\n"); - toi_kfree(2, this, sizeof(struct hibernate_extent)); - free_bdev_info(chain); - return 1; - } - - if (last) - last->next = this; - else { - char b1[32], b2[32], b3[32]; - /* - * Open the bdev - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Chain dev_t is %s. Resume dev t is %s. Header" - " bdev_t is %s.\n", - format_dev_t(b1, chain->dev_t), - format_dev_t(b2, resume_dev_t), - format_dev_t(b3, toi_sig_data->header_dev_t)); - - if (chain->dev_t == resume_dev_t) - chain->bdev = resume_block_device; - else if (chain->dev_t == toi_sig_data->header_dev_t) - chain->bdev = header_block_device; - else { - chain->bdev = toi_open_bdev(chain->uuid, - chain->dev_t, 1); - if (IS_ERR(chain->bdev)) { - free_bdev_info(chain); - return -ENODEV; - } - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift " - "is %d and blocks per page is %d.", - chain->bmap_shift, - chain->blocks_per_page); - - chain->blocks.first = this; - - /* - * Couldn't do this earlier, but can't do - * goto_start now - we may have already used blocks - * in the first chain. - */ - chain->blocks.current_extent = this; - chain->blocks.current_offset = this->start; - - /* - * Can't wait until we've read the whole chain - * before we insert it in the list. We might need - * this chain to read the next page in the header - */ - toi_insert_chain_in_prio_list(chain); - } - - /* - * We have to wait until 2 extents are loaded before setting up - * properly because if the first extent has only one page, we - * will need to put the position on the second extent. Sounds - * obvious, but it wasn't! - */ - (*num_loaded)++; - if ((*num_loaded) == 2) - set_up_start_position(); - last = this; - } - - /* - * Shouldn't get empty chains, but it's not impossible. Link them in so - * they get freed properly later. - */ - if (!chain->blocks.num_extents) - toi_insert_chain_in_prio_list(chain); - - if (!chain->blocks.current_extent) { - chain->blocks.current_extent = chain->blocks.first; - if (chain->blocks.current_extent) - chain->blocks.current_offset = - chain->blocks.current_extent->start; - } - return 0; -} - -int toi_load_extent_chains(void) -{ - int result; - int to_load; - int i; - int extents_loaded = 0; - - result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &to_load, - sizeof(int)); - if (result) - return result; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load); - - for (i = 0; i < to_load; i++) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.", - i, to_load); - result = toi_load_extent_chain(i, &extents_loaded); - if (result) - return result; - } - - /* If we never got to a second extent, we still need to do this. */ - if (extents_loaded == 1) - set_up_start_position(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers."); - result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, - &toi_blockwriter_ops, - (char *) &toi_writer_posn.saved_chain_number[0], - 4 * sizeof(int)); - - return result; -} - -static int toi_end_of_stream(int writing, int section_barrier) -{ - struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; - int compare_to = next_section[current_stream]; - struct toi_bdev_info *compare_chain = - toi_writer_posn.saved_chain_ptr[compare_to]; - int compare_offset = compare_chain ? - compare_chain->saved_state[compare_to].offset : 0; - - if (!section_barrier) - return 0; - - if (!cur_chain) - return 1; - - if (cur_chain == compare_chain && - cur_chain->blocks.current_offset == compare_offset) { - if (writing) { - if (!current_stream) { - debug_broken_header(); - return 1; - } - } else { - more_readahead = 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Reached the end of stream %d " - "(not an error).", current_stream); - return 1; - } - } - - return 0; -} - -/** - * go_next_page - skip blocks to the start of the next page - * @writing: Whether we're reading or writing the image. - * - * Go forward one page. - **/ -int go_next_page(int writing, int section_barrier) -{ - struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; - int max = cur_chain ? cur_chain->blocks_per_page : 1; - - /* Nope. Go foward a page - or maybe two. Don't stripe the header, - * so that bad fragmentation doesn't put the extent data containing - * the location of the second page out of the first header page. - */ - if (toi_extent_state_next(max, current_stream)) { - /* Don't complain if readahead falls off the end */ - if (writing && section_barrier) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. " - "Expected compression ratio too optimistic?"); - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to " - "read/write. (Not necessarily a fatal error."); - return -ENOSPC; - } - - return 0; -} - -int devices_of_same_priority(struct toi_bdev_info *this) -{ - struct toi_bdev_info *check = prio_chain_head; - int i = 0; - - while (check) { - if (check->prio == this->prio) - i++; - check = check->next; - } - - return i; -} - -/** - * toi_bio_rw_page - do i/o on the next disk page in the image - * @writing: Whether reading or writing. - * @page: Page to do i/o on. - * @is_readahead: Whether we're doing readahead - * @free_group: The group used in allocating the page - * - * Submit a page for reading or writing, possibly readahead. - * Pass the group used in allocating the page as well, as it should - * be freed on completion of the bio if we're writing the page. - **/ -int toi_bio_rw_page(int writing, struct page *page, - int is_readahead, int free_group) -{ - int result = toi_end_of_stream(writing, 1); - struct toi_bdev_info *dev_info = toi_writer_posn.current_chain; - - if (result) { - if (writing) - abort_hibernate(TOI_INSUFFICIENT_STORAGE, - "Insufficient storage for your image."); - else - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to " - "read/write another page when stream has " - "ended."); - return -ENOSPC; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "%s %lx:%ld", - writing ? "Write" : "Read", - dev_info->dev_t, dev_info->blocks.current_offset); - - result = toi_do_io(writing, dev_info->bdev, - dev_info->blocks.current_offset << dev_info->bmap_shift, - page, is_readahead, 0, free_group); - - /* Ignore the result here - will check end of stream if come in again */ - go_next_page(writing, 1); - - if (result) - printk(KERN_ERR "toi_do_io returned %d.\n", result); - return result; -} - -dev_t get_header_dev_t(void) -{ - return prio_chain_head->dev_t; -} - -struct block_device *get_header_bdev(void) -{ - return prio_chain_head->bdev; -} - -unsigned long get_headerblock(void) -{ - return prio_chain_head->blocks.first->start << - prio_chain_head->bmap_shift; -} - -int get_main_pool_phys_params(void) -{ - struct toi_bdev_info *this = prio_chain_head; - int result; - - while (this) { - result = this->allocator->bio_allocator_ops->bmap(this); - if (result) - return result; - this = this->next; - } - - return 0; -} - -static int apply_header_reservation(void) -{ - int i; - - if (!header_pages_reserved) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "No header pages reserved at the moment."); - return 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation."); - - /* Apply header space reservation */ - toi_extent_state_goto_start(); - - for (i = 0; i < header_pages_reserved; i++) - if (go_next_page(1, 0)) - return -ENOSPC; - - /* The end of header pages will be the start of pageset 2 */ - toi_extent_state_save(2); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Finished applying header reservation."); - return 0; -} - -static int toi_bio_register_storage(void) -{ - int result = 0; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type != BIO_ALLOCATOR_MODULE) - continue; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Registering storage from %s.", - this_module->name); - result = this_module->bio_allocator_ops->register_storage(); - if (result) - break; - } - - return result; -} - -void toi_bio_free_unused_storage(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - toi_bio_free_unused_storage_chain(this); - this = this->next; - } -} - -int toi_bio_allocate_storage(unsigned long request) -{ - struct toi_bdev_info *chain = prio_chain_head; - unsigned long to_get = request; - unsigned long extra_pages, needed; - int no_free = 0; - - if (!chain) { - int result = toi_bio_register_storage(); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " - "Registering storage."); - if (result) - return 0; - chain = prio_chain_head; - if (!chain) { - printk("TuxOnIce: No storage was registered.\n"); - return 0; - } - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " - "Request is %lu pages.", request); - extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long) - + sizeof(int)), PAGE_SIZE); - needed = request + extra_pages + header_pages_reserved; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu " - "for header => %lu.", - extra_pages, header_pages_reserved, needed); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.", - raw_pages_allocd); - - to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get); - - if (!to_get) - return apply_header_reservation(); - - while (to_get && chain) { - int num_group = devices_of_same_priority(chain); - int divisor = num_group - no_free; - int i; - unsigned long portion = DIV_ROUND_UP(to_get, divisor); - unsigned long got = 0; - unsigned long got_this_round = 0; - struct toi_bdev_info *top = chain; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Start of loop. To get is %lu. Divisor is %d.", - to_get, divisor); - no_free = 0; - - /* - * We're aiming to spread the allocated storage as evenly - * as possible, but we also want to get all the storage we - * can off this priority. - */ - for (i = 0; i < num_group; i++) { - struct toi_bio_allocator_ops *ops = - chain->allocator->bio_allocator_ops; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Asking for %lu pages from chain %p.", - portion, chain); - got = ops->allocate_storage(chain, portion); - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Got %lu pages from allocator %p.", - got, chain); - if (!got) - no_free++; - got_this_round += got; - chain = chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a " - "total of %lu pages from %d allocators.", - got_this_round, divisor - no_free); - - raw_pages_allocd += got_this_round; - to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : - 0; - - /* - * If we got anything from chains of this priority and we - * still have storage to allocate, go over this priority - * again. - */ - if (got_this_round && to_get) - chain = top; - else - no_free = 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling " - "get_main_pool_phys_params"); - /* Now let swap allocator bmap the pages */ - get_main_pool_phys_params(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header."); - return apply_header_reservation(); -} - -void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd) -{ - int i = 0; - struct toi_bdev_info *cur_chain = prio_chain_head; - - while (cur_chain) { - cur_chain->pages_used = bkd->pages_used[i]; - cur_chain = cur_chain->next; - i++; - } -} - -int toi_bio_chains_debug_info(char *buffer, int size) -{ - /* Show what we actually used */ - struct toi_bdev_info *cur_chain = prio_chain_head; - int len = 0; - - while (cur_chain) { - len += scnprintf(buffer + len, size - len, " Used %lu pages " - "from %s.\n", cur_chain->pages_used, - cur_chain->name); - cur_chain = cur_chain->next; - } - - return len; -} - -void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr) -{ - struct toi_bdev_info *this = toi_writer_posn.current_chain, - *cmp = prio_chain_head; - - ptr->save.chain = 1; - while (this != cmp) { - ptr->save.chain++; - cmp = cmp->next; - } - ptr->save.block = this->blocks.current_offset; - - /* Save the raw info internally for quicker access when updating pointers */ - ptr->bdev = this->bdev; - ptr->block = this->blocks.current_offset << this->bmap_shift; -} - -void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr) -{ - int i = ptr->save.chain - 1; - struct toi_bdev_info *this; - struct hibernate_extent *hib; - - /* Find chain by stored index */ - this = prio_chain_head; - while (i) { - this = this->next; - i--; - } - toi_writer_posn.current_chain = this; - - /* Restore block */ - this->blocks.current_offset = ptr->save.block; - - /* Find current offset from block number */ - hib = this->blocks.first; - - while (hib->start > ptr->save.block) { - hib = hib->next; - } - - this->blocks.last_touched = this->blocks.current_extent = hib; -} diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c deleted file mode 100644 index d18f2751c..000000000 --- a/kernel/power/tuxonice_bio_core.c +++ /dev/null @@ -1,1933 +0,0 @@ -/* - * kernel/power/tuxonice_bio.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains block io functions for TuxOnIce. These are - * used by the swapwriter and it is planned that they will also - * be used by the NFSwriter. - * - */ - -#include <linux/blkdev.h> -#include <linux/syscalls.h> -#include <linux/suspend.h> -#include <linux/ctype.h> -#include <linux/fs_uuid.h> -#include <linux/mount.h> - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_bio.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" -#include "tuxonice_builtin.h" -#include "tuxonice_bio_internal.h" - -#define MEMORY_ONLY 1 -#define THROTTLE_WAIT 2 - -/* #define MEASURE_MUTEX_CONTENTION */ -#ifndef MEASURE_MUTEX_CONTENTION -#define my_mutex_lock(index, the_lock) mutex_lock(the_lock) -#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock) -#else -unsigned long mutex_times[2][2][NR_CPUS]; -#define my_mutex_lock(index, the_lock) do { \ - int have_mutex; \ - have_mutex = mutex_trylock(the_lock); \ - if (!have_mutex) { \ - mutex_lock(the_lock); \ - mutex_times[index][0][smp_processor_id()]++; \ - } else { \ - mutex_times[index][1][smp_processor_id()]++; \ - } - -#define my_mutex_unlock(index, the_lock) \ - mutex_unlock(the_lock); \ -} while (0) -#endif - -static int page_idx, reset_idx; - -static int target_outstanding_io = 1024; -static int max_outstanding_writes, max_outstanding_reads; - -static struct page *bio_queue_head, *bio_queue_tail; -static atomic_t toi_bio_queue_size; -static DEFINE_SPINLOCK(bio_queue_lock); - -static int free_mem_throttle, throughput_throttle; -int more_readahead = 1; -static struct page *readahead_list_head, *readahead_list_tail; - -static struct page *waiting_on; - -static atomic_t toi_io_in_progress, toi_io_done; -static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait); - -int current_stream; -/* Not static, so that the allocators can setup and complete - * writing the header */ -char *toi_writer_buffer; -int toi_writer_buffer_posn; - -static DEFINE_MUTEX(toi_bio_mutex); -static DEFINE_MUTEX(toi_bio_readahead_mutex); - -static struct task_struct *toi_queue_flusher; -static int toi_bio_queue_flush_pages(int dedicated_thread); - -struct toi_module_ops toi_blockwriter_ops; - -struct toi_incremental_image_pointer toi_inc_ptr[2][2]; - -#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \ - atomic_read(&toi_bio_queue_size)) - -unsigned long raw_pages_allocd, header_pages_reserved; - -static int toi_rw_buffer(int writing, char *buffer, int buffer_size, - int no_readahead); - -/** - * set_free_mem_throttle - set the point where we pause to avoid oom. - * - * Initially, this value is zero, but when we first fail to allocate memory, - * we set it (plus a buffer) and thereafter throttle i/o once that limit is - * reached. - **/ -static void set_free_mem_throttle(void) -{ - int new_throttle = nr_free_buffer_pages() + 256; - - if (new_throttle > free_mem_throttle) - free_mem_throttle = new_throttle; -} - -#define NUM_REASONS 7 -static atomic_t reasons[NUM_REASONS]; -static char *reason_name[NUM_REASONS] = { - "readahead not ready", - "bio allocation", - "synchronous I/O", - "toi_bio_get_new_page", - "memory low", - "readahead buffer allocation", - "throughput_throttle", -}; - -/* User Specified Parameters. */ -unsigned long resume_firstblock; -dev_t resume_dev_t; -struct block_device *resume_block_device; -static atomic_t resume_bdev_open_count; - -struct block_device *header_block_device; - -/** - * toi_open_bdev: Open a bdev at resume time. - * - * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t - * (the user can have resume= pointing at a swap partition/file that isn't - * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the - * header. It will be from a swap partition that was enabled when we hibernated, - * but we don't know it's real index until we read that first page. - * dev_t: The device major/minor. - * display_errs: Whether to try to do this quietly. - * - * We stored a dev_t in the image header. Open the matching device without - * requiring /dev/<whatever> in most cases and record the details needed - * to close it later and avoid duplicating work. - */ -struct block_device *toi_open_bdev(char *uuid, dev_t default_device, - int display_errs) -{ - struct block_device *bdev; - dev_t device = default_device; - char buf[32]; - int retried = 0; - -retry: - if (uuid) { - struct fs_info seek; - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = 0; - seek.last_mount_size = 0; - device = blk_lookup_fs_info(&seek); - if (!device) { - device = default_device; - printk(KERN_DEBUG "Unable to resolve uuid. Falling back" - " to dev_t.\n"); - } else - printk(KERN_DEBUG "Resolved uuid to device %s.\n", - format_dev_t(buf, device)); - } - - if (!device) { - printk(KERN_ERR "TuxOnIce attempting to open a " - "blank dev_t!\n"); - dump_stack(); - return NULL; - } - bdev = toi_open_by_devnum(device); - - if (IS_ERR(bdev) || !bdev) { - if (!retried) { - retried = 1; - wait_for_device_probe(); - goto retry; - } - if (display_errs) - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "Failed to get access to block device " - "\"%x\" (error %d).\n Maybe you need " - "to run mknod and/or lvmsetup in an " - "initrd/ramfs?", device, bdev); - return ERR_PTR(-EINVAL); - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "TuxOnIce got bdev %p for dev_t %x.", - bdev, device); - - return bdev; -} - -static void toi_bio_reserve_header_space(unsigned long request) -{ - header_pages_reserved = request; -} - -/** - * do_bio_wait - wait for some TuxOnIce I/O to complete - * @reason: The array index of the reason we're waiting. - * - * Wait for a particular page of I/O if we're after a particular page. - * If we're not after a particular page, wait instead for all in flight - * I/O to be completed or for us to have enough free memory to be able - * to submit more I/O. - * - * If we wait, we also update our statistics regarding why we waited. - **/ -static void do_bio_wait(int reason) -{ - struct page *was_waiting_on = waiting_on; - - /* On SMP, waiting_on can be reset, so we make a copy */ - if (was_waiting_on) { - wait_on_page_locked(was_waiting_on); - atomic_inc(&reasons[reason]); - } else { - atomic_inc(&reasons[reason]); - - wait_event(num_in_progress_wait, - !atomic_read(&toi_io_in_progress) || - nr_free_buffer_pages() > free_mem_throttle); - } -} - -/** - * throttle_if_needed - wait for I/O completion if throttle points are reached - * @flags: What to check and how to act. - * - * Check whether we need to wait for some I/O to complete. We always check - * whether we have enough memory available, but may also (depending upon - * @reason) check if the throughput throttle limit has been reached. - **/ -static int throttle_if_needed(int flags) -{ - int free_pages = nr_free_buffer_pages(); - - /* Getting low on memory and I/O is in progress? */ - while (unlikely(free_pages < free_mem_throttle) && - atomic_read(&toi_io_in_progress) && - !test_result_state(TOI_ABORTED)) { - if (!(flags & THROTTLE_WAIT)) - return -ENOMEM; - do_bio_wait(4); - free_pages = nr_free_buffer_pages(); - } - - while (!(flags & MEMORY_ONLY) && throughput_throttle && - TOTAL_OUTSTANDING_IO >= throughput_throttle && - !test_result_state(TOI_ABORTED)) { - int result = toi_bio_queue_flush_pages(0); - if (result) - return result; - atomic_inc(&reasons[6]); - wait_event(num_in_progress_wait, - !atomic_read(&toi_io_in_progress) || - TOTAL_OUTSTANDING_IO < throughput_throttle); - } - - return 0; -} - -/** - * update_throughput_throttle - update the raw throughput throttle - * @jif_index: The number of times this function has been called. - * - * This function is called four times per second by the core, and used to limit - * the amount of I/O we submit at once, spreading out our waiting through the - * whole job and letting userui get an opportunity to do its work. - * - * We don't start limiting I/O until 1/4s has gone so that we get a - * decent sample for our initial limit, and keep updating it because - * throughput may vary (on rotating media, eg) with our block number. - * - * We throttle to 1/10s worth of I/O. - **/ -static void update_throughput_throttle(int jif_index) -{ - int done = atomic_read(&toi_io_done); - throughput_throttle = done * 2 / 5 / jif_index; -} - -/** - * toi_finish_all_io - wait for all outstanding i/o to complete - * - * Flush any queued but unsubmitted I/O and wait for it all to complete. - **/ -static int toi_finish_all_io(void) -{ - int result = toi_bio_queue_flush_pages(0); - toi_bio_queue_flusher_should_finish = 1; - wake_up(&toi_io_queue_flusher); - wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO); - return result; -} - -/** - * toi_end_bio - bio completion function. - * @bio: bio that has completed. - * @err: Error value. Yes, like end_swap_bio_read, we ignore it. - * - * Function called by the block driver from interrupt context when I/O is - * completed. If we were writing the page, we want to free it and will have - * set bio->bi_private to the parameter we should use in telling the page - * allocation accounting code what the page was allocated for. If we're - * reading the page, it will be in the singly linked list made from - * page->private pointers. - **/ -static void toi_end_bio(struct bio *bio, int err) -{ - struct page *page = bio->bi_io_vec[0].bv_page; - - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); - - unlock_page(page); - bio_put(bio); - - if (waiting_on == page) - waiting_on = NULL; - - put_page(page); - - if (bio->bi_private) - toi__free_page((int) ((unsigned long) bio->bi_private) , page); - - bio_put(bio); - - atomic_dec(&toi_io_in_progress); - atomic_inc(&toi_io_done); - - wake_up(&num_in_progress_wait); -} - -/** - * submit - submit BIO request - * @writing: READ or WRITE. - * @dev: The block device we're using. - * @first_block: The first sector we're using. - * @page: The page being used for I/O. - * @free_group: If writing, the group that was used in allocating the page - * and which will be used in freeing the page from the completion - * routine. - * - * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the - * textbook - allocate and initialize the bio. If we're writing, make sure - * the page is marked as dirty. Then submit it and carry on." - * - * If we're just testing the speed of our own code, we fake having done all - * the hard work and all toi_end_bio immediately. - **/ -static int submit(int writing, struct block_device *dev, sector_t first_block, - struct page *page, int free_group) -{ - struct bio *bio = NULL; - int cur_outstanding_io, result; - - /* - * Shouldn't throttle if reading - can deadlock in the single - * threaded case as pages are only freed when we use the - * readahead. - */ - if (writing) { - result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT); - if (result) - return result; - } - - while (!bio) { - bio = bio_alloc(TOI_ATOMIC_GFP, 1); - if (!bio) { - set_free_mem_throttle(); - do_bio_wait(1); - } - } - - bio->bi_bdev = dev; - bio->bi_iter.bi_sector = first_block; - bio->bi_private = (void *) ((unsigned long) free_group); - bio->bi_end_io = toi_end_bio; - bio->bi_flags |= (1 << BIO_TOI); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n", - (unsigned long long) first_block); - bio_put(bio); - return -EFAULT; - } - - bio_get(bio); - - cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress); - if (writing) { - if (cur_outstanding_io > max_outstanding_writes) - max_outstanding_writes = cur_outstanding_io; - } else { - if (cur_outstanding_io > max_outstanding_reads) - max_outstanding_reads = cur_outstanding_io; - } - - /* Still read the header! */ - if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) { - /* Fake having done the hard work */ - set_bit(BIO_UPTODATE, &bio->bi_flags); - toi_end_bio(bio, 0); - } else - submit_bio(writing | REQ_SYNC, bio); - - return 0; -} - -/** - * toi_do_io: Prepare to do some i/o on a page and submit or batch it. - * - * @writing: Whether reading or writing. - * @bdev: The block device which we're using. - * @block0: The first sector we're reading or writing. - * @page: The page on which I/O is being done. - * @readahead_index: If doing readahead, the index (reset this flag when done). - * @syncio: Whether the i/o is being done synchronously. - * - * Prepare and start a read or write operation. - * - * Note that we always work with our own page. If writing, we might be given a - * compression buffer that will immediately be used to start compressing the - * next page. For reading, we do readahead and therefore don't know the final - * address where the data needs to go. - **/ -int toi_do_io(int writing, struct block_device *bdev, long block0, - struct page *page, int is_readahead, int syncio, int free_group) -{ - page->private = 0; - - /* Do here so we don't race against toi_bio_get_next_page_read */ - lock_page(page); - - if (is_readahead) { - if (readahead_list_head) - readahead_list_tail->private = (unsigned long) page; - else - readahead_list_head = page; - - readahead_list_tail = page; - } - - /* Done before submitting to avoid races. */ - if (syncio) - waiting_on = page; - - /* Submit the page */ - get_page(page); - - if (submit(writing, bdev, block0, page, free_group)) - return -EFAULT; - - if (syncio) - do_bio_wait(2); - - return 0; -} - -/** - * toi_bdev_page_io - simpler interface to do directly i/o on a single page - * @writing: Whether reading or writing. - * @bdev: Block device on which we're operating. - * @pos: Sector at which page to read or write starts. - * @page: Page to be read/written. - * - * A simple interface to submit a page of I/O and wait for its completion. - * The caller must free the page used. - **/ -static int toi_bdev_page_io(int writing, struct block_device *bdev, - long pos, struct page *page) -{ - return toi_do_io(writing, bdev, pos, page, 0, 1, 0); -} - -/** - * toi_bio_memory_needed - report the amount of memory needed for block i/o - * - * We want to have at least enough memory so as to have target_outstanding_io - * or more transactions on the fly at once. If we can do more, fine. - **/ -static int toi_bio_memory_needed(void) -{ - return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) + - sizeof(struct bio)); -} - -/** - * toi_bio_print_debug_stats - put out debugging info in the buffer provided - * @buffer: A buffer of size @size into which text should be placed. - * @size: The size of @buffer. - * - * Fill a buffer with debugging info. This is used for both our debug_info sysfs - * entry and for recording the same info in dmesg. - **/ -static int toi_bio_print_debug_stats(char *buffer, int size) -{ - int len = 0; - - if (toiActiveAllocator != &toi_blockwriter_ops) { - len = scnprintf(buffer, size, - "- Block I/O inactive.\n"); - return len; - } - - len = scnprintf(buffer, size, "- Block I/O active.\n"); - - len += toi_bio_chains_debug_info(buffer + len, size - len); - - len += scnprintf(buffer + len, size - len, - "- Max outstanding reads %d. Max writes %d.\n", - max_outstanding_reads, max_outstanding_writes); - - len += scnprintf(buffer + len, size - len, - " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n", - target_outstanding_io, - PAGE_SIZE, (unsigned int) sizeof(struct request), - (unsigned int) sizeof(struct bio), toi_bio_memory_needed()); - -#ifdef MEASURE_MUTEX_CONTENTION - { - int i; - - len += scnprintf(buffer + len, size - len, - " Mutex contention while reading:\n Contended Free\n"); - - for_each_online_cpu(i) - len += scnprintf(buffer + len, size - len, - " %9lu %9lu\n", - mutex_times[0][0][i], mutex_times[0][1][i]); - - len += scnprintf(buffer + len, size - len, - " Mutex contention while writing:\n Contended Free\n"); - - for_each_online_cpu(i) - len += scnprintf(buffer + len, size - len, - " %9lu %9lu\n", - mutex_times[1][0][i], mutex_times[1][1][i]); - - } -#endif - - return len + scnprintf(buffer + len, size - len, - " Free mem throttle point reached %d.\n", free_mem_throttle); -} - -static int total_header_bytes; -static int unowned; - -void debug_broken_header(void) -{ - printk(KERN_DEBUG "Image header too big for size allocated!\n"); - print_toi_header_storage_for_modules(); - printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed()); - printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header)); - printk(KERN_DEBUG "Total unowned : %d.\n", unowned); - printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes, - DIV_ROUND_UP(total_header_bytes, PAGE_SIZE)); - printk(KERN_DEBUG "Space needed now : %ld.\n", - get_header_storage_needed()); - dump_block_chains(); - abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small."); -} - -static int toi_bio_update_previous_inc_img_ptr(int stream) -{ - int result; - char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP); - struct page *page; - struct toi_incremental_image_pointer *prev, *this; - - prev = &toi_inc_ptr[stream][0]; - this = &toi_inc_ptr[stream][1]; - - if (!buffer) { - // We're at the start of writing a pageset. Memory should not be that scarce. - return -ENOMEM; - } - - page = virt_to_page(buffer); - result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0); - - if (result) - goto out; - - memcpy(buffer, (char *) this, sizeof(this->save)); - - result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12); - - // If the IO is successfully submitted (!result), the page will be freed - // asynchronously on completion. -out: - if (result) - toi__free_page(12, virt_to_page(buffer)); - return result; -} - -/** - * toi_rw_init_incremental - incremental image part of setting up to write new section - */ -static int toi_write_init_incremental(int stream) -{ - int result = 0; - - // Remember the location of this block so we can link to it. - toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]); - - // Update the pointer at the start of the last pageset with the same stream number. - result = toi_bio_update_previous_inc_img_ptr(stream); - if (result) - return result; - - // Move the current to the previous slot. - memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1])); - - // Store a blank pointer at the start of this incremental pageset - memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1])); - result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); - if (result) - return result; - - // Serialise extent chains if this is an incremental pageset - return toi_serialise_extent_chains(); -} - -/** - * toi_read_init_incremental - incremental image part of setting up to read new section - */ -static int toi_read_init_incremental(int stream) -{ - int result; - - // Set our position to the start of the next pageset - toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]); - - // Read the start of the next incremental pageset (if any) - result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); - - if (!result) - result = toi_load_extent_chains(); - - return result; -} - -/** - * toi_rw_init - prepare to read or write a stream in the image - * @writing: Whether reading or writing. - * @stream number: Section of the image being processed. - * - * Prepare to read or write a section ('stream') in the image. - **/ -static int toi_rw_init(int writing, int stream_number) -{ - if (stream_number) - toi_extent_state_restore(stream_number); - else - toi_extent_state_goto_start(); - - if (writing) { - reset_idx = 0; - if (!current_stream) - page_idx = 0; - } else { - reset_idx = 1; - } - - atomic_set(&toi_io_done, 0); - if (!toi_writer_buffer) - toi_writer_buffer = (char *) toi_get_zeroed_page(11, - TOI_ATOMIC_GFP); - toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE; - - current_stream = stream_number; - - more_readahead = 1; - - if (test_result_state(TOI_KEPT_IMAGE)) { - int result; - - if (writing) { - result = toi_write_init_incremental(stream_number); - } else { - result = toi_read_init_incremental(stream_number); - } - - if (result) - return result; - } - - return toi_writer_buffer ? 0 : -ENOMEM; -} - -/** - * toi_bio_queue_write - queue a page for writing - * @full_buffer: Pointer to a page to be queued - * - * Add a page to the queue to be submitted. If we're the queue flusher, - * we'll do this once we've dropped toi_bio_mutex, so other threads can - * continue to submit I/O while we're on the slow path doing the actual - * submission. - **/ -static void toi_bio_queue_write(char **full_buffer) -{ - struct page *page = virt_to_page(*full_buffer); - unsigned long flags; - - *full_buffer = NULL; - page->private = 0; - - spin_lock_irqsave(&bio_queue_lock, flags); - if (!bio_queue_head) - bio_queue_head = page; - else - bio_queue_tail->private = (unsigned long) page; - - bio_queue_tail = page; - atomic_inc(&toi_bio_queue_size); - - spin_unlock_irqrestore(&bio_queue_lock, flags); - wake_up(&toi_io_queue_flusher); -} - -/** - * toi_rw_cleanup - Cleanup after i/o. - * @writing: Whether we were reading or writing. - * - * Flush all I/O and clean everything up after reading or writing a - * section of the image. - **/ -static int toi_rw_cleanup(int writing) -{ - int i, result = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup."); - if (writing) { - if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED)) - toi_bio_queue_write(&toi_writer_buffer); - - while (bio_queue_head && !result) - result = toi_bio_queue_flush_pages(0); - - if (result) - return result; - - if (current_stream == 2) - toi_extent_state_save(1); - else if (current_stream == 1) - toi_extent_state_save(3); - } - - result = toi_finish_all_io(); - - while (readahead_list_head) { - void *next = (void *) readahead_list_head->private; - toi__free_page(12, readahead_list_head); - readahead_list_head = next; - } - - readahead_list_tail = NULL; - - if (!current_stream) - return result; - - for (i = 0; i < NUM_REASONS; i++) { - if (!atomic_read(&reasons[i])) - continue; - printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n", - reason_name[i], atomic_read(&reasons[i])); - atomic_set(&reasons[i], 0); - } - - current_stream = 0; - return result; -} - -/** - * toi_start_one_readahead - start one page of readahead - * @dedicated_thread: Is this a thread dedicated to doing readahead? - * - * Start one new page of readahead. If this is being called by a thread - * whose only just is to submit readahead, don't quit because we failed - * to allocate a page. - **/ -static int toi_start_one_readahead(int dedicated_thread) -{ - char *buffer = NULL; - int oom = 0, result; - - result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0); - if (result) { - printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result); - return result; - } - - mutex_lock(&toi_bio_readahead_mutex); - - while (!buffer) { - buffer = (char *) toi_get_zeroed_page(12, - TOI_ATOMIC_GFP); - if (!buffer) { - if (oom && !dedicated_thread) { - mutex_unlock(&toi_bio_readahead_mutex); - printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result); - return -ENOMEM; - } - - oom = 1; - set_free_mem_throttle(); - do_bio_wait(5); - } - } - - result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0); - if (result) { - printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result); - } - if (result == -ENOSPC) - toi__free_page(12, virt_to_page(buffer)); - mutex_unlock(&toi_bio_readahead_mutex); - if (result) { - if (result == -ENOSPC) - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Last readahead page submitted."); - else - printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n", - result); - } - return result; -} - -/** - * toi_start_new_readahead - start new readahead - * @dedicated_thread: Are we dedicated to this task? - * - * Start readahead of image pages. - * - * We can be called as a thread dedicated to this task (may be helpful on - * systems with lots of CPUs), in which case we don't exit until there's no - * more readahead. - * - * If this is not called by a dedicated thread, we top up our queue until - * there's no more readahead to submit, we've submitted the number given - * in target_outstanding_io or the number in progress exceeds the target - * outstanding I/O value. - * - * No mutex needed because this is only ever called by the first cpu. - **/ -static int toi_start_new_readahead(int dedicated_thread) -{ - int last_result, num_submitted = 0; - - /* Start a new readahead? */ - if (!more_readahead) - return 0; - - do { - last_result = toi_start_one_readahead(dedicated_thread); - - if (last_result) { - if (last_result == -ENOMEM || last_result == -ENOSPC) - return 0; - - printk(KERN_DEBUG - "Begin read chunk returned %d.\n", - last_result); - } else - num_submitted++; - - } while (more_readahead && !last_result && - (dedicated_thread || - (num_submitted < target_outstanding_io && - atomic_read(&toi_io_in_progress) < target_outstanding_io))); - - return last_result; -} - -/** - * bio_io_flusher - start the dedicated I/O flushing routine - * @writing: Whether we're writing the image. - **/ -static int bio_io_flusher(int writing) -{ - - if (writing) - return toi_bio_queue_flush_pages(1); - else - return toi_start_new_readahead(1); -} - -/** - * toi_bio_get_next_page_read - read a disk page, perhaps with readahead - * @no_readahead: Whether we can use readahead - * - * Read a page from disk, submitting readahead and cleaning up finished i/o - * while we wait for the page we're after. - **/ -static int toi_bio_get_next_page_read(int no_readahead) -{ - char *virt; - struct page *old_readahead_list_head; - - /* - * When reading the second page of the header, we have to - * delay submitting the read until after we've gotten the - * extents out of the first page. - */ - if (unlikely(no_readahead)) { - int result = toi_start_one_readahead(0); - if (result) { - printk(KERN_EMERG "No readahead and toi_start_one_readahead " - "returned non-zero.\n"); - return -EIO; - } - } - - if (unlikely(!readahead_list_head)) { - /* - * If the last page finishes exactly on the page - * boundary, we will be called one extra time and - * have no data to return. In this case, we should - * not BUG(), like we used to! - */ - if (!more_readahead) { - printk(KERN_EMERG "No more readahead.\n"); - return -ENOSPC; - } - if (unlikely(toi_start_one_readahead(0))) { - printk(KERN_EMERG "No readahead and " - "toi_start_one_readahead returned non-zero.\n"); - return -EIO; - } - } - - if (PageLocked(readahead_list_head)) { - waiting_on = readahead_list_head; - do_bio_wait(0); - } - - virt = page_address(readahead_list_head); - memcpy(toi_writer_buffer, virt, PAGE_SIZE); - - mutex_lock(&toi_bio_readahead_mutex); - old_readahead_list_head = readahead_list_head; - readahead_list_head = (struct page *) readahead_list_head->private; - mutex_unlock(&toi_bio_readahead_mutex); - toi__free_page(12, old_readahead_list_head); - return 0; -} - -/** - * toi_bio_queue_flush_pages - flush the queue of pages queued for writing - * @dedicated_thread: Whether we're a dedicated thread - * - * Flush the queue of pages ready to be written to disk. - * - * If we're a dedicated thread, stay in here until told to leave, - * sleeping in wait_event. - * - * The first thread is normally the only one to come in here. Another - * thread can enter this routine too, though, via throttle_if_needed. - * Since that's the case, we must be careful to only have one thread - * doing this work at a time. Otherwise we have a race and could save - * pages out of order. - * - * If an error occurs, free all remaining pages without submitting them - * for I/O. - **/ - -int toi_bio_queue_flush_pages(int dedicated_thread) -{ - unsigned long flags; - int result = 0; - static DEFINE_MUTEX(busy); - - if (!mutex_trylock(&busy)) - return 0; - -top: - spin_lock_irqsave(&bio_queue_lock, flags); - while (bio_queue_head) { - struct page *page = bio_queue_head; - bio_queue_head = (struct page *) page->private; - if (bio_queue_tail == page) - bio_queue_tail = NULL; - atomic_dec(&toi_bio_queue_size); - spin_unlock_irqrestore(&bio_queue_lock, flags); - - /* Don't generate more error messages if already had one */ - if (!result) - result = toi_bio_rw_page(WRITE, page, 0, 11); - /* - * If writing the page failed, don't drop out. - * Flush the rest of the queue too. - */ - if (result) - toi__free_page(11 , page); - spin_lock_irqsave(&bio_queue_lock, flags); - } - spin_unlock_irqrestore(&bio_queue_lock, flags); - - if (dedicated_thread) { - wait_event(toi_io_queue_flusher, bio_queue_head || - toi_bio_queue_flusher_should_finish); - if (likely(!toi_bio_queue_flusher_should_finish)) - goto top; - toi_bio_queue_flusher_should_finish = 0; - } - - mutex_unlock(&busy); - return result; -} - -/** - * toi_bio_get_new_page - get a new page for I/O - * @full_buffer: Pointer to a page to allocate. - **/ -static int toi_bio_get_new_page(char **full_buffer) -{ - int result = throttle_if_needed(THROTTLE_WAIT); - if (result) - return result; - - while (!*full_buffer) { - *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP); - if (!*full_buffer) { - set_free_mem_throttle(); - do_bio_wait(3); - } - } - - return 0; -} - -/** - * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O - * @writing: Bool - whether writing (or reading). - * @buffer: The start of the buffer to write or fill. - * @buffer_size: The size of the buffer to write or fill. - * @no_readahead: Don't try to start readhead (when getting extents). - **/ -static int toi_rw_buffer(int writing, char *buffer, int buffer_size, - int no_readahead) -{ - int bytes_left = buffer_size, result = 0; - - while (bytes_left) { - char *source_start = buffer + buffer_size - bytes_left; - char *dest_start = toi_writer_buffer + toi_writer_buffer_posn; - int capacity = PAGE_SIZE - toi_writer_buffer_posn; - char *to = writing ? dest_start : source_start; - char *from = writing ? source_start : dest_start; - - if (bytes_left <= capacity) { - memcpy(to, from, bytes_left); - toi_writer_buffer_posn += bytes_left; - return 0; - } - - /* Complete this page and start a new one */ - memcpy(to, from, capacity); - bytes_left -= capacity; - - if (!writing) { - /* - * Perform actual I/O: - * read readahead_list_head into toi_writer_buffer - */ - int result = toi_bio_get_next_page_read(no_readahead); - if (result && bytes_left) { - printk("toi_bio_get_next_page_read " - "returned %d. Expecting to read %d bytes.\n", result, bytes_left); - return result; - } - } else { - toi_bio_queue_write(&toi_writer_buffer); - result = toi_bio_get_new_page(&toi_writer_buffer); - if (result) { - printk(KERN_ERR "toi_bio_get_new_page returned " - "%d.\n", result); - return result; - } - } - - toi_writer_buffer_posn = 0; - toi_cond_pause(0, NULL); - } - - return 0; -} - -/** - * toi_bio_read_page - read a page of the image - * @pfn: The pfn where the data belongs. - * @buffer_page: The page containing the (possibly compressed) data. - * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE). - * - * Read a (possibly compressed) page from the image, into buffer_page, - * returning its pfn and the buffer size. - **/ -static int toi_bio_read_page(unsigned long *pfn, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int result = 0; - int this_idx; - char *buffer_virt = TOI_MAP(buf_type, buffer_page); - - /* - * Only call start_new_readahead if we don't have a dedicated thread - * and we're the queue flusher. - */ - if (current == toi_queue_flusher && more_readahead && - !test_action_state(TOI_NO_READAHEAD)) { - int result2 = toi_start_new_readahead(0); - if (result2) { - printk(KERN_DEBUG "Queue flusher and " - "toi_start_one_readahead returned non-zero.\n"); - result = -EIO; - goto out; - } - } - - my_mutex_lock(0, &toi_bio_mutex); - - /* - * Structure in the image: - * [destination pfn|page size|page data] - * buf_size is PAGE_SIZE - * We can validly find there's nothing to read in a multithreaded - * situation. - */ - if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) || - toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) || - toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) || - toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) { - result = -ENODATA; - goto out_unlock; - } - - if (reset_idx) { - page_idx = this_idx; - reset_idx = 0; - } else { - page_idx++; - if (!this_idx) - result = -ENODATA; - else if (page_idx != this_idx) - printk(KERN_ERR "Got page index %d, expected %d.\n", - this_idx, page_idx); - } - -out_unlock: - my_mutex_unlock(0, &toi_bio_mutex); -out: - TOI_UNMAP(buf_type, buffer_page); - return result; -} - -/** - * toi_bio_write_page - write a page of the image - * @pfn: The pfn where the data belongs. - * @buffer_page: The page containing the (possibly compressed) data. - * @buf_size: The number of bytes on @buffer_page used. - * - * Write a (possibly compressed) page to the image from the buffer, together - * with it's index and buffer size. - **/ -static int toi_bio_write_page(unsigned long pfn, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - char *buffer_virt; - int result = 0, result2 = 0; - - if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED))) - return 0; - - my_mutex_lock(1, &toi_bio_mutex); - - if (test_result_state(TOI_ABORTED)) { - my_mutex_unlock(1, &toi_bio_mutex); - return 0; - } - - buffer_virt = TOI_MAP(buf_type, buffer_page); - page_idx++; - - /* - * Structure in the image: - * [destination pfn|page size|page data] - * buf_size is PAGE_SIZE - */ - if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) || - toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) || - toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) || - toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) { - printk(KERN_DEBUG "toi_rw_buffer returned non-zero to " - "toi_bio_write_page.\n"); - result = -EIO; - } - - TOI_UNMAP(buf_type, buffer_page); - my_mutex_unlock(1, &toi_bio_mutex); - - if (current == toi_queue_flusher) - result2 = toi_bio_queue_flush_pages(0); - - return result ? result : result2; -} - -/** - * _toi_rw_header_chunk - read or write a portion of the image header - * @writing: Whether reading or writing. - * @owner: The module for which we're writing. - * Used for confirming that modules - * don't use more header space than they asked for. - * @buffer: Address of the data to write. - * @buffer_size: Size of the data buffer. - * @no_readahead: Don't try to start readhead (when getting extents). - * - * Perform PAGE_SIZE I/O. Start readahead if needed. - **/ -static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner, - char *buffer, int buffer_size, int no_readahead) -{ - int result = 0; - - if (owner) { - owner->header_used += buffer_size; - toi_message(TOI_HEADER, TOI_LOW, 1, - "Header: %s : %d bytes (%d/%d) from offset %d.", - owner->name, - buffer_size, owner->header_used, - owner->header_requested, - toi_writer_buffer_posn); - if (owner->header_used > owner->header_requested && writing) { - printk(KERN_EMERG "TuxOnIce module %s is using more " - "header space (%u) than it requested (%u).\n", - owner->name, - owner->header_used, - owner->header_requested); - return buffer_size; - } - } else { - unowned += buffer_size; - toi_message(TOI_HEADER, TOI_LOW, 1, - "Header: (No owner): %d bytes (%d total so far) from " - "offset %d.", buffer_size, unowned, - toi_writer_buffer_posn); - } - - if (!writing && !no_readahead && more_readahead) { - result = toi_start_new_readahead(0); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead " - "returned %d.", result); - } - - if (!result) { - result = toi_rw_buffer(writing, buffer, buffer_size, - no_readahead); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned " - "%d.", result); - } - - total_header_bytes += buffer_size; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning " - "%d.", result); - return result; -} - -static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner, - char *buffer, int size) -{ - return _toi_rw_header_chunk(writing, owner, buffer, size, 1); -} - -static int toi_rw_header_chunk_noreadahead(int writing, - struct toi_module_ops *owner, char *buffer, int size) -{ - return _toi_rw_header_chunk(writing, owner, buffer, size, 1); -} - -/** - * toi_bio_storage_needed - get the amount of storage needed for my fns - **/ -static int toi_bio_storage_needed(void) -{ - return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed(); -} - -/** - * toi_bio_save_config_info - save block I/O config to image header - * @buf: PAGE_SIZE'd buffer into which data should be saved. - **/ -static int toi_bio_save_config_info(char *buf) -{ - int *ints = (int *) buf; - ints[0] = target_outstanding_io; - return sizeof(int); -} - -/** - * toi_bio_load_config_info - restore block I/O config - * @buf: Data to be reloaded. - * @size: Size of the buffer saved. - **/ -static void toi_bio_load_config_info(char *buf, int size) -{ - int *ints = (int *) buf; - target_outstanding_io = ints[0]; -} - -void close_resume_dev_t(int force) -{ - if (!resume_block_device) - return; - - if (force) - atomic_set(&resume_bdev_open_count, 0); - else - atomic_dec(&resume_bdev_open_count); - - if (!atomic_read(&resume_bdev_open_count)) { - toi_close_bdev(resume_block_device); - resume_block_device = NULL; - } -} - -int open_resume_dev_t(int force, int quiet) -{ - if (force) { - close_resume_dev_t(1); - atomic_set(&resume_bdev_open_count, 1); - } else - atomic_inc(&resume_bdev_open_count); - - if (resume_block_device) - return 0; - - resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0); - if (IS_ERR(resume_block_device)) { - if (!quiet) - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "Failed to open device %x, where" - " the header should be found.", - resume_dev_t); - resume_block_device = NULL; - atomic_set(&resume_bdev_open_count, 0); - return 1; - } - - return 0; -} - -/** - * toi_bio_initialise - initialise bio code at start of some action - * @starting_cycle: Whether starting a hibernation cycle, or just reading or - * writing a sysfs value. - **/ -static int toi_bio_initialise(int starting_cycle) -{ - int result; - - if (!starting_cycle || !resume_dev_t) - return 0; - - max_outstanding_writes = 0; - max_outstanding_reads = 0; - current_stream = 0; - toi_queue_flusher = current; -#ifdef MEASURE_MUTEX_CONTENTION - { - int i, j, k; - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - for_each_online_cpu(k) - mutex_times[i][j][k] = 0; - } -#endif - result = open_resume_dev_t(0, 1); - - if (result) - return result; - - return get_signature_page(); -} - -static unsigned long raw_to_real(unsigned long raw) -{ - unsigned long extra; - - extra = (raw * (sizeof(unsigned long) + sizeof(int)) + - (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) / - (PAGE_SIZE + sizeof(unsigned long) + sizeof(int)); - - return raw > extra ? raw - extra : 0; -} - -static unsigned long toi_bio_storage_available(void) -{ - unsigned long sum = 0; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type != BIO_ALLOCATOR_MODULE) - continue; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage " - "available from %s.", this_module->name); - sum += this_module->bio_allocator_ops->storage_available(); - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu " - "pages (%d header pages).", sum, header_pages_reserved); - - return sum > header_pages_reserved ? - raw_to_real(sum - header_pages_reserved) : 0; - -} - -static unsigned long toi_bio_storage_allocated(void) -{ - return raw_pages_allocd > header_pages_reserved ? - raw_to_real(raw_pages_allocd - header_pages_reserved) : 0; -} - -/* - * If we have read part of the image, we might have filled memory with - * data that should be zeroed out. - */ -static void toi_bio_noresume_reset(void) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset."); - toi_rw_cleanup(READ); - free_all_bdev_info(); -} - -/** - * toi_bio_cleanup - cleanup after some action - * @finishing_cycle: Whether completing a cycle. - **/ -static void toi_bio_cleanup(int finishing_cycle) -{ - if (!finishing_cycle) - return; - - if (toi_writer_buffer) { - toi_free_page(11, (unsigned long) toi_writer_buffer); - toi_writer_buffer = NULL; - } - - forget_signature_page(); - - if (header_block_device && toi_sig_data && - toi_sig_data->header_dev_t != resume_dev_t) - toi_close_bdev(header_block_device); - - header_block_device = NULL; - - close_resume_dev_t(0); -} - -static int toi_bio_write_header_init(void) -{ - int result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init"); - toi_rw_init(WRITE, 0); - toi_writer_buffer_posn = 0; - - /* Info needed to bootstrap goes at the start of the header. - * First we save the positions and devinfo, including the number - * of header pages. Then we save the structs containing data needed - * for reading the header pages back. - * Note that even if header pages take more than one page, when we - * read back the info, we will have restored the location of the - * next header page by the time we go to use it. - */ - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains."); - result = toi_serialise_extent_chains(); - - if (result) - return result; - - /* - * Signature page hasn't been modified at this point. Write it in - * the header so we can restore it later. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page."); - return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops, - (char *) toi_cur_sig_page, - PAGE_SIZE); -} - -static int toi_bio_write_header_cleanup(void) -{ - int result = 0; - - if (toi_writer_buffer_posn) - toi_bio_queue_write(&toi_writer_buffer); - - result = toi_finish_all_io(); - - unowned = 0; - total_header_bytes = 0; - - /* Set signature to save we have an image */ - if (!result) - result = toi_bio_mark_have_image(); - - return result; -} - -/* - * toi_bio_read_header_init() - * - * Description: - * 1. Attempt to read the device specified with resume=. - * 2. Check the contents of the swap header for our signature. - * 3. Warn, ignore, reset and/or continue as appropriate. - * 4. If continuing, read the toi_swap configuration section - * of the header and set up block device info so we can read - * the rest of the header & image. - * - * Returns: - * May not return if user choose to reboot at a warning. - * -EINVAL if cannot resume at this time. Booting should continue - * normally. - */ - -static int toi_bio_read_header_init(void) -{ - int result = 0; - char buf[32]; - - toi_writer_buffer_posn = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init"); - - if (!toi_sig_data) { - printk(KERN_INFO "toi_bio_read_header_init called when we " - "haven't verified there is an image!\n"); - return -EINVAL; - } - - /* - * If the header is not on the resume_swap_dev_t, get the resume device - * first. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.", - toi_sig_data->header_dev_t); - if (toi_sig_data->have_uuid) { - struct fs_info seek; - dev_t device; - - strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16); - seek.dev_t = toi_sig_data->header_dev_t; - seek.last_mount_size = 0; - device = blk_lookup_fs_info(&seek); - if (device) { - printk("Using dev_t %s, returned by blk_lookup_fs_info.\n", - format_dev_t(buf, device)); - toi_sig_data->header_dev_t = device; - } - } - if (toi_sig_data->header_dev_t != resume_dev_t) { - header_block_device = toi_open_bdev(NULL, - toi_sig_data->header_dev_t, 1); - - if (IS_ERR(header_block_device)) - return PTR_ERR(header_block_device); - } else - header_block_device = resume_block_device; - - if (!toi_writer_buffer) - toi_writer_buffer = (char *) toi_get_zeroed_page(11, - TOI_ATOMIC_GFP); - more_readahead = 1; - - /* - * Read toi_swap configuration. - * Headerblock size taken into account already. - */ - result = toi_bio_ops.bdev_page_io(READ, header_block_device, - toi_sig_data->first_header_block, - virt_to_page((unsigned long) toi_writer_buffer)); - if (result) - return result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains."); - result = toi_load_extent_chains(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page."); - toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); - if (!toi_orig_sig_page) { - printk(KERN_ERR "Failed to allocate memory for the current" - " image signature.\n"); - return -ENOMEM; - } - - return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops, - (char *) toi_orig_sig_page, - PAGE_SIZE); -} - -static int toi_bio_read_header_cleanup(void) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup."); - return toi_rw_cleanup(READ); -} - -/* Works only for digits and letters, but small and fast */ -#define TOLOWER(x) ((x) | 0x20) - -/* - * UUID must be 32 chars long. It may have dashes, but nothing - * else. - */ -char *uuid_from_commandline(char *commandline) -{ - int low = 0; - char *result = NULL, *output, *ptr; - - if (strncmp(commandline, "UUID=", 5)) - return NULL; - - result = kzalloc(17, GFP_KERNEL); - if (!result) { - printk("Failed to kzalloc UUID text memory.\n"); - return NULL; - } - - ptr = commandline + 5; - output = result; - - while (*ptr && (output - result) < 16) { - if (isxdigit(*ptr)) { - int value = isdigit(*ptr) ? *ptr - '0' : - TOLOWER(*ptr) - 'a' + 10; - if (low) { - *output += value; - output++; - } else { - *output = value << 4; - } - low = !low; - } else if (*ptr != '-') - break; - ptr++; - } - - if ((output - result) < 16 || *ptr) { - printk(KERN_DEBUG "Found resume=UUID=, but the value looks " - "invalid.\n"); - kfree(result); - result = NULL; - } - - return result; -} - -#define retry_if_fails(command) \ -do { \ - command; \ - if (!resume_dev_t && !waited_for_device_probe) { \ - wait_for_device_probe(); \ - command; \ - waited_for_device_probe = 1; \ - } \ -} while(0) - -/** - * try_to_open_resume_device: Try to parse and open resume= - * - * Any "swap:" has been stripped away and we just have the path to deal with. - * We attempt to do name_to_dev_t, open and stat the file. Having opened the - * file, get the struct block_device * to match. - */ -static int try_to_open_resume_device(char *commandline, int quiet) -{ - struct kstat stat; - int error = 0; - char *uuid = uuid_from_commandline(commandline); - int waited_for_device_probe = 0; - - resume_dev_t = MKDEV(0, 0); - - if (!strlen(commandline)) - retry_if_fails(toi_bio_scan_for_image(quiet)); - - if (uuid) { - struct fs_info seek; - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = resume_dev_t; - seek.last_mount_size = 0; - retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek)); - kfree(uuid); - } - - if (!resume_dev_t) - retry_if_fails(resume_dev_t = name_to_dev_t(commandline)); - - if (!resume_dev_t) { - struct file *file = filp_open(commandline, - O_RDONLY|O_LARGEFILE, 0); - - if (!IS_ERR(file) && file) { - vfs_getattr(&file->f_path, &stat); - filp_close(file, NULL); - } else - error = vfs_stat(commandline, &stat); - if (!error) - resume_dev_t = stat.rdev; - } - - if (!resume_dev_t) { - if (quiet) - return 1; - - if (test_toi_state(TOI_TRYING_TO_RESUME)) - toi_early_boot_message(1, toi_translate_err_default, - "Failed to translate \"%s\" into a device id.\n", - commandline); - else - printk("TuxOnIce: Can't translate \"%s\" into a device " - "id yet.\n", commandline); - return 1; - } - - return open_resume_dev_t(1, quiet); -} - -/* - * Parse Image Location - * - * Attempt to parse a resume= parameter. - * Swap Writer accepts: - * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE] - * - * Where: - * DEVNAME is convertable to a dev_t by name_to_dev_t - * FIRSTBLOCK is the location of the first block in the swap file - * (specifying for a swap partition is nonsensical but not prohibited). - * Data is validated by attempting to read a swap header from the - * location given. Failure will result in toi_swap refusing to - * save an image, and a reboot with correct parameters will be - * necessary. - */ -static int toi_bio_parse_sig_location(char *commandline, - int only_allocator, int quiet) -{ - char *thischar, *devstart, *colon = NULL; - int signature_found, result = -EINVAL, temp_result = 0; - - if (strncmp(commandline, "swap:", 5) && - strncmp(commandline, "file:", 5)) { - /* - * Failing swap:, we'll take a simple resume=/dev/hda2, or a - * blank value (scan) but fall through to other allocators - * if /dev/ or UUID= isn't matched. - */ - if (strncmp(commandline, "/dev/", 5) && - strncmp(commandline, "UUID=", 5) && - strlen(commandline)) - return 1; - } else - commandline += 5; - - devstart = commandline; - thischar = commandline; - while ((*thischar != ':') && (*thischar != '@') && - ((thischar - commandline) < 250) && (*thischar)) - thischar++; - - if (*thischar == ':') { - colon = thischar; - *colon = 0; - thischar++; - } - - while ((thischar - commandline) < 250 && *thischar) - thischar++; - - if (colon) { - unsigned long block; - temp_result = kstrtoul(colon + 1, 0, &block); - if (!temp_result) - resume_firstblock = (int) block; - } else - resume_firstblock = 0; - - clear_toi_state(TOI_CAN_HIBERNATE); - clear_toi_state(TOI_CAN_RESUME); - - if (!temp_result) - temp_result = try_to_open_resume_device(devstart, quiet); - - if (colon) - *colon = ':'; - - /* No error if we only scanned */ - if (temp_result) - return strlen(commandline) ? -EINVAL : 1; - - signature_found = toi_bio_image_exists(quiet); - - if (signature_found != -1) { - result = 0; - /* - * TODO: If only file storage, CAN_HIBERNATE should only be - * set if file allocator's target is valid. - */ - set_toi_state(TOI_CAN_HIBERNATE); - set_toi_state(TOI_CAN_RESUME); - } else - if (!quiet) - printk(KERN_ERR "TuxOnIce: Block I/O: No " - "signature found at %s.\n", devstart); - - return result; -} - -static void toi_bio_release_storage(void) -{ - header_pages_reserved = 0; - raw_pages_allocd = 0; - - free_all_bdev_info(); -} - -/* toi_swap_remove_image - * - */ -static int toi_bio_remove_image(void) -{ - int result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image."); - - result = toi_bio_restore_original_signature(); - - /* - * We don't do a sanity check here: we want to restore the swap - * whatever version of kernel made the hibernate image. - * - * We need to write swap, but swap may not be enabled so - * we write the device directly - * - * If we don't have an current_signature_page, we didn't - * read an image header, so don't change anything. - */ - - toi_bio_release_storage(); - - return result; -} - -struct toi_bio_ops toi_bio_ops = { - .bdev_page_io = toi_bdev_page_io, - .register_storage = toi_register_storage_chain, - .free_storage = toi_bio_release_storage, -}; - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io, - 0, 16384, 0, NULL), -}; - -struct toi_module_ops toi_blockwriter_ops = { - .type = WRITER_MODULE, - .name = "block i/o", - .directory = "block_io", - .module = THIS_MODULE, - .memory_needed = toi_bio_memory_needed, - .print_debug_info = toi_bio_print_debug_stats, - .storage_needed = toi_bio_storage_needed, - .save_config_info = toi_bio_save_config_info, - .load_config_info = toi_bio_load_config_info, - .initialise = toi_bio_initialise, - .cleanup = toi_bio_cleanup, - .post_atomic_restore = toi_bio_chains_post_atomic, - - .rw_init = toi_rw_init, - .rw_cleanup = toi_rw_cleanup, - .read_page = toi_bio_read_page, - .write_page = toi_bio_write_page, - .rw_header_chunk = toi_rw_header_chunk, - .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead, - .io_flusher = bio_io_flusher, - .update_throughput_throttle = update_throughput_throttle, - .finish_all_io = toi_finish_all_io, - - .noresume_reset = toi_bio_noresume_reset, - .storage_available = toi_bio_storage_available, - .storage_allocated = toi_bio_storage_allocated, - .reserve_header_space = toi_bio_reserve_header_space, - .allocate_storage = toi_bio_allocate_storage, - .free_unused_storage = toi_bio_free_unused_storage, - .image_exists = toi_bio_image_exists, - .mark_resume_attempted = toi_bio_mark_resume_attempted, - .write_header_init = toi_bio_write_header_init, - .write_header_cleanup = toi_bio_write_header_cleanup, - .read_header_init = toi_bio_read_header_init, - .read_header_cleanup = toi_bio_read_header_cleanup, - .get_header_version = toi_bio_get_header_version, - .remove_image = toi_bio_remove_image, - .parse_sig_location = toi_bio_parse_sig_location, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/** - * toi_block_io_load - load time routine for block I/O module - * - * Register block i/o ops and sysfs entries. - **/ -static __init int toi_block_io_load(void) -{ - return toi_register_module(&toi_blockwriter_ops); -} - -late_initcall(toi_block_io_load); diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h deleted file mode 100644 index cf9211ed9..000000000 --- a/kernel/power/tuxonice_bio_internal.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * kernel/power/tuxonice_bio_internal.h - * - * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains declarations for functions exported from - * tuxonice_bio.c, which contains low level io functions. - */ - -/* Extent chains */ -void toi_extent_state_goto_start(void); -void toi_extent_state_save(int slot); -int go_next_page(int writing, int section_barrier); -void toi_extent_state_restore(int slot); -void free_all_bdev_info(void); -int devices_of_same_priority(struct toi_bdev_info *this); -int toi_register_storage_chain(struct toi_bdev_info *new); -int toi_serialise_extent_chains(void); -int toi_load_extent_chains(void); -int toi_bio_rw_page(int writing, struct page *page, int is_readahead, - int free_group); -int toi_bio_restore_original_signature(void); -int toi_bio_devinfo_storage_needed(void); -unsigned long get_headerblock(void); -dev_t get_header_dev_t(void); -struct block_device *get_header_bdev(void); -int toi_bio_allocate_storage(unsigned long request); -void toi_bio_free_unused_storage(void); - -/* Signature functions */ -#define HaveImage "HaveImage" -#define NoImage "TuxOnIce" -#define sig_size (sizeof(HaveImage)) - -struct sig_data { - char sig[sig_size]; - int have_image; - int resumed_before; - - char have_uuid; - char header_uuid[17]; - dev_t header_dev_t; - unsigned long first_header_block; - - /* Repeat the signature to be sure we have a header version */ - char sig2[sig_size]; - int header_version; -}; - -void forget_signature_page(void); -int toi_check_for_signature(void); -int toi_bio_image_exists(int quiet); -int get_signature_page(void); -int toi_bio_mark_resume_attempted(int); -extern char *toi_cur_sig_page; -extern char *toi_orig_sig_page; -int toi_bio_mark_have_image(void); -extern struct sig_data *toi_sig_data; -extern dev_t resume_dev_t; -extern struct block_device *resume_block_device; -extern struct block_device *header_block_device; -extern unsigned long resume_firstblock; - -struct block_device *open_bdev(dev_t device, int display_errs); -extern int current_stream; -extern int more_readahead; -int toi_do_io(int writing, struct block_device *bdev, long block0, - struct page *page, int is_readahead, int syncio, int free_group); -int get_main_pool_phys_params(void); - -void toi_close_bdev(struct block_device *bdev); -struct block_device *toi_open_bdev(char *uuid, dev_t default_device, - int display_errs); - -extern struct toi_module_ops toi_blockwriter_ops; -void dump_block_chains(void); -void debug_broken_header(void); -extern unsigned long raw_pages_allocd, header_pages_reserved; -int toi_bio_chains_debug_info(char *buffer, int size); -void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd); -int toi_bio_scan_for_image(int quiet); -int toi_bio_get_header_version(void); - -void close_resume_dev_t(int force); -int open_resume_dev_t(int force, int quiet); - -struct toi_incremental_image_pointer_saved_data { - unsigned long block; - int chain; -}; - -struct toi_incremental_image_pointer { - struct toi_incremental_image_pointer_saved_data save; - struct block_device *bdev; - unsigned long block; -}; - -void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr); -void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr); diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c deleted file mode 100644 index ead874f8e..000000000 --- a/kernel/power/tuxonice_bio_signature.c +++ /dev/null @@ -1,403 +0,0 @@ -/* - * kernel/power/tuxonice_bio_signature.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - */ - -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_bio.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" -#include "tuxonice_builtin.h" -#include "tuxonice_bio_internal.h" - -struct sig_data *toi_sig_data; - -/* Struct of swap header pages */ - -struct old_sig_data { - dev_t device; - unsigned long sector; - int resume_attempted; - int orig_sig_type; -}; - -union diskpage { - union swap_header swh; /* swh.magic is the only member used */ - struct sig_data sig_data; - struct old_sig_data old_sig_data; -}; - -union p_diskpage { - union diskpage *pointer; - char *ptr; - unsigned long address; -}; - -char *toi_cur_sig_page; -char *toi_orig_sig_page; -int have_image; -int have_old_image; - -int get_signature_page(void) -{ - if (!toi_cur_sig_page) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Allocating current signature page."); - toi_cur_sig_page = (char *) toi_get_zeroed_page(38, - TOI_ATOMIC_GFP); - if (!toi_cur_sig_page) { - printk(KERN_ERR "Failed to allocate memory for the " - "current image signature.\n"); - return -ENOMEM; - } - - toi_sig_data = (struct sig_data *) toi_cur_sig_page; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx," - " sector %d.", - resume_block_device->bd_dev, resume_firstblock); - - return toi_bio_ops.bdev_page_io(READ, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -void forget_signature_page(void) -{ - if (toi_cur_sig_page) { - toi_sig_data = NULL; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page" - " (%p).", toi_cur_sig_page); - toi_free_page(38, (unsigned long) toi_cur_sig_page); - toi_cur_sig_page = NULL; - } - - if (toi_orig_sig_page) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page" - " (%p).", toi_orig_sig_page); - toi_free_page(38, (unsigned long) toi_orig_sig_page); - toi_orig_sig_page = NULL; - } -} - -/* - * We need to ensure we use the signature page that's currently on disk, - * so as to not remove the image header. Post-atomic-restore, the orig sig - * page will be empty, so we can use that as our method of knowing that we - * need to load the on-disk signature and not use the non-image sig in - * memory. (We're going to powerdown after writing the change, so it's safe. - */ -int toi_bio_mark_resume_attempted(int flag) -{ - toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.", - flag); - if (!toi_orig_sig_page) { - forget_signature_page(); - get_signature_page(); - } - toi_sig_data->resumed_before = flag; - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -int toi_bio_mark_have_image(void) -{ - int result = 0; - char buf[32]; - struct fs_info *fs_info; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists."); - memcpy(toi_sig_data->sig, tuxonice_signature, - sizeof(tuxonice_signature)); - toi_sig_data->have_image = 1; - toi_sig_data->resumed_before = 0; - toi_sig_data->header_dev_t = get_header_dev_t(); - toi_sig_data->have_uuid = 0; - - fs_info = fs_info_from_block_dev(get_header_bdev()); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - if (!result) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.", - format_dev_t(buf, get_header_dev_t())); - toi_sig_data->have_uuid = 1; - } else - toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for " - "dev_t %s.", - format_dev_t(buf, get_header_dev_t())); - - toi_sig_data->first_header_block = get_headerblock(); - have_image = 1; - toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block " - "is %d.", toi_sig_data->header_dev_t, - toi_sig_data->first_header_block); - - memcpy(toi_sig_data->sig2, tuxonice_signature, - sizeof(tuxonice_signature)); - toi_sig_data->header_version = TOI_HEADER_VERSION; - - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -int remove_old_signature(void) -{ - union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page; - char *orig_sig; - char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); - int result; - struct block_device *header_bdev; - struct old_sig_data *old_sig_data = - &swap_header_page.pointer->old_sig_data; - - header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1); - result = toi_bio_ops.bdev_page_io(READ, header_bdev, - old_sig_data->sector, virt_to_page(header_start)); - - if (result) - goto out; - - /* - * TODO: Get the original contents of the first bytes of the swap - * header page. - */ - if (!old_sig_data->orig_sig_type) - orig_sig = "SWAP-SPACE"; - else - orig_sig = "SWAPSPACE2"; - - memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10); - memcpy(swap_header_page.ptr, header_start, 10); - - result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(swap_header_page.ptr)); - -out: - toi_close_bdev(header_bdev); - have_old_image = 0; - toi_free_page(38, (unsigned long) header_start); - return result; -} - -/* - * toi_bio_restore_original_signature - restore the original signature - * - * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used. - * It will have the original signature page contents, stored in the image - * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain - * the contents that were loaded when we started the cycle. - */ -int toi_bio_restore_original_signature(void) -{ - char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page; - - if (have_old_image) - return remove_old_signature(); - - if (!use) { - printk("toi_bio_restore_original_signature: No signature " - "page loaded.\n"); - return 0; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists."); - have_image = 0; - toi_sig_data->have_image = 0; - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(use)); -} - -/* - * check_for_signature - See whether we have an image. - * - * Returns 0 if no image, 1 if there is one, -1 if indeterminate. - */ -int toi_check_for_signature(void) -{ - union p_diskpage swap_header_page; - int type; - const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" }; - const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" }; - char *swap_header; - - if (!toi_cur_sig_page) { - int result = get_signature_page(); - - if (result) - return result; - } - - /* - * Start by looking for the binary header. - */ - if (!memcmp(tuxonice_signature, toi_cur_sig_page, - sizeof(tuxonice_signature))) { - have_image = toi_sig_data->have_image; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. " - "Have image is %d.", have_image); - if (have_image) - toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is " - "%x. First block is %d.", - toi_sig_data->header_dev_t, - toi_sig_data->first_header_block); - return toi_sig_data->have_image; - } - - /* - * Failing that, try old file allocator headers. - */ - - if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) { - have_image = 1; - return 1; - } - - have_image = 0; - - if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage))) - return 0; - - /* - * Nope? How about swap? - */ - swap_header_page = (union p_diskpage) toi_cur_sig_page; - swap_header = swap_header_page.pointer->swh.magic.magic; - - /* Normal swapspace? */ - for (type = 0; type < 2; type++) - if (!memcmp(normal_sigs[type], swap_header, - strlen(normal_sigs[type]))) - return 0; - - /* Swsusp or uswsusp? */ - for (type = 0; type < 3; type++) - if (!memcmp(swsusp_sigs[type], swap_header, - strlen(swsusp_sigs[type]))) - return 2; - - /* Old TuxOnIce version? */ - if (!memcmp(tuxonice_signature, swap_header, - sizeof(tuxonice_signature) - 1)) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce " - "signature."); - have_old_image = 1; - return 3; - } - - return -1; -} - -/* - * Image_exists - * - * Returns -1 if don't know, otherwise 0 (no) or 1 (yes). - */ -int toi_bio_image_exists(int quiet) -{ - int result; - char *msg = NULL; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists."); - - if (!resume_dev_t) { - if (!quiet) - printk(KERN_INFO "Not even trying to read header " - "because resume_dev_t is not set.\n"); - return -1; - } - - if (open_resume_dev_t(0, quiet)) - return -1; - - result = toi_check_for_signature(); - - clear_toi_state(TOI_RESUMED_BEFORE); - if (toi_sig_data->resumed_before) - set_toi_state(TOI_RESUMED_BEFORE); - - if (quiet || result == -ENOMEM) - return result; - - if (result == -1) - msg = "TuxOnIce: Unable to find a signature." - " Could you have moved a swap file?\n"; - else if (!result) - msg = "TuxOnIce: No image found.\n"; - else if (result == 1) - msg = "TuxOnIce: Image found.\n"; - else if (result == 2) - msg = "TuxOnIce: uswsusp or swsusp image found.\n"; - else if (result == 3) - msg = "TuxOnIce: Old implementation's signature found.\n"; - - printk(KERN_INFO "%s", msg); - - return result; -} - -int toi_bio_scan_for_image(int quiet) -{ - struct block_device *bdev; - char default_name[255] = ""; - - if (!quiet) - printk(KERN_DEBUG "Scanning swap devices for TuxOnIce " - "signature...\n"); - for (bdev = next_bdev_of_type(NULL, "swap"); bdev; - bdev = next_bdev_of_type(bdev, "swap")) { - int result; - char name[255] = ""; - sprintf(name, "%u:%u", MAJOR(bdev->bd_dev), - MINOR(bdev->bd_dev)); - if (!quiet) - printk(KERN_DEBUG "- Trying %s.\n", name); - resume_block_device = bdev; - resume_dev_t = bdev->bd_dev; - - result = toi_check_for_signature(); - - resume_block_device = NULL; - resume_dev_t = MKDEV(0, 0); - - if (!default_name[0]) - strcpy(default_name, name); - - if (result == 1) { - /* Got one! */ - strcpy(resume_file, name); - next_bdev_of_type(bdev, NULL); - if (!quiet) - printk(KERN_DEBUG " ==> Image found on %s.\n", - resume_file); - return 1; - } - forget_signature_page(); - } - - if (!quiet) - printk(KERN_DEBUG "TuxOnIce scan: No image found.\n"); - strcpy(resume_file, default_name); - return 0; -} - -int toi_bio_get_header_version(void) -{ - return (memcmp(toi_sig_data->sig2, tuxonice_signature, - sizeof(tuxonice_signature))) ? - 0 : toi_sig_data->header_version; - -} diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c deleted file mode 100644 index 0a6733ae0..000000000 --- a/kernel/power/tuxonice_builtin.c +++ /dev/null @@ -1,498 +0,0 @@ -/* - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ -#include <linux/kernel.h> -#include <linux/swap.h> -#include <linux/syscalls.h> -#include <linux/bio.h> -#include <linux/root_dev.h> -#include <linux/freezer.h> -#include <linux/reboot.h> -#include <linux/writeback.h> -#include <linux/tty.h> -#include <linux/crypto.h> -#include <linux/cpu.h> -#include <linux/ctype.h> -#include <linux/kthread.h> -#include "tuxonice_io.h" -#include "tuxonice.h" -#include "tuxonice_extent.h" -#include "tuxonice_netlink.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_ui.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_modules.h" -#include "tuxonice_builtin.h" -#include "tuxonice_power_off.h" -#include "tuxonice_alloc.h" - -unsigned long toi_bootflags_mask; - -/* - * Highmem related functions (x86 only). - */ - -#ifdef CONFIG_HIGHMEM - -/** - * copyback_high: Restore highmem pages. - * - * Highmem data and pbe lists are/can be stored in highmem. - * The format is slightly different to the lowmem pbe lists - * used for the assembly code: the last pbe in each page is - * a struct page * instead of struct pbe *, pointing to the - * next page where pbes are stored (or NULL if happens to be - * the end of the list). Since we don't want to generate - * unnecessary deltas against swsusp code, we use a cast - * instead of a union. - **/ - -static void copyback_high(void) -{ - struct page *pbe_page = (struct page *) restore_highmem_pblist; - struct pbe *this_pbe, *first_pbe; - unsigned long *origpage, *copypage; - int pbe_index = 1; - - if (!pbe_page) - return; - - this_pbe = (struct pbe *) kmap_atomic(pbe_page); - first_pbe = this_pbe; - - while (this_pbe) { - int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1; - - origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address)); - copypage = kmap_atomic((struct page *) this_pbe->address); - - while (loop >= 0) { - *(origpage + loop) = *(copypage + loop); - loop--; - } - - kunmap_atomic(origpage); - kunmap_atomic(copypage); - - if (!this_pbe->next) - break; - - if (pbe_index < PBES_PER_PAGE) { - this_pbe++; - pbe_index++; - } else { - pbe_page = (struct page *) this_pbe->next; - kunmap_atomic(first_pbe); - if (!pbe_page) - return; - this_pbe = (struct pbe *) kmap_atomic(pbe_page); - first_pbe = this_pbe; - pbe_index = 1; - } - } - kunmap_atomic(first_pbe); -} - -#else /* CONFIG_HIGHMEM */ -static void copyback_high(void) { } -#endif - -char toi_wait_for_keypress_dev_console(int timeout) -{ - int fd, this_timeout = 255, orig_kthread = 0; - char key = '\0'; - struct termios t, t_backup; - - /* We should be guaranteed /dev/console exists after populate_rootfs() - * in init/main.c. - */ - fd = sys_open("/dev/console", O_RDONLY, 0); - if (fd < 0) { - printk(KERN_INFO "Couldn't open /dev/console.\n"); - return key; - } - - if (sys_ioctl(fd, TCGETS, (long)&t) < 0) - goto out_close; - - memcpy(&t_backup, &t, sizeof(t)); - - t.c_lflag &= ~(ISIG|ICANON|ECHO); - t.c_cc[VMIN] = 0; - -new_timeout: - if (timeout > 0) { - this_timeout = timeout < 26 ? timeout : 25; - timeout -= this_timeout; - this_timeout *= 10; - } - - t.c_cc[VTIME] = this_timeout; - - if (sys_ioctl(fd, TCSETS, (long)&t) < 0) - goto out_restore; - - if (current->flags & PF_KTHREAD) { - orig_kthread = (current->flags & PF_KTHREAD); - current->flags &= ~PF_KTHREAD; - } - - while (1) { - if (sys_read(fd, &key, 1) <= 0) { - if (timeout) - goto new_timeout; - key = '\0'; - break; - } - key = tolower(key); - if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) { - if (key == 'c') { - set_toi_state(TOI_CONTINUE_REQ); - break; - } else if (key == ' ') - break; - } else - break; - } - if (orig_kthread) { - current->flags |= PF_KTHREAD; - } - -out_restore: - sys_ioctl(fd, TCSETS, (long)&t_backup); -out_close: - sys_close(fd); - - return key; -} - -struct toi_boot_kernel_data toi_bkd __nosavedata - __attribute__((aligned(PAGE_SIZE))) = { - MY_BOOT_KERNEL_DATA_VERSION, - 0, -#ifdef CONFIG_TOI_REPLACE_SWSUSP - (1 << TOI_REPLACE_SWSUSP) | -#endif - (1 << TOI_NO_FLUSHER_THREAD) | - (1 << TOI_PAGESET2_FULL), -}; - -struct block_device *toi_open_by_devnum(dev_t dev) -{ - struct block_device *bdev = bdget(dev); - int err = -ENOMEM; - if (bdev) - err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); - return err ? ERR_PTR(err) : bdev; -} - -/** - * toi_close_bdev: Close a swap bdev. - * - * int: The swap entry number to close. - */ -void toi_close_bdev(struct block_device *bdev) -{ - blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); -} - -int toi_wait = CONFIG_TOI_DEFAULT_WAIT; -struct toi_core_fns *toi_core_fns; -unsigned long toi_result; -struct pagedir pagedir1 = {1}; -struct toi_cbw **toi_first_cbw; -int toi_next_cbw; - -unsigned long toi_get_nonconflicting_page(void) -{ - return toi_core_fns->get_nonconflicting_page(); -} - -int toi_post_context_save(void) -{ - return toi_core_fns->post_context_save(); -} - -int try_tuxonice_hibernate(void) -{ - if (!toi_core_fns) - return -ENODEV; - - return toi_core_fns->try_hibernate(); -} - -static int num_resume_calls; -#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL -static int ignore_late_initcall = 1; -#else -static int ignore_late_initcall; -#endif - -int toi_translate_err_default = TOI_CONTINUE_REQ; - -void try_tuxonice_resume(void) -{ - if (!hibernation_available()) - return; - - /* Don't let it wrap around eventually */ - if (num_resume_calls < 2) - num_resume_calls++; - - if (num_resume_calls == 1 && ignore_late_initcall) { - printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n"); - return; - } - - if (toi_core_fns) - toi_core_fns->try_resume(); - else - printk(KERN_INFO "TuxOnIce core not loaded yet.\n"); -} - -int toi_lowlevel_builtin(void) -{ - int error = 0; - - save_processor_state(); - error = swsusp_arch_suspend(); - if (error) - printk(KERN_ERR "Error %d hibernating\n", error); - - /* Restore control flow appears here */ - if (!toi_in_hibernate) { - copyback_high(); - set_toi_state(TOI_NOW_RESUMING); - } - - restore_processor_state(); - return error; -} - -unsigned long toi_compress_bytes_in; -unsigned long toi_compress_bytes_out; - -int toi_in_suspend(void) -{ - return in_suspend; -} - -unsigned long toi_state = ((1 << TOI_BOOT_TIME) | - (1 << TOI_IGNORE_LOGLEVEL) | - (1 << TOI_IO_STOPPED)); - -/* The number of hibernates we have started (some may have been cancelled) */ -unsigned int nr_hibernates; -int toi_running; -__nosavedata int toi_in_hibernate; -__nosavedata struct pbe *restore_highmem_pblist; - -int toi_trace_allocs; - -void toi_read_lock_tasklist(void) -{ - read_lock(&tasklist_lock); -} - -void toi_read_unlock_tasklist(void) -{ - read_unlock(&tasklist_lock); -} - -#ifdef CONFIG_TOI_ZRAM_SUPPORT -int (*toi_flag_zram_disks) (void); - -int toi_do_flag_zram_disks(void) -{ - return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0; -} - -#endif - -/* toi_generate_free_page_map - * - * Description: This routine generates a bitmap of free pages from the - * lists used by the memory manager. We then use the bitmap - * to quickly calculate which pages to save and in which - * pagesets. - */ -void toi_generate_free_page_map(void) -{ - int order, cpu, t; - unsigned long flags, i; - struct zone *zone; - struct list_head *curr; - unsigned long pfn; - struct page *page; - - for_each_populated_zone(zone) { - - if (!zone->spanned_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - - for (i = 0; i < zone->spanned_pages; i++) { - pfn = zone->zone_start_pfn + i; - - if (!pfn_valid(pfn)) - continue; - - page = pfn_to_page(pfn); - - ClearPageNosaveFree(page); - } - - for_each_migratetype_order(order, t) { - list_for_each(curr, - &zone->free_area[order].free_list[t]) { - unsigned long j; - - pfn = page_to_pfn(list_entry(curr, struct page, - lru)); - for (j = 0; j < (1UL << order); j++) - SetPageNosaveFree(pfn_to_page(pfn + j)); - } - } - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pset = - per_cpu_ptr(zone->pageset, cpu); - struct per_cpu_pages *pcp = &pset->pcp; - struct page *page; - int t; - - for (t = 0; t < MIGRATE_PCPTYPES; t++) - list_for_each_entry(page, &pcp->lists[t], lru) - SetPageNosaveFree(page); - } - - spin_unlock_irqrestore(&zone->lock, flags); - } -} - -/* toi_size_of_free_region - * - * Description: Return the number of pages that are free, beginning with and - * including this one. - */ -int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn) -{ - unsigned long this_pfn = start_pfn, - end_pfn = zone_end_pfn(zone); - - while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn))) - this_pfn++; - - return this_pfn - start_pfn; -} - -static int __init toi_wait_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) { - if (value < -1 || value > 255) - printk(KERN_INFO "TuxOnIce_wait outside range -1 to " - "255.\n"); - else - toi_wait = value; - } - - return 1; -} -__setup("toi_wait", toi_wait_setup); - -static int __init toi_translate_retry_setup(char *str) -{ - toi_translate_err_default = 0; - return 1; -} -__setup("toi_translate_retry", toi_translate_retry_setup); - -static int __init toi_debug_setup(char *str) -{ - toi_bkd.toi_action |= (1 << TOI_LOGALL); - toi_bootflags_mask |= (1 << TOI_LOGALL); - toi_bkd.toi_debug_state = 255; - toi_bkd.toi_default_console_level = 7; - return 1; -} -__setup("toi_debug_setup", toi_debug_setup); - -static int __init toi_pause_setup(char *str) -{ - toi_bkd.toi_action |= (1 << TOI_PAUSE); - toi_bootflags_mask |= (1 << TOI_PAUSE); - return 1; -} -__setup("toi_pause", toi_pause_setup); - -#ifdef CONFIG_PM_DEBUG -static int __init toi_trace_allocs_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) - toi_trace_allocs = value; - - return 1; -} -__setup("toi_trace_allocs", toi_trace_allocs_setup); -#endif - -static int __init toi_ignore_late_initcall_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) - ignore_late_initcall = value; - - return 1; -} -__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup); - -static int __init toi_force_no_multithreaded_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO); - toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO); - - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO); - - return 1; -} -__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup); - -#ifdef CONFIG_KGDB -static int __init toi_post_resume_breakpoint_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT); - toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT); - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT); - - return 1; -} -__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup); -#endif - -static int __init toi_disable_readahead_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD); - toi_bootflags_mask |= (1 << TOI_NO_READAHEAD); - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD); - - return 1; -} -__setup("toi_no_readahead", toi_disable_readahead_setup); diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h deleted file mode 100644 index 9539818e0..000000000 --- a/kernel/power/tuxonice_builtin.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ -#include <asm/setup.h> - -extern struct toi_core_fns *toi_core_fns; -extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out; -extern unsigned int nr_hibernates; -extern int toi_in_hibernate; - -extern __nosavedata struct pbe *restore_highmem_pblist; - -int toi_lowlevel_builtin(void); - -#ifdef CONFIG_HIGHMEM -extern __nosavedata struct zone_data *toi_nosave_zone_list; -extern __nosavedata unsigned long toi_nosave_max_pfn; -#endif - -extern unsigned long toi_get_nonconflicting_page(void); -extern int toi_post_context_save(void); - -extern char toi_wait_for_keypress_dev_console(int timeout); -extern struct block_device *toi_open_by_devnum(dev_t dev); -extern void toi_close_bdev(struct block_device *bdev); -extern int toi_wait; -extern int toi_translate_err_default; -extern int toi_force_no_multithreaded; -extern void toi_read_lock_tasklist(void); -extern void toi_read_unlock_tasklist(void); -extern int toi_in_suspend(void); -extern void toi_generate_free_page_map(void); -extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn); - -#ifdef CONFIG_TOI_ZRAM_SUPPORT -extern int toi_do_flag_zram_disks(void); -#else -#define toi_do_flag_zram_disks() (0) -#endif diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c deleted file mode 100644 index 8952c0fec..000000000 --- a/kernel/power/tuxonice_checksum.c +++ /dev/null @@ -1,392 +0,0 @@ -/* - * kernel/power/tuxonice_checksum.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data checksum routines for TuxOnIce, - * using cryptoapi. They are used to locate any modifications - * made to pageset 2 while we're saving it. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <linux/crypto.h> -#include <linux/scatterlist.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_checksum.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" - -static struct toi_module_ops toi_checksum_ops; - -/* Constant at the mo, but I might allow tuning later */ -static char toi_checksum_name[32] = "md4"; -/* Bytes per checksum */ -#define CHECKSUM_SIZE (16) - -#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE) - -struct cpu_context { - struct crypto_hash *transform; - struct hash_desc desc; - struct scatterlist sg[2]; - char *buf; -}; - -static DEFINE_PER_CPU(struct cpu_context, contexts); -static int pages_allocated; -static unsigned long page_list; - -static int toi_num_resaved; - -static unsigned long this_checksum, next_page; -static int checksum_count; - -static inline int checksum_pages_needed(void) -{ - return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE); -} - -/* ---- Local buffer management ---- */ - -/* - * toi_checksum_cleanup - * - * Frees memory allocated for our labours. - */ -static void toi_checksum_cleanup(int ending_cycle) -{ - int cpu; - - if (ending_cycle) { - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->transform) { - crypto_free_hash(this->transform); - this->transform = NULL; - this->desc.tfm = NULL; - } - - if (this->buf) { - toi_free_page(27, (unsigned long) this->buf); - this->buf = NULL; - } - } - } -} - -/* - * toi_crypto_initialise - * - * Prepare to do some work by allocating buffers and transforms. - * Returns: Int: Zero. Even if we can't set up checksum, we still - * seek to hibernate. - */ -static int toi_checksum_initialise(int starting_cycle) -{ - int cpu; - - if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled) - return 0; - - if (!*toi_checksum_name) { - printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - struct page *page; - - this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0); - if (IS_ERR(this->transform)) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s checksum algorithm: %ld.\n", - toi_checksum_name, (long) this->transform); - this->transform = NULL; - return 1; - } - - this->desc.tfm = this->transform; - this->desc.flags = 0; - - page = toi_alloc_page(27, GFP_KERNEL); - if (!page) - return 1; - this->buf = page_address(page); - sg_init_one(&this->sg[0], this->buf, PAGE_SIZE); - } - return 0; -} - -/* - * toi_checksum_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_checksum_print_debug_stats(char *buffer, int size) -{ - int len; - - if (!toi_checksum_ops.enabled) - return scnprintf(buffer, size, - "- Checksumming disabled.\n"); - - len = scnprintf(buffer, size, "- Checksum method is '%s'.\n", - toi_checksum_name); - len += scnprintf(buffer + len, size - len, - " %d pages resaved in atomic copy.\n", toi_num_resaved); - return len; -} - -static int toi_checksum_memory_needed(void) -{ - return toi_checksum_ops.enabled ? - checksum_pages_needed() << PAGE_SHIFT : 0; -} - -static int toi_checksum_storage_needed(void) -{ - if (toi_checksum_ops.enabled) - return strlen(toi_checksum_name) + sizeof(int) + 1; - else - return 0; -} - -/* - * toi_checksum_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_checksum_save_config_info(char *buffer) -{ - int namelen = strlen(toi_checksum_name) + 1; - int total_len; - - *((unsigned int *) buffer) = namelen; - strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen); - total_len = sizeof(unsigned int) + namelen; - return total_len; -} - -/* toi_checksum_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for dechecksuming the image at - * resume time. - */ -static void toi_checksum_load_config_info(char *buffer, int size) -{ - int namelen; - - namelen = *((unsigned int *) (buffer)); - strncpy(toi_checksum_name, buffer + sizeof(unsigned int), - namelen); - return; -} - -/* - * Free Checksum Memory - */ - -void free_checksum_pages(void) -{ - while (pages_allocated) { - unsigned long next = *((unsigned long *) page_list); - ClearPageNosave(virt_to_page(page_list)); - toi_free_page(15, (unsigned long) page_list); - page_list = next; - pages_allocated--; - } -} - -/* - * Allocate Checksum Memory - */ - -int allocate_checksum_pages(void) -{ - int pages_needed = checksum_pages_needed(); - - if (!toi_checksum_ops.enabled) - return 0; - - while (pages_allocated < pages_needed) { - unsigned long *new_page = - (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP); - if (!new_page) { - printk(KERN_ERR "Unable to allocate checksum pages.\n"); - return -ENOMEM; - } - SetPageNosave(virt_to_page(new_page)); - (*new_page) = page_list; - page_list = (unsigned long) new_page; - pages_allocated++; - } - - next_page = (unsigned long) page_list; - checksum_count = 0; - - return 0; -} - -char *tuxonice_get_next_checksum(void) -{ - if (!toi_checksum_ops.enabled) - return NULL; - - if (checksum_count % CHECKSUMS_PER_PAGE) - this_checksum += CHECKSUM_SIZE; - else { - this_checksum = next_page + sizeof(void *); - next_page = *((unsigned long *) next_page); - } - - checksum_count++; - return (char *) this_checksum; -} - -int tuxonice_calc_checksum(struct page *page, char *checksum_locn) -{ - char *pa; - int result, cpu = smp_processor_id(); - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!toi_checksum_ops.enabled) - return 0; - - pa = kmap(page); - memcpy(ctx->buf, pa, PAGE_SIZE); - kunmap(page); - result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, - checksum_locn); - if (result) - printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest " - "returned %d.\n", result); - return result; -} -/* - * Calculate checksums - */ - -void check_checksums(void) -{ - int index = 0, cpu = smp_processor_id(); - char current_checksum[CHECKSUM_SIZE]; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - unsigned long pfn; - - if (!toi_checksum_ops.enabled) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled."); - return; - } - - next_page = (unsigned long) page_list; - - toi_num_resaved = 0; - this_checksum = 0; - - toi_trace_index++; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums."); - memory_bm_position_reset(pageset2_map); - for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset2_map, 0)) { - int ret, resave_needed = false; - char *pa; - struct page *page = pfn_to_page(pfn); - - if (index < checksum_count) { - if (index % CHECKSUMS_PER_PAGE) { - this_checksum += CHECKSUM_SIZE; - } else { - this_checksum = next_page + sizeof(void *); - next_page = *((unsigned long *) next_page); - } - - /* Done when IRQs disabled so must be atomic */ - pa = kmap_atomic(page); - memcpy(ctx->buf, pa, PAGE_SIZE); - kunmap_atomic(pa); - ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, - current_checksum); - - if (ret) { - printk(KERN_INFO "Digest failed. Returned %d.\n", ret); - return; - } - - resave_needed = memcmp(current_checksum, (char *) this_checksum, - CHECKSUM_SIZE); - } else { - resave_needed = true; - } - - if (resave_needed) { - TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed); - SetPageResave(pfn_to_page(pfn)); - toi_num_resaved++; - if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED)) - set_abort_result(TOI_RESAVE_NEEDED); - } - - index++; - } - toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete."); -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0, - NULL), - SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action, - TOI_ABORT_ON_RESAVE_NEEDED, 0) -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_checksum_ops = { - .type = MISC_MODULE, - .name = "checksumming", - .directory = "checksum", - .module = THIS_MODULE, - .initialise = toi_checksum_initialise, - .cleanup = toi_checksum_cleanup, - .print_debug_info = toi_checksum_print_debug_stats, - .save_config_info = toi_checksum_save_config_info, - .load_config_info = toi_checksum_load_config_info, - .memory_needed = toi_checksum_memory_needed, - .storage_needed = toi_checksum_storage_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -int toi_checksum_init(void) -{ - int result = toi_register_module(&toi_checksum_ops); - return result; -} - -void toi_checksum_exit(void) -{ - toi_unregister_module(&toi_checksum_ops); -} diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h deleted file mode 100644 index 7d6478a6a..000000000 --- a/kernel/power/tuxonice_checksum.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * kernel/power/tuxonice_checksum.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data checksum routines for TuxOnIce, - * using cryptoapi. They are used to locate any modifications - * made to pageset 2 while we're saving it. - */ - -#if defined(CONFIG_TOI_CHECKSUM) -extern int toi_checksum_init(void); -extern void toi_checksum_exit(void); -void check_checksums(void); -int allocate_checksum_pages(void); -void free_checksum_pages(void); -char *tuxonice_get_next_checksum(void); -int tuxonice_calc_checksum(struct page *page, char *checksum_locn); -#else -static inline int toi_checksum_init(void) { return 0; } -static inline void toi_checksum_exit(void) { } -static inline void check_checksums(void) { }; -static inline int allocate_checksum_pages(void) { return 0; }; -static inline void free_checksum_pages(void) { }; -static inline char *tuxonice_get_next_checksum(void) { return NULL; }; -static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn) - { return 0; } -#endif - diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c deleted file mode 100644 index cfe3383ab..000000000 --- a/kernel/power/tuxonice_cluster.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * kernel/power/tuxonice_cluster.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains routines for cluster hibernation support. - * - * Based on ip autoconfiguration code in net/ipv4/ipconfig.c. - * - * How does it work? - * - * There is no 'master' node that tells everyone else what to do. All nodes - * send messages to the broadcast address/port, maintain a list of peers - * and figure out when to progress to the next step in hibernating or resuming. - * This makes us more fault tolerant when it comes to nodes coming and going - * (which may be more of an issue if we're hibernating when power supplies - * are being unreliable). - * - * At boot time, we start a ktuxonice thread that handles communication with - * other nodes. This node maintains a state machine that controls our progress - * through hibernating and resuming, keeping us in step with other nodes. Nodes - * are identified by their hw address. - * - * On startup, the node sends CLUSTER_PING on the configured interface's - * broadcast address, port $toi_cluster_port (see below) and begins to listen - * for other broadcast messages. CLUSTER_PING messages are repeated at - * intervals of 5 minutes, with a random offset to spread traffic out. - * - * A hibernation cycle is initiated from any node via - * - * echo > /sys/power/tuxonice/do_hibernate - * - * and (possibily) the hibernate script. At each step of the process, the node - * completes its work, and waits for all other nodes to signal completion of - * their work (or timeout) before progressing to the next step. - * - * Request/state Action before reply Possible reply Next state - * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP - * HIBERNATE|NACK INIT_0 - * - * PREP prepare_image PREP|ACK IMAGE_WRITE - * PREP|NACK INIT_0 - * ABORT RUNNING - * - * IO write image IO|ACK power off - * ABORT POST_RESUME - * - * (Boot time) check for image IMAGE|ACK RESUME_PREP - * (Note 1) - * IMAGE|NACK (Note 2) - * - * PREP prepare read image PREP|ACK IMAGE_READ - * PREP|NACK (As NACK_IMAGE) - * - * IO read image IO|ACK POST_RESUME - * - * POST_RESUME thaw, post-script RUNNING - * - * INIT_0 init 0 - * - * Other messages: - * - * - PING: Request for all other live nodes to send a PONG. Used at startup to - * announce presence, when a node is suspected dead and periodically, in case - * segments of the network are [un]plugged. - * - * - PONG: Response to a PING. - * - * - ABORT: Request to cancel writing an image. - * - * - BYE: Notification that this node is shutting down. - * - * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that - * nodes which are slower to start up can get state synchronised. If a node - * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send - * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it - * must invalidate its image (if any) and boot normally. - * - * Note 2: May occur when one node lost power or powered off while others - * hibernated. This node waits for others to complete resuming (ACK_READ) - * before completing its boot, so that it appears as a fail node restarting. - * - * If any node has an image, then it also has a list of nodes that hibernated - * in synchronisation with it. The node will wait for other nodes to appear - * or timeout before beginning its restoration. - * - * If a node has no image, it needs to wait, in case other nodes which do have - * an image are going to resume, but are taking longer to announce their - * presence. For this reason, the user can specify a timeout value and a number - * of nodes detected before we just continue. (We might want to assume in a - * cluster of, say, 15 nodes, if 8 others have booted without finding an image, - * the remaining nodes will too. This might help in situations where some nodes - * are much slower to boot, or more subject to hardware failures or such like). - */ - -#include <linux/suspend.h> -#include <linux/if.h> -#include <linux/rtnetlink.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/in.h> -#include <linux/if_arp.h> -#include <linux/kthread.h> -#include <linux/wait.h> -#include <linux/netdevice.h> -#include <net/ip.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" - -#if 1 -#define PRINTK(a, b...) do { printk(a, ##b); } while (0) -#else -#define PRINTK(a, b...) do { } while (0) -#endif - -static int loopback_mode; -static int num_local_nodes = 1; -#define MAX_LOCAL_NODES 8 -#define SADDR (loopback_mode ? b->sid : h->saddr) - -#define MYNAME "TuxOnIce Clustering" - -enum cluster_message { - MSG_ACK = 1, - MSG_NACK = 2, - MSG_PING = 4, - MSG_ABORT = 8, - MSG_BYE = 16, - MSG_HIBERNATE = 32, - MSG_IMAGE = 64, - MSG_IO = 128, - MSG_RUNNING = 256 -}; - -static char *str_message(int message) -{ - switch (message) { - case 4: - return "Ping"; - case 8: - return "Abort"; - case 9: - return "Abort acked"; - case 10: - return "Abort nacked"; - case 16: - return "Bye"; - case 17: - return "Bye acked"; - case 18: - return "Bye nacked"; - case 32: - return "Hibernate request"; - case 33: - return "Hibernate ack"; - case 34: - return "Hibernate nack"; - case 64: - return "Image exists?"; - case 65: - return "Image does exist"; - case 66: - return "No image here"; - case 128: - return "I/O"; - case 129: - return "I/O okay"; - case 130: - return "I/O failed"; - case 256: - return "Running"; - default: - printk(KERN_ERR "Unrecognised message %d.\n", message); - return "Unrecognised message (see dmesg)"; - } -} - -#define MSG_ACK_MASK (MSG_ACK | MSG_NACK) -#define MSG_STATE_MASK (~MSG_ACK_MASK) - -struct node_info { - struct list_head member_list; - wait_queue_head_t member_events; - spinlock_t member_list_lock; - spinlock_t receive_lock; - int peer_count, ignored_peer_count; - struct toi_sysfs_data sysfs_data; - enum cluster_message current_message; -}; - -struct node_info node_array[MAX_LOCAL_NODES]; - -struct cluster_member { - __be32 addr; - enum cluster_message message; - struct list_head list; - int ignore; -}; - -#define toi_cluster_port_send 3501 -#define toi_cluster_port_recv 3502 - -static struct net_device *net_dev; -static struct toi_module_ops toi_cluster_ops; - -static int toi_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); - -static struct packet_type toi_cluster_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = toi_recv, -}; - -struct toi_pkt { /* BOOTP packet format */ - struct iphdr iph; /* IP header */ - struct udphdr udph; /* UDP header */ - u8 htype; /* HW address type */ - u8 hlen; /* HW address length */ - __be32 xid; /* Transaction ID */ - __be16 secs; /* Seconds since we started */ - __be16 flags; /* Just what it says */ - u8 hw_addr[16]; /* Sender's HW address */ - u16 message; /* Message */ - unsigned long sid; /* Source ID for loopback testing */ -}; - -static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE; - -static int added_pack; - -static int others_have_image; - -/* Key used to allow multiple clusters on the same lan */ -static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY; -static char pre_hibernate_script[255] = - CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE; -static char post_hibernate_script[255] = - CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE; - -/* List of cluster members */ -static unsigned long continue_delay = 5 * HZ; -static unsigned long cluster_message_timeout = 3 * HZ; - -/* === Membership list === */ - -static void print_member_info(int index) -{ - struct cluster_member *this; - - printk(KERN_INFO "==> Dumping node %d.\n", index); - - list_for_each_entry(this, &node_array[index].member_list, list) - printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n", - NIPQUAD(this->addr), - str_message(this->message), - this->ignore ? "(Ignored)" : ""); - printk(KERN_INFO "== Done ==\n"); -} - -static struct cluster_member *__find_member(int index, __be32 addr) -{ - struct cluster_member *this; - - list_for_each_entry(this, &node_array[index].member_list, list) { - if (this->addr != addr) - continue; - - return this; - } - - return NULL; -} - -static void set_ignore(int index, __be32 addr, struct cluster_member *this) -{ - if (this->ignore) { - PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", - index, NIPQUAD(addr)); - return; - } - - PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", - index, NIPQUAD(addr)); - this->ignore = 1; - node_array[index].ignored_peer_count++; -} - -static int __add_update_member(int index, __be32 addr, int message) -{ - struct cluster_member *this; - - this = __find_member(index, addr); - if (this) { - if (this->message != message) { - this->message = message; - if ((message & MSG_NACK) && - (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) - set_ignore(index, addr, this); - PRINTK("Node %d sees node %d.%d.%d.%d now sending " - "%s.\n", index, NIPQUAD(addr), - str_message(message)); - wake_up(&node_array[index].member_events); - } - return 0; - } - - this = (struct cluster_member *) toi_kzalloc(36, - sizeof(struct cluster_member), GFP_KERNEL); - - if (!this) - return -1; - - this->addr = addr; - this->message = message; - this->ignore = 0; - INIT_LIST_HEAD(&this->list); - - node_array[index].peer_count++; - - PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index, - NIPQUAD(addr), str_message(message)); - - if ((message & MSG_NACK) && - (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) - set_ignore(index, addr, this); - list_add_tail(&this->list, &node_array[index].member_list); - return 1; -} - -static int add_update_member(int index, __be32 addr, int message) -{ - int result; - unsigned long flags; - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - result = __add_update_member(index, addr, message); - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); - - print_member_info(index); - - wake_up(&node_array[index].member_events); - - return result; -} - -static void del_member(int index, __be32 addr) -{ - struct cluster_member *this; - unsigned long flags; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - this = __find_member(index, addr); - - if (this) { - list_del_init(&this->list); - toi_kfree(36, this, sizeof(*this)); - node_array[index].peer_count--; - } - - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); -} - -/* === Message transmission === */ - -static void toi_send_if(int message, unsigned long my_id); - -/* - * Process received TOI packet. - */ -static int toi_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) -{ - struct toi_pkt *b; - struct iphdr *h; - int len, result, index; - unsigned long addr, message, ack; - - /* Perform verifications before taking the lock. */ - if (skb->pkt_type == PACKET_OTHERHOST) - goto drop; - - if (dev != net_dev) - goto drop; - - skb = skb_share_check(skb, GFP_ATOMIC); - if (!skb) - return NET_RX_DROP; - - if (!pskb_may_pull(skb, - sizeof(struct iphdr) + - sizeof(struct udphdr))) - goto drop; - - b = (struct toi_pkt *)skb_network_header(skb); - h = &b->iph; - - if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) - goto drop; - - /* Fragments are not supported */ - if (h->frag_off & htons(IP_OFFSET | IP_MF)) { - if (net_ratelimit()) - printk(KERN_ERR "TuxOnIce: Ignoring fragmented " - "cluster message.\n"); - goto drop; - } - - if (skb->len < ntohs(h->tot_len)) - goto drop; - - if (ip_fast_csum((char *) h, h->ihl)) - goto drop; - - if (b->udph.source != htons(toi_cluster_port_send) || - b->udph.dest != htons(toi_cluster_port_recv)) - goto drop; - - if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) - goto drop; - - len = ntohs(b->udph.len) - sizeof(struct udphdr); - - /* Ok the front looks good, make sure we can get at the rest. */ - if (!pskb_may_pull(skb, skb->len)) - goto drop; - - b = (struct toi_pkt *)skb_network_header(skb); - h = &b->iph; - - addr = SADDR; - PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n", - str_message(b->message), NIPQUAD(addr)); - - message = b->message & MSG_STATE_MASK; - ack = b->message & MSG_ACK_MASK; - - for (index = 0; index < num_local_nodes; index++) { - int new_message = node_array[index].current_message, - old_message = new_message; - - if (index == SADDR || !old_message) { - PRINTK("Ignoring node %d (offline or self).\n", index); - continue; - } - - /* One message at a time, please. */ - spin_lock(&node_array[index].receive_lock); - - result = add_update_member(index, SADDR, b->message); - if (result == -1) { - printk(KERN_INFO "Failed to add new cluster member " - NIPQUAD_FMT ".\n", - NIPQUAD(addr)); - goto drop_unlock; - } - - switch (b->message & MSG_STATE_MASK) { - case MSG_PING: - break; - case MSG_ABORT: - break; - case MSG_BYE: - break; - case MSG_HIBERNATE: - /* Can I hibernate? */ - new_message = MSG_HIBERNATE | - ((index & 1) ? MSG_NACK : MSG_ACK); - break; - case MSG_IMAGE: - /* Can I resume? */ - new_message = MSG_IMAGE | - ((index & 1) ? MSG_NACK : MSG_ACK); - if (new_message != old_message) - printk(KERN_ERR "Setting whether I can resume " - "to %d.\n", new_message); - break; - case MSG_IO: - new_message = MSG_IO | MSG_ACK; - break; - case MSG_RUNNING: - break; - default: - if (net_ratelimit()) - printk(KERN_ERR "Unrecognised TuxOnIce cluster" - " message %d from " NIPQUAD_FMT ".\n", - b->message, NIPQUAD(addr)); - }; - - if (old_message != new_message) { - node_array[index].current_message = new_message; - printk(KERN_INFO ">>> Sending new message for node " - "%d.\n", index); - toi_send_if(new_message, index); - } else if (!ack) { - printk(KERN_INFO ">>> Resending message for node %d.\n", - index); - toi_send_if(new_message, index); - } -drop_unlock: - spin_unlock(&node_array[index].receive_lock); - }; - -drop: - /* Throw the packet out. */ - kfree_skb(skb); - - return 0; -} - -/* - * Send cluster message to single interface. - */ -static void toi_send_if(int message, unsigned long my_id) -{ - struct sk_buff *skb; - struct toi_pkt *b; - int hh_len = LL_RESERVED_SPACE(net_dev); - struct iphdr *h; - - /* Allocate packet */ - skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL); - if (!skb) - return; - skb_reserve(skb, hh_len); - b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt)); - memset(b, 0, sizeof(struct toi_pkt)); - - /* Construct IP header */ - skb_reset_network_header(skb); - h = ip_hdr(skb); - h->version = 4; - h->ihl = 5; - h->tot_len = htons(sizeof(struct toi_pkt)); - h->frag_off = htons(IP_DF); - h->ttl = 64; - h->protocol = IPPROTO_UDP; - h->daddr = htonl(INADDR_BROADCAST); - h->check = ip_fast_csum((unsigned char *) h, h->ihl); - - /* Construct UDP header */ - b->udph.source = htons(toi_cluster_port_send); - b->udph.dest = htons(toi_cluster_port_recv); - b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr)); - /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ - - /* Construct message */ - b->message = message; - b->sid = my_id; - b->htype = net_dev->type; /* can cause undefined behavior */ - b->hlen = net_dev->addr_len; - memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len); - b->secs = htons(3); /* 3 seconds */ - - /* Chain packet down the line... */ - skb->dev = net_dev; - skb->protocol = htons(ETH_P_IP); - if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol), - net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) || - dev_queue_xmit(skb) < 0) - printk(KERN_INFO "E"); -} - -/* ========================================= */ - -/* kTOICluster */ - -static atomic_t num_cluster_threads; -static DECLARE_WAIT_QUEUE_HEAD(clusterd_events); - -static int kTOICluster(void *data) -{ - unsigned long my_id; - - my_id = atomic_add_return(1, &num_cluster_threads) - 1; - node_array[my_id].current_message = (unsigned long) data; - - PRINTK("kTOICluster daemon %lu starting.\n", my_id); - - current->flags |= PF_NOFREEZE; - - while (node_array[my_id].current_message) { - toi_send_if(node_array[my_id].current_message, my_id); - sleep_on_timeout(&clusterd_events, - cluster_message_timeout); - PRINTK("Link state %lu is %d.\n", my_id, - node_array[my_id].current_message); - } - - toi_send_if(MSG_BYE, my_id); - atomic_dec(&num_cluster_threads); - wake_up(&clusterd_events); - - PRINTK("kTOICluster daemon %lu exiting.\n", my_id); - __set_current_state(TASK_RUNNING); - return 0; -} - -static void kill_clusterd(void) -{ - int i; - - for (i = 0; i < num_local_nodes; i++) { - if (node_array[i].current_message) { - PRINTK("Seeking to kill clusterd %d.\n", i); - node_array[i].current_message = 0; - } - } - wait_event(clusterd_events, - !atomic_read(&num_cluster_threads)); - PRINTK("All cluster daemons have exited.\n"); -} - -static int peers_not_in_message(int index, int message, int precise) -{ - struct cluster_member *this; - unsigned long flags; - int result = 0; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - list_for_each_entry(this, &node_array[index].member_list, list) { - if (this->ignore) - continue; - - PRINTK("Peer %d.%d.%d.%d sending %s. " - "Seeking %s.\n", - NIPQUAD(this->addr), - str_message(this->message), str_message(message)); - if ((precise ? this->message : - this->message & MSG_STATE_MASK) != - message) - result++; - } - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); - PRINTK("%d peers in sought message.\n", result); - return result; -} - -static void reset_ignored(int index) -{ - struct cluster_member *this; - unsigned long flags; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - list_for_each_entry(this, &node_array[index].member_list, list) - this->ignore = 0; - node_array[index].ignored_peer_count = 0; - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); -} - -static int peers_in_message(int index, int message, int precise) -{ - return node_array[index].peer_count - - node_array[index].ignored_peer_count - - peers_not_in_message(index, message, precise); -} - -static int time_to_continue(int index, unsigned long start, int message) -{ - int first = peers_not_in_message(index, message, 0); - int second = peers_in_message(index, message, 1); - - PRINTK("First part returns %d, second returns %d.\n", first, second); - - if (!first && !second) { - PRINTK("All peers answered message %d.\n", - message); - return 1; - } - - if (time_after(jiffies, start + continue_delay)) { - PRINTK("Timeout reached.\n"); - return 1; - } - - PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies, - start + continue_delay); - return 0; -} - -void toi_initiate_cluster_hibernate(void) -{ - int result; - unsigned long start; - - result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); - if (result) - return; - - toi_send_if(MSG_HIBERNATE, 0); - - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_HIBERNATE)); - - if (test_action_state(TOI_FREEZER_TEST)) { - toi_send_if(MSG_ABORT, 0); - - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_RUNNING)); - - do_toi_step(STEP_QUIET_CLEANUP); - return; - } - - toi_send_if(MSG_IO, 0); - - result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); - if (result) - return; - - /* This code runs at resume time too! */ - if (toi_in_hibernate) - result = do_toi_step(STEP_HIBERNATE_POWERDOWN); -} - -/* toi_cluster_print_debug_stats - * - * Description: Print information to be recorded for debugging purposes into a - * buffer. - * Arguments: buffer: Pointer to a buffer into which the debug info will be - * printed. - * size: Size of the buffer. - * Returns: Number of characters written to the buffer. - */ -static int toi_cluster_print_debug_stats(char *buffer, int size) -{ - int len; - - if (strlen(toi_cluster_iface)) - len = scnprintf(buffer, size, - "- Cluster interface is '%s'.\n", - toi_cluster_iface); - else - len = scnprintf(buffer, size, - "- Cluster support is disabled.\n"); - return len; -} - -/* cluster_memory_needed - * - * Description: Tell the caller how much memory we need to operate during - * hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_cluster_memory_needed(void) -{ - return 0; -} - -static int toi_cluster_storage_needed(void) -{ - return 1 + strlen(toi_cluster_iface); -} - -/* toi_cluster_save_config_info - * - * Description: Save informaton needed when reloading the image at resume time. - * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE. - * Returns: Number of bytes used for saving our data. - */ -static int toi_cluster_save_config_info(char *buffer) -{ - strcpy(buffer, toi_cluster_iface); - return strlen(toi_cluster_iface + 1); -} - -/* toi_cluster_load_config_info - * - * Description: Reload information needed for declustering the image at - * resume time. - * Arguments: Buffer: Pointer to the start of the data. - * Size: Number of bytes that were saved. - */ -static void toi_cluster_load_config_info(char *buffer, int size) -{ - strncpy(toi_cluster_iface, buffer, size); - return; -} - -static void cluster_startup(void) -{ - int have_image = do_check_can_resume(), i; - unsigned long start = jiffies, initial_message; - struct task_struct *p; - - initial_message = MSG_IMAGE; - - have_image = 1; - - for (i = 0; i < num_local_nodes; i++) { - PRINTK("Starting ktoiclusterd %d.\n", i); - p = kthread_create(kTOICluster, (void *) initial_message, - "ktoiclusterd/%d", i); - if (IS_ERR(p)) { - printk(KERN_ERR "Failed to start ktoiclusterd.\n"); - return; - } - - wake_up_process(p); - } - - /* Wait for delay or someone else sending first message */ - wait_event(node_array[0].member_events, time_to_continue(0, start, - MSG_IMAGE)); - - others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1); - - printk(KERN_INFO "Continuing. I %shave an image. Peers with image:" - " %d.\n", have_image ? "" : "don't ", others_have_image); - - if (have_image) { - int result; - - /* Start to resume */ - printk(KERN_INFO " === Starting to resume === \n"); - node_array[0].current_message = MSG_IO; - toi_send_if(MSG_IO, 0); - - /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */ - result = 0; - - if (!result) { - /* - * Atomic restore - we'll come back in the hibernation - * path. - */ - - /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */ - result = 0; - - /* do_toi_step(STEP_QUIET_CLEANUP); */ - } - - node_array[0].current_message |= MSG_NACK; - - /* For debugging - disable for real life? */ - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_IO)); - } - - if (others_have_image) { - /* Wait for them to resume */ - printk(KERN_INFO "Waiting for other nodes to resume.\n"); - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_RUNNING)); - if (peers_not_in_message(0, MSG_RUNNING, 0)) - printk(KERN_INFO "Timed out while waiting for other " - "nodes to resume.\n"); - } - - /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE - * as appropriate. - * - * If we don't have an image: - * - Wait until someone else says they have one, or conditions are met - * for continuing to boot (n machines or t seconds). - * - If anyone has an image, wait for them to resume before continuing - * to boot. - * - * If we have an image: - * - Wait until conditions are met before continuing to resume (n - * machines or t seconds). Send RESUME_PREP and freeze processes. - * NACK_PREP if freezing fails (shouldn't) and follow logic for - * us having no image above. On success, wait for [N]ACK_PREP from - * other machines. Read image (including atomic restore) until done. - * Wait for ACK_READ from others (should never fail). Thaw processes - * and do post-resume. (The section after the atomic restore is done - * via the code for hibernating). - */ - - node_array[0].current_message = MSG_RUNNING; -} - -/* toi_cluster_open_iface - * - * Description: Prepare to use an interface. - */ - -static int toi_cluster_open_iface(void) -{ - struct net_device *dev; - - rtnl_lock(); - - for_each_netdev(&init_net, dev) { - if (/* dev == &init_net.loopback_dev || */ - strcmp(dev->name, toi_cluster_iface)) - continue; - - net_dev = dev; - break; - } - - rtnl_unlock(); - - if (!net_dev) { - printk(KERN_ERR MYNAME ": Device %s not found.\n", - toi_cluster_iface); - return -ENODEV; - } - - dev_add_pack(&toi_cluster_packet_type); - added_pack = 1; - - loopback_mode = (net_dev == init_net.loopback_dev); - num_local_nodes = loopback_mode ? 8 : 1; - - PRINTK("Loopback mode is %s. Number of local nodes is %d.\n", - loopback_mode ? "on" : "off", num_local_nodes); - - cluster_startup(); - return 0; -} - -/* toi_cluster_close_iface - * - * Description: Stop using an interface. - */ - -static int toi_cluster_close_iface(void) -{ - kill_clusterd(); - if (added_pack) { - dev_remove_pack(&toi_cluster_packet_type); - added_pack = 0; - } - return 0; -} - -static void write_side_effect(void) -{ - if (toi_cluster_ops.enabled) { - toi_cluster_open_iface(); - set_toi_state(TOI_CLUSTER_MODE); - } else { - toi_cluster_close_iface(); - clear_toi_state(TOI_CLUSTER_MODE); - } -} - -static void node_write_side_effect(void) -{ -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0, - NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0, - write_side_effect), - SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL), - SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script, - 256, 0, NULL), - SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script, - 256, 0, STRING), - SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ, - 0) -}; - -/* - * Ops structure. - */ - -static struct toi_module_ops toi_cluster_ops = { - .type = FILTER_MODULE, - .name = "Cluster", - .directory = "cluster", - .module = THIS_MODULE, - .memory_needed = toi_cluster_memory_needed, - .print_debug_info = toi_cluster_print_debug_stats, - .save_config_info = toi_cluster_save_config_info, - .load_config_info = toi_cluster_load_config_info, - .storage_needed = toi_cluster_storage_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -#ifdef MODULE -#define INIT static __init -#define EXIT static __exit -#else -#define INIT -#define EXIT -#endif - -INIT int toi_cluster_init(void) -{ - int temp = toi_register_module(&toi_cluster_ops), i; - struct kobject *kobj = toi_cluster_ops.dir_kobj; - - for (i = 0; i < MAX_LOCAL_NODES; i++) { - node_array[i].current_message = 0; - INIT_LIST_HEAD(&node_array[i].member_list); - init_waitqueue_head(&node_array[i].member_events); - spin_lock_init(&node_array[i].member_list_lock); - spin_lock_init(&node_array[i].receive_lock); - - /* Set up sysfs entry */ - node_array[i].sysfs_data.attr.name = toi_kzalloc(8, - sizeof(node_array[i].sysfs_data.attr.name), - GFP_KERNEL); - sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d", - i); - node_array[i].sysfs_data.attr.mode = SYSFS_RW; - node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER; - node_array[i].sysfs_data.flags = 0; - node_array[i].sysfs_data.data.integer.variable = - (int *) &node_array[i].current_message; - node_array[i].sysfs_data.data.integer.minimum = 0; - node_array[i].sysfs_data.data.integer.maximum = INT_MAX; - node_array[i].sysfs_data.write_side_effect = - node_write_side_effect; - toi_register_sysfs_file(kobj, &node_array[i].sysfs_data); - } - - toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0); - - if (toi_cluster_ops.enabled) - toi_cluster_open_iface(); - - return temp; -} - -EXIT void toi_cluster_exit(void) -{ - int i; - toi_cluster_close_iface(); - - for (i = 0; i < MAX_LOCAL_NODES; i++) - toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj, - &node_array[i].sysfs_data); - toi_unregister_module(&toi_cluster_ops); -} - -static int __init toi_cluster_iface_setup(char *iface) -{ - toi_cluster_ops.enabled = (*iface && - strcmp(iface, "off")); - - if (toi_cluster_ops.enabled) - strncpy(toi_cluster_iface, iface, strlen(iface)); -} - -__setup("toi_cluster=", toi_cluster_iface_setup); diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h deleted file mode 100644 index 84356b304..000000000 --- a/kernel/power/tuxonice_cluster.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * kernel/power/tuxonice_cluster.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifdef CONFIG_TOI_CLUSTER -extern int toi_cluster_init(void); -extern void toi_cluster_exit(void); -extern void toi_initiate_cluster_hibernate(void); -#else -static inline int toi_cluster_init(void) { return 0; } -static inline void toi_cluster_exit(void) { } -static inline void toi_initiate_cluster_hibernate(void) { } -#endif - diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c deleted file mode 100644 index d118568b7..000000000 --- a/kernel/power/tuxonice_compress.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * kernel/power/compression.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data compression routines for TuxOnIce, - * using cryptoapi. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <linux/crypto.h> - -#include "tuxonice_builtin.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" - -static int toi_expected_compression; - -static struct toi_module_ops toi_compression_ops; -static struct toi_module_ops *next_driver; - -static char toi_compressor_name[32] = "lzo"; - -static DEFINE_MUTEX(stats_lock); - -struct cpu_context { - u8 *page_buffer; - struct crypto_comp *transform; - unsigned int len; - u8 *buffer_start; - u8 *output_buffer; -}; - -#define OUT_BUF_SIZE (2 * PAGE_SIZE) - -static DEFINE_PER_CPU(struct cpu_context, contexts); - -/* - * toi_crypto_prepare - * - * Prepare to do some work by allocating buffers and transforms. - */ -static int toi_compress_crypto_prepare(void) -{ - int cpu; - - if (!*toi_compressor_name) { - printk(KERN_INFO "TuxOnIce: Compression enabled but no " - "compressor name set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0); - if (IS_ERR(this->transform)) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s compression transform.\n", - toi_compressor_name); - this->transform = NULL; - return 1; - } - - this->page_buffer = - (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP); - - if (!this->page_buffer) { - printk(KERN_ERR - "Failed to allocate a page buffer for TuxOnIce " - "compression driver.\n"); - return -ENOMEM; - } - - this->output_buffer = - (char *) vmalloc_32(OUT_BUF_SIZE); - - if (!this->output_buffer) { - printk(KERN_ERR - "Failed to allocate a output buffer for TuxOnIce " - "compression driver.\n"); - return -ENOMEM; - } - } - - return 0; -} - -static int toi_compress_rw_cleanup(int writing) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->transform) { - crypto_free_comp(this->transform); - this->transform = NULL; - } - - if (this->page_buffer) - toi_free_page(16, (unsigned long) this->page_buffer); - - this->page_buffer = NULL; - - if (this->output_buffer) - vfree(this->output_buffer); - - this->output_buffer = NULL; - } - - return 0; -} - -/* - * toi_compress_init - */ - -static int toi_compress_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - toi_compress_bytes_in = 0; - toi_compress_bytes_out = 0; - - next_driver = toi_get_next_filter(&toi_compression_ops); - - return next_driver ? 0 : -ECHILD; -} - -/* - * toi_compress_rw_init() - */ - -static int toi_compress_rw_init(int rw, int stream_number) -{ - if (toi_compress_crypto_prepare()) { - printk(KERN_ERR "Failed to initialise compression " - "algorithm.\n"); - if (rw == READ) { - printk(KERN_INFO "Unable to read the image.\n"); - return -ENODEV; - } else { - printk(KERN_INFO "Continuing without " - "compressing the image.\n"); - toi_compression_ops.enabled = 0; - } - } - - return 0; -} - -/* - * toi_compress_write_page() - * - * Compress a page of data, buffering output and passing on filled - * pages to the next module in the pipeline. - * - * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing - * data to be compressed. - * - * Returns: 0 on success. Otherwise the error is that returned by later - * modules, -ECHILD if we have a broken pipeline or -EIO if - * zlib errs. - */ -static int toi_compress_write_page(unsigned long index, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - int ret = 0, cpu = smp_processor_id(); - struct cpu_context *ctx = &per_cpu(contexts, cpu); - u8* output_buffer = buffer_page; - int output_len = buf_size; - int out_buf_type = buf_type; - - if (ctx->transform) { - - ctx->buffer_start = TOI_MAP(buf_type, buffer_page); - ctx->len = OUT_BUF_SIZE; - - ret = crypto_comp_compress(ctx->transform, - ctx->buffer_start, buf_size, - ctx->output_buffer, &ctx->len); - - TOI_UNMAP(buf_type, buffer_page); - - toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, - "CPU %d, index %lu: %d bytes", - cpu, index, ctx->len); - - if (!ret && ctx->len < buf_size) { /* some compression */ - output_buffer = ctx->output_buffer; - output_len = ctx->len; - out_buf_type = TOI_VIRT; - } - - } - - mutex_lock(&stats_lock); - - toi_compress_bytes_in += buf_size; - toi_compress_bytes_out += output_len; - - mutex_unlock(&stats_lock); - - if (!ret) - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - - return ret; -} - -/* - * toi_compress_read_page() - * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. - * - * Retrieve data from later modules and decompress it until the input buffer - * is filled. - * Zero if successful. Error condition from me or from downstream on failure. - */ -static int toi_compress_read_page(unsigned long *index, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int ret, cpu = smp_processor_id(); - unsigned int len; - unsigned int outlen = PAGE_SIZE; - char *buffer_start; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!ctx->transform) - return next_driver->read_page(index, TOI_PAGE, buffer_page, - buf_size); - - /* - * All our reads must be synchronous - we can't decompress - * data that hasn't been read yet. - */ - - ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len); - - buffer_start = kmap(buffer_page); - - /* Error or uncompressed data */ - if (ret || len == PAGE_SIZE) { - memcpy(buffer_start, ctx->page_buffer, len); - goto out; - } - - ret = crypto_comp_decompress( - ctx->transform, - ctx->page_buffer, - len, buffer_start, &outlen); - - toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, - "CPU %d, index %lu: %d=>%d (%d).", - cpu, *index, len, outlen, ret); - - if (ret) - abort_hibernate(TOI_FAILED_IO, - "Compress_read returned %d.\n", ret); - else if (outlen != PAGE_SIZE) { - abort_hibernate(TOI_FAILED_IO, - "Decompression yielded %d bytes instead of %ld.\n", - outlen, PAGE_SIZE); - printk(KERN_ERR "Decompression yielded %d bytes instead of " - "%ld.\n", outlen, PAGE_SIZE); - ret = -EIO; - *buf_size = outlen; - } -out: - TOI_UNMAP(buf_type, buffer_page); - return ret; -} - -/* - * toi_compress_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_compress_print_debug_stats(char *buffer, int size) -{ - unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT, - pages_out = toi_compress_bytes_out >> PAGE_SHIFT; - int len; - - /* Output the compression ratio achieved. */ - if (*toi_compressor_name) - len = scnprintf(buffer, size, "- Compressor is '%s'.\n", - toi_compressor_name); - else - len = scnprintf(buffer, size, "- Compressor is not set.\n"); - - if (pages_in) - len += scnprintf(buffer+len, size - len, " Compressed " - "%lu bytes into %lu (%ld percent compression).\n", - toi_compress_bytes_in, - toi_compress_bytes_out, - (pages_in - pages_out) * 100 / pages_in); - return len; -} - -/* - * toi_compress_compression_memory_needed - * - * Tell the caller how much memory we need to operate during hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_compress_memory_needed(void) -{ - return 2 * PAGE_SIZE; -} - -static int toi_compress_storage_needed(void) -{ - return 2 * sizeof(unsigned long) + 2 * sizeof(int) + - strlen(toi_compressor_name) + 1; -} - -/* - * toi_compress_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_compress_save_config_info(char *buffer) -{ - int len = strlen(toi_compressor_name) + 1, offset = 0; - - *((unsigned long *) buffer) = toi_compress_bytes_in; - offset += sizeof(unsigned long); - *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out; - offset += sizeof(unsigned long); - *((int *) (buffer + offset)) = toi_expected_compression; - offset += sizeof(int); - *((int *) (buffer + offset)) = len; - offset += sizeof(int); - strncpy(buffer + offset, toi_compressor_name, len); - return offset + len; -} - -/* toi_compress_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for decompressing the image at - * resume time. - */ -static void toi_compress_load_config_info(char *buffer, int size) -{ - int len, offset = 0; - - toi_compress_bytes_in = *((unsigned long *) buffer); - offset += sizeof(unsigned long); - toi_compress_bytes_out = *((unsigned long *) (buffer + offset)); - offset += sizeof(unsigned long); - toi_expected_compression = *((int *) (buffer + offset)); - offset += sizeof(int); - len = *((int *) (buffer + offset)); - offset += sizeof(int); - strncpy(toi_compressor_name, buffer + offset, len); -} - -static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - bkd->compress_bytes_in = toi_compress_bytes_in; - bkd->compress_bytes_out = toi_compress_bytes_out; -} - -static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_compress_bytes_in = bkd->compress_bytes_in; - toi_compress_bytes_out = bkd->compress_bytes_out; -} - -/* - * toi_expected_compression_ratio - * - * Description: Returns the expected ratio between data passed into this module - * and the amount of data output when writing. - * Returns: 100 if the module is disabled. Otherwise the value set by the - * user via our sysfs entry. - */ - -static int toi_compress_expected_ratio(void) -{ - if (!toi_compression_ops.enabled) - return 100; - else - return 100 - toi_expected_compression; -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression, - 0, 99, 0, NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0, - NULL), - SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL), -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_compression_ops = { - .type = FILTER_MODULE, - .name = "compression", - .directory = "compression", - .module = THIS_MODULE, - .initialise = toi_compress_init, - .memory_needed = toi_compress_memory_needed, - .print_debug_info = toi_compress_print_debug_stats, - .save_config_info = toi_compress_save_config_info, - .load_config_info = toi_compress_load_config_info, - .storage_needed = toi_compress_storage_needed, - .expected_compression = toi_compress_expected_ratio, - - .pre_atomic_restore = toi_compress_pre_atomic_restore, - .post_atomic_restore = toi_compress_post_atomic_restore, - - .rw_init = toi_compress_rw_init, - .rw_cleanup = toi_compress_rw_cleanup, - - .write_page = toi_compress_write_page, - .read_page = toi_compress_read_page, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -static __init int toi_compress_load(void) -{ - return toi_register_module(&toi_compression_ops); -} - -late_initcall(toi_compress_load); diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c deleted file mode 100644 index dc02a4acf..000000000 --- a/kernel/power/tuxonice_copy_before_write.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * kernel/power/tuxonice_copy_before_write.c - * - * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines (apart from the fault handling code) to deal with allocating memory - * for copying pages before they are modified, restoring the contents and getting - * the contents written to disk. - */ - -#include <linux/percpu-defs.h> -#include <linux/sched.h> -#include <linux/tuxonice.h> -#include "tuxonice_alloc.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice.h" - -DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states); -#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw)) -#define toi_cbw_pool_size 100 - -static void _toi_free_cbw_data(struct toi_cbw_state *state) -{ - struct toi_cbw *page_ptr, *ptr, *next; - - page_ptr = ptr = state->first; - - while(ptr) { - next = ptr->next; - - if (ptr->virt) { - toi__free_page(40, virt_to_page(ptr->virt)); - } - if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) { - /* Must be on a new page - free the previous one. */ - toi__free_page(40, virt_to_page(page_ptr)); - page_ptr = ptr; - } - ptr = next; - } - - if (page_ptr) { - toi__free_page(40, virt_to_page(page_ptr)); - } - - state->first = state->next = state->last = NULL; - state->size = 0; -} - -void toi_free_cbw_data(void) -{ - int i; - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - - if (!state->first) - continue; - - state->enabled = 0; - - while (state->active) { - schedule(); - } - - _toi_free_cbw_data(state); - } -} - -static int _toi_allocate_cbw_data(struct toi_cbw_state *state) -{ - while(state->size < toi_cbw_pool_size) { - int i; - struct toi_cbw *ptr; - - ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL); - - if (!ptr) { - return -ENOMEM; - } - - if (!state->first) { - state->first = state->next = state->last = ptr; - } - - for (i = 0; i < CBWS_PER_PAGE; i++) { - struct toi_cbw *cbw = &ptr[i]; - - cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL); - if (!cbw->virt) { - state->size += i; - printk("Out of memory allocating CBW pages.\n"); - return -ENOMEM; - } - - if (cbw == state->first) - continue; - - state->last->next = cbw; - state->last = cbw; - } - - state->size += CBWS_PER_PAGE; - } - - state->enabled = 1; - - return 0; -} - - -int toi_allocate_cbw_data(void) -{ - int i, result; - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - - result = _toi_allocate_cbw_data(state); - - if (result) - return result; - } - - return 0; -} - -void toi_cbw_restore(void) -{ - if (!toi_keeping_image) - return; - -} - -void toi_cbw_write(void) -{ - if (!toi_keeping_image) - return; - -} - -/** - * toi_cbw_test_read - Test copy before write on one page - * - * Allocate copy before write buffers, then make one page only copy-before-write - * and attempt to write to it. We should then be able to retrieve the original - * version from the cbw buffer and the modified version from the page itself. - */ -static int toi_cbw_test_read(const char *buffer, int count) -{ - unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL); - char *original = "Original contents"; - char *modified = "Modified material"; - struct page *page = virt_to_page(virt); - int i, len = 0, found = 0, pfn = page_to_pfn(page); - - if (!page) { - printk("toi_cbw_test_read: Unable to allocate a page for testing.\n"); - return -ENOMEM; - } - - memcpy((char *) virt, original, strlen(original)); - - if (toi_allocate_cbw_data()) { - printk("toi_cbw_test_read: Unable to allocate cbw data.\n"); - return -ENOMEM; - } - - toi_reset_dirtiness_one(pfn, 0); - - SetPageTOI_CBW(page); - - memcpy((char *) virt, modified, strlen(modified)); - - if (strncmp((char *) virt, modified, strlen(modified))) { - len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n"); - } - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - struct toi_cbw *ptr = state->first, *last_ptr = ptr; - - if (!found) { - while (ptr) { - if (ptr->pfn == pfn) { - found = 1; - if (strncmp(ptr->virt, original, strlen(original))) { - len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n"); - } else { - len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n"); - } - break; - } - - last_ptr = ptr; - ptr = ptr->next; - } - } - - if (!last_ptr) - len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i); - } - - if (!found) - len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n"); - - toi_free_cbw_data(); - - return len; -} - -/* - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read, - NULL, SYSFS_NEEDS_SM_FOR_READ, NULL), -}; - -static struct toi_module_ops toi_cbw_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "copy_before_write debugging", - .directory = "cbw", - .module = THIS_MODULE, - .early = 1, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_cbw_init(void) -{ - int result = toi_register_module(&toi_cbw_ops); - return result; -} diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c deleted file mode 100644 index 3b558b220..000000000 --- a/kernel/power/tuxonice_extent.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * kernel/power/tuxonice_extent.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * These functions encapsulate the manipulation of storage metadata. - */ - -#include <linux/suspend.h> -#include "tuxonice_modules.h" -#include "tuxonice_extent.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" -#include "tuxonice.h" - -/** - * toi_get_extent - return a free extent - * - * May fail, returning NULL instead. - **/ -static struct hibernate_extent *toi_get_extent(void) -{ - return (struct hibernate_extent *) toi_kzalloc(2, - sizeof(struct hibernate_extent), TOI_ATOMIC_GFP); -} - -/** - * toi_put_extent_chain - free a chain of extents starting from value 'from' - * @chain: Chain to free. - * - * Note that 'from' is an extent value, and may be part way through an extent. - * In this case, the extent should be truncated (if necessary) and following - * extents freed. - **/ -void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from) -{ - struct hibernate_extent *this; - - this = chain->first; - - while (this) { - struct hibernate_extent *next = this->next; - - // Delete the whole extent? - if (this->start >= from) { - chain->size -= (this->end - this->start + 1); - if (chain->first == this) - chain->first = next; - if (chain->last_touched == this) - chain->last_touched = NULL; - if (chain->current_extent == this) - chain->current_extent = NULL; - toi_kfree(2, this, sizeof(*this)); - chain->num_extents--; - } else if (this->end >= from) { - // Delete part of the extent - chain->size -= (this->end - from + 1); - this->start = from; - } - this = next; - } -} - -/** - * toi_put_extent_chain - free a whole chain of extents - * @chain: Chain to free. - **/ -void toi_put_extent_chain(struct hibernate_extent_chain *chain) -{ - toi_put_extent_chain_from(chain, 0); -} - -/** - * toi_add_to_extent_chain - add an extent to an existing chain - * @chain: Chain to which the extend should be added - * @start: Start of the extent (first physical block) - * @end: End of the extent (last physical block) - * - * The chain information is updated if the insertion is successful. - **/ -int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, - unsigned long start, unsigned long end) -{ - struct hibernate_extent *new_ext = NULL, *cur_ext = NULL; - - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Adding extent %lu-%lu to chain %p.\n", start, end, chain); - - /* Find the right place in the chain */ - if (chain->last_touched && chain->last_touched->start < start) - cur_ext = chain->last_touched; - else if (chain->first && chain->first->start < start) - cur_ext = chain->first; - - if (cur_ext) { - while (cur_ext->next && cur_ext->next->start < start) - cur_ext = cur_ext->next; - - if (cur_ext->end == (start - 1)) { - struct hibernate_extent *next_ext = cur_ext->next; - cur_ext->end = end; - - /* Merge with the following one? */ - if (next_ext && cur_ext->end + 1 == next_ext->start) { - cur_ext->end = next_ext->end; - cur_ext->next = next_ext->next; - toi_kfree(2, next_ext, sizeof(*next_ext)); - chain->num_extents--; - } - - chain->last_touched = cur_ext; - chain->size += (end - start + 1); - - return 0; - } - } - - new_ext = toi_get_extent(); - if (!new_ext) { - printk(KERN_INFO "Error unable to append a new extent to the " - "chain.\n"); - return -ENOMEM; - } - - chain->num_extents++; - chain->size += (end - start + 1); - new_ext->start = start; - new_ext->end = end; - - chain->last_touched = new_ext; - - if (cur_ext) { - new_ext->next = cur_ext->next; - cur_ext->next = new_ext; - } else { - if (chain->first) - new_ext->next = chain->first; - chain->first = new_ext; - } - - return 0; -} diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h deleted file mode 100644 index cf1289efc..000000000 --- a/kernel/power/tuxonice_extent.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * kernel/power/tuxonice_extent.h - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations related to extents. Extents are - * TuxOnIce's method of storing some of the metadata for the image. - * See tuxonice_extent.c for more info. - * - */ - -#include "tuxonice_modules.h" - -#ifndef EXTENT_H -#define EXTENT_H - -struct hibernate_extent { - unsigned long start, end; - struct hibernate_extent *next; -}; - -struct hibernate_extent_chain { - unsigned long size; /* size of the chain ie sum (max-min+1) */ - int num_extents; - struct hibernate_extent *first, *last_touched; - struct hibernate_extent *current_extent; - unsigned long current_offset; -}; - -/* Simplify iterating through all the values in an extent chain */ -#define toi_extent_for_each(extent_chain, extentpointer, value) \ -if ((extent_chain)->first) \ - for ((extentpointer) = (extent_chain)->first, (value) = \ - (extentpointer)->start; \ - ((extentpointer) && ((extentpointer)->next || (value) <= \ - (extentpointer)->end)); \ - (((value) == (extentpointer)->end) ? \ - ((extentpointer) = (extentpointer)->next, (value) = \ - ((extentpointer) ? (extentpointer)->start : 0)) : \ - (value)++)) - -extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from); -#endif diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c deleted file mode 100644 index 607246051..000000000 --- a/kernel/power/tuxonice_file.c +++ /dev/null @@ -1,484 +0,0 @@ -/* - * kernel/power/tuxonice_file.c - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file encapsulates functions for usage of a simple file as a - * backing store. It is based upon the swapallocator, and shares the - * same basic working. Here, though, we have nothing to do with - * swapspace, and only one device to worry about. - * - * The user can just - * - * echo TuxOnIce > /path/to/my_file - * - * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file - * - * and - * - * echo /path/to/my_file > /sys/power/tuxonice/file/target - * - * then put what they find in /sys/power/tuxonice/resume - * as their resume= parameter in lilo.conf (and rerun lilo if using it). - * - * Having done this, they're ready to hibernate and resume. - * - * TODO: - * - File resizing. - */ - -#include <linux/blkdev.h> -#include <linux/mount.h> -#include <linux/fs.h> -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_bio.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_ui.h" -#include "tuxonice_io.h" - -#define target_is_normal_file() (S_ISREG(target_inode->i_mode)) - -static struct toi_module_ops toi_fileops; - -static struct file *target_file; -static struct block_device *toi_file_target_bdev; -static unsigned long pages_available, pages_allocated; -static char toi_file_target[256]; -static struct inode *target_inode; -static int file_target_priority; -static int used_devt; -static int target_claim; -static dev_t toi_file_dev_t; -static int sig_page_index; - -/* For test_toi_file_target */ -static struct toi_bdev_info *file_chain; - -static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num) -{ - int j; - sector_t last = 0; - - for (j = 0; j < dev_info->blocks_per_page; j++) { - sector_t this = bmap(target_inode, - page_num * dev_info->blocks_per_page + j); - - if (!this || (last && (last + 1) != this)) - break; - - last = this; - } - - return j == dev_info->blocks_per_page; -} - -static unsigned long get_usable_pages(struct toi_bdev_info *dev_info) -{ - unsigned long result = 0; - struct block_device *bdev = dev_info->bdev; - int i; - - switch (target_inode->i_mode & S_IFMT) { - case S_IFSOCK: - case S_IFCHR: - case S_IFIFO: /* Socket, Char, Fifo */ - return -1; - case S_IFREG: /* Regular file: current size - holes + free - space on part */ - for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) { - if (has_contiguous_blocks(dev_info, i)) - result++; - } - break; - case S_IFBLK: /* Block device */ - if (!bdev->bd_disk) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "bdev->bd_disk null."); - return 0; - } - - result = (bdev->bd_part ? - bdev->bd_part->nr_sects : - get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9); - } - - - return result; -} - -static int toi_file_register_storage(void) -{ - struct toi_bdev_info *devinfo; - int result = 0; - struct fs_info *fs_info; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage."); - if (!strlen(toi_file_target)) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: " - "No target filename set."); - return 0; - } - - target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0); - toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.", - toi_file_target, target_file); - - if (IS_ERR(target_file) || !target_file) { - target_file = NULL; - toi_file_dev_t = name_to_dev_t(toi_file_target); - if (!toi_file_dev_t) { - struct kstat stat; - int error = vfs_stat(toi_file_target, &stat); - printk(KERN_INFO "Open file %s returned %p and " - "name_to_devt failed.\n", - toi_file_target, target_file); - if (error) { - printk(KERN_INFO "Stating the file also failed." - " Nothing more we can do.\n"); - return 0; - } else - toi_file_dev_t = stat.rdev; - } - - toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t); - if (IS_ERR(toi_file_target_bdev)) { - printk(KERN_INFO "Got a dev_num (%lx) but failed to " - "open it.\n", - (unsigned long) toi_file_dev_t); - toi_file_target_bdev = NULL; - return 0; - } - used_devt = 1; - target_inode = toi_file_target_bdev->bd_inode; - } else - target_inode = target_file->f_mapping->host; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target."); - if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) || - S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) { - printk(KERN_INFO "File support works with regular files," - " character files and block devices.\n"); - /* Cleanup routine will undo the above */ - return 0; - } - - if (!used_devt) { - if (S_ISBLK(target_inode->i_mode)) { - toi_file_target_bdev = I_BDEV(target_inode); - if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE | - FMODE_READ, NULL)) - target_claim = 1; - } else - toi_file_target_bdev = target_inode->i_sb->s_bdev; - if (!toi_file_target_bdev) { - printk(KERN_INFO "%s is not a valid file allocator " - "target.\n", toi_file_target); - return 0; - } - toi_file_dev_t = toi_file_target_bdev->bd_dev; - } - - devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC); - if (!devinfo) { - printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n"); - return -ENOMEM; - } - - devinfo->bdev = toi_file_target_bdev; - devinfo->allocator = &toi_fileops; - devinfo->allocator_index = 0; - - fs_info = fs_info_from_block_dev(toi_file_target_bdev); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(devinfo->uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - /* Unlike swap code, only complain if fs_info_from_block_dev returned - * -ENOMEM. The 'file' might be a full partition, so might validly not - * have an identifiable type, UUID etc. - */ - if (result) - printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n", - result); - devinfo->dev_t = toi_file_dev_t; - devinfo->prio = file_target_priority; - devinfo->bmap_shift = target_inode->i_blkbits - 9; - devinfo->blocks_per_page = - (1 << (PAGE_SHIFT - target_inode->i_blkbits)); - sprintf(devinfo->name, "file %s", toi_file_target); - file_chain = devinfo; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap " - "shift is %d. Blocks per page %d.", - devinfo->dev_t, devinfo->prio, devinfo->bmap_shift, - devinfo->blocks_per_page); - - /* Keep one aside for the signature */ - pages_available = get_usable_pages(devinfo) - 1; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu " - "pages.", pages_available); - - toi_bio_ops.register_storage(devinfo); - return 0; -} - -static unsigned long toi_file_storage_available(void) -{ - return pages_available; -} - -static int toi_file_allocate_storage(struct toi_bdev_info *chain, - unsigned long request) -{ - unsigned long available = pages_available - pages_allocated; - unsigned long to_add = min(available, request); - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated " - "is %lu. Allocating %lu pages from file.", - pages_available, pages_allocated, to_add); - pages_allocated += to_add; - - return to_add; -} - -/** - * __populate_block_list - add an extent to the chain - * @min: Start of the extent (first physical block = sector) - * @max: End of the extent (last physical block = sector) - * - * If TOI_TEST_BIO is set, print a debug message, outputting the min and max - * fs block numbers. - **/ -static int __populate_block_list(struct toi_bdev_info *chain, int min, int max) -{ - if (test_action_state(TOI_TEST_BIO)) - toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.", - min << chain->bmap_shift, - ((max + 1) << chain->bmap_shift) - 1); - - return toi_add_to_extent_chain(&chain->blocks, min, max); -} - -static int get_main_pool_phys_params(struct toi_bdev_info *chain) -{ - int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0; - unsigned long pages_mapped = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks."); - - if (chain->blocks.first) - toi_put_extent_chain(&chain->blocks); - - if (!target_is_normal_file()) { - result = (pages_available > 0) ? - __populate_block_list(chain, chain->blocks_per_page, - (pages_allocated + 1) * - chain->blocks_per_page - 1) : 0; - return result; - } - - /* - * FIXME: We are assuming the first page is contiguous. Is that - * assumption always right? - */ - - for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) { - sector_t new_sector; - - if (!has_contiguous_blocks(chain, i)) - continue; - - if (!have_sig_page) { - have_sig_page = 1; - sig_page_index = i; - continue; - } - - pages_mapped++; - - /* Ignore first page - it has the header */ - if (pages_mapped == 1) - continue; - - new_sector = bmap(target_inode, (i * chain->blocks_per_page)); - - /* - * I'd love to be able to fill in holes and resize - * files, but not yet... - */ - - if (new_sector == extent_max + 1) - extent_max += chain->blocks_per_page; - else { - if (extent_min > -1) { - result = __populate_block_list(chain, - extent_min, extent_max); - if (result) - return result; - } - - extent_min = new_sector; - extent_max = extent_min + - chain->blocks_per_page - 1; - } - - if (pages_mapped == pages_allocated) - break; - } - - if (extent_min > -1) { - result = __populate_block_list(chain, extent_min, extent_max); - if (result) - return result; - } - - return 0; -} - -static void toi_file_free_storage(struct toi_bdev_info *chain) -{ - pages_allocated = 0; - file_chain = NULL; -} - -/** - * toi_file_print_debug_stats - print debug info - * @buffer: Buffer to data to populate - * @size: Size of the buffer - **/ -static int toi_file_print_debug_stats(char *buffer, int size) -{ - int len = scnprintf(buffer, size, "- File Allocator active.\n"); - - len += scnprintf(buffer+len, size-len, " Storage available for " - "image: %lu pages.\n", pages_available); - - return len; -} - -static void toi_file_cleanup(int finishing_cycle) -{ - if (toi_file_target_bdev) { - if (target_claim) { - blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ); - target_claim = 0; - } - - if (used_devt) { - blkdev_put(toi_file_target_bdev, - FMODE_READ | FMODE_NDELAY); - used_devt = 0; - } - toi_file_target_bdev = NULL; - target_inode = NULL; - } - - if (target_file) { - filp_close(target_file, NULL); - target_file = NULL; - } - - pages_available = 0; -} - -/** - * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target - * - * Test wheter the target file is valid for hibernating. - **/ -static void test_toi_file_target(void) -{ - int result = toi_file_register_storage(); - sector_t sector; - char buf[50]; - struct fs_info *fs_info; - - if (result || !file_chain) - return; - - /* This doesn't mean we're in business. Is any storage available? */ - if (!pages_available) - goto out; - - toi_file_allocate_storage(file_chain, 1); - result = get_main_pool_phys_params(file_chain); - if (result) - goto out; - - - sector = bmap(target_inode, sig_page_index * - file_chain->blocks_per_page) << file_chain->bmap_shift; - - /* Use the uuid, or the dev_t if that fails */ - fs_info = fs_info_from_block_dev(toi_file_target_bdev); - if (!fs_info || IS_ERR(fs_info)) { - bdevname(toi_file_target_bdev, buf); - sprintf(resume_file, "/dev/%s:%llu", buf, - (unsigned long long) sector); - } else { - int i; - hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0); - - /* Remove the spaces */ - for (i = 1; i < 16; i++) { - buf[2 * i] = buf[3 * i]; - buf[2 * i + 1] = buf[3 * i + 1]; - } - buf[32] = 0; - sprintf(resume_file, "UUID=%s:0x%llx", buf, - (unsigned long long) sector); - free_fs_info(fs_info); - } - - toi_attempt_to_parse_resume_device(0); -out: - toi_file_free_storage(file_chain); - toi_bio_ops.free_storage(); -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256, - SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target), - SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL), - SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095, - 4096, 0, NULL), -}; - -static struct toi_bio_allocator_ops toi_bio_fileops = { - .register_storage = toi_file_register_storage, - .storage_available = toi_file_storage_available, - .allocate_storage = toi_file_allocate_storage, - .bmap = get_main_pool_phys_params, - .free_storage = toi_file_free_storage, -}; - -static struct toi_module_ops toi_fileops = { - .type = BIO_ALLOCATOR_MODULE, - .name = "file storage", - .directory = "file", - .module = THIS_MODULE, - .print_debug_info = toi_file_print_debug_stats, - .cleanup = toi_file_cleanup, - .bio_allocator_ops = &toi_bio_fileops, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -static __init int toi_file_load(void) -{ - return toi_register_module(&toi_fileops); -} - -late_initcall(toi_file_load); diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c deleted file mode 100644 index bdcd832f3..000000000 --- a/kernel/power/tuxonice_highlevel.c +++ /dev/null @@ -1,1413 +0,0 @@ -/* - * kernel/power/tuxonice_highlevel.c - */ -/** \mainpage TuxOnIce. - * - * TuxOnIce provides support for saving and restoring an image of - * system memory to an arbitrary storage device, either on the local computer, - * or across some network. The support is entirely OS based, so TuxOnIce - * works without requiring BIOS, APM or ACPI support. The vast majority of the - * code is also architecture independant, so it should be very easy to port - * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem - * and preemption. Initramfses and initrds are also supported. - * - * TuxOnIce uses a modular design, in which the method of storing the image is - * completely abstracted from the core code, as are transformations on the data - * such as compression and/or encryption (multiple 'modules' can be used to - * provide arbitrary combinations of functionality). The user interface is also - * modular, so that arbitrarily simple or complex interfaces can be used to - * provide anything from debugging information through to eye candy. - * - * \section Copyright - * - * TuxOnIce is released under the GPLv2. - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR> - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR> - * - * \section Credits - * - * Nigel would like to thank the following people for their work: - * - * Bernard Blackham <bernard@blackham.com.au><BR> - * Web page & Wiki administration, some coding. A person without whom - * TuxOnIce would not be where it is. - * - * Michael Frank <mhf@linuxmail.org><BR> - * Extensive testing and help with improving stability. I was constantly - * amazed by the quality and quantity of Michael's help. - * - * Pavel Machek <pavel@ucw.cz><BR> - * Modifications, defectiveness pointing, being with Gabor at the very - * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and - * 2.5.17. Even though Pavel and I disagree on the direction suspend to - * disk should take, I appreciate the valuable work he did in helping Gabor - * get the concept working. - * - * ..and of course the myriads of TuxOnIce users who have helped diagnose - * and fix bugs, made suggestions on how to improve the code, proofread - * documentation, and donated time and money. - * - * Thanks also to corporate sponsors: - * - * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!). - * - * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who - * allowed him to work on TuxOnIce and PM related issues on company time. - * - * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct - * 2003 to Jan 2004. - * - * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing - * maintenance of SMP and Highmem support. - * - * <B>OSDL.</B> Provided access to various hardware configurations, make - * occasional small donations to the project. - */ - -#include <linux/suspend.h> -#include <linux/module.h> -#include <linux/freezer.h> -#include <generated/utsrelease.h> -#include <linux/cpu.h> -#include <linux/console.h> -#include <linux/writeback.h> -#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */ -#include <linux/bio.h> -#include <linux/kgdb.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_power_off.h" -#include "tuxonice_storage.h" -#include "tuxonice_checksum.h" -#include "tuxonice_builtin.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_alloc.h" -#include "tuxonice_cluster.h" - -/*! Pageset metadata. */ -struct pagedir pagedir2 = {2}; - -static mm_segment_t oldfs; -static DEFINE_MUTEX(tuxonice_in_use); -static int block_dump_save; - -int toi_trace_index; - -/* Binary signature if an image is present */ -char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c"; - -unsigned long boot_kernel_data_buffer; - -static char *result_strings[] = { - "Hibernation was aborted", - "The user requested that we cancel the hibernation", - "No storage was available", - "Insufficient storage was available", - "Freezing filesystems and/or tasks failed", - "A pre-existing image was used", - "We would free memory, but image size limit doesn't allow this", - "Unable to free enough memory to hibernate", - "Unable to obtain the Power Management Semaphore", - "A device suspend/resume returned an error", - "A system device suspend/resume returned an error", - "The extra pages allowance is too small", - "We were unable to successfully prepare an image", - "TuxOnIce module initialisation failed", - "TuxOnIce module cleanup failed", - "I/O errors were encountered", - "Ran out of memory", - "An error was encountered while reading the image", - "Platform preparation failed", - "CPU Hotplugging failed", - "Architecture specific preparation failed", - "Pages needed resaving, but we were told to abort if this happens", - "We can't hibernate at the moment (invalid resume= or filewriter " - "target?)", - "A hibernation preparation notifier chain member cancelled the " - "hibernation", - "Pre-snapshot preparation failed", - "Pre-restore preparation failed", - "Failed to disable usermode helpers", - "Can't resume from alternate image", - "Header reservation too small", - "Device Power Management Preparation failed", -}; - -/** - * toi_finish_anything - cleanup after doing anything - * @hibernate_or_resume: Whether finishing a cycle or attempt at - * resuming. - * - * This is our basic clean-up routine, matching start_anything below. We - * call cleanup routines, drop module references and restore process fs and - * cpus allowed masks, together with the global block_dump variable's value. - **/ -void toi_finish_anything(int hibernate_or_resume) -{ - toi_running = 0; - toi_cleanup_modules(hibernate_or_resume); - toi_put_modules(); - if (hibernate_or_resume) { - block_dump = block_dump_save; - set_cpus_allowed_ptr(current, cpu_all_mask); - toi_alloc_print_debug_stats(); - atomic_inc(&snapshot_device_available); - unlock_system_sleep(); - } - - set_fs(oldfs); - mutex_unlock(&tuxonice_in_use); -} - -/** - * toi_start_anything - basic initialisation for TuxOnIce - * @toi_or_resume: Whether starting a cycle or attempt at resuming. - * - * Our basic initialisation routine. Take references on modules, use the - * kernel segment, recheck resume= if no active allocator is set, initialise - * modules, save and reset block_dump and ensure we're running on CPU0. - **/ -int toi_start_anything(int hibernate_or_resume) -{ - mutex_lock(&tuxonice_in_use); - - oldfs = get_fs(); - set_fs(KERNEL_DS); - - toi_trace_index = 0; - - if (hibernate_or_resume) { - lock_system_sleep(); - - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - goto snapshotdevice_unavailable; - } - - if (hibernate_or_resume == SYSFS_HIBERNATE) - toi_print_modules(); - - if (toi_get_modules()) { - printk(KERN_INFO "TuxOnIce: Get modules failed!\n"); - goto prehibernate_err; - } - - if (hibernate_or_resume) { - block_dump_save = block_dump; - block_dump = 0; - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_first(cpu_online_mask))); - } - - if (toi_initialise_modules_early(hibernate_or_resume)) - goto early_init_err; - - if (!toiActiveAllocator) - toi_attempt_to_parse_resume_device(!hibernate_or_resume); - - if (!toi_initialise_modules_late(hibernate_or_resume)) { - toi_running = 1; /* For the swsusp code we use :< */ - return 0; - } - - toi_cleanup_modules(hibernate_or_resume); -early_init_err: - if (hibernate_or_resume) { - block_dump_save = block_dump; - set_cpus_allowed_ptr(current, cpu_all_mask); - } - toi_put_modules(); -prehibernate_err: - if (hibernate_or_resume) - atomic_inc(&snapshot_device_available); -snapshotdevice_unavailable: - if (hibernate_or_resume) - mutex_unlock(&pm_mutex); - set_fs(oldfs); - mutex_unlock(&tuxonice_in_use); - return -EBUSY; -} - -/* - * Nosave page tracking. - * - * Here rather than in prepare_image because we want to do it once only at the - * start of a cycle. - */ - -/** - * mark_nosave_pages - set up our Nosave bitmap - * - * Build a bitmap of Nosave pages from the list. The bitmap allows faster - * use when preparing the image. - **/ -static void mark_nosave_pages(void) -{ - struct nosave_region *region; - - list_for_each_entry(region, &nosave_regions, list) { - unsigned long pfn; - - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - SetPageNosave(pfn_to_page(pfn)); - } - } -} - -/** - * allocate_bitmaps - allocate bitmaps used to record page states - * - * Allocate the bitmaps we use to record the various TuxOnIce related - * page states. - **/ -static int allocate_bitmaps(void) -{ - if (toi_alloc_bitmap(&pageset1_map) || - toi_alloc_bitmap(&pageset1_copy_map) || - toi_alloc_bitmap(&pageset2_map) || - toi_alloc_bitmap(&io_map) || - toi_alloc_bitmap(&nosave_map) || - toi_alloc_bitmap(&free_map) || - toi_alloc_bitmap(&compare_map) || - toi_alloc_bitmap(&page_resave_map)) - return 1; - - return 0; -} - -/** - * free_bitmaps - free the bitmaps used to record page states - * - * Free the bitmaps allocated above. It is not an error to call - * memory_bm_free on a bitmap that isn't currently allocated. - **/ -static void free_bitmaps(void) -{ - toi_free_bitmap(&pageset1_map); - toi_free_bitmap(&pageset1_copy_map); - toi_free_bitmap(&pageset2_map); - toi_free_bitmap(&io_map); - toi_free_bitmap(&nosave_map); - toi_free_bitmap(&free_map); - toi_free_bitmap(&compare_map); - toi_free_bitmap(&page_resave_map); -} - -/** - * io_MB_per_second - return the number of MB/s read or written - * @write: Whether to return the speed at which we wrote. - * - * Calculate the number of megabytes per second that were read or written. - **/ -static int io_MB_per_second(int write) -{ - return (toi_bkd.toi_io_time[write][1]) ? - MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ / - toi_bkd.toi_io_time[write][1] : 0; -} - -#define SNPRINTF(a...) do { len += scnprintf(((char *) buffer) + len, \ - count - len - 1, ## a); } while (0) - -/** - * get_debug_info - fill a buffer with debugging information - * @buffer: The buffer to be filled. - * @count: The size of the buffer, in bytes. - * - * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will - * either printk or return via sysfs. - **/ -static int get_toi_debug_info(const char *buffer, int count) -{ - int len = 0, i, first_result = 1; - - SNPRINTF("TuxOnIce debugging info:\n"); - SNPRINTF("- TuxOnIce core : " TOI_CORE_VERSION "\n"); - SNPRINTF("- Kernel Version : " UTS_RELEASE "\n"); - SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__); - SNPRINTF("- Attempt number : %d\n", nr_hibernates); - SNPRINTF("- Parameters : %ld %ld %ld %d %ld %ld\n", - toi_result, - toi_bkd.toi_action, - toi_bkd.toi_debug_state, - toi_bkd.toi_default_console_level, - image_size_limit, - toi_poweroff_method); - SNPRINTF("- Overall expected compression percentage: %d.\n", - 100 - toi_expected_compression_ratio()); - len += toi_print_module_debug_info(((char *) buffer) + len, - count - len - 1); - if (toi_bkd.toi_io_time[0][1]) { - if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) { - SNPRINTF("- I/O speed: Write %ld KB/s", - (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / - toi_bkd.toi_io_time[0][1])); - if (toi_bkd.toi_io_time[1][1]) - SNPRINTF(", Read %ld KB/s", - (KB((unsigned long) - toi_bkd.toi_io_time[1][0]) * HZ / - toi_bkd.toi_io_time[1][1])); - } else { - SNPRINTF("- I/O speed: Write %ld MB/s", - (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / - toi_bkd.toi_io_time[0][1])); - if (toi_bkd.toi_io_time[1][1]) - SNPRINTF(", Read %ld MB/s", - (MB((unsigned long) - toi_bkd.toi_io_time[1][0]) * HZ / - toi_bkd.toi_io_time[1][1])); - } - SNPRINTF(".\n"); - } else - SNPRINTF("- No I/O speed stats available.\n"); - SNPRINTF("- Extra pages : %lu used/%lu.\n", - extra_pd1_pages_used, extra_pd1_pages_allowance); - - for (i = 0; i < TOI_NUM_RESULT_STATES; i++) - if (test_result_state(i)) { - SNPRINTF("%s: %s.\n", first_result ? - "- Result " : - " ", - result_strings[i]); - first_result = 0; - } - if (first_result) - SNPRINTF("- Result : %s.\n", nr_hibernates ? - "Succeeded" : - "No hibernation attempts so far"); - return len; -} - -#ifdef CONFIG_TOI_INCREMENTAL -/** - * get_toi_page_state - fill a buffer with page state information - * @buffer: The buffer to be filled. - * @count: The size of the buffer, in bytes. - * - * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will - * either printk or return via sysfs. - **/ -static int get_toi_page_state(const char *buffer, int count) -{ - int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0; - int len = 0; - struct zone *zone; - int allocated_bitmaps = 0; - - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_first(cpu_online_mask))); - - if (!free_map) { - BUG_ON(toi_alloc_bitmap(&free_map)); - allocated_bitmaps = 1; - } - - toi_generate_free_page_map(); - - for_each_populated_zone(zone) { - unsigned long loop; - - total += zone->spanned_pages; - - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - /* - * If the page gets allocated, it will be need - * saving in an image. - * Don't bother with explicitly removing any - * RO protection applied below. - * We'll SetPageTOI_Dirty(page) if/when it - * gets allocated. - */ - free += chunk_size; - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageTOI_Untracked(page)) { - untracked++; - } else if (PageTOI_RO(page)) { - ro++; - } else if (PageTOI_Dirty(page)) { - dirty++; - } else { - printk("Page %ld state 'other'.\n", pfn); - other++; - } - } - } - - if (allocated_bitmaps) { - toi_free_bitmap(&free_map); - } - - set_cpus_allowed_ptr(current, cpu_all_mask); - - SNPRINTF("TuxOnIce page breakdown:\n"); - SNPRINTF("- Free : %d\n", free); - SNPRINTF("- Untracked : %d\n", untracked); - SNPRINTF("- Read only : %d\n", ro); - SNPRINTF("- Dirty : %d\n", dirty); - SNPRINTF("- Other : %d\n", other); - SNPRINTF("- Invalid : %d\n", invalid); - SNPRINTF("- Total : %d\n", total); - return len; -} -#endif - -/** - * do_cleanup - cleanup after attempting to hibernate or resume - * @get_debug_info: Whether to allocate and return debugging info. - * - * Cleanup after attempting to hibernate or resume, possibly getting - * debugging info as we do so. - **/ -static void do_cleanup(int get_debug_info, int restarting) -{ - int i = 0; - char *buffer = NULL; - - trap_non_toi_io = 0; - - if (get_debug_info) - toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up..."); - - free_checksum_pages(); - - toi_cbw_restore(); - toi_free_cbw_data(); - - if (get_debug_info) - buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP); - - if (buffer) - i = get_toi_debug_info(buffer, PAGE_SIZE); - - toi_free_extra_pagedir_memory(); - - pagedir1.size = 0; - pagedir2.size = 0; - set_highmem_size(pagedir1, 0); - set_highmem_size(pagedir2, 0); - - if (boot_kernel_data_buffer) { - if (!test_toi_state(TOI_BOOT_KERNEL)) - toi_free_page(37, boot_kernel_data_buffer); - boot_kernel_data_buffer = 0; - } - - if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) { - unlock_device_hotplug(); - clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); - } - - clear_toi_state(TOI_BOOT_KERNEL); - if (current->flags & PF_SUSPEND_TASK) - thaw_processes(); - - if (!restarting) - toi_stop_other_threads(); - - if (toi_keeping_image && - !test_result_state(TOI_ABORTED)) { - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, - "TuxOnIce: Not invalidating the image due " - "to Keep Image or Incremental Image being enabled."); - set_result_state(TOI_KEPT_IMAGE); - - /* - * For an incremental image, free unused storage so - * swap (if any) can be used for normal system operation, - * if so desired. - */ - - toiActiveAllocator->free_unused_storage(); - } else - if (toiActiveAllocator) - toiActiveAllocator->remove_image(); - - free_bitmaps(); - usermodehelper_enable(); - - if (test_toi_state(TOI_NOTIFIERS_PREPARE)) { - pm_notifier_call_chain(PM_POST_HIBERNATION); - clear_toi_state(TOI_NOTIFIERS_PREPARE); - } - - if (buffer && i) { - /* Printk can only handle 1023 bytes, including - * its level mangling. */ - for (i = 0; i < 3; i++) - printk(KERN_ERR "%s", buffer + (1023 * i)); - toi_free_page(20, (unsigned long) buffer); - } - - if (!restarting) - toi_cleanup_console(); - - free_attention_list(); - - if (!restarting) - toi_deactivate_storage(0); - - clear_toi_state(TOI_IGNORE_LOGLEVEL); - clear_toi_state(TOI_TRYING_TO_RESUME); - clear_toi_state(TOI_NOW_RESUMING); -} - -/** - * check_still_keeping_image - we kept an image; check whether to reuse it. - * - * We enter this routine when we have kept an image. If the user has said they - * want to still keep it, all we need to do is powerdown. If powering down - * means hibernating to ram and the power doesn't run out, we'll return 1. - * If we do power off properly or the battery runs out, we'll resume via the - * normal paths. - * - * If the user has said they want to remove the previously kept image, we - * remove it, and return 0. We'll then store a new image. - **/ -static int check_still_keeping_image(void) -{ - if (toi_keeping_image) { - if (!test_action_state(TOI_INCREMENTAL_IMAGE)) { - printk(KERN_INFO "Image already stored: powering down " - "immediately."); - do_toi_step(STEP_HIBERNATE_POWERDOWN); - return 1; - } - /** - * Incremental image - need to write new part. - * We detect that we're writing an incremental image by looking - * at test_result_state(TOI_KEPT_IMAGE) - **/ - return 0; - } - - printk(KERN_INFO "Invalidating previous image.\n"); - toiActiveAllocator->remove_image(); - - return 0; -} - -/** - * toi_init - prepare to hibernate to disk - * - * Initialise variables & data structures, in preparation for - * hibernating to disk. - **/ -static int toi_init(int restarting) -{ - int result, i, j; - - toi_result = 0; - - printk(KERN_INFO "Initiating a hibernation cycle.\n"); - - nr_hibernates++; - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - toi_bkd.toi_io_time[i][j] = 0; - - if (!test_toi_state(TOI_CAN_HIBERNATE) || - allocate_bitmaps()) - return 1; - - mark_nosave_pages(); - - if (!restarting) - toi_prepare_console(); - - result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (result) { - set_result_state(TOI_NOTIFIERS_PREPARE_FAILED); - return 1; - } - set_toi_state(TOI_NOTIFIERS_PREPARE); - - if (!restarting) { - printk(KERN_ERR "Starting other threads."); - toi_start_other_threads(); - } - - result = usermodehelper_disable(); - if (result) { - printk(KERN_ERR "TuxOnIce: Failed to disable usermode " - "helpers\n"); - set_result_state(TOI_USERMODE_HELPERS_ERR); - return 1; - } - - boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP); - if (!boot_kernel_data_buffer) { - printk(KERN_ERR "TuxOnIce: Failed to allocate " - "boot_kernel_data_buffer.\n"); - set_result_state(TOI_OUT_OF_MEMORY); - return 1; - } - - toi_allocate_cbw_data(); - - return 0; -} - -/** - * can_hibernate - perform basic 'Can we hibernate?' tests - * - * Perform basic tests that must pass if we're going to be able to hibernate: - * Can we get the pm_mutex? Is resume= valid (we need to know where to write - * the image header). - **/ -static int can_hibernate(void) -{ - if (!test_toi_state(TOI_CAN_HIBERNATE)) - toi_attempt_to_parse_resume_device(0); - - if (!test_toi_state(TOI_CAN_HIBERNATE)) { - printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n" - "This may be because you haven't put something along " - "the lines of\n\nresume=swap:/dev/hda1\n\n" - "in lilo.conf or equivalent. (Where /dev/hda1 is your " - "swap partition).\n"); - set_abort_result(TOI_CANT_SUSPEND); - return 0; - } - - if (strlen(alt_resume_param)) { - attempt_to_parse_alt_resume_param(); - - if (!strlen(alt_resume_param)) { - printk(KERN_INFO "Alternate resume parameter now " - "invalid. Aborting.\n"); - set_abort_result(TOI_CANT_USE_ALT_RESUME); - return 0; - } - } - - return 1; -} - -/** - * do_post_image_write - having written an image, figure out what to do next - * - * After writing an image, we might load an alternate image or power down. - * Powering down might involve hibernating to ram, in which case we also - * need to handle reloading pageset2. - **/ -static int do_post_image_write(void) -{ - /* If switching images fails, do normal powerdown */ - if (alt_resume_param[0]) - do_toi_step(STEP_RESUME_ALT_IMAGE); - - toi_power_down(); - - barrier(); - mb(); - return 0; -} - -/** - * __save_image - do the hard work of saving the image - * - * High level routine for getting the image saved. The key assumptions made - * are that processes have been frozen and sufficient memory is available. - * - * We also exit through here at resume time, coming back from toi_hibernate - * after the atomic restore. This is the reason for the toi_in_hibernate - * test. - **/ -static int __save_image(void) -{ - int temp_result, did_copy = 0; - - toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image.."); - - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, - " - Final values: %d and %d.", - pagedir1.size, pagedir2.size); - - toi_cond_pause(1, "About to write pagedir2."); - - temp_result = write_pageset(&pagedir2); - - if (temp_result == -1 || test_result_state(TOI_ABORTED)) - return 1; - - toi_cond_pause(1, "About to copy pageset 1."); - - if (test_result_state(TOI_ABORTED)) - return 1; - - toi_deactivate_storage(1); - - toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); - - toi_in_hibernate = 1; - - if (toi_go_atomic(PMSG_FREEZE, 1)) - goto Failed; - - temp_result = toi_hibernate(); - -#ifdef CONFIG_KGDB - if (test_action_state(TOI_POST_RESUME_BREAKPOINT)) - kgdb_breakpoint(); -#endif - - if (!temp_result) - did_copy = 1; - - /* We return here at resume time too! */ - toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result); - -Failed: - if (toi_activate_storage(1)) - panic("Failed to reactivate our storage."); - - /* Resume time? */ - if (!toi_in_hibernate) { - copyback_post(); - return 0; - } - - /* Nope. Hibernating. So, see if we can save the image... */ - - if (temp_result || test_result_state(TOI_ABORTED)) { - if (did_copy) - goto abort_reloading_pagedir_two; - else - return 1; - } - - toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size, - NULL); - - if (test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - toi_cond_pause(1, "About to write pageset1."); - - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1"); - - temp_result = write_pageset(&pagedir1); - - /* We didn't overwrite any memory, so no reread needs to be done. */ - if (test_action_state(TOI_TEST_FILTER_SPEED) || - test_action_state(TOI_TEST_BIO)) - return 1; - - if (temp_result == 1 || test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - toi_cond_pause(1, "About to write header."); - - if (test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - temp_result = write_image_header(); - - if (!temp_result && !test_result_state(TOI_ABORTED)) - return 0; - -abort_reloading_pagedir_two: - temp_result = read_pageset2(1); - - /* If that failed, we're sunk. Panic! */ - if (temp_result) - panic("Attempt to reload pagedir 2 while aborting " - "a hibernate failed."); - - return 1; -} - -static void map_ps2_pages(int enable) -{ - unsigned long pfn = 0; - - memory_bm_position_reset(pageset2_map); - pfn = memory_bm_next_pfn(pageset2_map, 0); - - while (pfn != BM_END_OF_MAP) { - struct page *page = pfn_to_page(pfn); - kernel_map_pages(page, 1, enable); - pfn = memory_bm_next_pfn(pageset2_map, 0); - } -} - -/** - * do_save_image - save the image and handle the result - * - * Save the prepared image. If we fail or we're in the path returning - * from the atomic restore, cleanup. - **/ -static int do_save_image(void) -{ - int result; - map_ps2_pages(0); - result = __save_image(); - map_ps2_pages(1); - return result; -} - -/** - * do_prepare_image - try to prepare an image - * - * Seek to initialise and prepare an image to be saved. On failure, - * cleanup. - **/ -static int do_prepare_image(void) -{ - int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - - if (!restarting && toi_activate_storage(0)) - return 1; - - /* - * If kept image and still keeping image and hibernating to RAM, (non - * incremental image case) we will return 1 after hibernating and - * resuming (provided the power doesn't run out. In that case, we skip - * directly to cleaning up and exiting. - */ - - if (!can_hibernate() || - (test_result_state(TOI_KEPT_IMAGE) && - check_still_keeping_image())) - return 1; - - if (toi_init(restarting) || toi_prepare_image() || - test_result_state(TOI_ABORTED)) - return 1; - - trap_non_toi_io = 1; - - return 0; -} - -/** - * do_check_can_resume - find out whether an image has been stored - * - * Read whether an image exists. We use the same routine as the - * image_exists sysfs entry, and just look to see whether the - * first character in the resulting buffer is a '1'. - **/ -int do_check_can_resume(void) -{ - int result = -1; - - if (toi_activate_storage(0)) - return -1; - - if (!test_toi_state(TOI_RESUME_DEVICE_OK)) - toi_attempt_to_parse_resume_device(1); - - if (toiActiveAllocator) - result = toiActiveAllocator->image_exists(1); - - toi_deactivate_storage(0); - return result; -} - -/** - * do_load_atomic_copy - load the first part of an image, if it exists - * - * Check whether we have an image. If one exists, do sanity checking - * (possibly invalidating the image or even rebooting if the user - * requests that) before loading it into memory in preparation for the - * atomic restore. - * - * If and only if we have an image loaded and ready to restore, we return 1. - **/ -static int do_load_atomic_copy(void) -{ - int read_image_result = 0; - - if (sizeof(swp_entry_t) != sizeof(long)) { - printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size" - " of long. Please report this!\n"); - return 1; - } - - if (!resume_file[0]) - printk(KERN_WARNING "TuxOnIce: " - "You need to use a resume= command line parameter to " - "tell TuxOnIce where to look for an image.\n"); - - toi_activate_storage(0); - - if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) && - !toi_attempt_to_parse_resume_device(0)) { - /* - * Without a usable storage device we can do nothing - - * even if noresume is given - */ - - if (!toiNumAllocators) - printk(KERN_ALERT "TuxOnIce: " - "No storage allocators have been registered.\n"); - else - printk(KERN_ALERT "TuxOnIce: " - "Missing or invalid storage location " - "(resume= parameter). Please correct and " - "rerun lilo (or equivalent) before " - "hibernating.\n"); - toi_deactivate_storage(0); - return 1; - } - - if (allocate_bitmaps()) - return 1; - - read_image_result = read_pageset1(); /* non fatal error ignored */ - - if (test_toi_state(TOI_NORESUME_SPECIFIED)) - clear_toi_state(TOI_NORESUME_SPECIFIED); - - toi_deactivate_storage(0); - - if (read_image_result) - return 1; - - return 0; -} - -/** - * prepare_restore_load_alt_image - save & restore alt image variables - * - * Save and restore the pageset1 maps, when loading an alternate image. - **/ -static void prepare_restore_load_alt_image(int prepare) -{ - static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save; - - if (prepare) { - pageset1_map_save = pageset1_map; - pageset1_map = NULL; - pageset1_copy_map_save = pageset1_copy_map; - pageset1_copy_map = NULL; - set_toi_state(TOI_LOADING_ALT_IMAGE); - toi_reset_alt_image_pageset2_pfn(); - } else { - toi_free_bitmap(&pageset1_map); - pageset1_map = pageset1_map_save; - toi_free_bitmap(&pageset1_copy_map); - pageset1_copy_map = pageset1_copy_map_save; - clear_toi_state(TOI_NOW_RESUMING); - clear_toi_state(TOI_LOADING_ALT_IMAGE); - } -} - -/** - * do_toi_step - perform a step in hibernating or resuming - * - * Perform a step in hibernating or resuming an image. This abstraction - * is in preparation for implementing cluster support, and perhaps replacing - * uswsusp too (haven't looked whether that's possible yet). - **/ -int do_toi_step(int step) -{ - switch (step) { - case STEP_HIBERNATE_PREPARE_IMAGE: - return do_prepare_image(); - case STEP_HIBERNATE_SAVE_IMAGE: - return do_save_image(); - case STEP_HIBERNATE_POWERDOWN: - return do_post_image_write(); - case STEP_RESUME_CAN_RESUME: - return do_check_can_resume(); - case STEP_RESUME_LOAD_PS1: - return do_load_atomic_copy(); - case STEP_RESUME_DO_RESTORE: - /* - * If we succeed, this doesn't return. - * Instead, we return from do_save_image() in the - * hibernated kernel. - */ - return toi_atomic_restore(); - case STEP_RESUME_ALT_IMAGE: - printk(KERN_INFO "Trying to resume alternate image.\n"); - toi_in_hibernate = 0; - save_restore_alt_param(SAVE, NOQUIET); - prepare_restore_load_alt_image(1); - if (!do_check_can_resume()) { - printk(KERN_INFO "Nothing to resume from.\n"); - goto out; - } - if (!do_load_atomic_copy()) - toi_atomic_restore(); - - printk(KERN_INFO "Failed to load image.\n"); -out: - prepare_restore_load_alt_image(0); - save_restore_alt_param(RESTORE, NOQUIET); - break; - case STEP_CLEANUP: - do_cleanup(1, 0); - break; - case STEP_QUIET_CLEANUP: - do_cleanup(0, 0); - break; - } - - return 0; -} - -/* -- Functions for kickstarting a hibernate or resume --- */ - -/** - * toi_try_resume - try to do the steps in resuming - * - * Check if we have an image and if so try to resume. Clear the status - * flags too. - **/ -void toi_try_resume(void) -{ - set_toi_state(TOI_TRYING_TO_RESUME); - resume_attempted = 1; - - current->flags |= PF_MEMALLOC; - toi_start_other_threads(); - - if (do_toi_step(STEP_RESUME_CAN_RESUME) && - !do_toi_step(STEP_RESUME_LOAD_PS1)) - do_toi_step(STEP_RESUME_DO_RESTORE); - - toi_stop_other_threads(); - do_cleanup(0, 0); - - current->flags &= ~PF_MEMALLOC; - - clear_toi_state(TOI_IGNORE_LOGLEVEL); - clear_toi_state(TOI_TRYING_TO_RESUME); - clear_toi_state(TOI_NOW_RESUMING); -} - -/** - * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume - * - * Wrapper for when __toi_try_resume is called from swsusp resume path, - * rather than from echo > /sys/power/tuxonice/do_resume. - **/ -static void toi_sys_power_disk_try_resume(void) -{ - resume_attempted = 1; - - /* - * There's a comment in kernel/power/disk.c that indicates - * we should be able to use mutex_lock_nested below. That - * doesn't seem to cut it, though, so let's just turn lockdep - * off for now. - */ - lockdep_off(); - - if (toi_start_anything(SYSFS_RESUMING)) - goto out; - - toi_try_resume(); - - /* - * For initramfs, we have to clear the boot time - * flag after trying to resume - */ - clear_toi_state(TOI_BOOT_TIME); - - toi_finish_anything(SYSFS_RESUMING); -out: - lockdep_on(); -} - -/** - * toi_try_hibernate - try to start a hibernation cycle - * - * Start a hibernation cycle, coming in from either - * echo > /sys/power/tuxonice/do_suspend - * - * or - * - * echo disk > /sys/power/state - * - * In the later case, we come in without pm_sem taken; in the - * former, it has been taken. - **/ -int toi_try_hibernate(void) -{ - int result = 0, sys_power_disk = 0, retries = 0; - - if (!mutex_is_locked(&tuxonice_in_use)) { - /* Came in via /sys/power/disk */ - if (toi_start_anything(SYSFS_HIBERNATING)) - return -EBUSY; - sys_power_disk = 1; - } - - current->flags |= PF_MEMALLOC; - - if (test_toi_state(TOI_CLUSTER_MODE)) { - toi_initiate_cluster_hibernate(); - goto out; - } - -prepare: - result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); - - if (result) - goto out; - - if (test_action_state(TOI_FREEZER_TEST)) - goto out_restore_gfp_mask; - - result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); - - if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) { - if (retries < 2) { - do_cleanup(0, 1); - retries++; - clear_result_state(TOI_ABORTED); - extra_pd1_pages_allowance = extra_pd1_pages_used + 500; - printk(KERN_INFO "Automatically adjusting the extra" - " pages allowance to %ld and restarting.\n", - extra_pd1_pages_allowance); - pm_restore_gfp_mask(); - goto prepare; - } - - printk(KERN_INFO "Adjusted extra pages allowance twice and " - "still couldn't hibernate successfully. Giving up."); - } - - /* This code runs at resume time too! */ - if (!result && toi_in_hibernate) - result = do_toi_step(STEP_HIBERNATE_POWERDOWN); - -out_restore_gfp_mask: - pm_restore_gfp_mask(); -out: - do_cleanup(1, 0); - current->flags &= ~PF_MEMALLOC; - - if (sys_power_disk) - toi_finish_anything(SYSFS_HIBERNATING); - - return result; -} - -/* - * channel_no: If !0, -c <channel_no> is added to args (userui). - */ -int toi_launch_userspace_program(char *command, int channel_no, - int wait, int debug) -{ - int retval; - static char *envp[] = { - "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; - static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL - }; - char *channel = NULL; - int arg = 0, size; - char test_read[255]; - char *orig_posn = command; - - if (!strlen(orig_posn)) - return 1; - - if (channel_no) { - channel = toi_kzalloc(4, 6, GFP_KERNEL); - if (!channel) { - printk(KERN_INFO "Failed to allocate memory in " - "preparing to launch userspace program.\n"); - return 1; - } - } - - /* Up to 6 args supported */ - while (arg < 6) { - sscanf(orig_posn, "%s", test_read); - size = strlen(test_read); - if (!(size)) - break; - argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP); - strcpy(argv[arg], test_read); - orig_posn += size + 1; - *test_read = 0; - arg++; - } - - if (channel_no) { - sprintf(channel, "-c%d", channel_no); - argv[arg] = channel; - } else - arg--; - - if (debug) { - argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP); - strcpy(argv[arg], "--debug"); - } - - retval = call_usermodehelper(argv[0], argv, envp, wait); - - /* - * If the program reports an error, retval = 256. Don't complain - * about that here. - */ - if (retval && retval != 256) - printk(KERN_ERR "Failed to launch userspace program '%s': " - "Error %d\n", command, retval); - - { - int i; - for (i = 0; i < arg; i++) - if (argv[i] && argv[i] != channel) - toi_kfree(5, argv[i], sizeof(*argv[i])); - } - - toi_kfree(4, channel, sizeof(*channel)); - - return retval; -} - -/* - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_LONG("extra_pages_allowance", SYSFS_RW, - &extra_pd1_pages_allowance, 0, LONG_MAX, 0), - SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read, - image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL), - SYSFS_STRING("resume", SYSFS_RW, resume_file, 255, - SYSFS_NEEDS_SM_FOR_WRITE, - attempt_to_parse_resume_device2), - SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255, - SYSFS_NEEDS_SM_FOR_WRITE, - attempt_to_parse_alt_resume_param), - SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0, - NULL), - SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action, - TOI_IGNORE_ROOTFS, 0), - SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2, - INT_MAX, 0), - SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0), - SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_MULTITHREADED_IO, 0), - SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_FLUSHER_THREAD, 0), - SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action, - TOI_PAGESET2_FULL, 0), - SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0), - SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action, - TOI_REPLACE_SWSUSP, 0), - SYSFS_STRING("resume_commandline", SYSFS_RW, - toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0, - NULL), - SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL), - SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action, - TOI_FREEZER_TEST, 0), - SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0), - SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action, - TOI_TEST_FILTER_SPEED, 0), - SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_PAGESET2, 0), - SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_PS2_IF_UNNEEDED, 0), - SYSFS_STRING("binary_signature", SYSFS_READONLY, - tuxonice_signature, 9, 0, NULL), - SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0, - NULL), -#ifdef CONFIG_KGDB - SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action, - TOI_POST_RESUME_BREAKPOINT, 0), -#endif - SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_READAHEAD, 0), - SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action, - TOI_TRACE_DEBUG_ON, 0), -#ifdef CONFIG_TOI_KEEP_IMAGE - SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE, - 0), -#endif -#ifdef CONFIG_TOI_INCREMENTAL - SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0, - NULL), - SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action, - TOI_INCREMENTAL_IMAGE, 1), -#endif -}; - -static struct toi_core_fns my_fns = { - .get_nonconflicting_page = __toi_get_nonconflicting_page, - .post_context_save = __toi_post_context_save, - .try_hibernate = toi_try_hibernate, - .try_resume = toi_sys_power_disk_try_resume, -}; - -/** - * core_load - initialisation of TuxOnIce core - * - * Initialise the core, beginning with sysfs. Checksum and so on are part of - * the core, but have their own initialisation routines because they either - * aren't compiled in all the time or have their own subdirectories. - **/ -static __init int core_load(void) -{ - int i, - numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION - " (http://tuxonice.net)\n"); - - if (!hibernation_available()) { - printk(KERN_INFO "TuxOnIce disabled due to request for hibernation" - " to be disabled in this kernel.\n"); - return 1; - } - - if (toi_sysfs_init()) - return 1; - - for (i = 0; i < numfiles; i++) - toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); - - toi_core_fns = &my_fns; - - if (toi_alloc_init()) - return 1; - if (toi_checksum_init()) - return 1; - if (toi_usm_init()) - return 1; - if (toi_ui_init()) - return 1; - if (toi_poweroff_init()) - return 1; - if (toi_cluster_init()) - return 1; - if (toi_cbw_init()) - return 1; - - return 0; -} - -late_initcall(core_load); diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c deleted file mode 100644 index c5a09789e..000000000 --- a/kernel/power/tuxonice_incremental.c +++ /dev/null @@ -1,402 +0,0 @@ -/* - * kernel/power/tuxonice_incremental.c - * - * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains routines related to storing incremental images - that - * is, retaining an image after an initial cycle and then storing incremental - * changes on subsequent hibernations. - * - * Based in part on on... - * - * Debug helper to dump the current kernel pagetables of the system - * so that we can see what the various memory ranges are set to. - * - * (C) Copyright 2008 Intel Corporation - * - * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/mm.h> -#include <linux/tuxonice.h> -#include <linux/sched.h> -#include <asm/pgtable.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/page.h> -#include "tuxonice_pageflags.h" -#include "tuxonice_builtin.h" -#include "power.h" - -int toi_do_incremental_initcall; - -extern void kdb_init(int level); -extern noinline void kgdb_breakpoint(void); - -#undef pr_debug -#if 0 -#define pr_debug(a, b...) do { printk(a, ##b); } while(0) -#else -#define pr_debug(a, b...) do { } while(0) -#endif - -/* Multipliers for offsets within the PTEs */ -#define PTE_LEVEL_MULT (PAGE_SIZE) -#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) -#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) - -/* - * This function gets called on a break in a continuous series - * of PTE entries; the next one is different so we need to - * print what we collected so far. - */ -static void note_page(void *addr) -{ - static struct page *lastpage; - struct page *page; - - page = virt_to_page(addr); - - if (page != lastpage) { - unsigned int level; - pte_t *pte = lookup_address((unsigned long) addr, &level); - struct page *pt_page2 = pte_page(*pte); - //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2)); - SetPageTOI_Untracked(pt_page2); - lastpage = page; - } -} - -static void walk_pte_level(pmd_t addr) -{ - int i; - pte_t *start; - - start = (pte_t *) pmd_page_vaddr(addr); - for (i = 0; i < PTRS_PER_PTE; i++) { - note_page(start); - start++; - } -} - -#if PTRS_PER_PMD > 1 - -static void walk_pmd_level(pud_t addr) -{ - int i; - pmd_t *start; - - start = (pmd_t *) pud_page_vaddr(addr); - for (i = 0; i < PTRS_PER_PMD; i++) { - if (!pmd_none(*start)) { - if (pmd_large(*start) || !pmd_present(*start)) - note_page(start); - else - walk_pte_level(*start); - } else - note_page(start); - start++; - } -} - -#else -#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a))) -#define pud_large(a) pmd_large(__pmd(pud_val(a))) -#define pud_none(a) pmd_none(__pmd(pud_val(a))) -#endif - -#if PTRS_PER_PUD > 1 - -static void walk_pud_level(pgd_t addr) -{ - int i; - pud_t *start; - - start = (pud_t *) pgd_page_vaddr(addr); - - for (i = 0; i < PTRS_PER_PUD; i++) { - if (!pud_none(*start)) { - if (pud_large(*start) || !pud_present(*start)) - note_page(start); - else - walk_pmd_level(*start); - } else - note_page(start); - - start++; - } -} - -#else -#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a))) -#define pgd_large(a) pud_large(__pud(pgd_val(a))) -#define pgd_none(a) pud_none(__pud(pgd_val(a))) -#endif - -/* - * Not static in the original at the time of writing, so needs renaming here. - */ -static void toi_ptdump_walk_pgd_level(pgd_t *pgd) -{ -#ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_level4_pgt; -#else - pgd_t *start = swapper_pg_dir; -#endif - int i; - if (pgd) { - start = pgd; - } - - for (i = 0; i < PTRS_PER_PGD; i++) { - if (!pgd_none(*start)) { - if (pgd_large(*start) || !pgd_present(*start)) - note_page(start); - else - walk_pud_level(*start); - } else - note_page(start); - - start++; - } - - /* Flush out the last page */ - note_page(start); -} - -#ifdef CONFIG_PARAVIRT -extern struct pv_info pv_info; - -static void toi_set_paravirt_ops_untracked(void) { - int i; - - unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)), - pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end)); - //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end); - for (i = pvpfn; i <= pvpfn_end; i++) { - SetPageTOI_Untracked(pfn_to_page(i)); - } -} -#else -#define toi_set_paravirt_ops_untracked() { do { } while(0) } -#endif - -extern void toi_mark_per_cpus_pages_untracked(void); - -void toi_untrack_stack(unsigned long *stack) -{ - int i; - struct page *stack_page = virt_to_page(stack); - - for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) { - pr_debug("Untrack stack page %p.\n", page_address(stack_page + i)); - SetPageTOI_Untracked(stack_page + i); - } -} -void toi_untrack_process(struct task_struct *p) -{ - SetPageTOI_Untracked(virt_to_page(p)); - pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p))); - - toi_untrack_stack(p->stack); -} - -void toi_generate_untracked_map(void) -{ - struct task_struct *p, *t; - struct page *page; - pte_t *pte; - int i; - unsigned int level; - static int been_here = 0; - - if (been_here) - return; - - been_here = 1; - - /* Pagetable pages */ - toi_ptdump_walk_pgd_level(NULL); - - /* Printk buffer - not normally needed but can be helpful for debugging. */ - //toi_set_logbuf_untracked(); - - /* Paravirt ops */ - toi_set_paravirt_ops_untracked(); - - /* Task structs and stacks */ - for_each_process_thread(p, t) { - toi_untrack_process(p); - //toi_untrack_stack((unsigned long *) t->thread.sp); - } - - for (i = 0; i < NR_CPUS; i++) { - struct task_struct *idle = idle_task(i); - - if (idle) { - pr_debug("Untrack idle process for CPU %d.\n", i); - toi_untrack_process(idle); - } - - /* IRQ stack */ - pr_debug("Untrack IRQ stack for CPU %d.\n", i); - toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i)); - } - - /* Per CPU data */ - //pr_debug("Untracking per CPU variable pages.\n"); - toi_mark_per_cpus_pages_untracked(); - - /* Init stack - for bringing up secondary CPUs */ - page = virt_to_page(init_stack); - for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) { - SetPageTOI_Untracked(page + i); - } - - pte = lookup_address((unsigned long) &mmu_cr4_features, &level); - SetPageTOI_Untracked(pte_page(*pte)); - SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features)); -} - -/** - * toi_reset_dirtiness_one - */ - -void toi_reset_dirtiness_one(unsigned long pfn, int verbose) -{ - struct page *page = pfn_to_page(pfn); - - /** - * Don't worry about whether the Dirty flag is - * already set. If this is our first call, it - * won't be. - */ - - preempt_disable(); - - ClearPageTOI_Dirty(page); - SetPageTOI_RO(page); - if (verbose) - printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page)); - - set_memory_ro((unsigned long) page_address(page), 1); - - preempt_enable(); -} - -/** - * TuxOnIce's incremental image support works by marking all memory apart from - * the page tables read-only, then in the page-faults that result enabling - * writing if appropriate and flagging the page as dirty. Free pages are also - * marked as dirty and not protected so that if allocated, they will be included - * in the image without further processing. - * - * toi_reset_dirtiness is called when and image exists and incremental images are - * enabled, and each time we resume thereafter. It is not invoked on a fresh boot. - * - * This routine should be called from a single-cpu-running context to avoid races in setting - * page dirty/read only flags. - * - * TODO: Make "it is not invoked on a fresh boot" true when I've finished developing it! - * - * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c. - **/ - -int toi_reset_dirtiness(int verbose) -{ - struct zone *zone; - unsigned long loop; - int allocated_map = 0; - - toi_generate_untracked_map(); - - if (!free_map) { - if (!toi_alloc_bitmap(&free_map)) - return -ENOMEM; - allocated_map = 1; - } - - toi_generate_free_page_map(); - - pr_debug(KERN_EMERG "Reset dirtiness.\n"); - for_each_populated_zone(zone) { - // 64 bit only. No need to worry about highmem. - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageNosave(page) || !saveable_page(zone, pfn)) { - continue; - } - - if (PageTOI_Untracked(page)) { - continue; - } - - /** - * Do we need to (re)protect the page? - * If it is already protected (PageTOI_RO), there is - * nothing to do - skip the following. - * If it is marked as dirty (PageTOI_Dirty), it was - * either free and has been allocated or has been - * written to and marked dirty. Reset the dirty flag - * and (re)apply the protection. - */ - if (!PageTOI_RO(page)) { - toi_reset_dirtiness_one(pfn, verbose); - } - } - } - - pr_debug(KERN_EMERG "Done resetting dirtiness.\n"); - - if (allocated_map) { - toi_free_bitmap(&free_map); - } - return 0; -} - -static int toi_reset_dirtiness_initcall(void) -{ - if (toi_do_incremental_initcall) { - pr_info("TuxOnIce: Enabling dirty page tracking.\n"); - toi_reset_dirtiness(0); - } - return 1; -} -extern void toi_generate_untracked_map(void); - -// Leave early_initcall for pages to register untracked sections. -early_initcall(toi_reset_dirtiness_initcall); - -static int __init toi_incremental_initcall_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value) && value) - toi_do_incremental_initcall = value; - - return 1; -} -__setup("toi_incremental_initcall", toi_incremental_initcall_setup); diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c deleted file mode 100644 index 91b0c4fd0..000000000 --- a/kernel/power/tuxonice_io.c +++ /dev/null @@ -1,1932 +0,0 @@ -/* - * kernel/power/tuxonice_io.c - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr> - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains high level IO routines for hibernating. - * - */ - -#include <linux/suspend.h> -#include <linux/version.h> -#include <linux/utsname.h> -#include <linux/mount.h> -#include <linux/highmem.h> -#include <linux/kthread.h> -#include <linux/cpu.h> -#include <linux/fs_struct.h> -#include <linux/bio.h> -#include <linux/fs_uuid.h> -#include <linux/kmod.h> -#include <asm/tlbflush.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_storage.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_extent.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_builtin.h" -#include "tuxonice_checksum.h" -#include "tuxonice_alloc.h" -char alt_resume_param[256]; - -/* Version read from image header at resume */ -static int toi_image_header_version; - -#define read_if_version(VERS, VAR, DESC, ERR_ACT) do { \ - if (likely(toi_image_header_version >= VERS)) \ - if (toiActiveAllocator->rw_header_chunk(READ, NULL, \ - (char *) &VAR, sizeof(VAR))) { \ - abort_hibernate(TOI_FAILED_IO, "Failed to read DESC."); \ - ERR_ACT; \ - } \ -} while(0) \ - -/* Variables shared between threads and updated under the mutex */ -static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result; -static int io_index, io_nextupdate, io_pc, io_pc_step; -static DEFINE_MUTEX(io_mutex); -static DEFINE_PER_CPU(struct page *, last_sought); -static DEFINE_PER_CPU(struct page *, last_high_page); -static DEFINE_PER_CPU(char *, checksum_locn); -static DEFINE_PER_CPU(struct pbe *, last_low_page); -static atomic_t io_count; -atomic_t toi_io_workers; - -static int using_flusher; - -DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher); - -int toi_bio_queue_flusher_should_finish; - -int toi_max_workers; - -static char *image_version_error = "The image header version is newer than " \ - "this kernel supports."; - -struct toi_module_ops *first_filter; - -static atomic_t toi_num_other_threads; -static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue); -enum toi_worker_commands { - TOI_IO_WORKER_STOP, - TOI_IO_WORKER_RUN, - TOI_IO_WORKER_EXIT -}; -static enum toi_worker_commands toi_worker_command; - -/** - * toi_attempt_to_parse_resume_device - determine if we can hibernate - * - * Can we hibernate, using the current resume= parameter? - **/ -int toi_attempt_to_parse_resume_device(int quiet) -{ - struct list_head *Allocator; - struct toi_module_ops *thisAllocator; - int result, returning = 0; - - if (toi_activate_storage(0)) - return 0; - - toiActiveAllocator = NULL; - clear_toi_state(TOI_RESUME_DEVICE_OK); - clear_toi_state(TOI_CAN_RESUME); - clear_result_state(TOI_ABORTED); - - if (!toiNumAllocators) { - if (!quiet) - printk(KERN_INFO "TuxOnIce: No storage allocators have " - "been registered. Hibernating will be " - "disabled.\n"); - goto cleanup; - } - - list_for_each(Allocator, &toiAllocators) { - thisAllocator = list_entry(Allocator, struct toi_module_ops, - type_list); - - /* - * Not sure why you'd want to disable an allocator, but - * we should honour the flag if we're providing it - */ - if (!thisAllocator->enabled) - continue; - - result = thisAllocator->parse_sig_location( - resume_file, (toiNumAllocators == 1), - quiet); - - switch (result) { - case -EINVAL: - /* For this allocator, but not a valid - * configuration. Error already printed. */ - goto cleanup; - - case 0: - /* For this allocator and valid. */ - toiActiveAllocator = thisAllocator; - - set_toi_state(TOI_RESUME_DEVICE_OK); - set_toi_state(TOI_CAN_RESUME); - returning = 1; - goto cleanup; - } - } - if (!quiet) - printk(KERN_INFO "TuxOnIce: No matching enabled allocator " - "found. Resuming disabled.\n"); -cleanup: - toi_deactivate_storage(0); - return returning; -} - -void attempt_to_parse_resume_device2(void) -{ - toi_prepare_usm(); - toi_attempt_to_parse_resume_device(0); - toi_cleanup_usm(); -} - -void save_restore_alt_param(int replace, int quiet) -{ - static char resume_param_save[255]; - static unsigned long toi_state_save; - - if (replace) { - toi_state_save = toi_state; - strcpy(resume_param_save, resume_file); - strcpy(resume_file, alt_resume_param); - } else { - strcpy(resume_file, resume_param_save); - toi_state = toi_state_save; - } - toi_attempt_to_parse_resume_device(quiet); -} - -void attempt_to_parse_alt_resume_param(void) -{ - int ok = 0; - - /* Temporarily set resume_param to the poweroff value */ - if (!strlen(alt_resume_param)) - return; - - printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n"); - save_restore_alt_param(SAVE, NOQUIET); - if (test_toi_state(TOI_CAN_RESUME)) - ok = 1; - - printk(KERN_INFO "=== Done ===\n"); - save_restore_alt_param(RESTORE, QUIET); - - /* If not ok, clear the string */ - if (ok) - return; - - printk(KERN_INFO "Can't resume from that location; clearing " - "alt_resume_param.\n"); - alt_resume_param[0] = '\0'; -} - -/** - * noresume_reset_modules - reset data structures in case of non resuming - * - * When we read the start of an image, modules (and especially the - * active allocator) might need to reset data structures if we - * decide to remove the image rather than resuming from it. - **/ -static void noresume_reset_modules(void) -{ - struct toi_module_ops *this_filter; - - list_for_each_entry(this_filter, &toi_filters, type_list) - if (this_filter->noresume_reset) - this_filter->noresume_reset(); - - if (toiActiveAllocator && toiActiveAllocator->noresume_reset) - toiActiveAllocator->noresume_reset(); -} - -/** - * fill_toi_header - fill the hibernate header structure - * @struct toi_header: Header data structure to be filled. - **/ -static int fill_toi_header(struct toi_header *sh) -{ - int i, error; - - error = init_header((struct swsusp_info *) sh); - if (error) - return error; - - sh->pagedir = pagedir1; - sh->pageset_2_size = pagedir2.size; - sh->param0 = toi_result; - sh->param1 = toi_bkd.toi_action; - sh->param2 = toi_bkd.toi_debug_state; - sh->param3 = toi_bkd.toi_default_console_level; - sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev; - for (i = 0; i < 4; i++) - sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2]; - sh->bkd = boot_kernel_data_buffer; - return 0; -} - -/** - * rw_init_modules - initialize modules - * @rw: Whether we are reading of writing an image. - * @which: Section of the image being processed. - * - * Iterate over modules, preparing the ones that will be used to read or write - * data. - **/ -static int rw_init_modules(int rw, int which) -{ - struct toi_module_ops *this_module; - /* Initialise page transformers */ - list_for_each_entry(this_module, &toi_filters, type_list) { - if (!this_module->enabled) - continue; - if (this_module->rw_init && this_module->rw_init(rw, which)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialize the %s filter.", - this_module->name); - return 1; - } - } - - /* Initialise allocator */ - if (toiActiveAllocator->rw_init(rw, which)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialise the allocator."); - return 1; - } - - /* Initialise other modules */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type == FILTER_MODULE || - this_module->type == WRITER_MODULE) - continue; - if (this_module->rw_init && this_module->rw_init(rw, which)) { - set_abort_result(TOI_FAILED_MODULE_INIT); - printk(KERN_INFO "Setting aborted flag due to module " - "init failure.\n"); - return 1; - } - } - - return 0; -} - -/** - * rw_cleanup_modules - cleanup modules - * @rw: Whether we are reading of writing an image. - * - * Cleanup components after reading or writing a set of pages. - * Only the allocator may fail. - **/ -static int rw_cleanup_modules(int rw) -{ - struct toi_module_ops *this_module; - int result = 0; - - /* Cleanup other modules */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type == FILTER_MODULE || - this_module->type == WRITER_MODULE) - continue; - if (this_module->rw_cleanup) - result |= this_module->rw_cleanup(rw); - } - - /* Flush data and cleanup */ - list_for_each_entry(this_module, &toi_filters, type_list) { - if (!this_module->enabled) - continue; - if (this_module->rw_cleanup) - result |= this_module->rw_cleanup(rw); - } - - result |= toiActiveAllocator->rw_cleanup(rw); - - return result; -} - -static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high) -{ - int index, min, max; - struct page *high_page = NULL, - **my_last_high_page = raw_cpu_ptr(&last_high_page), - **my_last_sought = raw_cpu_ptr(&last_sought); - struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page); - void *compare; - - if (is_high) { - if (*my_last_sought && *my_last_high_page && - *my_last_sought < orig_page) - high_page = *my_last_high_page; - else - high_page = (struct page *) restore_highmem_pblist; - this = (struct pbe *) kmap(high_page); - compare = orig_page; - } else { - if (*my_last_sought && *my_last_low_page && - *my_last_sought < orig_page) - this = *my_last_low_page; - else - this = restore_pblist; - compare = page_address(orig_page); - } - - *my_last_sought = orig_page; - - /* Locate page containing pbe */ - while (this[PBES_PER_PAGE - 1].next && - this[PBES_PER_PAGE - 1].orig_address < compare) { - if (is_high) { - struct page *next_high_page = (struct page *) - this[PBES_PER_PAGE - 1].next; - kunmap(high_page); - this = kmap(next_high_page); - high_page = next_high_page; - } else - this = this[PBES_PER_PAGE - 1].next; - } - - /* Do a binary search within the page */ - min = 0; - max = PBES_PER_PAGE; - index = PBES_PER_PAGE / 2; - while (max - min) { - if (!this[index].orig_address || - this[index].orig_address > compare) - max = index; - else if (this[index].orig_address == compare) { - if (is_high) { - struct page *page = this[index].address; - *my_last_high_page = high_page; - kunmap(high_page); - return page; - } - *my_last_low_page = this; - return virt_to_page(this[index].address); - } else - min = index; - index = ((max + min) / 2); - }; - - if (is_high) - kunmap(high_page); - - abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for" - " orig page %p. This[min].orig_address=%p.\n", orig_page, - this[index].orig_address); - return NULL; -} - -/** - * write_next_page - write the next page in a pageset - * @data_pfn: The pfn where the next data to write is located. - * @my_io_index: The index of the page in the pageset. - * @write_pfn: The pfn number to write in the image (where the data belongs). - * - * Get the pfn of the next page to write, map the page if necessary and do the - * write. - **/ -static int write_next_page(unsigned long *data_pfn, int *my_io_index, - unsigned long *write_pfn) -{ - struct page *page; - char **my_checksum_locn = raw_cpu_ptr(&checksum_locn); - int result = 0, was_present; - - *data_pfn = memory_bm_next_pfn(io_map, 0); - - /* Another thread could have beaten us to it. */ - if (*data_pfn == BM_END_OF_MAP) { - if (atomic_read(&io_count)) { - printk(KERN_INFO "Ran out of pfns but io_count is " - "still %d.\n", atomic_read(&io_count)); - BUG(); - } - mutex_unlock(&io_mutex); - return -ENODATA; - } - - *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); - - memory_bm_clear_bit(io_map, 0, *data_pfn); - page = pfn_to_page(*data_pfn); - - was_present = kernel_page_present(page); - if (!was_present) - kernel_map_pages(page, 1, 1); - - if (io_pageset == 1) - *write_pfn = memory_bm_next_pfn(pageset1_map, 0); - else { - *write_pfn = *data_pfn; - *my_checksum_locn = tuxonice_get_next_checksum(); - } - - TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index); - - mutex_unlock(&io_mutex); - - if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn)) - return 1; - - result = first_filter->write_page(*write_pfn, TOI_PAGE, page, - PAGE_SIZE); - - if (!was_present) - kernel_map_pages(page, 1, 0); - - return result; -} - -/** - * read_next_page - read the next page in a pageset - * @my_io_index: The index of the page in the pageset. - * @write_pfn: The pfn in which the data belongs. - * - * Read a page of the image into our buffer. It can happen (here and in the - * write routine) that threads don't get run until after other CPUs have done - * all the work. This was the cause of the long standing issue with - * occasionally getting -ENODATA errors at the end of reading the image. We - * therefore need to check there's actually a page to read before trying to - * retrieve one. - **/ - -static int read_next_page(int *my_io_index, unsigned long *write_pfn, - struct page *buffer) -{ - unsigned int buf_size = PAGE_SIZE; - unsigned long left = atomic_read(&io_count); - - if (!left) - return -ENODATA; - - /* Start off assuming the page we read isn't resaved */ - *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); - - mutex_unlock(&io_mutex); - - /* - * Are we aborting? If so, don't submit any more I/O as - * resetting the resume_attempted flag (from ui.c) will - * clear the bdev flags, making this thread oops. - */ - if (unlikely(test_toi_state(TOI_STOP_RESUME))) { - atomic_dec(&toi_io_workers); - if (!atomic_read(&toi_io_workers)) { - /* - * So we can be sure we'll have memory for - * marking that we haven't resumed. - */ - rw_cleanup_modules(READ); - set_toi_state(TOI_IO_STOPPED); - } - while (1) - schedule(); - } - - /* - * See toi_bio_read_page in tuxonice_bio.c: - * read the next page in the image. - */ - return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size); -} - -static void use_read_page(unsigned long write_pfn, struct page *buffer) -{ - struct page *final_page = pfn_to_page(write_pfn), - *copy_page = final_page; - char *virt, *buffer_virt; - int was_present, cpu = smp_processor_id(); - unsigned long idx = 0; - - if (io_pageset == 1 && (!pageset1_copy_map || - !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) { - int is_high = PageHighMem(final_page); - copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high); - } - - if (!memory_bm_test_bit(io_map, cpu, write_pfn)) { - int test = !memory_bm_test_bit(io_map, cpu, write_pfn); - toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test); - mutex_lock(&io_mutex); - idx = atomic_add_return(1, &io_count); - mutex_unlock(&io_mutex); - return; - } - - virt = kmap(copy_page); - buffer_virt = kmap(buffer); - was_present = kernel_page_present(copy_page); - if (!was_present) - kernel_map_pages(copy_page, 1, 1); - memcpy(virt, buffer_virt, PAGE_SIZE); - if (!was_present) - kernel_map_pages(copy_page, 1, 0); - kunmap(copy_page); - kunmap(buffer); - memory_bm_clear_bit(io_map, cpu, write_pfn); - TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset); -} - -static unsigned long status_update(int writing, unsigned long done, - unsigned long ticks) -{ - int cs_index = writing ? 0 : 1; - unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks; - unsigned long msec = jiffies_to_msecs(abs(ticks_so_far)); - unsigned long pgs_per_s, estimate = 0, pages_left; - - if (msec) { - pages_left = io_barmax - done; - pgs_per_s = 1000 * done / msec; - if (pgs_per_s) - estimate = DIV_ROUND_UP(pages_left, pgs_per_s); - } - - if (estimate && ticks > HZ / 2) - return toi_update_status(done, io_barmax, - " %d/%d MB (%lu sec left)", - MB(done+1), MB(io_barmax), estimate); - - return toi_update_status(done, io_barmax, " %d/%d MB", - MB(done+1), MB(io_barmax)); -} - -/** - * worker_rw_loop - main loop to read/write pages - * - * The main I/O loop for reading or writing pages. The io_map bitmap is used to - * track the pages to read/write. - * If we are reading, the pages are loaded to their final (mapped) pfn. - * Data is non zero iff this is a thread started via start_other_threads. - * In that case, we stay in here until told to quit. - **/ -static int worker_rw_loop(void *data) -{ - unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4, - jif_index = 1, start_time = jiffies, thread_num; - int result = 0, my_io_index = 0, last_worker; - struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP); - cpumask_var_t orig_mask; - - if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) { - printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data); - result = -ENOMEM; - goto out; - } - - cpumask_copy(orig_mask, tsk_cpus_allowed(current)); - - current->flags |= PF_NOFREEZE; - -top: - mutex_lock(&io_mutex); - thread_num = atomic_read(&toi_io_workers); - - cpumask_copy(tsk_cpus_allowed(current), orig_mask); - schedule(); - - atomic_inc(&toi_io_workers); - - while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) && - !(io_write && test_result_state(TOI_ABORTED)) && - toi_worker_command == TOI_IO_WORKER_RUN) { - if (!thread_num && jiffies > next_jiffies) { - next_jiffies += HZ / 4; - if (toiActiveAllocator->update_throughput_throttle) - toiActiveAllocator->update_throughput_throttle( - jif_index); - jif_index++; - } - - /* - * What page to use? If reading, don't know yet which page's - * data will be read, so always use the buffer. If writing, - * use the copy (Pageset1) or original page (Pageset2), but - * always write the pfn of the original page. - */ - if (io_write) - result = write_next_page(&data_pfn, &my_io_index, - &write_pfn); - else /* Reading */ - result = read_next_page(&my_io_index, &write_pfn, - buffer); - - if (result) { - mutex_lock(&io_mutex); - /* Nothing to do? */ - if (result == -ENODATA) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Thread %d has no more work.", - smp_processor_id()); - break; - } - - io_result = result; - - if (io_write) { - printk(KERN_INFO "Write chunk returned %d.\n", - result); - abort_hibernate(TOI_FAILED_IO, - "Failed to write a chunk of the " - "image."); - break; - } - - if (io_pageset == 1) { - printk(KERN_ERR "\nBreaking out of I/O loop " - "because of result code %d.\n", result); - break; - } - panic("Read chunk returned (%d)", result); - } - - /* - * Discard reads of resaved pages while reading ps2 - * and unwanted pages while rereading ps2 when aborting. - */ - if (!io_write) { - if (!PageResave(pfn_to_page(write_pfn))) - use_read_page(write_pfn, buffer); - else { - mutex_lock(&io_mutex); - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Resaved %ld.", write_pfn); - atomic_inc(&io_count); - mutex_unlock(&io_mutex); - } - } - - if (!thread_num) { - if(my_io_index + io_base > io_nextupdate) - io_nextupdate = status_update(io_write, - my_io_index + io_base, - jiffies - start_time); - - if (my_io_index > io_pc) { - printk(KERN_CONT "...%d%%", 20 * io_pc_step); - io_pc_step++; - io_pc = io_finish_at * io_pc_step / 5; - } - } - - toi_cond_pause(0, NULL); - - /* - * Subtle: If there's less I/O still to be done than threads - * running, quit. This stops us doing I/O beyond the end of - * the image when reading. - * - * Possible race condition. Two threads could do the test at - * the same time; one should exit and one should continue. - * Therefore we take the mutex before comparing and exiting. - */ - - mutex_lock(&io_mutex); - } - - last_worker = atomic_dec_and_test(&toi_io_workers); - toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers)); - mutex_unlock(&io_mutex); - - if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) { - /* Were we the last thread and we're using a flusher thread? */ - if (last_worker && using_flusher) { - toiActiveAllocator->finish_all_io(); - } - /* First, if we're doing I/O, wait for it to finish */ - wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN); - /* Then wait to be told what to do next */ - wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP); - if (toi_worker_command == TOI_IO_WORKER_RUN) - goto top; - } - - if (thread_num) - atomic_dec(&toi_num_other_threads); - -out: - toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num); - toi__free_page(28, buffer); - free_cpumask_var(orig_mask); - - return result; -} - -int toi_start_other_threads(void) -{ - int cpu; - struct task_struct *p; - int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1; - unsigned long num_started = 0; - - if (test_action_state(TOI_NO_MULTITHREADED_IO)) - return 0; - - toi_worker_command = TOI_IO_WORKER_STOP; - - for_each_online_cpu(cpu) { - if (num_started == to_start) - break; - - if (cpu == smp_processor_id()) - continue; - - p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1, - cpu_to_node(cpu), "ktoi_io/%d", cpu); - if (IS_ERR(p)) { - printk(KERN_ERR "ktoi_io for %i failed\n", cpu); - continue; - } - kthread_bind(p, cpu); - p->flags |= PF_MEMALLOC; - wake_up_process(p); - num_started++; - atomic_inc(&toi_num_other_threads); - } - - toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started); - return num_started; -} - -void toi_stop_other_threads(void) -{ - toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads."); - toi_worker_command = TOI_IO_WORKER_EXIT; - wake_up(&toi_worker_wait_queue); -} - -/** - * do_rw_loop - main highlevel function for reading or writing pages - * - * Create the io_map bitmap and call worker_rw_loop to perform I/O operations. - **/ -static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags, - int base, int barmax, int pageset) -{ - int index = 0, cpu, result = 0, workers_started; - unsigned long pfn, next; - - first_filter = toi_get_next_filter(NULL); - - if (!finish_at) - return 0; - - io_write = write; - io_finish_at = finish_at; - io_base = base; - io_barmax = barmax; - io_pageset = pageset; - io_index = 0; - io_pc = io_finish_at / 5; - io_pc_step = 1; - io_result = 0; - io_nextupdate = base + 1; - toi_bio_queue_flusher_should_finish = 0; - - for_each_online_cpu(cpu) { - per_cpu(last_sought, cpu) = NULL; - per_cpu(last_low_page, cpu) = NULL; - per_cpu(last_high_page, cpu) = NULL; - } - - /* Ensure all bits clear */ - memory_bm_clear(io_map); - - memory_bm_position_reset(io_map); - next = memory_bm_next_pfn(io_map, 0); - - BUG_ON(next != BM_END_OF_MAP); - - /* Set the bits for the pages to write */ - memory_bm_position_reset(pageflags); - - pfn = memory_bm_next_pfn(pageflags, 0); - toi_trace_index++; - - while (pfn != BM_END_OF_MAP && index < finish_at) { - TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at); - memory_bm_set_bit(io_map, 0, pfn); - pfn = memory_bm_next_pfn(pageflags, 0); - index++; - } - - BUG_ON(next != BM_END_OF_MAP || index < finish_at); - - memory_bm_position_reset(io_map); - toi_trace_index++; - - atomic_set(&io_count, finish_at); - - memory_bm_position_reset(pageset1_map); - - mutex_lock(&io_mutex); - - clear_toi_state(TOI_IO_STOPPED); - - using_flusher = (atomic_read(&toi_num_other_threads) && - toiActiveAllocator->io_flusher && - !test_action_state(TOI_NO_FLUSHER_THREAD)); - - workers_started = atomic_read(&toi_num_other_threads); - - memory_bm_position_reset(io_map); - memory_bm_position_reset(pageset1_copy_map); - - toi_worker_command = TOI_IO_WORKER_RUN; - wake_up(&toi_worker_wait_queue); - - mutex_unlock(&io_mutex); - - if (using_flusher) - result = toiActiveAllocator->io_flusher(write); - else - worker_rw_loop(NULL); - - while (atomic_read(&toi_io_workers)) - schedule(); - - printk(KERN_CONT "\n"); - - toi_worker_command = TOI_IO_WORKER_STOP; - wake_up(&toi_worker_wait_queue); - - if (unlikely(test_toi_state(TOI_STOP_RESUME))) { - if (!atomic_read(&toi_io_workers)) { - rw_cleanup_modules(READ); - set_toi_state(TOI_IO_STOPPED); - } - while (1) - schedule(); - } - set_toi_state(TOI_IO_STOPPED); - - if (!io_result && !result && !test_result_state(TOI_ABORTED)) { - unsigned long next; - - toi_update_status(io_base + io_finish_at, io_barmax, - " %d/%d MB ", - MB(io_base + io_finish_at), MB(io_barmax)); - - memory_bm_position_reset(io_map); - next = memory_bm_next_pfn(io_map, 0); - if (next != BM_END_OF_MAP) { - printk(KERN_INFO "Finished I/O loop but still work to " - "do?\nFinish at = %d. io_count = %d.\n", - finish_at, atomic_read(&io_count)); - printk(KERN_INFO "I/O bitmap still records work to do." - "%ld.\n", next); - BUG(); - do { - cpu_relax(); - } while (0); - } - } - - return io_result ? io_result : result; -} - -/** - * write_pageset - write a pageset to disk. - * @pagedir: Which pagedir to write. - * - * Returns: - * Zero on success or -1 on failure. - **/ -int write_pageset(struct pagedir *pagedir) -{ - int finish_at, base = 0; - int barmax = pagedir1.size + pagedir2.size; - long error = 0; - struct memory_bitmap *pageflags; - unsigned long start_time, end_time; - - /* - * Even if there is nothing to read or write, the allocator - * may need the init/cleanup for it's housekeeping. (eg: - * Pageset1 may start where pageset2 ends when writing). - */ - finish_at = pagedir->size; - - if (pagedir->id == 1) { - toi_prepare_status(DONT_CLEAR_BAR, - "Writing kernel & process data..."); - base = pagedir2.size; - if (test_action_state(TOI_TEST_FILTER_SPEED) || - test_action_state(TOI_TEST_BIO)) - pageflags = pageset1_map; - else - pageflags = pageset1_copy_map; - } else { - toi_prepare_status(DONT_CLEAR_BAR, "Writing caches..."); - pageflags = pageset2_map; - } - - start_time = jiffies; - - if (rw_init_modules(WRITE, pagedir->id)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialise modules for writing."); - error = 1; - } - - if (!error) - error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax, - pagedir->id); - - if (rw_cleanup_modules(WRITE) && !error) { - abort_hibernate(TOI_FAILED_MODULE_CLEANUP, - "Failed to cleanup after writing."); - error = 1; - } - - end_time = jiffies; - - if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { - toi_bkd.toi_io_time[0][0] += finish_at, - toi_bkd.toi_io_time[0][1] += (end_time - start_time); - } - - return error; -} - -/** - * read_pageset - highlevel function to read a pageset from disk - * @pagedir: pageset to read - * @overwrittenpagesonly: Whether to read the whole pageset or - * only part of it. - * - * Returns: - * Zero on success or -1 on failure. - **/ -static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly) -{ - int result = 0, base = 0; - int finish_at = pagedir->size; - int barmax = pagedir1.size + pagedir2.size; - struct memory_bitmap *pageflags; - unsigned long start_time, end_time; - - if (pagedir->id == 1) { - toi_prepare_status(DONT_CLEAR_BAR, - "Reading kernel & process data..."); - pageflags = pageset1_map; - } else { - toi_prepare_status(DONT_CLEAR_BAR, "Reading caches..."); - if (overwrittenpagesonly) { - barmax = min(pagedir1.size, pagedir2.size); - finish_at = min(pagedir1.size, pagedir2.size); - } else - base = pagedir1.size; - pageflags = pageset2_map; - } - - start_time = jiffies; - - if (rw_init_modules(READ, pagedir->id)) { - toiActiveAllocator->remove_image(); - result = 1; - } else - result = do_rw_loop(READ, finish_at, pageflags, base, barmax, - pagedir->id); - - if (rw_cleanup_modules(READ) && !result) { - abort_hibernate(TOI_FAILED_MODULE_CLEANUP, - "Failed to cleanup after reading."); - result = 1; - } - - /* Statistics */ - end_time = jiffies; - - if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { - toi_bkd.toi_io_time[1][0] += finish_at, - toi_bkd.toi_io_time[1][1] += (end_time - start_time); - } - - return result; -} - -/** - * write_module_configs - store the modules configuration - * - * The configuration for each module is stored in the image header. - * Returns: Int - * Zero on success, Error value otherwise. - **/ -static int write_module_configs(void) -{ - struct toi_module_ops *this_module; - char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP); - int len, index = 1; - struct toi_module_header toi_module_header; - - if (!buffer) { - printk(KERN_INFO "Failed to allocate a buffer for saving " - "module configuration info.\n"); - return -ENOMEM; - } - - /* - * We have to know which data goes with which module, so we at - * least write a length of zero for a module. Note that we are - * also assuming every module's config data takes <= PAGE_SIZE. - */ - - /* For each module (in registration order) */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || !this_module->storage_needed || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - - /* Get the data from the module */ - len = 0; - if (this_module->save_config_info) - len = this_module->save_config_info(buffer); - - /* Save the details of the module */ - toi_module_header.enabled = this_module->enabled; - toi_module_header.type = this_module->type; - toi_module_header.index = index++; - strncpy(toi_module_header.name, this_module->name, - sizeof(toi_module_header.name)); - toiActiveAllocator->rw_header_chunk(WRITE, - this_module, - (char *) &toi_module_header, - sizeof(toi_module_header)); - - /* Save the size of the data and any data returned */ - toiActiveAllocator->rw_header_chunk(WRITE, - this_module, - (char *) &len, sizeof(int)); - if (len) - toiActiveAllocator->rw_header_chunk( - WRITE, this_module, buffer, len); - } - - /* Write a blank header to terminate the list */ - toi_module_header.name[0] = '\0'; - toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &toi_module_header, sizeof(toi_module_header)); - - toi_free_page(22, (unsigned long) buffer); - return 0; -} - -/** - * read_one_module_config - read and configure one module - * - * Read the configuration for one module, and configure the module - * to match if it is loaded. - * - * Returns: Int - * Zero on success, Error value otherwise. - **/ -static int read_one_module_config(struct toi_module_header *header) -{ - struct toi_module_ops *this_module; - int result, len; - char *buffer; - - /* Find the module */ - this_module = toi_find_module_given_name(header->name); - - if (!this_module) { - if (header->enabled) { - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "It looks like we need module %s for reading " - "the image but it hasn't been registered.\n", - header->name); - if (!(test_toi_state(TOI_CONTINUE_REQ))) - return -EINVAL; - } else - printk(KERN_INFO "Module %s configuration data found, " - "but the module hasn't registered. Looks like " - "it was disabled, so we're ignoring its data.", - header->name); - } - - /* Get the length of the data (if any) */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len, - sizeof(int)); - if (result) { - printk(KERN_ERR "Failed to read the length of the module %s's" - " configuration data.\n", - header->name); - return -EINVAL; - } - - /* Read any data and pass to the module (if we found one) */ - if (!len) - return 0; - - buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP); - - if (!buffer) { - printk(KERN_ERR "Failed to allocate a buffer for reloading " - "module configuration info.\n"); - return -ENOMEM; - } - - toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len); - - if (!this_module) - goto out; - - if (!this_module->save_config_info) - printk(KERN_ERR "Huh? Module %s appears to have a " - "save_config_info, but not a load_config_info " - "function!\n", this_module->name); - else - this_module->load_config_info(buffer, len); - - /* - * Now move this module to the tail of its lists. This will put it in - * order. Any new modules will end up at the top of the lists. They - * should have been set to disabled when loaded (people will - * normally not edit an initrd to load a new module and then hibernate - * without using it!). - */ - - toi_move_module_tail(this_module); - - this_module->enabled = header->enabled; - -out: - toi_free_page(23, (unsigned long) buffer); - return 0; -} - -/** - * read_module_configs - reload module configurations from the image header. - * - * Returns: Int - * Zero on success or an error code. - **/ -static int read_module_configs(void) -{ - int result = 0; - struct toi_module_header toi_module_header; - struct toi_module_ops *this_module; - - /* All modules are initially disabled. That way, if we have a module - * loaded now that wasn't loaded when we hibernated, it won't be used - * in trying to read the data. - */ - list_for_each_entry(this_module, &toi_modules, module_list) - this_module->enabled = 0; - - /* Get the first module header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - (char *) &toi_module_header, - sizeof(toi_module_header)); - if (result) { - printk(KERN_ERR "Failed to read the next module header.\n"); - return -EINVAL; - } - - /* For each module (in registration order) */ - while (toi_module_header.name[0]) { - result = read_one_module_config(&toi_module_header); - - if (result) - return -EINVAL; - - /* Get the next module header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - (char *) &toi_module_header, - sizeof(toi_module_header)); - - if (result) { - printk(KERN_ERR "Failed to read the next module " - "header.\n"); - return -EINVAL; - } - } - - return 0; -} - -static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev) -{ - return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1; -} - -int fs_info_space_needed(void) -{ - const struct super_block *sb; - int result = sizeof(int); - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) - result += 16 + sizeof(dev_t) + sizeof(int) + - fs->last_mount_size; - free_fs_info(fs); - } - return result; -} - -static int fs_info_num_to_save(void) -{ - const struct super_block *sb; - int to_save = 0; - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) - to_save++; - free_fs_info(fs); - } - - return to_save; -} - -static int fs_info_save(void) -{ - const struct super_block *sb; - int to_save = fs_info_num_to_save(); - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info" - " to save."); - return -EIO; - } - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) { - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - &fs->uuid[0], 16)) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write uuid."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &fs->dev_t, sizeof(dev_t))) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write dev_t."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &fs->last_mount_size, sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write last mount length."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - fs->last_mount, fs->last_mount_size)) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write uuid."); - return -EIO; - } - } - free_fs_info(fs); - } - return 0; -} - -static int fs_info_load_and_check_one(void) -{ - char uuid[16], *last_mount; - int result = 0, ln; - dev_t dev_t; - struct block_device *dev; - struct fs_info *fs_info, seek; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) { - abort_hibernate(TOI_FAILED_IO, "Failed to read uuid."); - return -EIO; - } - - read_if_version(3, dev_t, "uuid dev_t field", return -EIO); - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, - "Failed to read last mount size."); - return -EIO; - } - - last_mount = kzalloc(ln, GFP_KERNEL); - - if (!last_mount) - return -ENOMEM; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount, ln)) { - abort_hibernate(TOI_FAILED_IO, - "Failed to read last mount timestamp."); - result = -EIO; - goto out_lmt; - } - - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = dev_t; - seek.last_mount_size = ln; - seek.last_mount = last_mount; - dev_t = blk_lookup_fs_info(&seek); - if (!dev_t) - goto out_lmt; - - dev = toi_open_by_devnum(dev_t); - - fs_info = fs_info_from_block_dev(dev); - if (fs_info && !IS_ERR(fs_info)) { - if (ln != fs_info->last_mount_size) { - printk(KERN_EMERG "Found matching uuid but last mount " - "time lengths differ?! " - "(%d vs %d).\n", ln, - fs_info->last_mount_size); - result = -EINVAL; - } else { - char buf[BDEVNAME_SIZE]; - result = !!memcmp(fs_info->last_mount, last_mount, ln); - if (result) - printk(KERN_EMERG "Last mount time for %s has " - "changed!\n", bdevname(dev, buf)); - } - } - toi_close_bdev(dev); - free_fs_info(fs_info); -out_lmt: - kfree(last_mount); - return result; -} - -static int fs_info_load_and_check(void) -{ - int to_do, result = 0; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info " - "to load."); - return -EIO; - } - - while(to_do--) - result |= fs_info_load_and_check_one(); - - return result; -} - -/** - * write_image_header - write the image header after write the image proper - * - * Returns: Int - * Zero on success, error value otherwise. - **/ -int write_image_header(void) -{ - int ret; - int total = pagedir1.size + pagedir2.size+2; - char *header_buffer = NULL; - - /* Now prepare to write the header */ - ret = toiActiveAllocator->write_header_init(); - if (ret) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Active allocator's write_header_init" - " function failed."); - goto write_image_header_abort; - } - - /* Get a buffer */ - header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP); - if (!header_buffer) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Out of memory when trying to get page for header!"); - goto write_image_header_abort; - } - - /* Write hibernate header */ - if (fill_toi_header((struct toi_header *) header_buffer)) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to fill header information!"); - goto write_image_header_abort; - } - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - header_buffer, sizeof(struct toi_header))) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to write header info."); - goto write_image_header_abort; - } - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &toi_max_workers, sizeof(toi_max_workers))) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to number of workers to use."); - goto write_image_header_abort; - } - - /* Write filesystem info */ - if (fs_info_save()) - goto write_image_header_abort; - - /* Write module configurations */ - ret = write_module_configs(); - if (ret) { - abort_hibernate(TOI_FAILED_IO, - "Failed to write module configs."); - goto write_image_header_abort; - } - - if (memory_bm_write(pageset1_map, - toiActiveAllocator->rw_header_chunk)) { - abort_hibernate(TOI_FAILED_IO, - "Failed to write bitmaps."); - goto write_image_header_abort; - } - - /* Flush data and let allocator cleanup */ - if (toiActiveAllocator->write_header_cleanup()) { - abort_hibernate(TOI_FAILED_IO, - "Failed to cleanup writing header."); - goto write_image_header_abort_no_cleanup; - } - - if (test_result_state(TOI_ABORTED)) - goto write_image_header_abort_no_cleanup; - - toi_update_status(total, total, NULL); - -out: - if (header_buffer) - toi_free_page(24, (unsigned long) header_buffer); - return ret; - -write_image_header_abort: - toiActiveAllocator->write_header_cleanup(); -write_image_header_abort_no_cleanup: - ret = -1; - goto out; -} - -/** - * sanity_check - check the header - * @sh: the header which was saved at hibernate time. - * - * Perform a few checks, seeking to ensure that the kernel being - * booted matches the one hibernated. They need to match so we can - * be _sure_ things will work. It is not absolutely impossible for - * resuming from a different kernel to work, just not assured. - **/ -static char *sanity_check(struct toi_header *sh) -{ - char *reason = check_image_kernel((struct swsusp_info *) sh); - - if (reason) - return reason; - - if (!test_action_state(TOI_IGNORE_ROOTFS)) { - const struct super_block *sb; - list_for_each_entry(sb, &super_blocks, s_list) { - if ((!(sb->s_flags & MS_RDONLY)) && - (sb->s_type->fs_flags & FS_REQUIRES_DEV)) - return "Device backed fs has been mounted " - "rw prior to resume or initrd/ramfs " - "is mounted rw."; - } - } - - return NULL; -} - -static DECLARE_WAIT_QUEUE_HEAD(freeze_wait); - -#define FREEZE_IN_PROGRESS (~0) - -static int freeze_result; - -static void do_freeze(struct work_struct *dummy) -{ - freeze_result = freeze_processes(); - wake_up(&freeze_wait); - trap_non_toi_io = 1; -} - -static DECLARE_WORK(freeze_work, do_freeze); - -/** - * __read_pageset1 - test for the existence of an image and attempt to load it - * - * Returns: Int - * Zero if image found and pageset1 successfully loaded. - * Error if no image found or loaded. - **/ -static int __read_pageset1(void) -{ - int i, result = 0; - char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP), - *sanity_error = NULL; - struct toi_header *toi_header; - - if (!header_buffer) { - printk(KERN_INFO "Unable to allocate a page for reading the " - "signature.\n"); - return -ENOMEM; - } - - /* Check for an image */ - result = toiActiveAllocator->image_exists(1); - if (result == 3) { - result = -ENODATA; - toi_early_boot_message(1, 0, "The signature from an older " - "version of TuxOnIce has been detected."); - goto out_remove_image; - } - - if (result != 1) { - result = -ENODATA; - noresume_reset_modules(); - printk(KERN_INFO "TuxOnIce: No image found.\n"); - goto out; - } - - /* - * Prepare the active allocator for reading the image header. The - * activate allocator might read its own configuration. - * - * NB: This call may never return because there might be a signature - * for a different image such that we warn the user and they choose - * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the - * location of the image might be unavailable if it was stored on a - * network connection). - */ - - result = toiActiveAllocator->read_header_init(); - if (result) { - printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the " - "image header.\n"); - goto out_remove_image; - } - - /* Check for noresume command line option */ - if (test_toi_state(TOI_NORESUME_SPECIFIED)) { - printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed " - "image.\n"); - goto out_remove_image; - } - - /* Check whether we've resumed before */ - if (test_toi_state(TOI_RESUMED_BEFORE)) { - toi_early_boot_message(1, 0, NULL); - if (!(test_toi_state(TOI_CONTINUE_REQ))) { - printk(KERN_INFO "TuxOnIce: Tried to resume before: " - "Invalidated image.\n"); - goto out_remove_image; - } - } - - clear_toi_state(TOI_CONTINUE_REQ); - - toi_image_header_version = toiActiveAllocator->get_header_version(); - - if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) { - toi_early_boot_message(1, 0, image_version_error); - if (!(test_toi_state(TOI_CONTINUE_REQ))) { - printk(KERN_INFO "TuxOnIce: Header version too new: " - "Invalidated image.\n"); - goto out_remove_image; - } - } - - /* Read hibernate header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - header_buffer, sizeof(struct toi_header)); - if (result < 0) { - printk(KERN_ERR "TuxOnIce: Failed to read the image " - "signature.\n"); - goto out_remove_image; - } - - toi_header = (struct toi_header *) header_buffer; - - /* - * NB: This call may also result in a reboot rather than returning. - */ - - sanity_error = sanity_check(toi_header); - if (sanity_error) { - toi_early_boot_message(1, TOI_CONTINUE_REQ, - sanity_error); - printk(KERN_INFO "TuxOnIce: Sanity check failed.\n"); - goto out_remove_image; - } - - /* - * We have an image and it looks like it will load okay. - * - * Get metadata from header. Don't override commandline parameters. - * - * We don't need to save the image size limit because it's not used - * during resume and will be restored with the image anyway. - */ - - memcpy((char *) &pagedir1, - (char *) &toi_header->pagedir, sizeof(pagedir1)); - toi_result = toi_header->param0; - if (!toi_bkd.toi_debug_state) { - toi_bkd.toi_action = - (toi_header->param1 & ~toi_bootflags_mask) | - (toi_bkd.toi_action & toi_bootflags_mask); - toi_bkd.toi_debug_state = toi_header->param2; - toi_bkd.toi_default_console_level = toi_header->param3; - } - clear_toi_state(TOI_IGNORE_LOGLEVEL); - pagedir2.size = toi_header->pageset_2_size; - for (i = 0; i < 4; i++) - toi_bkd.toi_io_time[i/2][i%2] = - toi_header->io_time[i/2][i%2]; - - set_toi_state(TOI_BOOT_KERNEL); - boot_kernel_data_buffer = toi_header->bkd; - - read_if_version(1, toi_max_workers, "TuxOnIce max workers", - goto out_remove_image); - - /* Read filesystem info */ - if (fs_info_load_and_check()) { - printk(KERN_EMERG "TuxOnIce: File system mount time checks " - "failed. Refusing to corrupt your filesystems!\n"); - goto out_remove_image; - } - - /* Read module configurations */ - result = read_module_configs(); - if (result) { - pagedir1.size = 0; - pagedir2.size = 0; - printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module " - "configurations.\n"); - clear_action_state(TOI_KEEP_IMAGE); - goto out_remove_image; - } - - toi_prepare_console(); - - set_toi_state(TOI_NOW_RESUMING); - - result = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (result) - goto out_notifier_call_chain;; - - if (usermodehelper_disable()) - goto out_enable_usermodehelper; - - current->flags |= PF_NOFREEZE; - freeze_result = FREEZE_IN_PROGRESS; - - schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work); - - toi_cond_pause(1, "About to read original pageset1 locations."); - - /* - * See _toi_rw_header_chunk in tuxonice_bio.c: - * Initialize pageset1_map by reading the map from the image. - */ - if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk)) - goto out_thaw; - - /* - * See toi_rw_cleanup in tuxonice_bio.c: - * Clean up after reading the header. - */ - result = toiActiveAllocator->read_header_cleanup(); - if (result) { - printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the " - "image header.\n"); - goto out_thaw; - } - - toi_cond_pause(1, "About to read pagedir."); - - /* - * Get the addresses of pages into which we will load the kernel to - * be copied back and check if they conflict with the ones we are using. - */ - if (toi_get_pageset1_load_addresses()) { - printk(KERN_INFO "TuxOnIce: Failed to get load addresses for " - "pageset1.\n"); - goto out_thaw; - } - - /* Read the original kernel back */ - toi_cond_pause(1, "About to read pageset 1."); - - /* Given the pagemap, read back the data from disk */ - if (read_pageset(&pagedir1, 0)) { - toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1."); - result = -EIO; - goto out_thaw; - } - - toi_cond_pause(1, "About to restore original kernel."); - result = 0; - - if (!toi_keeping_image && - toiActiveAllocator->mark_resume_attempted) - toiActiveAllocator->mark_resume_attempted(1); - - wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); -out: - current->flags &= ~PF_NOFREEZE; - toi_free_page(25, (unsigned long) header_buffer); - return result; - -out_thaw: - wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); - trap_non_toi_io = 0; - thaw_processes(); -out_enable_usermodehelper: - usermodehelper_enable(); -out_notifier_call_chain: - pm_notifier_call_chain(PM_POST_RESTORE); - toi_cleanup_console(); -out_remove_image: - result = -EINVAL; - if (!toi_keeping_image) - toiActiveAllocator->remove_image(); - toiActiveAllocator->read_header_cleanup(); - noresume_reset_modules(); - goto out; -} - -/** - * read_pageset1 - highlevel function to read the saved pages - * - * Attempt to read the header and pageset1 of a hibernate image. - * Handle the outcome, complaining where appropriate. - **/ -int read_pageset1(void) -{ - int error; - - error = __read_pageset1(); - - if (error && error != -ENODATA && error != -EINVAL && - !test_result_state(TOI_ABORTED)) - abort_hibernate(TOI_IMAGE_ERROR, - "TuxOnIce: Error %d resuming\n", error); - - return error; -} - -/** - * get_have_image_data - check the image header - **/ -static char *get_have_image_data(void) -{ - char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP); - struct toi_header *toi_header; - - if (!output_buffer) { - printk(KERN_INFO "Output buffer null.\n"); - return NULL; - } - - /* Check for an image */ - if (!toiActiveAllocator->image_exists(1) || - toiActiveAllocator->read_header_init() || - toiActiveAllocator->rw_header_chunk(READ, NULL, - output_buffer, sizeof(struct toi_header))) { - sprintf(output_buffer, "0\n"); - /* - * From an initrd/ramfs, catting have_image and - * getting a result of 0 is sufficient. - */ - clear_toi_state(TOI_BOOT_TIME); - goto out; - } - - toi_header = (struct toi_header *) output_buffer; - - sprintf(output_buffer, "1\n%s\n%s\n", - toi_header->uts.machine, - toi_header->uts.version); - - /* Check whether we've resumed before */ - if (test_toi_state(TOI_RESUMED_BEFORE)) - strcat(output_buffer, "Resumed before.\n"); - -out: - noresume_reset_modules(); - return output_buffer; -} - -/** - * read_pageset2 - read second part of the image - * @overwrittenpagesonly: Read only pages which would have been - * verwritten by pageset1? - * - * Read in part or all of pageset2 of an image, depending upon - * whether we are hibernating and have only overwritten a portion - * with pageset1 pages, or are resuming and need to read them - * all. - * - * Returns: Int - * Zero if no error, otherwise the error value. - **/ -int read_pageset2(int overwrittenpagesonly) -{ - int result = 0; - - if (!pagedir2.size) - return 0; - - result = read_pageset(&pagedir2, overwrittenpagesonly); - - toi_cond_pause(1, "Pagedir 2 read."); - - return result; -} - -/** - * image_exists_read - has an image been found? - * @page: Output buffer - * - * Store 0 or 1 in page, depending on whether an image is found. - * Incoming buffer is PAGE_SIZE and result is guaranteed - * to be far less than that, so we don't worry about - * overflow. - **/ -int image_exists_read(const char *page, int count) -{ - int len = 0; - char *result; - - if (toi_activate_storage(0)) - return count; - - if (!test_toi_state(TOI_RESUME_DEVICE_OK)) - toi_attempt_to_parse_resume_device(0); - - if (!toiActiveAllocator) { - len = sprintf((char *) page, "-1\n"); - } else { - result = get_have_image_data(); - if (result) { - len = sprintf((char *) page, "%s", result); - toi_free_page(26, (unsigned long) result); - } - } - - toi_deactivate_storage(0); - - return len; -} - -/** - * image_exists_write - invalidate an image if one exists - **/ -int image_exists_write(const char *buffer, int count) -{ - if (toi_activate_storage(0)) - return count; - - if (toiActiveAllocator && toiActiveAllocator->image_exists(1)) - toiActiveAllocator->remove_image(); - - toi_deactivate_storage(0); - - clear_result_state(TOI_KEPT_IMAGE); - - return count; -} diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h deleted file mode 100644 index 56645a5c6..000000000 --- a/kernel/power/tuxonice_io.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * kernel/power/tuxonice_io.h - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains high level IO routines for hibernating. - * - */ - -#include <linux/utsname.h> -#include "tuxonice_pagedir.h" - -/* Non-module data saved in our image header */ -struct toi_header { - /* - * Mirror struct swsusp_info, but without - * the page aligned attribute - */ - struct new_utsname uts; - u32 version_code; - unsigned long num_physpages; - int cpus; - unsigned long image_pages; - unsigned long pages; - unsigned long size; - - /* Our own data */ - unsigned long orig_mem_free; - int page_size; - int pageset_2_size; - int param0; - int param1; - int param2; - int param3; - int progress0; - int progress1; - int progress2; - int progress3; - int io_time[2][2]; - struct pagedir pagedir; - dev_t root_fs; - unsigned long bkd; /* Boot kernel data locn */ -}; - -extern int write_pageset(struct pagedir *pagedir); -extern int write_image_header(void); -extern int read_pageset1(void); -extern int read_pageset2(int overwrittenpagesonly); - -extern int toi_attempt_to_parse_resume_device(int quiet); -extern void attempt_to_parse_resume_device2(void); -extern void attempt_to_parse_alt_resume_param(void); -int image_exists_read(const char *page, int count); -int image_exists_write(const char *buffer, int count); -extern void save_restore_alt_param(int replace, int quiet); -extern atomic_t toi_io_workers; - -/* Args to save_restore_alt_param */ -#define RESTORE 0 -#define SAVE 1 - -#define NOQUIET 0 -#define QUIET 1 - -extern wait_queue_head_t toi_io_queue_flusher; -extern int toi_bio_queue_flusher_should_finish; - -int fs_info_space_needed(void); - -extern int toi_max_workers; diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c deleted file mode 100644 index 18f22bdb6..000000000 --- a/kernel/power/tuxonice_modules.c +++ /dev/null @@ -1,520 +0,0 @@ -/* - * kernel/power/tuxonice_modules.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - */ - -#include <linux/suspend.h> -#include <linux/module.h> -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_ui.h" - -LIST_HEAD(toi_filters); -LIST_HEAD(toiAllocators); - -LIST_HEAD(toi_modules); - -struct toi_module_ops *toiActiveAllocator; - -static int toi_num_filters; -int toiNumAllocators, toi_num_modules; - -/* - * toi_header_storage_for_modules - * - * Returns the amount of space needed to store configuration - * data needed by the modules prior to copying back the original - * kernel. We can exclude data for pageset2 because it will be - * available anyway once the kernel is copied back. - */ -long toi_header_storage_for_modules(void) -{ - struct toi_module_ops *this_module; - int bytes = 0; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - if (this_module->storage_needed) { - int this = this_module->storage_needed() + - sizeof(struct toi_module_header) + - sizeof(int); - this_module->header_requested = this; - bytes += this; - } - } - - /* One more for the empty terminator */ - return bytes + sizeof(struct toi_module_header); -} - -void print_toi_header_storage_for_modules(void) -{ - struct toi_module_ops *this_module; - int bytes = 0; - - printk(KERN_DEBUG "Header storage:\n"); - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - if (this_module->storage_needed) { - int this = this_module->storage_needed() + - sizeof(struct toi_module_header) + - sizeof(int); - this_module->header_requested = this; - bytes += this; - printk(KERN_DEBUG "+ %16s : %-4d/%d.\n", - this_module->name, - this_module->header_used, this); - } - } - - printk(KERN_DEBUG "+ empty terminator : %zu.\n", - sizeof(struct toi_module_header)); - printk(KERN_DEBUG " ====\n"); - printk(KERN_DEBUG " %zu\n", - bytes + sizeof(struct toi_module_header)); -} - -/* - * toi_memory_for_modules - * - * Returns the amount of memory requested by modules for - * doing their work during the cycle. - */ - -long toi_memory_for_modules(int print_parts) -{ - long bytes = 0, result; - struct toi_module_ops *this_module; - - if (print_parts) - printk(KERN_INFO "Memory for modules:\n===================\n"); - list_for_each_entry(this_module, &toi_modules, module_list) { - int this; - if (!this_module->enabled) - continue; - if (this_module->memory_needed) { - this = this_module->memory_needed(); - if (print_parts) - printk(KERN_INFO "%10d bytes (%5ld pages) for " - "module '%s'.\n", this, - DIV_ROUND_UP(this, PAGE_SIZE), - this_module->name); - bytes += this; - } - } - - result = DIV_ROUND_UP(bytes, PAGE_SIZE); - if (print_parts) - printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result); - - return result; -} - -/* - * toi_expected_compression_ratio - * - * Returns the compression ratio expected when saving the image. - */ - -int toi_expected_compression_ratio(void) -{ - int ratio = 100; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled) - continue; - if (this_module->expected_compression) - ratio = ratio * this_module->expected_compression() - / 100; - } - - return ratio; -} - -/* toi_find_module_given_dir - * Functionality : Return a module (if found), given a pointer - * to its directory name - */ - -static struct toi_module_ops *toi_find_module_given_dir(char *name) -{ - struct toi_module_ops *this_module, *found_module = NULL; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!strcmp(name, this_module->directory)) { - found_module = this_module; - break; - } - } - - return found_module; -} - -/* toi_find_module_given_name - * Functionality : Return a module (if found), given a pointer - * to its name - */ - -struct toi_module_ops *toi_find_module_given_name(char *name) -{ - struct toi_module_ops *this_module, *found_module = NULL; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!strcmp(name, this_module->name)) { - found_module = this_module; - break; - } - } - - return found_module; -} - -/* - * toi_print_module_debug_info - * Functionality : Get debugging info from modules into a buffer. - */ -int toi_print_module_debug_info(char *buffer, int buffer_size) -{ - struct toi_module_ops *this_module; - int len = 0; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled) - continue; - if (this_module->print_debug_info) { - int result; - result = this_module->print_debug_info(buffer + len, - buffer_size - len); - len += result; - } - } - - /* Ensure null terminated */ - buffer[buffer_size] = 0; - - return len; -} - -/* - * toi_register_module - * - * Register a module. - */ -int toi_register_module(struct toi_module_ops *module) -{ - int i; - struct kobject *kobj; - - if (!hibernation_available()) - return -ENODEV; - - module->enabled = 1; - - if (toi_find_module_given_name(module->name)) { - printk(KERN_INFO "TuxOnIce: Trying to load module %s," - " which is already registered.\n", - module->name); - return -EBUSY; - } - - switch (module->type) { - case FILTER_MODULE: - list_add_tail(&module->type_list, &toi_filters); - toi_num_filters++; - break; - case WRITER_MODULE: - list_add_tail(&module->type_list, &toiAllocators); - toiNumAllocators++; - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Hmmm. Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return -EINVAL; - } - list_add_tail(&module->module_list, &toi_modules); - toi_num_modules++; - - if ((!module->directory && !module->shared_directory) || - !module->sysfs_data || !module->num_sysfs_entries) - return 0; - - /* - * Modules may share a directory, but those with shared_dir - * set must be loaded (via symbol dependencies) after parents - * and unloaded beforehand. - */ - if (module->shared_directory) { - struct toi_module_ops *shared = - toi_find_module_given_dir(module->shared_directory); - if (!shared) { - printk(KERN_ERR "TuxOnIce: Module %s wants to share " - "%s's directory but %s isn't loaded.\n", - module->name, module->shared_directory, - module->shared_directory); - toi_unregister_module(module); - return -ENODEV; - } - kobj = shared->dir_kobj; - } else { - if (!strncmp(module->directory, "[ROOT]", 6)) - kobj = tuxonice_kobj; - else - kobj = make_toi_sysdir(module->directory); - } - module->dir_kobj = kobj; - for (i = 0; i < module->num_sysfs_entries; i++) { - int result = toi_register_sysfs_file(kobj, - &module->sysfs_data[i]); - if (result) - return result; - } - return 0; -} - -/* - * toi_unregister_module - * - * Remove a module. - */ -void toi_unregister_module(struct toi_module_ops *module) -{ - int i; - - if (module->dir_kobj) - for (i = 0; i < module->num_sysfs_entries; i++) - toi_unregister_sysfs_file(module->dir_kobj, - &module->sysfs_data[i]); - - if (!module->shared_directory && module->directory && - strncmp(module->directory, "[ROOT]", 6)) - remove_toi_sysdir(module->dir_kobj); - - switch (module->type) { - case FILTER_MODULE: - list_del(&module->type_list); - toi_num_filters--; - break; - case WRITER_MODULE: - list_del(&module->type_list); - toiNumAllocators--; - if (toiActiveAllocator == module) { - toiActiveAllocator = NULL; - clear_toi_state(TOI_CAN_RESUME); - clear_toi_state(TOI_CAN_HIBERNATE); - } - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return; - } - list_del(&module->module_list); - toi_num_modules--; -} - -/* - * toi_move_module_tail - * - * Rearrange modules when reloading the config. - */ -void toi_move_module_tail(struct toi_module_ops *module) -{ - switch (module->type) { - case FILTER_MODULE: - if (toi_num_filters > 1) - list_move_tail(&module->type_list, &toi_filters); - break; - case WRITER_MODULE: - if (toiNumAllocators > 1) - list_move_tail(&module->type_list, &toiAllocators); - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return; - } - if ((toi_num_filters + toiNumAllocators) > 1) - list_move_tail(&module->module_list, &toi_modules); -} - -/* - * toi_initialise_modules - * - * Get ready to do some work! - */ -int toi_initialise_modules(int starting_cycle, int early) -{ - struct toi_module_ops *this_module; - int result; - - list_for_each_entry(this_module, &toi_modules, module_list) { - this_module->header_requested = 0; - this_module->header_used = 0; - if (!this_module->enabled) - continue; - if (this_module->early != early) - continue; - if (this_module->initialise) { - result = this_module->initialise(starting_cycle); - if (result) { - toi_cleanup_modules(starting_cycle); - return result; - } - this_module->initialised = 1; - } - } - - return 0; -} - -/* - * toi_cleanup_modules - * - * Tell modules the work is done. - */ -void toi_cleanup_modules(int finishing_cycle) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || !this_module->initialised) - continue; - if (this_module->cleanup) - this_module->cleanup(finishing_cycle); - this_module->initialised = 0; - } -} - -/* - * toi_pre_atomic_restore_modules - * - * Get ready to do some work! - */ -void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->enabled && this_module->pre_atomic_restore) - this_module->pre_atomic_restore(bkd); - } -} - -/* - * toi_post_atomic_restore_modules - * - * Get ready to do some work! - */ -void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->enabled && this_module->post_atomic_restore) - this_module->post_atomic_restore(bkd); - } -} - -/* - * toi_get_next_filter - * - * Get the next filter in the pipeline. - */ -struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought) -{ - struct toi_module_ops *last_filter = NULL, *this_filter = NULL; - - list_for_each_entry(this_filter, &toi_filters, type_list) { - if (!this_filter->enabled) - continue; - if ((last_filter == filter_sought) || (!filter_sought)) - return this_filter; - last_filter = this_filter; - } - - return toiActiveAllocator; -} - -/** - * toi_show_modules: Printk what support is loaded. - */ -void toi_print_modules(void) -{ - struct toi_module_ops *this_module; - int prev = 0; - - printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for"); - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->type == MISC_HIDDEN_MODULE) - continue; - printk("%s %s%s%s", prev ? "," : "", - this_module->enabled ? "" : "[", - this_module->name, - this_module->enabled ? "" : "]"); - prev = 1; - } - - printk(".\n"); -} - -/* toi_get_modules - * - * Take a reference to modules so they can't go away under us. - */ - -int toi_get_modules(void) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - struct toi_module_ops *this_module2; - - if (try_module_get(this_module->module)) - continue; - - /* Failed! Reverse gets and return error */ - list_for_each_entry(this_module2, &toi_modules, - module_list) { - if (this_module == this_module2) - return -EINVAL; - module_put(this_module2->module); - } - } - return 0; -} - -/* toi_put_modules - * - * Release our references to modules we used. - */ - -void toi_put_modules(void) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) - module_put(this_module->module); -} diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h deleted file mode 100644 index 34ffe2ee3..000000000 --- a/kernel/power/tuxonice_modules.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * kernel/power/tuxonice_modules.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations for modules. Modules are additions to - * TuxOnIce that provide facilities such as image compression or - * encryption, backends for storage of the image and user interfaces. - * - */ - -#ifndef TOI_MODULES_H -#define TOI_MODULES_H - -/* This is the maximum size we store in the image header for a module name */ -#define TOI_MAX_MODULE_NAME_LENGTH 30 - -struct toi_boot_kernel_data; - -/* Per-module metadata */ -struct toi_module_header { - char name[TOI_MAX_MODULE_NAME_LENGTH]; - int enabled; - int type; - int index; - int data_length; - unsigned long signature; -}; - -enum { - FILTER_MODULE, - WRITER_MODULE, - BIO_ALLOCATOR_MODULE, - MISC_MODULE, - MISC_HIDDEN_MODULE, -}; - -enum { - TOI_ASYNC, - TOI_SYNC -}; - -enum { - TOI_VIRT, - TOI_PAGE, -}; - -#define TOI_MAP(type, addr) \ - (type == TOI_PAGE ? kmap(addr) : addr) - -#define TOI_UNMAP(type, addr) \ - do { \ - if (type == TOI_PAGE) \ - kunmap(addr); \ - } while(0) - -struct toi_module_ops { - /* Functions common to all modules */ - int type; - char *name; - char *directory; - char *shared_directory; - struct kobject *dir_kobj; - struct module *module; - int enabled, early, initialised; - struct list_head module_list; - - /* List of filters or allocators */ - struct list_head list, type_list; - - /* - * Requirements for memory and storage in - * the image header.. - */ - int (*memory_needed) (void); - int (*storage_needed) (void); - - int header_requested, header_used; - - int (*expected_compression) (void); - - /* - * Debug info - */ - int (*print_debug_info) (char *buffer, int size); - int (*save_config_info) (char *buffer); - void (*load_config_info) (char *buffer, int len); - - /* - * Initialise & cleanup - general routines called - * at the start and end of a cycle. - */ - int (*initialise) (int starting_cycle); - void (*cleanup) (int finishing_cycle); - - void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd); - void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd); - - /* - * Calls for allocating storage (allocators only). - * - * Header space is requested separately and cannot fail, but the - * reservation is only applied when main storage is allocated. - * The header space reservation is thus always set prior to - * requesting the allocation of storage - and prior to querying - * how much storage is available. - */ - - unsigned long (*storage_available) (void); - void (*reserve_header_space) (unsigned long space_requested); - int (*register_storage) (void); - int (*allocate_storage) (unsigned long space_requested); - unsigned long (*storage_allocated) (void); - void (*free_unused_storage) (void); - - /* - * Routines used in image I/O. - */ - int (*rw_init) (int rw, int stream_number); - int (*rw_cleanup) (int rw); - int (*write_page) (unsigned long index, int buf_type, void *buf, - unsigned int buf_size); - int (*read_page) (unsigned long *index, int buf_type, void *buf, - unsigned int *buf_size); - int (*io_flusher) (int rw); - - /* Reset module if image exists but reading aborted */ - void (*noresume_reset) (void); - - /* Read and write the metadata */ - int (*write_header_init) (void); - int (*write_header_cleanup) (void); - - int (*read_header_init) (void); - int (*read_header_cleanup) (void); - - /* To be called after read_header_init */ - int (*get_header_version) (void); - - int (*rw_header_chunk) (int rw, struct toi_module_ops *owner, - char *buffer_start, int buffer_size); - - int (*rw_header_chunk_noreadahead) (int rw, - struct toi_module_ops *owner, char *buffer_start, - int buffer_size); - - /* Attempt to parse an image location */ - int (*parse_sig_location) (char *buffer, int only_writer, int quiet); - - /* Throttle I/O according to throughput */ - void (*update_throughput_throttle) (int jif_index); - - /* Flush outstanding I/O */ - int (*finish_all_io) (void); - - /* Determine whether image exists that we can restore */ - int (*image_exists) (int quiet); - - /* Mark the image as having tried to resume */ - int (*mark_resume_attempted) (int); - - /* Destroy image if one exists */ - int (*remove_image) (void); - - /* Sysfs Data */ - struct toi_sysfs_data *sysfs_data; - int num_sysfs_entries; - - /* Block I/O allocator */ - struct toi_bio_allocator_ops *bio_allocator_ops; -}; - -extern int toi_num_modules, toiNumAllocators; - -extern struct toi_module_ops *toiActiveAllocator; -extern struct list_head toi_filters, toiAllocators, toi_modules; - -extern void toi_prepare_console_modules(void); -extern void toi_cleanup_console_modules(void); - -extern struct toi_module_ops *toi_find_module_given_name(char *name); -extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *); - -extern int toi_register_module(struct toi_module_ops *module); -extern void toi_move_module_tail(struct toi_module_ops *module); - -extern long toi_header_storage_for_modules(void); -extern long toi_memory_for_modules(int print_parts); -extern void print_toi_header_storage_for_modules(void); -extern int toi_expected_compression_ratio(void); - -extern int toi_print_module_debug_info(char *buffer, int buffer_size); -extern int toi_register_module(struct toi_module_ops *module); -extern void toi_unregister_module(struct toi_module_ops *module); - -extern int toi_initialise_modules(int starting_cycle, int early); -#define toi_initialise_modules_early(starting) \ - toi_initialise_modules(starting, 1) -#define toi_initialise_modules_late(starting) \ - toi_initialise_modules(starting, 0) -extern void toi_cleanup_modules(int finishing_cycle); - -extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd); -extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd); - -extern void toi_print_modules(void); - -int toi_get_modules(void); -void toi_put_modules(void); -#endif diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c deleted file mode 100644 index 0db58af8b..000000000 --- a/kernel/power/tuxonice_netlink.c +++ /dev/null @@ -1,324 +0,0 @@ -/* - * kernel/power/tuxonice_netlink.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Functions for communicating with a userspace helper via netlink. - */ - -#include <linux/suspend.h> -#include <linux/sched.h> -#include <linux/kmod.h> -#include "tuxonice_netlink.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" - -static struct user_helper_data *uhd_list; - -/* - * Refill our pool of SKBs for use in emergencies (eg, when eating memory and - * none can be allocated). - */ -static void toi_fill_skb_pool(struct user_helper_data *uhd) -{ - while (uhd->pool_level < uhd->pool_limit) { - struct sk_buff *new_skb = - alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); - - if (!new_skb) - break; - - new_skb->next = uhd->emerg_skbs; - uhd->emerg_skbs = new_skb; - uhd->pool_level++; - } -} - -/* - * Try to allocate a single skb. If we can't get one, try to use one from - * our pool. - */ -static struct sk_buff *toi_get_skb(struct user_helper_data *uhd) -{ - struct sk_buff *skb = - alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); - - if (skb) - return skb; - - skb = uhd->emerg_skbs; - if (skb) { - uhd->pool_level--; - uhd->emerg_skbs = skb->next; - skb->next = NULL; - } - - return skb; -} - -void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - void *dest; - struct task_struct *t; - - if (uhd->pid == -1) - return; - - if (uhd->debug) - printk(KERN_ERR "toi_send_netlink_message: Send " - "message type %d.\n", type); - - skb = toi_get_skb(uhd); - if (!skb) { - printk(KERN_INFO "toi_netlink: Can't allocate skb!\n"); - return; - } - - nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0); - uhd->sock_seq++; - - dest = NLMSG_DATA(nlh); - if (params && len > 0) - memcpy(dest, params, len); - - netlink_unicast(uhd->nl, skb, uhd->pid, 0); - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); - if (!t) { - toi_read_unlock_tasklist(); - if (uhd->pid > -1) - printk(KERN_INFO "Hmm. Can't find the userspace task" - " %d.\n", uhd->pid); - return; - } - wake_up_process(t); - toi_read_unlock_tasklist(); - - yield(); -} - -static void send_whether_debugging(struct user_helper_data *uhd) -{ - static u8 is_debugging = 1; - - toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING, - &is_debugging, sizeof(u8)); -} - -/* - * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we - * are hibernating. - */ -static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid) -{ - struct task_struct *t; - - if (uhd->debug) - printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid); - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(pid, &init_pid_ns); - if (!t) { - toi_read_unlock_tasklist(); - printk(KERN_INFO "Strange. Can't find the userspace task %d.\n", - pid); - return -EINVAL; - } - - t->flags |= PF_NOFREEZE; - - toi_read_unlock_tasklist(); - uhd->pid = pid; - - toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0); - - return 0; -} - -/* - * Called when the userspace process has informed us that it's ready to roll. - */ -static int nl_ready(struct user_helper_data *uhd, u32 version) -{ - if (version != uhd->interface_version) { - printk(KERN_INFO "%s userspace process using invalid interface" - " version (%d - kernel wants %d). Trying to " - "continue without it.\n", - uhd->name, version, uhd->interface_version); - if (uhd->not_ready) - uhd->not_ready(); - return -EINVAL; - } - - complete(&uhd->wait_for_process); - - return 0; -} - -void toi_netlink_close_complete(struct user_helper_data *uhd) -{ - if (uhd->nl) { - netlink_kernel_release(uhd->nl); - uhd->nl = NULL; - } - - while (uhd->emerg_skbs) { - struct sk_buff *next = uhd->emerg_skbs->next; - kfree_skb(uhd->emerg_skbs); - uhd->emerg_skbs = next; - } - - uhd->pid = -1; -} - -static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd, - struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type = nlh->nlmsg_type; - int *data; - int err; - - if (uhd->debug) - printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n", - type); - - /* Let the more specific handler go first. It returns - * 1 for valid messages that it doesn't know. */ - err = uhd->rcv_msg(skb, nlh); - if (err != 1) - return err; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) { - printk(KERN_INFO "Received extra nofreeze me requests.\n"); - return -EBUSY; - } - - data = NLMSG_DATA(nlh); - - switch (type) { - case NETLINK_MSG_NOFREEZE_ME: - return nl_set_nofreeze(uhd, nlh->nlmsg_pid); - case NETLINK_MSG_GET_DEBUGGING: - send_whether_debugging(uhd); - return 0; - case NETLINK_MSG_READY: - if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) { - printk(KERN_INFO "Invalid ready mesage.\n"); - if (uhd->not_ready) - uhd->not_ready(); - return -EINVAL; - } - return nl_ready(uhd, (u32) *data); - case NETLINK_MSG_CLEANUP: - toi_netlink_close_complete(uhd); - return 0; - } - - return -EINVAL; -} - -static void toi_user_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr *nlh; - struct user_helper_data *uhd = uhd_list; - - while (uhd && uhd->netlink_id != skb->sk->sk_protocol) - uhd = uhd->next; - - if (!uhd) - return; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *) skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - - err = toi_nl_gen_rcv_msg(uhd, skb, nlh); - if (err) - netlink_ack(skb, nlh, err); - else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } -} - -static int netlink_prepare(struct user_helper_data *uhd) -{ - struct netlink_kernel_cfg cfg = { - .groups = 0, - .input = toi_user_rcv_skb, - }; - - uhd->next = uhd_list; - uhd_list = uhd; - - uhd->sock_seq = 0x42c0ffee; - uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg); - if (!uhd->nl) { - printk(KERN_INFO "Failed to allocate netlink socket for %s.\n", - uhd->name); - return -ENOMEM; - } - - toi_fill_skb_pool(uhd); - - return 0; -} - -void toi_netlink_close(struct user_helper_data *uhd) -{ - struct task_struct *t; - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); - if (t) - t->flags &= ~PF_NOFREEZE; - toi_read_unlock_tasklist(); - - toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0); -} -int toi_netlink_setup(struct user_helper_data *uhd) -{ - /* In case userui didn't cleanup properly on us */ - toi_netlink_close_complete(uhd); - - if (netlink_prepare(uhd) < 0) { - printk(KERN_INFO "Netlink prepare failed.\n"); - return 1; - } - - if (toi_launch_userspace_program(uhd->program, uhd->netlink_id, - UMH_WAIT_EXEC, uhd->debug) < 0) { - printk(KERN_INFO "Launch userspace program failed.\n"); - toi_netlink_close_complete(uhd); - return 1; - } - - /* Wait 2 seconds for the userspace process to make contact */ - wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ); - - if (uhd->pid == -1) { - printk(KERN_INFO "%s: Failed to contact userspace process.\n", - uhd->name); - toi_netlink_close_complete(uhd); - return 1; - } - - return 0; -} diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h deleted file mode 100644 index 89e154599..000000000 --- a/kernel/power/tuxonice_netlink.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * kernel/power/tuxonice_netlink.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Declarations for functions for communicating with a userspace helper - * via netlink. - */ - -#include <linux/netlink.h> -#include <net/sock.h> - -#define NETLINK_MSG_BASE 0x10 - -#define NETLINK_MSG_READY 0x10 -#define NETLINK_MSG_NOFREEZE_ME 0x16 -#define NETLINK_MSG_GET_DEBUGGING 0x19 -#define NETLINK_MSG_CLEANUP 0x24 -#define NETLINK_MSG_NOFREEZE_ACK 0x27 -#define NETLINK_MSG_IS_DEBUGGING 0x28 - -struct user_helper_data { - int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh); - void (*not_ready) (void); - struct sock *nl; - u32 sock_seq; - pid_t pid; - char *comm; - char program[256]; - int pool_level; - int pool_limit; - struct sk_buff *emerg_skbs; - int skb_size; - int netlink_id; - char *name; - struct user_helper_data *next; - struct completion wait_for_process; - u32 interface_version; - int must_init; - int debug; -}; - -#ifdef CONFIG_NET -int toi_netlink_setup(struct user_helper_data *uhd); -void toi_netlink_close(struct user_helper_data *uhd); -void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len); -void toi_netlink_close_complete(struct user_helper_data *uhd); -#else -static inline int toi_netlink_setup(struct user_helper_data *uhd) -{ - return 0; -} - -static inline void toi_netlink_close(struct user_helper_data *uhd) { }; -static inline void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len) { }; -static inline void toi_netlink_close_complete(struct user_helper_data *uhd) - { }; -#endif diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c deleted file mode 100644 index 9ea185af1..000000000 --- a/kernel/power/tuxonice_pagedir.c +++ /dev/null @@ -1,345 +0,0 @@ -/* - * kernel/power/tuxonice_pagedir.c - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr> - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for handling pagesets. - * Note that pbes aren't actually stored as such. They're stored as - * bitmaps and extents. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <linux/hardirq.h> -#include <linux/sched.h> -#include <linux/cpu.h> -#include <asm/tlbflush.h> - -#include "tuxonice_pageflags.h" -#include "tuxonice_ui.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice.h" -#include "tuxonice_builtin.h" -#include "tuxonice_alloc.h" - -static int ptoi_pfn; -static struct pbe *this_low_pbe; -static struct pbe **last_low_pbe_ptr; - -void toi_reset_alt_image_pageset2_pfn(void) -{ - memory_bm_position_reset(pageset2_map); -} - -static struct page *first_conflicting_page; - -/* - * free_conflicting_pages - */ - -static void free_conflicting_pages(void) -{ - while (first_conflicting_page) { - struct page *next = - *((struct page **) kmap(first_conflicting_page)); - kunmap(first_conflicting_page); - toi__free_page(29, first_conflicting_page); - first_conflicting_page = next; - } -} - -/* __toi_get_nonconflicting_page - * - * Description: Gets order zero pages that won't be overwritten - * while copying the original pages. - */ - -struct page *___toi_get_nonconflicting_page(int can_be_highmem) -{ - struct page *page; - gfp_t flags = TOI_ATOMIC_GFP; - if (can_be_highmem) - flags |= __GFP_HIGHMEM; - - - if (test_toi_state(TOI_LOADING_ALT_IMAGE) && - pageset2_map && ptoi_pfn) { - do { - ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0); - if (ptoi_pfn != BM_END_OF_MAP) { - page = pfn_to_page(ptoi_pfn); - if (!PagePageset1(page) && - (can_be_highmem || !PageHighMem(page))) - return page; - } - } while (ptoi_pfn); - } - - do { - page = toi_alloc_page(29, flags | __GFP_ZERO); - if (!page) { - printk(KERN_INFO "Failed to get nonconflicting " - "page.\n"); - return NULL; - } - if (PagePageset1(page)) { - struct page **next = (struct page **) kmap(page); - *next = first_conflicting_page; - first_conflicting_page = page; - kunmap(page); - } - } while (PagePageset1(page)); - - return page; -} - -unsigned long __toi_get_nonconflicting_page(void) -{ - struct page *page = ___toi_get_nonconflicting_page(0); - return page ? (unsigned long) page_address(page) : 0; -} - -static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe, - int highmem) -{ - if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1)) - + 2 * sizeof(struct pbe)) > PAGE_SIZE) { - struct page *new_page = - ___toi_get_nonconflicting_page(highmem); - if (!new_page) - return ERR_PTR(-ENOMEM); - this_pbe = (struct pbe *) kmap(new_page); - memset(this_pbe, 0, PAGE_SIZE); - *page_ptr = new_page; - } else - this_pbe++; - - return this_pbe; -} - -/** - * get_pageset1_load_addresses - generate pbes for conflicting pages - * - * We check here that pagedir & pages it points to won't collide - * with pages where we're going to restore from the loaded pages - * later. - * - * Returns: - * Zero on success, one if couldn't find enough pages (shouldn't - * happen). - **/ -int toi_get_pageset1_load_addresses(void) -{ - int pfn, highallocd = 0, lowallocd = 0; - int low_needed = pagedir1.size - get_highmem_size(pagedir1); - int high_needed = get_highmem_size(pagedir1); - int low_pages_for_highmem = 0; - gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM; - struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL, - *low_pbe_page, *last_low_pbe_page = NULL; - struct pbe **last_high_pbe_ptr = &restore_highmem_pblist, - *this_high_pbe = NULL; - unsigned long orig_low_pfn, orig_high_pfn; - int high_pbes_done = 0, low_pbes_done = 0; - int low_direct = 0, high_direct = 0, result = 0, i; - int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0; - - toi_trace_index++; - - memory_bm_position_reset(pageset1_map); - memory_bm_position_reset(pageset1_copy_map); - - last_low_pbe_ptr = &restore_pblist; - - /* First, allocate pages for the start of our pbe lists. */ - if (high_needed) { - high_pbe_page = ___toi_get_nonconflicting_page(1); - if (!high_pbe_page) { - result = -ENOMEM; - goto out; - } - this_high_pbe = (struct pbe *) kmap(high_pbe_page); - memset(this_high_pbe, 0, PAGE_SIZE); - } - - low_pbe_page = ___toi_get_nonconflicting_page(0); - if (!low_pbe_page) { - result = -ENOMEM; - goto out; - } - this_low_pbe = (struct pbe *) page_address(low_pbe_page); - - /* - * Next, allocate the number of pages we need. - */ - - i = low_needed + high_needed; - - do { - int is_high; - - if (i == low_needed) - flags &= ~__GFP_HIGHMEM; - - page = toi_alloc_page(30, flags); - BUG_ON(!page); - - SetPagePageset1Copy(page); - is_high = PageHighMem(page); - - if (PagePageset1(page)) { - if (is_high) - high_direct++; - else - low_direct++; - } else { - if (is_high) - highallocd++; - else - lowallocd++; - } - } while (--i); - - high_needed -= high_direct; - low_needed -= low_direct; - - /* - * Do we need to use some lowmem pages for the copies of highmem - * pages? - */ - if (high_needed > highallocd) { - low_pages_for_highmem = high_needed - highallocd; - high_needed -= low_pages_for_highmem; - low_needed += low_pages_for_highmem; - } - - /* - * Now generate our pbes (which will be used for the atomic restore), - * and free unneeded pages. - */ - memory_bm_position_reset(pageset1_copy_map); - for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) { - int is_high; - page = pfn_to_page(pfn); - is_high = PageHighMem(page); - - if (PagePageset1(page)) - continue; - - /* Nope. We're going to use this page. Add a pbe. */ - if (is_high || low_pages_for_highmem) { - struct page *orig_page; - high_pbes_done++; - if (!is_high) - low_pages_for_highmem--; - do { - orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0); - BUG_ON(orig_high_pfn == BM_END_OF_MAP); - orig_page = pfn_to_page(orig_high_pfn); - } while (!PageHighMem(orig_page) || - PagePageset1Copy(orig_page)); - - this_high_pbe->orig_address = (void *) orig_high_pfn; - this_high_pbe->address = page; - this_high_pbe->next = NULL; - toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p", - high_page, high_offset, page, orig_high_pfn, orig_page); - if (last_high_pbe_page != high_pbe_page) { - *last_high_pbe_ptr = - (struct pbe *) high_pbe_page; - if (last_high_pbe_page) { - kunmap(last_high_pbe_page); - high_page++; - high_offset = 0; - } else - high_offset++; - last_high_pbe_page = high_pbe_page; - } else { - *last_high_pbe_ptr = this_high_pbe; - high_offset++; - } - last_high_pbe_ptr = &this_high_pbe->next; - this_high_pbe = get_next_pbe(&high_pbe_page, - this_high_pbe, 1); - if (IS_ERR(this_high_pbe)) { - printk(KERN_INFO - "This high pbe is an error.\n"); - return -ENOMEM; - } - } else { - struct page *orig_page; - low_pbes_done++; - do { - orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0); - BUG_ON(orig_low_pfn == BM_END_OF_MAP); - orig_page = pfn_to_page(orig_low_pfn); - } while (PageHighMem(orig_page) || - PagePageset1Copy(orig_page)); - - this_low_pbe->orig_address = page_address(orig_page); - this_low_pbe->address = page_address(page); - this_low_pbe->next = NULL; - toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p", - low_page, low_offset, this_low_pbe->orig_address, - orig_low_pfn, this_low_pbe->address); - TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address); - *last_low_pbe_ptr = this_low_pbe; - last_low_pbe_ptr = &this_low_pbe->next; - this_low_pbe = get_next_pbe(&low_pbe_page, - this_low_pbe, 0); - if (low_pbe_page != last_low_pbe_page) { - if (last_low_pbe_page) { - low_page++; - low_offset = 0; - } else { - low_offset++; - } - last_low_pbe_page = low_pbe_page; - } else - low_offset++; - if (IS_ERR(this_low_pbe)) { - printk(KERN_INFO "this_low_pbe is an error.\n"); - return -ENOMEM; - } - } - } - - if (high_pbe_page) - kunmap(high_pbe_page); - - if (last_high_pbe_page != high_pbe_page) { - if (last_high_pbe_page) - kunmap(last_high_pbe_page); - toi__free_page(29, high_pbe_page); - } - - free_conflicting_pages(); - -out: - return result; -} - -int add_boot_kernel_data_pbe(void) -{ - this_low_pbe->address = (char *) __toi_get_nonconflicting_page(); - if (!this_low_pbe->address) { - printk(KERN_INFO "Failed to get bkd atomic restore buffer."); - return -ENOMEM; - } - - toi_bkd.size = sizeof(toi_bkd); - memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd)); - - *last_low_pbe_ptr = this_low_pbe; - this_low_pbe->orig_address = (char *) boot_kernel_data_buffer; - this_low_pbe->next = NULL; - return 0; -} diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h deleted file mode 100644 index 80d1a3d8c..000000000 --- a/kernel/power/tuxonice_pagedir.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * kernel/power/tuxonice_pagedir.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Declarations for routines for handling pagesets. - */ - -#ifndef KERNEL_POWER_PAGEDIR_H -#define KERNEL_POWER_PAGEDIR_H - -/* Pagedir - * - * Contains the metadata for a set of pages saved in the image. - */ - -struct pagedir { - int id; - unsigned long size; -#ifdef CONFIG_HIGHMEM - unsigned long size_high; -#endif -}; - -#ifdef CONFIG_HIGHMEM -#define get_highmem_size(pagedir) (pagedir.size_high) -#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0) -#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0) -#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high) -#else -#define get_highmem_size(pagedir) (0) -#define set_highmem_size(pagedir, sz) do { } while (0) -#define inc_highmem_size(pagedir) do { } while (0) -#define get_lowmem_size(pagedir) (pagedir.size) -#endif - -extern struct pagedir pagedir1, pagedir2; - -extern void toi_copy_pageset1(void); - -extern int toi_get_pageset1_load_addresses(void); - -extern unsigned long __toi_get_nonconflicting_page(void); -struct page *___toi_get_nonconflicting_page(int can_be_highmem); - -extern void toi_reset_alt_image_pageset2_pfn(void); -extern int add_boot_kernel_data_pbe(void); -#endif diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c deleted file mode 100644 index 307d09f33..000000000 --- a/kernel/power/tuxonice_pageflags.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * kernel/power/tuxonice_pageflags.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for serialising and relocating pageflags in which we - * store our image metadata. - */ - -#include "tuxonice_pageflags.h" -#include "power.h" - -int toi_pageflags_space_needed(void) -{ - return memory_bm_space_needed(pageset1_map); -} diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h deleted file mode 100644 index 30ee577c3..000000000 --- a/kernel/power/tuxonice_pageflags.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * kernel/power/tuxonice_pageflags.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H -#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H - -struct memory_bitmap; -void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); -void memory_bm_clear(struct memory_bitmap *bm); - -int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index); -unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index); -void memory_bm_position_reset(struct memory_bitmap *bm); -void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); -int toi_alloc_bitmap(struct memory_bitmap **bm); -void toi_free_bitmap(struct memory_bitmap **bm); -void memory_bm_clear(struct memory_bitmap *bm); -void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); - -struct toi_module_ops; -int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); -int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); -int memory_bm_space_needed(struct memory_bitmap *bm); - -extern struct memory_bitmap *pageset1_map; -extern struct memory_bitmap *pageset1_copy_map; -extern struct memory_bitmap *pageset2_map; -extern struct memory_bitmap *page_resave_map; -extern struct memory_bitmap *io_map; -extern struct memory_bitmap *nosave_map; -extern struct memory_bitmap *free_map; -extern struct memory_bitmap *compare_map; - -#define PagePageset1(page) \ - (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset1(page) \ - (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset1(page) \ - (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) - -#define PagePageset1Copy(page) \ - (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset1Copy(page) \ - (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset1Copy(page) \ - (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) - -#define PagePageset2(page) \ - (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset2(page) \ - (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset2(page) \ - (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) - -#define PageWasRW(page) \ - (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define SetPageWasRW(page) \ - (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageWasRW(page) \ - (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) - -#define PageResave(page) (page_resave_map ? \ - memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageResave(page) \ - (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageResave(page) \ - (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) - -#define PageNosave(page) (nosave_map ? \ - memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageNosave(page) \ - (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageNosave(page) \ - (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page))) - -#define PageNosaveFree(page) (free_map ? \ - memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageNosaveFree(page) \ - (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageNosaveFree(page) \ - (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page))) - -#define PageCompareChanged(page) (compare_map ? \ - memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageCompareChanged(page) \ - (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageCompareChanged(page) \ - (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page))) - -extern void save_pageflags(struct memory_bitmap *pagemap); -extern int load_pageflags(struct memory_bitmap *pagemap); -extern int toi_pageflags_space_needed(void); -#endif diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c deleted file mode 100644 index f8e969625..000000000 --- a/kernel/power/tuxonice_power_off.c +++ /dev/null @@ -1,286 +0,0 @@ -/* - * kernel/power/tuxonice_power_off.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Support for powering down. - */ - -#include <linux/device.h> -#include <linux/suspend.h> -#include <linux/mm.h> -#include <linux/pm.h> -#include <linux/reboot.h> -#include <linux/cpu.h> -#include <linux/console.h> -#include <linux/fs.h> -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_power_off.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" - -unsigned long toi_poweroff_method; /* 0 - Kernel power off */ - -static int wake_delay; -static char lid_state_file[256], wake_alarm_dir[256]; -static struct file *lid_file, *alarm_file, *epoch_file; -static int post_wake_state = -1; - -static int did_suspend_to_both; - -/* - * __toi_power_down - * Functionality : Powers down or reboots the computer once the image - * has been written to disk. - * Key Assumptions : Able to reboot/power down via code called or that - * the warning emitted if the calls fail will be visible - * to the user (ie printk resumes devices). - */ - -static void __toi_power_down(int method) -{ - int error; - - toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." : - "Powering down."); - - if (test_result_state(TOI_ABORTED)) - goto out; - - if (test_action_state(TOI_REBOOT)) - kernel_restart(NULL); - - switch (method) { - case 0: - break; - case 3: - /* - * Re-read the overwritten part of pageset2 to make post-resume - * faster. - */ - if (read_pageset2(1)) - panic("Attempt to reload pagedir 2 failed. " - "Try rebooting."); - - pm_prepare_console(); - - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (!error) { - pm_restore_gfp_mask(); - error = suspend_devices_and_enter(PM_SUSPEND_MEM); - pm_restrict_gfp_mask(); - if (!error) - did_suspend_to_both = 1; - } - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); - - /* Success - we're now post-resume-from-ram */ - if (did_suspend_to_both) - return; - - /* Failed to suspend to ram - do normal power off */ - break; - case 4: - /* - * If succeeds, doesn't return. If fails, do a simple - * powerdown. - */ - hibernation_platform_enter(); - break; - case 5: - /* Historic entry only now */ - break; - } - - if (method && method != 5) - toi_cond_pause(1, - "Falling back to alternate power off method."); - - if (test_result_state(TOI_ABORTED)) - goto out; - - if (pm_power_off) - kernel_power_off(); - kernel_halt(); - toi_cond_pause(1, "Powerdown failed."); - while (1) - cpu_relax(); - -out: - if (read_pageset2(1)) - panic("Attempt to reload pagedir 2 failed. Try rebooting."); - return; -} - -#define CLOSE_FILE(file) \ - if (file) { \ - filp_close(file, NULL); file = NULL; \ - } - -static void powerdown_cleanup(int toi_or_resume) -{ - if (!toi_or_resume) - return; - - CLOSE_FILE(lid_file); - CLOSE_FILE(alarm_file); - CLOSE_FILE(epoch_file); -} - -static void open_file(char *format, char *arg, struct file **var, int mode, - char *desc) -{ - char buf[256]; - - if (strlen(arg)) { - sprintf(buf, format, arg); - *var = filp_open(buf, mode, 0); - if (IS_ERR(*var) || !*var) { - printk(KERN_INFO "Failed to open %s file '%s' (%p).\n", - desc, buf, *var); - *var = NULL; - } - } -} - -static int powerdown_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - did_suspend_to_both = 0; - - open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file, - O_RDONLY, "lid"); - - if (strlen(wake_alarm_dir)) { - open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir, - &alarm_file, O_WRONLY, "alarm"); - - open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir, - &epoch_file, O_RDONLY, "epoch"); - } - - return 0; -} - -static int lid_closed(void) -{ - char array[25]; - ssize_t size; - loff_t pos = 0; - - if (!lid_file) - return 0; - - size = vfs_read(lid_file, (char __user *) array, 25, &pos); - if ((int) size < 1) { - printk(KERN_INFO "Failed to read lid state file (%d).\n", - (int) size); - return 0; - } - - if (!strcmp(array, "state: closed\n")) - return 1; - - return 0; -} - -static void write_alarm_file(int value) -{ - ssize_t size; - char buf[40]; - loff_t pos = 0; - - if (!alarm_file) - return; - - sprintf(buf, "%d\n", value); - - size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos); - - if (size < 0) - printk(KERN_INFO "Error %d writing alarm value %s.\n", - (int) size, buf); -} - -/** - * toi_check_resleep: See whether to powerdown again after waking. - * - * After waking, check whether we should powerdown again in a (usually - * different) way. We only do this if the lid switch is still closed. - */ -void toi_check_resleep(void) -{ - /* We only return if we suspended to ram and woke. */ - if (lid_closed() && post_wake_state >= 0) - __toi_power_down(post_wake_state); -} - -void toi_power_down(void) -{ - if (alarm_file && wake_delay) { - char array[25]; - loff_t pos = 0; - size_t size = vfs_read(epoch_file, (char __user *) array, 25, - &pos); - - if (((int) size) < 1) - printk(KERN_INFO "Failed to read epoch file (%d).\n", - (int) size); - else { - unsigned long since_epoch; - if (!kstrtoul(array, 0, &since_epoch)) { - /* Clear any wakeup time. */ - write_alarm_file(0); - - /* Set new wakeup time. */ - write_alarm_file(since_epoch + wake_delay); - } - } - } - - __toi_power_down(toi_poweroff_method); - - toi_check_resleep(); -} - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_ACPI) - SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL), - SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL), - SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL), - SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0, - NULL), - SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0), - SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both, - 0, 0, 0, NULL) -#endif -}; - -static struct toi_module_ops powerdown_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "poweroff", - .initialise = powerdown_init, - .cleanup = powerdown_cleanup, - .directory = "[ROOT]", - .module = THIS_MODULE, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_poweroff_init(void) -{ - return toi_register_module(&powerdown_ops); -} - -void toi_poweroff_exit(void) -{ - toi_unregister_module(&powerdown_ops); -} diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h deleted file mode 100644 index 6e1d8bb39..000000000 --- a/kernel/power/tuxonice_power_off.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * kernel/power/tuxonice_power_off.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Support for the powering down. - */ - -int toi_pm_state_finish(void); -void toi_power_down(void); -extern unsigned long toi_poweroff_method; -int toi_poweroff_init(void); -void toi_poweroff_exit(void); -void toi_check_resleep(void); - -extern int platform_begin(int platform_mode); -extern int platform_pre_snapshot(int platform_mode); -extern void platform_leave(int platform_mode); -extern void platform_end(int platform_mode); -extern void platform_finish(int platform_mode); -extern int platform_pre_restore(int platform_mode); -extern void platform_restore_cleanup(int platform_mode); diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c deleted file mode 100644 index e0593252f..000000000 --- a/kernel/power/tuxonice_prepare_image.c +++ /dev/null @@ -1,1080 +0,0 @@ -/* - * kernel/power/tuxonice_prepare_image.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * We need to eat memory until we can: - * 1. Perform the save without changing anything (RAM_NEEDED < #pages) - * 2. Fit it all in available space (toiActiveAllocator->available_space() >= - * main_storage_needed()) - * 3. Reload the pagedir and pageset1 to places that don't collide with their - * final destinations, not knowing to what extent the resumed kernel will - * overlap with the one loaded at boot time. I think the resumed kernel - * should overlap completely, but I don't want to rely on this as it is - * an unproven assumption. We therefore assume there will be no overlap at - * all (worse case). - * 4. Meet the user's requested limit (if any) on the size of the image. - * The limit is in MB, so pages/256 (assuming 4K pages). - * - */ - -#include <linux/highmem.h> -#include <linux/freezer.h> -#include <linux/hardirq.h> -#include <linux/mmzone.h> -#include <linux/console.h> -#include <linux/tuxonice.h> - -#include "tuxonice_pageflags.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice.h" -#include "tuxonice_extent.h" -#include "tuxonice_checksum.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_alloc.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_builtin.h" - -static unsigned long num_nosave, main_storage_allocated, storage_limit, - header_storage_needed; -unsigned long extra_pd1_pages_allowance = - CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE; -long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT; -static int no_ps2_needed; - -struct attention_list { - struct task_struct *task; - struct attention_list *next; -}; - -static struct attention_list *attention_list; - -#define PAGESET1 0 -#define PAGESET2 1 - -void free_attention_list(void) -{ - struct attention_list *last = NULL; - - while (attention_list) { - last = attention_list; - attention_list = attention_list->next; - toi_kfree(6, last, sizeof(*last)); - } -} - -static int build_attention_list(void) -{ - int i, task_count = 0; - struct task_struct *p; - struct attention_list *next; - - /* - * Count all userspace process (with task->mm) marked PF_NOFREEZE. - */ - toi_read_lock_tasklist(); - for_each_process(p) - if ((p->flags & PF_NOFREEZE) || p == current) - task_count++; - toi_read_unlock_tasklist(); - - /* - * Allocate attention list structs. - */ - for (i = 0; i < task_count; i++) { - struct attention_list *this = - toi_kzalloc(6, sizeof(struct attention_list), - TOI_WAIT_GFP); - if (!this) { - printk(KERN_INFO "Failed to allocate slab for " - "attention list.\n"); - free_attention_list(); - return 1; - } - this->next = NULL; - if (attention_list) - this->next = attention_list; - attention_list = this; - } - - next = attention_list; - toi_read_lock_tasklist(); - for_each_process(p) - if ((p->flags & PF_NOFREEZE) || p == current) { - next->task = p; - next = next->next; - } - toi_read_unlock_tasklist(); - return 0; -} - -static void pageset2_full(void) -{ - struct zone *zone; - struct page *page; - unsigned long flags; - int i; - - toi_trace_index++; - - for_each_populated_zone(zone) { - spin_lock_irqsave(&zone->lru_lock, flags); - for_each_lru(i) { - if (!zone_page_state(zone, NR_LRU_BASE + i)) - continue; - - list_for_each_entry(page, &zone->lruvec.lists[i], lru) { - struct address_space *mapping; - - mapping = page_mapping(page); - if (!mapping || !mapping->host || - !(mapping->host->i_flags & S_ATOMIC_COPY)) { - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified."); - } else { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full."); - SetPagePageset2(page); - } - } - } - } - spin_unlock_irqrestore(&zone->lru_lock, flags); - } -} - -/* - * toi_mark_task_as_pageset - * Functionality : Marks all the saveable pages belonging to a given process - * as belonging to a particular pageset. - */ - -static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2) -{ - struct vm_area_struct *vma; - struct mm_struct *mm; - - mm = t->active_mm; - - if (!mm || !mm->mmap) - return; - - toi_trace_index++; - - if (!irqs_disabled()) - down_read(&mm->mmap_sem); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long posn; - - if (!vma->vm_start || - vma->vm_flags & VM_PFNMAP) - continue; - - for (posn = vma->vm_start; posn < vma->vm_end; - posn += PAGE_SIZE) { - struct page *page = follow_page(vma, posn, 0); - struct address_space *mapping; - - if (!page || !pfn_valid(page_to_pfn(page))) - continue; - - mapping = page_mapping(page); - if (mapping && mapping->host && - mapping->host->i_flags & S_ATOMIC_COPY && pageset2) - continue; - - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2); - continue; - } - - if (pageset2) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1"); - SetPagePageset2(page); - } else { - TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2"); - ClearPagePageset2(page); - SetPagePageset1(page); - } - } - } - - if (!irqs_disabled()) - up_read(&mm->mmap_sem); -} - -static void mark_tasks(int pageset) -{ - struct task_struct *p; - - toi_read_lock_tasklist(); - for_each_process(p) { - if (!p->mm) - continue; - - if (p->flags & PF_KTHREAD) - continue; - - toi_mark_task_as_pageset(p, pageset); - } - toi_read_unlock_tasklist(); - -} - -/* mark_pages_for_pageset2 - * - * Description: Mark unshared pages in processes not needed for hibernate as - * being able to be written out in a separate pagedir. - * HighMem pages are simply marked as pageset2. They won't be - * needed during hibernate. - */ - -static void toi_mark_pages_for_pageset2(void) -{ - struct attention_list *this = attention_list; - - memory_bm_clear(pageset2_map); - - if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed) - return; - - if (test_action_state(TOI_PAGESET2_FULL)) - pageset2_full(); - else - mark_tasks(PAGESET2); - - /* - * Because the tasks in attention_list are ones related to hibernating, - * we know that they won't go away under us. - */ - - while (this) { - if (!test_result_state(TOI_ABORTED)) - toi_mark_task_as_pageset(this->task, PAGESET1); - this = this->next; - } -} - -/* - * The atomic copy of pageset1 is stored in pageset2 pages. - * But if pageset1 is larger (normally only just after boot), - * we need to allocate extra pages to store the atomic copy. - * The following data struct and functions are used to handle - * the allocation and freeing of that memory. - */ - -static unsigned long extra_pages_allocated; - -struct extras { - struct page *page; - int order; - struct extras *next; -}; - -static struct extras *extras_list; - -/* toi_free_extra_pagedir_memory - * - * Description: Free previously allocated extra pagedir memory. - */ -void toi_free_extra_pagedir_memory(void) -{ - /* Free allocated pages */ - while (extras_list) { - struct extras *this = extras_list; - int i; - - extras_list = this->next; - - for (i = 0; i < (1 << this->order); i++) - ClearPageNosave(this->page + i); - - toi_free_pages(9, this->page, this->order); - toi_kfree(7, this, sizeof(*this)); - } - - extra_pages_allocated = 0; -} - -/* toi_allocate_extra_pagedir_memory - * - * Description: Allocate memory for making the atomic copy of pagedir1 in the - * case where it is bigger than pagedir2. - * Arguments: int num_to_alloc: Number of extra pages needed. - * Result: int. Number of extra pages we now have allocated. - */ -static int toi_allocate_extra_pagedir_memory(int extra_pages_needed) -{ - int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated; - gfp_t flags = TOI_ATOMIC_GFP; - - if (num_to_alloc < 1) - return 0; - - order = fls(num_to_alloc); - if (order >= MAX_ORDER) - order = MAX_ORDER - 1; - - while (num_to_alloc) { - struct page *newpage; - unsigned long virt; - struct extras *extras_entry; - - while ((1 << order) > num_to_alloc) - order--; - - extras_entry = (struct extras *) toi_kzalloc(7, - sizeof(struct extras), TOI_ATOMIC_GFP); - - if (!extras_entry) - return extra_pages_allocated; - - virt = toi_get_free_pages(9, flags, order); - while (!virt && order) { - order--; - virt = toi_get_free_pages(9, flags, order); - } - - if (!virt) { - toi_kfree(7, extras_entry, sizeof(*extras_entry)); - return extra_pages_allocated; - } - - newpage = virt_to_page(virt); - - extras_entry->page = newpage; - extras_entry->order = order; - extras_entry->next = extras_list; - - extras_list = extras_entry; - - for (j = 0; j < (1 << order); j++) { - SetPageNosave(newpage + j); - SetPagePageset1Copy(newpage + j); - } - - extra_pages_allocated += (1 << order); - num_to_alloc -= (1 << order); - } - - return extra_pages_allocated; -} - -/* - * real_nr_free_pages: Count pcp pages for a zone type or all zones - * (-1 for all, otherwise zone_idx() result desired). - */ -unsigned long real_nr_free_pages(unsigned long zone_idx_mask) -{ - struct zone *zone; - int result = 0, cpu; - - /* PCP lists */ - for_each_populated_zone(zone) { - if (!(zone_idx_mask & (1 << zone_idx(zone)))) - continue; - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pset = - per_cpu_ptr(zone->pageset, cpu); - struct per_cpu_pages *pcp = &pset->pcp; - result += pcp->count; - } - - result += zone_page_state(zone, NR_FREE_PAGES); - } - return result; -} - -/* - * Discover how much extra memory will be required by the drivers - * when they're asked to hibernate. We can then ensure that amount - * of memory is available when we really want it. - */ -static void get_extra_pd1_allowance(void) -{ - unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final; - - toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers."); - - if (toi_go_atomic(PMSG_FREEZE, 1)) - return; - - final = real_nr_free_pages(all_zones_mask); - toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0); - - extra_pd1_pages_allowance = (orig_num_free > final) ? - orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE : - MIN_EXTRA_PAGES_ALLOWANCE; -} - -/* - * Amount of storage needed, possibly taking into account the - * expected compression ratio and possibly also ignoring our - * allowance for extra pages. - */ -static unsigned long main_storage_needed(int use_ecr, - int ignore_extra_pd1_allow) -{ - return (pagedir1.size + pagedir2.size + - (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) * - (use_ecr ? toi_expected_compression_ratio() : 100) / 100; -} - -/* - * Storage needed for the image header, in bytes until the return. - */ -unsigned long get_header_storage_needed(void) -{ - unsigned long bytes = sizeof(struct toi_header) + - toi_header_storage_for_modules() + - toi_pageflags_space_needed() + - fs_info_space_needed(); - - return DIV_ROUND_UP(bytes, PAGE_SIZE); -} - -/* - * When freeing memory, pages from either pageset might be freed. - * - * When seeking to free memory to be able to hibernate, for every ps1 page - * freed, we need 2 less pages for the atomic copy because there is one less - * page to copy and one more page into which data can be copied. - * - * Freeing ps2 pages saves us nothing directly. No more memory is available - * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but - * that's too much work to figure out. - * - * => ps1_to_free functions - * - * Of course if we just want to reduce the image size, because of storage - * limitations or an image size limit either ps will do. - * - * => any_to_free function - */ - -static unsigned long lowpages_usable_for_highmem_copy(void) -{ - unsigned long needed = get_lowmem_size(pagedir1) + - extra_pd1_pages_allowance + MIN_FREE_RAM + - toi_memory_for_modules(0), - available = get_lowmem_size(pagedir2) + - real_nr_free_low_pages() + extra_pages_allocated; - - return available > needed ? available - needed : 0; -} - -static unsigned long highpages_ps1_to_free(void) -{ - unsigned long need = get_highmem_size(pagedir1), - available = get_highmem_size(pagedir2) + - real_nr_free_high_pages() + - lowpages_usable_for_highmem_copy(); - - return need > available ? DIV_ROUND_UP(need - available, 2) : 0; -} - -static unsigned long lowpages_ps1_to_free(void) -{ - unsigned long needed = get_lowmem_size(pagedir1) + - extra_pd1_pages_allowance + MIN_FREE_RAM + - toi_memory_for_modules(0), - available = get_lowmem_size(pagedir2) + - real_nr_free_low_pages() + extra_pages_allocated; - - return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0; -} - -static unsigned long current_image_size(void) -{ - return pagedir1.size + pagedir2.size + header_storage_needed; -} - -static unsigned long storage_still_required(void) -{ - unsigned long needed = main_storage_needed(1, 1); - return needed > storage_limit ? needed - storage_limit : 0; -} - -static unsigned long ram_still_required(void) -{ - unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) + - 2 * extra_pd1_pages_allowance, - available = real_nr_free_low_pages() + extra_pages_allocated; - return needed > available ? needed - available : 0; -} - -unsigned long any_to_free(int use_image_size_limit) -{ - int use_soft_limit = use_image_size_limit && image_size_limit > 0; - unsigned long current_size = current_image_size(), - soft_limit = use_soft_limit ? (image_size_limit << 8) : 0, - to_free = use_soft_limit ? (current_size > soft_limit ? - current_size - soft_limit : 0) : 0, - storage_limit = storage_still_required(), - ram_limit = ram_still_required(), - first_max = max(to_free, storage_limit); - - return max(first_max, ram_limit); -} - -static int need_pageset2(void) -{ - return (real_nr_free_low_pages() + extra_pages_allocated - - 2 * extra_pd1_pages_allowance - MIN_FREE_RAM - - toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size; -} - -/* amount_needed - * - * Calculates the amount by which the image size needs to be reduced to meet - * our constraints. - */ -static unsigned long amount_needed(int use_image_size_limit) -{ - return max(highpages_ps1_to_free() + lowpages_ps1_to_free(), - any_to_free(use_image_size_limit)); -} - -static int image_not_ready(int use_image_size_limit) -{ - toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, - "Amount still needed (%lu) > 0:%u," - " Storage allocd: %lu < %lu: %u.\n", - amount_needed(use_image_size_limit), - (amount_needed(use_image_size_limit) > 0), - main_storage_allocated, - main_storage_needed(1, 1), - main_storage_allocated < main_storage_needed(1, 1)); - - toi_cond_pause(0, NULL); - - return (amount_needed(use_image_size_limit) > 0) || - main_storage_allocated < main_storage_needed(1, 1); -} - -static void display_failure_reason(int tries_exceeded) -{ - unsigned long storage_required = storage_still_required(), - ram_required = ram_still_required(), - high_ps1 = highpages_ps1_to_free(), - low_ps1 = lowpages_ps1_to_free(); - - printk(KERN_INFO "Failed to prepare the image because...\n"); - - if (!storage_limit) { - printk(KERN_INFO "- You need some storage available to be " - "able to hibernate.\n"); - return; - } - - if (tries_exceeded) - printk(KERN_INFO "- The maximum number of iterations was " - "reached without successfully preparing the " - "image.\n"); - - if (storage_required) { - printk(KERN_INFO " - We need at least %lu pages of storage " - "(ignoring the header), but only have %lu.\n", - main_storage_needed(1, 1), - main_storage_allocated); - set_abort_result(TOI_INSUFFICIENT_STORAGE); - } - - if (ram_required) { - printk(KERN_INFO " - We need %lu more free pages of low " - "memory.\n", ram_required); - printk(KERN_INFO " Minimum free : %8d\n", MIN_FREE_RAM); - printk(KERN_INFO " + Reqd. by modules : %8lu\n", - toi_memory_for_modules(0)); - printk(KERN_INFO " + 2 * extra allow : %8lu\n", - 2 * extra_pd1_pages_allowance); - printk(KERN_INFO " - Currently free : %8lu\n", - real_nr_free_low_pages()); - printk(KERN_INFO " - Pages allocd : %8lu\n", - extra_pages_allocated); - printk(KERN_INFO " : ========\n"); - printk(KERN_INFO " Still needed : %8lu\n", - ram_required); - - /* Print breakdown of memory needed for modules */ - toi_memory_for_modules(1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } - - if (high_ps1) { - printk(KERN_INFO "- We need to free %lu highmem pageset 1 " - "pages.\n", high_ps1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } - - if (low_ps1) { - printk(KERN_INFO " - We need to free %ld lowmem pageset 1 " - "pages.\n", low_ps1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } -} - -static void display_stats(int always, int sub_extra_pd1_allow) -{ - char buffer[255]; - snprintf(buffer, 254, - "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). " - "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). " - "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n", - - /* Free */ - real_nr_free_pages(all_zones_mask), - real_nr_free_low_pages(), - - /* Sets */ - pagedir1.size, pagedir1.size - get_highmem_size(pagedir1), - pagedir2.size, pagedir2.size - get_highmem_size(pagedir2), - - /* Nosave */ - num_nosave, extra_pages_allocated, - num_nosave - extra_pages_allocated, - - /* Storage */ - main_storage_allocated, - storage_limit, - main_storage_needed(1, sub_extra_pd1_allow), - main_storage_needed(1, 1), - - /* Needed */ - lowpages_ps1_to_free(), highpages_ps1_to_free(), - any_to_free(1), - MIN_FREE_RAM, toi_memory_for_modules(0), - extra_pd1_pages_allowance, - image_size_limit, - - need_pageset2() ? "yes" : "no"); - - if (always) - printk("%s", buffer); - else - toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer); -} - -/* flag_image_pages - * - * This routine generates our lists of pages to be stored in each - * pageset. Since we store the data using extents, and adding new - * extents might allocate a new extent page, this routine may well - * be called more than once. - */ -static void flag_image_pages(int atomic_copy) -{ - int num_free = 0, num_unmodified = 0; - unsigned long loop; - struct zone *zone; - - pagedir1.size = 0; - pagedir2.size = 0; - - set_highmem_size(pagedir1, 0); - set_highmem_size(pagedir2, 0); - - num_nosave = 0; - toi_trace_index++; - - memory_bm_clear(pageset1_map); - - toi_generate_free_page_map(); - - /* - * Pages not to be saved are marked Nosave irrespective of being - * reserved. - */ - for_each_populated_zone(zone) { - int highmem = is_highmem(zone); - - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - TOI_TRACE_DEBUG(pfn, "_Flag Invalid"); - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - unsigned long y; - for (y = pfn; y < pfn + chunk_size; y++) { - page = pfn_to_page(y); - TOI_TRACE_DEBUG(y, "_Flag Free"); - ClearPagePageset1(page); - ClearPagePageset2(page); - } - num_free += chunk_size; - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageNosave(page)) { - char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave"; - TOI_TRACE_DEBUG(pfn, "_Flag %s", desc); - num_nosave++; - continue; - } - - page = highmem ? saveable_highmem_page(zone, pfn) : - saveable_page(zone, pfn); - - if (!page) { - TOI_TRACE_DEBUG(pfn, "_Flag Nosave2"); - num_nosave++; - continue; - } - - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(pfn, "_Unmodified"); - num_unmodified++; - continue; - } - - if (PagePageset2(page)) { - pagedir2.size++; - TOI_TRACE_DEBUG(pfn, "_Flag PS2"); - if (PageHighMem(page)) - inc_highmem_size(pagedir2); - else - SetPagePageset1Copy(page); - if (PageResave(page)) { - SetPagePageset1(page); - ClearPagePageset1Copy(page); - pagedir1.size++; - if (PageHighMem(page)) - inc_highmem_size(pagedir1); - } - } else { - pagedir1.size++; - TOI_TRACE_DEBUG(pfn, "_Flag PS1"); - SetPagePageset1(page); - if (PageHighMem(page)) - inc_highmem_size(pagedir1); - } - } - } - - if (!atomic_copy) - toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0, - "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)" - " + Unmodified (%d) + NumFree (%d) = %d.\n", - pagedir1.size, pagedir2.size, num_nosave, num_unmodified, - num_free, pagedir1.size + pagedir2.size + num_nosave + num_free); -} - -void toi_recalculate_image_contents(int atomic_copy) -{ - memory_bm_clear(pageset1_map); - if (!atomic_copy) { - unsigned long pfn; - memory_bm_position_reset(pageset2_map); - for (pfn = memory_bm_next_pfn(pageset2_map, 0); - pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset2_map, 0)) - ClearPagePageset1Copy(pfn_to_page(pfn)); - /* Need to call this before getting pageset1_size! */ - toi_mark_pages_for_pageset2(); - } - memory_bm_position_reset(pageset2_map); - flag_image_pages(atomic_copy); - - if (!atomic_copy) { - storage_limit = toiActiveAllocator->storage_available(); - display_stats(0, 0); - } -} - -int try_allocate_extra_memory(void) -{ - unsigned long wanted = pagedir1.size + extra_pd1_pages_allowance - - get_lowmem_size(pagedir2); - if (wanted > extra_pages_allocated) { - unsigned long got = toi_allocate_extra_pagedir_memory(wanted); - if (wanted < got) { - toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, - "Want %d extra pages for pageset1, got %d.\n", - wanted, got); - return 1; - } - } - return 0; -} - -/* update_image - * - * Allocate [more] memory and storage for the image. - */ -static void update_image(int ps2_recalc) -{ - int old_header_req; - unsigned long seek; - - if (try_allocate_extra_memory()) - return; - - if (ps2_recalc) - goto recalc; - - thaw_kernel_threads(); - - /* - * Allocate remaining storage space, if possible, up to the - * maximum we know we'll need. It's okay to allocate the - * maximum if the writer is the swapwriter, but - * we don't want to grab all available space on an NFS share. - * We therefore ignore the expected compression ratio here, - * thereby trying to allocate the maximum image size we could - * need (assuming compression doesn't expand the image), but - * don't complain if we can't get the full amount we're after. - */ - - do { - int result; - - old_header_req = header_storage_needed; - toiActiveAllocator->reserve_header_space(header_storage_needed); - - /* How much storage is free with the reservation applied? */ - storage_limit = toiActiveAllocator->storage_available(); - seek = min(storage_limit, main_storage_needed(0, 0)); - - result = toiActiveAllocator->allocate_storage(seek); - if (result) - printk("Failed to allocate storage (%d).\n", result); - - main_storage_allocated = - toiActiveAllocator->storage_allocated(); - - /* Need more header because more storage allocated? */ - header_storage_needed = get_header_storage_needed(); - - } while (header_storage_needed > old_header_req); - - if (freeze_kernel_threads()) - set_abort_result(TOI_FREEZING_FAILED); - -recalc: - toi_recalculate_image_contents(0); -} - -/* attempt_to_freeze - * - * Try to freeze processes. - */ - -static int attempt_to_freeze(void) -{ - int result; - - /* Stop processes before checking again */ - toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing " - "filesystems."); - result = freeze_processes(); - - if (result) - set_abort_result(TOI_FREEZING_FAILED); - - result = freeze_kernel_threads(); - - if (result) - set_abort_result(TOI_FREEZING_FAILED); - - return result; -} - -/* eat_memory - * - * Try to free some memory, either to meet hard or soft constraints on the image - * characteristics. - * - * Hard constraints: - * - Pageset1 must be < half of memory; - * - We must have enough memory free at resume time to have pageset1 - * be able to be loaded in pages that don't conflict with where it has to - * be restored. - * Soft constraints - * - User specificied image size limit. - */ -static void eat_memory(void) -{ - unsigned long amount_wanted = 0; - int did_eat_memory = 0; - - /* - * Note that if we have enough storage space and enough free memory, we - * may exit without eating anything. We give up when the last 10 - * iterations ate no extra pages because we're not going to get much - * more anyway, but the few pages we get will take a lot of time. - * - * We freeze processes before beginning, and then unfreeze them if we - * need to eat memory until we think we have enough. If our attempts - * to freeze fail, we give up and abort. - */ - - amount_wanted = amount_needed(1); - - switch (image_size_limit) { - case -1: /* Don't eat any memory */ - if (amount_wanted > 0) { - set_abort_result(TOI_WOULD_EAT_MEMORY); - return; - } - break; - case -2: /* Free caches only */ - drop_pagecache(); - toi_recalculate_image_contents(0); - amount_wanted = amount_needed(1); - break; - default: - break; - } - - if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) && - image_size_limit != -1) { - unsigned long request = amount_wanted; - unsigned long high_req = max(highpages_ps1_to_free(), - any_to_free(1)); - unsigned long low_req = lowpages_ps1_to_free(); - unsigned long got = 0; - - toi_prepare_status(CLEAR_BAR, - "Seeking to free %ldMB of memory.", - MB(amount_wanted)); - - thaw_kernel_threads(); - - /* - * Ask for too many because shrink_memory_mask doesn't - * currently return enough most of the time. - */ - - if (low_req) - got = shrink_memory_mask(low_req, GFP_KERNEL); - if (high_req) - shrink_memory_mask(high_req - got, GFP_HIGHUSER); - - did_eat_memory = 1; - - toi_recalculate_image_contents(0); - - amount_wanted = amount_needed(1); - - printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &" - " %ld pages from anywhere, got %ld.\n", - high_req, low_req, - request - amount_wanted); - - toi_cond_pause(0, NULL); - - if (freeze_kernel_threads()) - set_abort_result(TOI_FREEZING_FAILED); - } - - if (did_eat_memory) - toi_recalculate_image_contents(0); -} - -/* toi_prepare_image - * - * Entry point to the whole image preparation section. - * - * We do four things: - * - Freeze processes; - * - Ensure image size constraints are met; - * - Complete all the preparation for saving the image, - * including allocation of storage. The only memory - * that should be needed when we're finished is that - * for actually storing the image (and we know how - * much is needed for that because the modules tell - * us). - * - Make sure that all dirty buffers are written out. - */ -#define MAX_TRIES 2 -int toi_prepare_image(void) -{ - int result = 1, tries = 1; - - main_storage_allocated = 0; - no_ps2_needed = 0; - - if (attempt_to_freeze()) - return 1; - - lock_device_hotplug(); - set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); - - if (!extra_pd1_pages_allowance) - get_extra_pd1_allowance(); - - storage_limit = toiActiveAllocator->storage_available(); - - if (!storage_limit) { - printk(KERN_INFO "No storage available. Didn't try to prepare " - "an image.\n"); - display_failure_reason(0); - set_abort_result(TOI_NOSTORAGE_AVAILABLE); - return 1; - } - - if (build_attention_list()) { - abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, - "Unable to successfully prepare the image.\n"); - return 1; - } - - toi_recalculate_image_contents(0); - - do { - toi_prepare_status(CLEAR_BAR, - "Preparing Image. Try %d.", tries); - - eat_memory(); - - if (test_result_state(TOI_ABORTED)) - break; - - update_image(0); - - tries++; - - } while (image_not_ready(1) && tries <= MAX_TRIES && - !test_result_state(TOI_ABORTED)); - - result = image_not_ready(0); - - /* TODO: Handle case where need to remove existing image and resave - * instead of adding to incremental image. */ - - if (!test_result_state(TOI_ABORTED)) { - if (result) { - display_stats(1, 0); - display_failure_reason(tries > MAX_TRIES); - abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, - "Unable to successfully prepare the image.\n"); - } else { - /* Pageset 2 needed? */ - if (!need_pageset2() && - test_action_state(TOI_NO_PS2_IF_UNNEEDED)) { - no_ps2_needed = 1; - toi_recalculate_image_contents(0); - update_image(1); - } - - toi_cond_pause(1, "Image preparation complete."); - } - } - - return result ? result : allocate_checksum_pages(); -} diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h deleted file mode 100644 index af6769ee2..000000000 --- a/kernel/power/tuxonice_prepare_image.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * kernel/power/tuxonice_prepare_image.h - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include <asm/sections.h> - -extern int toi_prepare_image(void); -extern void toi_recalculate_image_contents(int storage_available); -extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask); -extern long image_size_limit; -extern void toi_free_extra_pagedir_memory(void); -extern unsigned long extra_pd1_pages_allowance; -extern void free_attention_list(void); - -#define MIN_FREE_RAM 100 -#define MIN_EXTRA_PAGES_ALLOWANCE 500 - -#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1)) -#ifdef CONFIG_HIGHMEM -#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM)) -#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \ - (1 << ZONE_HIGHMEM))) -#else -#define real_nr_free_high_pages() (0) -#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask)) - -/* For eat_memory function */ -#define ZONE_HIGHMEM (MAX_NR_ZONES + 1) -#endif - -unsigned long get_header_storage_needed(void); -unsigned long any_to_free(int use_image_size_limit); -int try_allocate_extra_memory(void); diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c deleted file mode 100644 index 710e48dee..000000000 --- a/kernel/power/tuxonice_prune.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - * kernel/power/tuxonice_prune.c - * - * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file implements a TuxOnIce module that seeks to prune the - * amount of data written to disk. It builds a table of hashes - * of the uncompressed data, and writes the pfn of the previous page - * with the same contents instead of repeating the data when a match - * is found. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <linux/crypto.h> -#include <linux/scatterlist.h> -#include <crypto/hash.h> - -#include "tuxonice_builtin.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" - -/* - * We never write a page bigger than PAGE_SIZE, so use a large number - * to indicate that data is a PFN. - */ -#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100) - -static unsigned long toi_pruned_pages; - -static struct toi_module_ops toi_prune_ops; -static struct toi_module_ops *next_driver; - -static char toi_prune_hash_algo_name[32] = "sha1"; - -static DEFINE_MUTEX(stats_lock); - -struct cpu_context { - struct shash_desc desc; - char *digest; -}; - -#define OUT_BUF_SIZE (2 * PAGE_SIZE) - -static DEFINE_PER_CPU(struct cpu_context, contexts); - -/* - * toi_crypto_prepare - * - * Prepare to do some work by allocating buffers and transforms. - */ -static int toi_prune_crypto_prepare(void) -{ - int cpu, ret, digestsize; - - if (!*toi_prune_hash_algo_name) { - printk(KERN_INFO "TuxOnIce: Pruning enabled but no " - "hash algorithm set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0); - if (IS_ERR(this->desc.tfm)) { - printk(KERN_INFO "TuxOnIce: Failed to allocate the " - "%s prune hash algorithm.\n", - toi_prune_hash_algo_name); - this->desc.tfm = NULL; - return 1; - } - - if (!digestsize) - digestsize = crypto_shash_digestsize(this->desc.tfm); - - this->digest = kmalloc(digestsize, GFP_KERNEL); - if (!this->digest) { - printk(KERN_INFO "TuxOnIce: Failed to allocate space " - "for digest output.\n"); - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - } - - this->desc.flags = 0; - - ret = crypto_shash_init(&this->desc); - if (ret < 0) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s prune hash algorithm.\n", - toi_prune_hash_algo_name); - kfree(this->digest); - this->digest = NULL; - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - return 1; - } - } - - return 0; -} - -static int toi_prune_rw_cleanup(int writing) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->desc.tfm) { - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - } - - if (this->digest) { - kfree(this->digest); - this->digest = NULL; - } - } - - return 0; -} - -/* - * toi_prune_init - */ - -static int toi_prune_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - toi_pruned_pages = 0; - - next_driver = toi_get_next_filter(&toi_prune_ops); - - return next_driver ? 0 : -ECHILD; -} - -/* - * toi_prune_rw_init() - */ - -static int toi_prune_rw_init(int rw, int stream_number) -{ - if (toi_prune_crypto_prepare()) { - printk(KERN_ERR "Failed to initialise prune " - "algorithm.\n"); - if (rw == READ) { - printk(KERN_INFO "Unable to read the image.\n"); - return -ENODEV; - } else { - printk(KERN_INFO "Continuing without " - "pruning the image.\n"); - toi_prune_ops.enabled = 0; - } - } - - return 0; -} - -/* - * toi_prune_write_page() - * - * Compress a page of data, buffering output and passing on filled - * pages to the next module in the pipeline. - * - * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing - * data to be checked. - * - * Returns: 0 on success. Otherwise the error is that returned by later - * modules, -ECHILD if we have a broken pipeline or -EIO if - * zlib errs. - */ -static int toi_prune_write_page(unsigned long index, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - int ret = 0, cpu = smp_processor_id(), write_data = 1; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - u8* output_buffer = buffer_page; - int output_len = buf_size; - int out_buf_type = buf_type; - void *buffer_start; - u32 buf[4]; - - if (ctx->desc.tfm) { - - buffer_start = TOI_MAP(buf_type, buffer_page); - ctx->len = OUT_BUF_SIZE; - - ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest); - if (ret) { - printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret); - } else { - mutex_lock(&stats_lock); - - toi_pruned_pages++; - - mutex_unlock(&stats_lock); - - } - - TOI_UNMAP(buf_type, buffer_page); - } - - if (write_data) - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - else - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - - return ret; -} - -/* - * toi_prune_read_page() - * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. - * - * Retrieve data from later modules or from a previously loaded page and - * fill the input buffer. - * Zero if successful. Error condition from me or from downstream on failure. - */ -static int toi_prune_read_page(unsigned long *index, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int ret, cpu = smp_processor_id(); - unsigned int len; - char *buffer_start; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!ctx->desc.tfm) - return next_driver->read_page(index, TOI_PAGE, buffer_page, - buf_size); - - /* - * All our reads must be synchronous - we can't handle - * data that hasn't been read yet. - */ - - ret = next_driver->read_page(index, buf_type, buffer_page, &len); - - if (len == PRUNE_DATA_IS_PFN) { - buffer_start = kmap(buffer_page); - } - - return ret; -} - -/* - * toi_prune_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_prune_print_debug_stats(char *buffer, int size) -{ - int len; - - /* Output the number of pages pruned. */ - if (*toi_prune_hash_algo_name) - len = scnprintf(buffer, size, "- Compressor is '%s'.\n", - toi_prune_hash_algo_name); - else - len = scnprintf(buffer, size, "- Compressor is not set.\n"); - - if (toi_pruned_pages) - len += scnprintf(buffer+len, size - len, " Pruned " - "%lu pages).\n", - toi_pruned_pages); - return len; -} - -/* - * toi_prune_memory_needed - * - * Tell the caller how much memory we need to operate during hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_prune_memory_needed(void) -{ - return 2 * PAGE_SIZE; -} - -static int toi_prune_storage_needed(void) -{ - return 2 * sizeof(unsigned long) + 2 * sizeof(int) + - strlen(toi_prune_hash_algo_name) + 1; -} - -/* - * toi_prune_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_prune_save_config_info(char *buffer) -{ - int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0; - - *((unsigned long *) buffer) = toi_pruned_pages; - offset += sizeof(unsigned long); - *((int *) (buffer + offset)) = len; - offset += sizeof(int); - strncpy(buffer + offset, toi_prune_hash_algo_name, len); - return offset + len; -} - -/* toi_prune_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for passing back to the - * resumed kernel. - */ -static void toi_prune_load_config_info(char *buffer, int size) -{ - int len, offset = 0; - - toi_pruned_pages = *((unsigned long *) buffer); - offset += sizeof(unsigned long); - len = *((int *) (buffer + offset)); - offset += sizeof(int); - strncpy(toi_prune_hash_algo_name, buffer + offset, len); -} - -static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - bkd->pruned_pages = toi_pruned_pages; -} - -static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_pruned_pages = bkd->pruned_pages; -} - -/* - * toi_expected_ratio - * - * Description: Returns the expected ratio between data passed into this module - * and the amount of data output when writing. - * Returns: 100 - we have no idea how many pages will be pruned. - */ - -static int toi_prune_expected_ratio(void) -{ - return 100; -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0, - NULL), - SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL), -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_prune_ops = { - .type = FILTER_MODULE, - .name = "prune", - .directory = "prune", - .module = THIS_MODULE, - .initialise = toi_prune_init, - .memory_needed = toi_prune_memory_needed, - .print_debug_info = toi_prune_print_debug_stats, - .save_config_info = toi_prune_save_config_info, - .load_config_info = toi_prune_load_config_info, - .storage_needed = toi_prune_storage_needed, - .expected_compression = toi_prune_expected_ratio, - - .pre_atomic_restore = toi_prune_pre_atomic_restore, - .post_atomic_restore = toi_prune_post_atomic_restore, - - .rw_init = toi_prune_rw_init, - .rw_cleanup = toi_prune_rw_cleanup, - - .write_page = toi_prune_write_page, - .read_page = toi_prune_read_page, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -static __init int toi_prune_load(void) -{ - return toi_register_module(&toi_prune_ops); -} - -late_initcall(toi_prune_load); diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c deleted file mode 100644 index e99f6e24f..000000000 --- a/kernel/power/tuxonice_storage.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - * kernel/power/tuxonice_storage.c - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for talking to a userspace program that manages storage. - * - * The kernel side: - * - starts the userspace program; - * - sends messages telling it when to open and close the connection; - * - tells it when to quit; - * - * The user space side: - * - passes messages regarding status; - * - */ - -#include <linux/suspend.h> -#include <linux/freezer.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_netlink.h" -#include "tuxonice_storage.h" -#include "tuxonice_ui.h" - -static struct user_helper_data usm_helper_data; -static struct toi_module_ops usm_ops; -static int message_received, usm_prepare_count; -static int storage_manager_last_action, storage_manager_action; - -static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type; - int *data; - - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < NETLINK_MSG_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ - if (type >= USM_MSG_MAX) - return -EINVAL; - - /* All operations require privileges, even GET */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1) - return -EBUSY; - - data = (int *) NLMSG_DATA(nlh); - - switch (type) { - case USM_MSG_SUCCESS: - case USM_MSG_FAILED: - message_received = type; - complete(&usm_helper_data.wait_for_process); - break; - default: - printk(KERN_INFO "Storage manager doesn't recognise " - "message %d.\n", type); - } - - return 1; -} - -#ifdef CONFIG_NET -static int activations; - -int toi_activate_storage(int force) -{ - int tries = 1; - - if (usm_helper_data.pid == -1 || !usm_ops.enabled) - return 0; - - message_received = 0; - activations++; - - if (activations > 1 && !force) - return 0; - - while ((!message_received || message_received == USM_MSG_FAILED) && - tries < 2) { - toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt " - "%d.\n", tries); - - init_completion(&usm_helper_data.wait_for_process); - - toi_send_netlink_message(&usm_helper_data, - USM_MSG_CONNECT, - NULL, 0); - - /* Wait 2 seconds for the userspace process to make contact */ - wait_for_completion_timeout(&usm_helper_data.wait_for_process, - 2*HZ); - - tries++; - } - - return 0; -} - -int toi_deactivate_storage(int force) -{ - if (usm_helper_data.pid == -1 || !usm_ops.enabled) - return 0; - - message_received = 0; - activations--; - - if (activations && !force) - return 0; - - init_completion(&usm_helper_data.wait_for_process); - - toi_send_netlink_message(&usm_helper_data, - USM_MSG_DISCONNECT, - NULL, 0); - - wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ); - - if (!message_received || message_received == USM_MSG_FAILED) { - printk(KERN_INFO "Returning failure disconnecting storage.\n"); - return 1; - } - - return 0; -} -#endif - -static void storage_manager_simulate(void) -{ - printk(KERN_INFO "--- Storage manager simulate ---\n"); - toi_prepare_usm(); - schedule(); - printk(KERN_INFO "--- Activate storage 1 ---\n"); - toi_activate_storage(1); - schedule(); - printk(KERN_INFO "--- Deactivate storage 1 ---\n"); - toi_deactivate_storage(1); - schedule(); - printk(KERN_INFO "--- Cleanup usm ---\n"); - toi_cleanup_usm(); - schedule(); - printk(KERN_INFO "--- Storage manager simulate ends ---\n"); -} - -static int usm_storage_needed(void) -{ - return sizeof(int) + strlen(usm_helper_data.program) + 1; -} - -static int usm_save_config_info(char *buf) -{ - int len = strlen(usm_helper_data.program); - memcpy(buf, usm_helper_data.program, len + 1); - return sizeof(int) + len + 1; -} - -static void usm_load_config_info(char *buf, int size) -{ - /* Don't load the saved path if one has already been set */ - if (usm_helper_data.program[0]) - return; - - memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf)); -} - -static int usm_memory_needed(void) -{ - /* ball park figure of 32 pages */ - return 32 * PAGE_SIZE; -} - -/* toi_prepare_usm - */ -int toi_prepare_usm(void) -{ - usm_prepare_count++; - - if (usm_prepare_count > 1 || !usm_ops.enabled) - return 0; - - usm_helper_data.pid = -1; - - if (!*usm_helper_data.program) - return 0; - - toi_netlink_setup(&usm_helper_data); - - if (usm_helper_data.pid == -1) - printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't" - " start it.\n"); - - toi_activate_storage(0); - - return usm_helper_data.pid != -1; -} - -void toi_cleanup_usm(void) -{ - usm_prepare_count--; - - if (usm_helper_data.pid > -1 && !usm_prepare_count) { - toi_deactivate_storage(0); - toi_netlink_close(&usm_helper_data); - } -} - -static void storage_manager_activate(void) -{ - if (storage_manager_action == storage_manager_last_action) - return; - - if (storage_manager_action) - toi_prepare_usm(); - else - toi_cleanup_usm(); - - storage_manager_last_action = storage_manager_action; -} - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate), - SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL), - SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0, - NULL), - SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1, - 0, storage_manager_activate) -}; - -static struct toi_module_ops usm_ops = { - .type = MISC_MODULE, - .name = "usm", - .directory = "storage_manager", - .module = THIS_MODULE, - .storage_needed = usm_storage_needed, - .save_config_info = usm_save_config_info, - .load_config_info = usm_load_config_info, - .memory_needed = usm_memory_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* toi_usm_sysfs_init - * Description: Boot time initialisation for user interface. - */ -int toi_usm_init(void) -{ - usm_helper_data.nl = NULL; - usm_helper_data.program[0] = '\0'; - usm_helper_data.pid = -1; - usm_helper_data.skb_size = 0; - usm_helper_data.pool_limit = 6; - usm_helper_data.netlink_id = NETLINK_TOI_USM; - usm_helper_data.name = "userspace storage manager"; - usm_helper_data.rcv_msg = usm_user_rcv_msg; - usm_helper_data.interface_version = 2; - usm_helper_data.must_init = 0; - init_completion(&usm_helper_data.wait_for_process); - - return toi_register_module(&usm_ops); -} - -void toi_usm_exit(void) -{ - toi_netlink_close_complete(&usm_helper_data); - toi_unregister_module(&usm_ops); -} diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h deleted file mode 100644 index 1ed9ab156..000000000 --- a/kernel/power/tuxonice_storage.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * kernel/power/tuxonice_storage.h - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifdef CONFIG_NET -int toi_prepare_usm(void); -void toi_cleanup_usm(void); - -int toi_activate_storage(int force); -int toi_deactivate_storage(int force); -extern int toi_usm_init(void); -extern void toi_usm_exit(void); -#else -static inline int toi_usm_init(void) { return 0; } -static inline void toi_usm_exit(void) { } - -static inline int toi_activate_storage(int force) -{ - return 0; -} - -static inline int toi_deactivate_storage(int force) -{ - return 0; -} - -static inline int toi_prepare_usm(void) { return 0; } -static inline void toi_cleanup_usm(void) { } -#endif - -enum { - USM_MSG_BASE = 0x10, - - /* Kernel -> Userspace */ - USM_MSG_CONNECT = 0x30, - USM_MSG_DISCONNECT = 0x31, - USM_MSG_SUCCESS = 0x40, - USM_MSG_FAILED = 0x41, - - USM_MSG_MAX, -}; diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c deleted file mode 100644 index ce3215033..000000000 --- a/kernel/power/tuxonice_swap.c +++ /dev/null @@ -1,474 +0,0 @@ -/* - * kernel/power/tuxonice_swap.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file encapsulates functions for usage of swap space as a - * backing store. - */ - -#include <linux/suspend.h> -#include <linux/blkdev.h> -#include <linux/swapops.h> -#include <linux/swap.h> -#include <linux/syscalls.h> -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_extent.h" -#include "tuxonice_bio.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" - -static struct toi_module_ops toi_swapops; - -/* For swapfile automatically swapon/off'd. */ -static char swapfilename[255] = ""; -static int toi_swapon_status; - -/* Swap Pages */ -static unsigned long swap_allocated; - -static struct sysinfo swapinfo; - -static int is_ram_backed(struct swap_info_struct *si) -{ - if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) || - !strncmp(si->bdev->bd_disk->disk_name, "zram", 4)) - return 1; - - return 0; -} - -/** - * enable_swapfile: Swapon the user specified swapfile prior to hibernating. - * - * Activate the given swapfile if it wasn't already enabled. Remember whether - * we really did swapon it for swapoffing later. - */ -static void enable_swapfile(void) -{ - int activateswapresult = -EINVAL; - - if (swapfilename[0]) { - /* Attempt to swap on with maximum priority */ - activateswapresult = sys_swapon(swapfilename, 0xFFFF); - if (activateswapresult && activateswapresult != -EBUSY) - printk(KERN_ERR "TuxOnIce: The swapfile/partition " - "specified by /sys/power/tuxonice/swap/swapfile" - " (%s) could not be turned on (error %d). " - "Attempting to continue.\n", - swapfilename, activateswapresult); - if (!activateswapresult) - toi_swapon_status = 1; - } -} - -/** - * disable_swapfile: Swapoff any file swaponed at the start of the cycle. - * - * If we did successfully swapon a file at the start of the cycle, swapoff - * it now (finishing up). - */ -static void disable_swapfile(void) -{ - if (!toi_swapon_status) - return; - - sys_swapoff(swapfilename); - toi_swapon_status = 0; -} - -static int add_blocks_to_extent_chain(struct toi_bdev_info *chain, - unsigned long start, unsigned long end) -{ - if (test_action_state(TOI_TEST_BIO)) - toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to " - "chain %p.", start << chain->bmap_shift, - end << chain->bmap_shift, chain); - - return toi_add_to_extent_chain(&chain->blocks, start, end); -} - - -static int get_main_pool_phys_params(struct toi_bdev_info *chain) -{ - struct hibernate_extent *extentpointer = NULL; - unsigned long address, extent_min = 0, extent_max = 0; - int empty = 1; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for " - "chain %d.", chain->allocator_index); - - if (!chain->allocations.first) - return 0; - - if (chain->blocks.first) - toi_put_extent_chain(&chain->blocks); - - toi_extent_for_each(&chain->allocations, extentpointer, address) { - swp_entry_t swap_address = (swp_entry_t) { address }; - struct block_device *bdev; - sector_t new_sector = map_swap_entry(swap_address, &bdev); - - if (empty) { - empty = 0; - extent_min = extent_max = new_sector; - continue; - } - - if (new_sector == extent_max + 1) { - extent_max++; - continue; - } - - if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) { - printk(KERN_ERR "Out of memory while making block " - "chains.\n"); - return -ENOMEM; - } - - extent_min = new_sector; - extent_max = new_sector; - } - - if (!empty && - add_blocks_to_extent_chain(chain, extent_min, extent_max)) { - printk(KERN_ERR "Out of memory while making block chains.\n"); - return -ENOMEM; - } - - return 0; -} - -/* - * Like si_swapinfo, except that we don't include ram backed swap (compcache!) - * and don't need to use the spinlocks (userspace is stopped when this - * function is called). - */ -void si_swapinfo_no_compcache(void) -{ - unsigned int i; - - si_swapinfo(&swapinfo); - swapinfo.freeswap = 0; - swapinfo.totalswap = 0; - - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) { - swapinfo.totalswap += si->inuse_pages; - swapinfo.freeswap += si->pages - si->inuse_pages; - } - } -} -/* - * We can't just remember the value from allocation time, because other - * processes might have allocated swap in the mean time. - */ -static unsigned long toi_swap_storage_available(void) -{ - toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available."); - si_swapinfo_no_compcache(); - return swapinfo.freeswap + swap_allocated; -} - -static int toi_swap_initialise(int starting_cycle) -{ - if (!starting_cycle) - return 0; - - enable_swapfile(); - return 0; -} - -static void toi_swap_cleanup(int ending_cycle) -{ - if (!ending_cycle) - return; - - disable_swapfile(); -} - -static void toi_swap_free_storage(struct toi_bdev_info *chain) -{ - /* Free swap entries */ - struct hibernate_extent *extentpointer; - unsigned long extentvalue; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.", - chain); - - swap_allocated -= chain->allocations.size; - toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) - swap_free((swp_entry_t) { extentvalue }); - - toi_put_extent_chain(&chain->allocations); -} - -static void free_swap_range(unsigned long min, unsigned long max) -{ - int j; - - for (j = min; j <= max; j++) - swap_free((swp_entry_t) { j }); - swap_allocated -= (max - min + 1); -} - -/* - * Allocation of a single swap type. Swap priorities are handled at the higher - * level. - */ -static int toi_swap_allocate_storage(struct toi_bdev_info *chain, - unsigned long request) -{ - unsigned long gotten = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, " Swap allocate storage: Asked to" - " allocate %lu pages from device %d.", request, - chain->allocator_index); - - while (gotten < request) { - swp_entry_t start, end; - if (0) { - /* Broken at the moment for SSDs */ - get_swap_range_of_type(chain->allocator_index, &start, &end, - request - gotten + 1); - } else { - start = end = get_swap_page_of_type(chain->allocator_index); - } - if (start.val) { - int added = end.val - start.val + 1; - if (toi_add_to_extent_chain(&chain->allocations, - start.val, end.val)) { - printk(KERN_INFO "Failed to allocate extent for " - "%lu-%lu.\n", start.val, end.val); - free_swap_range(start.val, end.val); - break; - } - gotten += added; - swap_allocated += added; - } else - break; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, " Allocated %lu pages.", gotten); - return gotten; -} - -static int toi_swap_register_storage(void) -{ - int i, result = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage."); - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - struct toi_bdev_info *devinfo; - unsigned char *p; - unsigned char buf[256]; - struct fs_info *fs_info; - - if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si)) - continue; - - devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), - GFP_ATOMIC); - if (!devinfo) { - printk("Failed to allocate devinfo struct for swap " - "device %d.\n", i); - return -ENOMEM; - } - - devinfo->bdev = si->bdev; - devinfo->allocator = &toi_swapops; - devinfo->allocator_index = i; - - fs_info = fs_info_from_block_dev(si->bdev); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(devinfo->uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - if (!fs_info) - printk("fs_info from block dev returned %d.\n", result); - devinfo->dev_t = si->bdev->bd_dev; - devinfo->prio = si->prio; - devinfo->bmap_shift = 3; - devinfo->blocks_per_page = 1; - - p = d_path(&si->swap_file->f_path, buf, sizeof(buf)); - sprintf(devinfo->name, "swap on %s", p); - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:" - " Device %d (%lx), prio %d.", i, - (unsigned long) devinfo->dev_t, devinfo->prio); - toi_bio_ops.register_storage(devinfo); - } - - return 0; -} - -static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used) -{ - struct hibernate_extent *extentpointer = NULL; - unsigned long extentvalue; - unsigned long i = 0, first_freed = 0; - - toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) { - i++; - if (i > used) { - swap_free((swp_entry_t) { extentvalue }); - if (!first_freed) - first_freed = extentvalue; - } - } - - return first_freed; -} - -/* - * workspace_size - * - * Description: - * Returns the number of bytes of RAM needed for this - * code to do its work. (Used when calculating whether - * we have enough memory to be able to hibernate & resume). - * - */ -static int toi_swap_memory_needed(void) -{ - return 1; -} - -/* - * Print debug info - * - * Description: - */ -static int toi_swap_print_debug_stats(char *buffer, int size) -{ - int len = 0; - - len = scnprintf(buffer, size, "- Swap Allocator enabled.\n"); - if (swapfilename[0]) - len += scnprintf(buffer+len, size-len, - " Attempting to automatically swapon: %s.\n", - swapfilename); - - si_swapinfo_no_compcache(); - - len += scnprintf(buffer+len, size-len, - " Swap available for image: %lu pages.\n", - swapinfo.freeswap + swap_allocated); - - return len; -} - -static int header_locations_read_sysfs(const char *page, int count) -{ - int i, printedpartitionsmessage = 0, len = 0, haveswap = 0; - struct inode *swapf = NULL; - int zone; - char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL); - char *path, *output = (char *) page; - int path_len; - - if (!page) - return 0; - - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - - if (!si || !(si->flags & SWP_WRITEOK)) - continue; - - if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) { - haveswap = 1; - if (!printedpartitionsmessage) { - len += sprintf(output + len, - "For swap partitions, simply use the " - "format: resume=swap:/dev/hda1.\n"); - printedpartitionsmessage = 1; - } - } else { - path_len = 0; - - path = d_path(&si->swap_file->f_path, path_page, - PAGE_SIZE); - path_len = snprintf(path_page, PAGE_SIZE, "%s", path); - - haveswap = 1; - swapf = si->swap_file->f_mapping->host; - zone = bmap(swapf, 0); - if (!zone) { - len += sprintf(output + len, - "Swapfile %s has been corrupted. Reuse" - " mkswap on it and try again.\n", - path_page); - } else { - char name_buffer[BDEVNAME_SIZE]; - len += sprintf(output + len, - "For swapfile `%s`," - " use resume=swap:/dev/%s:0x%x.\n", - path_page, - bdevname(si->bdev, name_buffer), - zone << (swapf->i_blkbits - 9)); - } - } - } - - if (!haveswap) - len = sprintf(output, "You need to turn on swap partitions " - "before examining this file.\n"); - - toi_free_page(10, (unsigned long) path_page); - return len; -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL), - SYSFS_CUSTOM("headerlocations", SYSFS_READONLY, - header_locations_read_sysfs, NULL, 0, NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0, - attempt_to_parse_resume_device2), -}; - -static struct toi_bio_allocator_ops toi_bio_swapops = { - .register_storage = toi_swap_register_storage, - .storage_available = toi_swap_storage_available, - .allocate_storage = toi_swap_allocate_storage, - .bmap = get_main_pool_phys_params, - .free_storage = toi_swap_free_storage, - .free_unused_storage = toi_swap_free_unused_storage, -}; - -static struct toi_module_ops toi_swapops = { - .type = BIO_ALLOCATOR_MODULE, - .name = "swap storage", - .directory = "swap", - .module = THIS_MODULE, - .memory_needed = toi_swap_memory_needed, - .print_debug_info = toi_swap_print_debug_stats, - .initialise = toi_swap_initialise, - .cleanup = toi_swap_cleanup, - .bio_allocator_ops = &toi_bio_swapops, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -static __init int toi_swap_load(void) -{ - return toi_register_module(&toi_swapops); -} - -late_initcall(toi_swap_load); diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c deleted file mode 100644 index 79c9315b6..000000000 --- a/kernel/power/tuxonice_sysfs.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * kernel/power/tuxonice_sysfs.c - * - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains support for sysfs entries for tuning TuxOnIce. - * - * We have a generic handler that deals with the most common cases, and - * hooks for special handlers to use. - */ - -#include <linux/suspend.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice.h" -#include "tuxonice_storage.h" -#include "tuxonice_alloc.h" - -static int toi_sysfs_initialised; - -static void toi_initialise_sysfs(void); - -static struct toi_sysfs_data sysfs_params[]; - -#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr) - -static void toi_main_wrapper(void) -{ - toi_try_hibernate(); -} - -static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); - int len = 0; - int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ; - - if (full_prep && toi_start_anything(0)) - return -EBUSY; - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) - toi_prepare_usm(); - - switch (sysfs_data->type) { - case TOI_SYSFS_DATA_CUSTOM: - len = (sysfs_data->data.special.read_sysfs) ? - (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE) - : 0; - break; - case TOI_SYSFS_DATA_BIT: - len = sprintf(page, "%d\n", - -test_bit(sysfs_data->data.bit.bit, - sysfs_data->data.bit.bit_vector)); - break; - case TOI_SYSFS_DATA_INTEGER: - len = sprintf(page, "%d\n", - *(sysfs_data->data.integer.variable)); - break; - case TOI_SYSFS_DATA_LONG: - len = sprintf(page, "%ld\n", - *(sysfs_data->data.a_long.variable)); - break; - case TOI_SYSFS_DATA_UL: - len = sprintf(page, "%lu\n", - *(sysfs_data->data.ul.variable)); - break; - case TOI_SYSFS_DATA_STRING: - len = sprintf(page, "%s\n", - sysfs_data->data.string.variable); - break; - } - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) - toi_cleanup_usm(); - - if (full_prep) - toi_finish_anything(0); - - return len; -} - -#define BOUND(_variable, _type) do { \ - if (*_variable < sysfs_data->data._type.minimum) \ - *_variable = sysfs_data->data._type.minimum; \ - else if (*_variable > sysfs_data->data._type.maximum) \ - *_variable = sysfs_data->data._type.maximum; \ -} while (0) - -static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr, - const char *my_buf, size_t count) -{ - int assigned_temp_buffer = 0, result = count; - struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); - - if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME))) - return -EBUSY; - - ((char *) my_buf)[count] = 0; - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) - toi_prepare_usm(); - - switch (sysfs_data->type) { - case TOI_SYSFS_DATA_CUSTOM: - if (sysfs_data->data.special.write_sysfs) - result = (sysfs_data->data.special.write_sysfs)(my_buf, - count); - break; - case TOI_SYSFS_DATA_BIT: - { - unsigned long value; - result = kstrtoul(my_buf, 0, &value); - if (result) - break; - if (value) - set_bit(sysfs_data->data.bit.bit, - (sysfs_data->data.bit.bit_vector)); - else - clear_bit(sysfs_data->data.bit.bit, - (sysfs_data->data.bit.bit_vector)); - } - break; - case TOI_SYSFS_DATA_INTEGER: - { - long temp; - result = kstrtol(my_buf, 0, &temp); - if (result) - break; - *(sysfs_data->data.integer.variable) = (int) temp; - BOUND(sysfs_data->data.integer.variable, integer); - break; - } - case TOI_SYSFS_DATA_LONG: - { - long *variable = - sysfs_data->data.a_long.variable; - result = kstrtol(my_buf, 0, variable); - if (result) - break; - BOUND(variable, a_long); - break; - } - case TOI_SYSFS_DATA_UL: - { - unsigned long *variable = - sysfs_data->data.ul.variable; - result = kstrtoul(my_buf, 0, variable); - if (result) - break; - BOUND(variable, ul); - break; - } - break; - case TOI_SYSFS_DATA_STRING: - { - int copy_len = count; - char *variable = - sysfs_data->data.string.variable; - - if (sysfs_data->data.string.max_length && - (copy_len > sysfs_data->data.string.max_length)) - copy_len = sysfs_data->data.string.max_length; - - if (!variable) { - variable = (char *) toi_get_zeroed_page(31, - TOI_ATOMIC_GFP); - sysfs_data->data.string.variable = variable; - assigned_temp_buffer = 1; - } - strncpy(variable, my_buf, copy_len); - if (copy_len && my_buf[copy_len - 1] == '\n') - variable[count - 1] = 0; - variable[count] = 0; - } - break; - } - - if (!result) - result = count; - - /* Side effect routine? */ - if (result == count && sysfs_data->write_side_effect) - sysfs_data->write_side_effect(); - - /* Free temporary buffers */ - if (assigned_temp_buffer) { - toi_free_page(31, - (unsigned long) sysfs_data->data.string.variable); - sysfs_data->data.string.variable = NULL; - } - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) - toi_cleanup_usm(); - - toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME); - - return result; -} - -static struct sysfs_ops toi_sysfs_ops = { - .show = &toi_attr_show, - .store = &toi_attr_store, -}; - -static struct kobj_type toi_ktype = { - .sysfs_ops = &toi_sysfs_ops, -}; - -struct kobject *tuxonice_kobj; - -/* Non-module sysfs entries. - * - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL, - SYSFS_HIBERNATING, toi_main_wrapper), - SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL, - SYSFS_RESUMING, toi_try_resume) -}; - -void remove_toi_sysdir(struct kobject *kobj) -{ - if (!kobj) - return; - - kobject_put(kobj); -} - -struct kobject *make_toi_sysdir(char *name) -{ - struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj); - - if (!kobj) { - printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs " - "dir!\n"); - return NULL; - } - - kobj->ktype = &toi_ktype; - - return kobj; -} - -/* toi_register_sysfs_file - * - * Helper for registering a new /sysfs/tuxonice entry. - */ - -int toi_register_sysfs_file( - struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data) -{ - int result; - - if (!toi_sysfs_initialised) - toi_initialise_sysfs(); - - result = sysfs_create_file(kobj, &toi_sysfs_data->attr); - if (result) - printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s " - "returned %d.\n", - toi_sysfs_data->attr.name, result); - kobj->ktype = &toi_ktype; - - return result; -} - -/* toi_unregister_sysfs_file - * - * Helper for removing unwanted /sys/power/tuxonice entries. - * - */ -void toi_unregister_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data) -{ - sysfs_remove_file(kobj, &toi_sysfs_data->attr); -} - -void toi_cleanup_sysfs(void) -{ - int i, - numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - if (!toi_sysfs_initialised) - return; - - for (i = 0; i < numfiles; i++) - toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]); - - kobject_put(tuxonice_kobj); - toi_sysfs_initialised = 0; -} - -/* toi_initialise_sysfs - * - * Initialise the /sysfs/tuxonice directory. - */ - -static void toi_initialise_sysfs(void) -{ - int i; - int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - if (toi_sysfs_initialised) - return; - - /* Make our TuxOnIce directory a child of /sys/power */ - tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj); - if (!tuxonice_kobj) - return; - - toi_sysfs_initialised = 1; - - for (i = 0; i < numfiles; i++) - toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); -} - -int toi_sysfs_init(void) -{ - toi_initialise_sysfs(); - return 0; -} - -void toi_sysfs_exit(void) -{ - toi_cleanup_sysfs(); -} diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h deleted file mode 100644 index 5b331b19a..000000000 --- a/kernel/power/tuxonice_sysfs.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * kernel/power/tuxonice_sysfs.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#include <linux/sysfs.h> - -struct toi_sysfs_data { - struct attribute attr; - int type; - int flags; - union { - struct { - unsigned long *bit_vector; - int bit; - } bit; - struct { - int *variable; - int minimum; - int maximum; - } integer; - struct { - long *variable; - long minimum; - long maximum; - } a_long; - struct { - unsigned long *variable; - unsigned long minimum; - unsigned long maximum; - } ul; - struct { - char *variable; - int max_length; - } string; - struct { - int (*read_sysfs) (const char *buffer, int count); - int (*write_sysfs) (const char *buffer, int count); - void *data; - } special; - } data; - - /* Side effects routine. Used, eg, for reparsing the - * resume= entry when it changes */ - void (*write_side_effect) (void); - struct list_head sysfs_data_list; -}; - -enum { - TOI_SYSFS_DATA_NONE = 1, - TOI_SYSFS_DATA_CUSTOM, - TOI_SYSFS_DATA_BIT, - TOI_SYSFS_DATA_INTEGER, - TOI_SYSFS_DATA_UL, - TOI_SYSFS_DATA_LONG, - TOI_SYSFS_DATA_STRING -}; - -#define SYSFS_WRITEONLY 0200 -#define SYSFS_READONLY 0444 -#define SYSFS_RW 0644 - -#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_BIT, \ - .flags = _flags, \ - .data = { .bit = { .bit_vector = _ul, .bit = _bit } } } - -#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_INTEGER, \ - .flags = _flags, \ - .data = { .integer = { .variable = _int, .minimum = _min, \ - .maximum = _max } }, \ - .write_side_effect = _wse } - -#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_UL, \ - .flags = _flags, \ - .data = { .ul = { .variable = _ul, .minimum = _min, \ - .maximum = _max } } } - -#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_LONG, \ - .flags = _flags, \ - .data = { .a_long = { .variable = _long, .minimum = _min, \ - .maximum = _max } } } - -#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_STRING, \ - .flags = _flags, \ - .data = { .string = { .variable = _string, .max_length = _max_len } }, \ - .write_side_effect = _wse } - -#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_CUSTOM, \ - .flags = _flags, \ - .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \ - .write_side_effect = _wse } - -#define SYSFS_NONE(_name, _wse) { \ - .attr = {.name = _name , .mode = SYSFS_WRITEONLY }, \ - .type = TOI_SYSFS_DATA_NONE, \ - .write_side_effect = _wse, \ -} - -/* Flags */ -#define SYSFS_NEEDS_SM_FOR_READ 1 -#define SYSFS_NEEDS_SM_FOR_WRITE 2 -#define SYSFS_HIBERNATE 4 -#define SYSFS_RESUME 8 -#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME) -#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE) -#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE) -#define SYSFS_NEEDS_SM_FOR_BOTH \ - (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE) - -int toi_register_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data); -void toi_unregister_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data); - -extern struct kobject *tuxonice_kobj; - -struct kobject *make_toi_sysdir(char *name); -void remove_toi_sysdir(struct kobject *obj); -extern void toi_cleanup_sysfs(void); - -extern int toi_sysfs_init(void); -extern void toi_sysfs_exit(void); diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c deleted file mode 100644 index c405f9b9a..000000000 --- a/kernel/power/tuxonice_ui.c +++ /dev/null @@ -1,247 +0,0 @@ -/* - * kernel/power/tuxonice_ui.c - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr> - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for TuxOnIce's user interface. - * - * The user interface code talks to a userspace program via a - * netlink socket. - * - * The kernel side: - * - starts the userui program; - * - sends text messages and progress bar status; - * - * The user space side: - * - passes messages regarding user requests (abort, toggle reboot etc) - * - */ - -#define __KERNEL_SYSCALLS__ - -#include <linux/reboot.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_netlink.h" -#include "tuxonice_power_off.h" -#include "tuxonice_builtin.h" - -static char local_printf_buf[1024]; /* Same as printk - should be safe */ -struct ui_ops *toi_current_ui; - -/** - * toi_wait_for_keypress - Wait for keypress via userui or /dev/console. - * - * @timeout: Maximum time to wait. - * - * Wait for a keypress, either from userui or /dev/console if userui isn't - * available. The non-userui path is particularly for at boot-time, prior - * to userui being started, when we have an important warning to give to - * the user. - */ -static char toi_wait_for_keypress(int timeout) -{ - if (toi_current_ui && toi_current_ui->wait_for_key(timeout)) - return ' '; - - return toi_wait_for_keypress_dev_console(timeout); -} - -/* toi_early_boot_message() - * Description: Handle errors early in the process of booting. - * The user may press C to continue booting, perhaps - * invalidating the image, or space to reboot. - * This works from either the serial console or normally - * attached keyboard. - * - * Note that we come in here from init, while the kernel is - * locked. If we want to get events from the serial console, - * we need to temporarily unlock the kernel. - * - * toi_early_boot_message may also be called post-boot. - * In this case, it simply printks the message and returns. - * - * Arguments: int Whether we are able to erase the image. - * int default_answer. What to do when we timeout. This - * will normally be continue, but the user might - * provide command line options (__setup) to override - * particular cases. - * Char *. Pointer to a string explaining why we're moaning. - */ - -#define say(message, a...) printk(KERN_EMERG message, ##a) - -void toi_early_boot_message(int message_detail, int default_answer, - char *warning_reason, ...) -{ -#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) - unsigned long orig_state = get_toi_state(), continue_req = 0; - unsigned long orig_loglevel = console_loglevel; - int can_ask = 1; -#else - int can_ask = 0; -#endif - - va_list args; - int printed_len; - - if (!toi_wait) { - set_toi_state(TOI_CONTINUE_REQ); - can_ask = 0; - } - - if (warning_reason) { - va_start(args, warning_reason); - printed_len = vsnprintf(local_printf_buf, - sizeof(local_printf_buf), - warning_reason, - args); - va_end(args); - } - - if (!test_toi_state(TOI_BOOT_TIME)) { - printk("TuxOnIce: %s\n", local_printf_buf); - return; - } - - if (!can_ask) { - continue_req = !!default_answer; - goto post_ask; - } - -#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) - console_loglevel = 7; - - say("=== TuxOnIce ===\n\n"); - if (warning_reason) { - say("BIG FAT WARNING!! %s\n\n", local_printf_buf); - switch (message_detail) { - case 0: - say("If you continue booting, note that any image WILL" - "NOT BE REMOVED.\nTuxOnIce is unable to do so " - "because the appropriate modules aren't\n" - "loaded. You should manually remove the image " - "to avoid any\npossibility of corrupting your " - "filesystem(s) later.\n"); - break; - case 1: - say("If you want to use the current TuxOnIce image, " - "reboot and try\nagain with the same kernel " - "that you hibernated from. If you want\n" - "to forget that image, continue and the image " - "will be erased.\n"); - break; - } - say("Press SPACE to reboot or C to continue booting with " - "this kernel\n\n"); - if (toi_wait > 0) - say("Default action if you don't select one in %d " - "seconds is: %s.\n", - toi_wait, - default_answer == TOI_CONTINUE_REQ ? - "continue booting" : "reboot"); - } else { - say("BIG FAT WARNING!!\n\n" - "You have tried to resume from this image before.\n" - "If it failed once, it may well fail again.\n" - "Would you like to remove the image and boot " - "normally?\nThis will be equivalent to entering " - "noresume on the\nkernel command line.\n\n" - "Press SPACE to remove the image or C to continue " - "resuming.\n\n"); - if (toi_wait > 0) - say("Default action if you don't select one in %d " - "seconds is: %s.\n", toi_wait, - !!default_answer ? - "continue resuming" : "remove the image"); - } - console_loglevel = orig_loglevel; - - set_toi_state(TOI_SANITY_CHECK_PROMPT); - clear_toi_state(TOI_CONTINUE_REQ); - - if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */ - continue_req = !!default_answer; - else - continue_req = test_toi_state(TOI_CONTINUE_REQ); - -#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */ - -post_ask: - if ((warning_reason) && (!continue_req)) - kernel_restart(NULL); - - restore_toi_state(orig_state); - if (continue_req) - set_toi_state(TOI_CONTINUE_REQ); -} - -#undef say - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) - SYSFS_INT("default_console_level", SYSFS_RW, - &toi_bkd.toi_default_console_level, 0, 7, 0, NULL), - SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0, - 1 << 30, 0), - SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL, - 0) -#endif -}; - -static struct toi_module_ops userui_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "printk ui", - .directory = "user_interface", - .module = THIS_MODULE, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_register_ui_ops(struct ui_ops *this_ui) -{ - if (toi_current_ui) { - printk(KERN_INFO "Only one TuxOnIce user interface module can " - "be loaded at a time."); - return -EBUSY; - } - - toi_current_ui = this_ui; - - return 0; -} - -void toi_remove_ui_ops(struct ui_ops *this_ui) -{ - if (toi_current_ui != this_ui) - return; - - toi_current_ui = NULL; -} - -/* toi_console_sysfs_init - * Description: Boot time initialisation for user interface. - */ - -int toi_ui_init(void) -{ - return toi_register_module(&userui_ops); -} - -void toi_ui_exit(void) -{ - toi_unregister_module(&userui_ops); -} diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h deleted file mode 100644 index d71c607f6..000000000 --- a/kernel/power/tuxonice_ui.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * kernel/power/tuxonice_ui.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - */ - -enum { - DONT_CLEAR_BAR, - CLEAR_BAR -}; - -enum { - /* Userspace -> Kernel */ - USERUI_MSG_ABORT = 0x11, - USERUI_MSG_SET_STATE = 0x12, - USERUI_MSG_GET_STATE = 0x13, - USERUI_MSG_GET_DEBUG_STATE = 0x14, - USERUI_MSG_SET_DEBUG_STATE = 0x15, - USERUI_MSG_SPACE = 0x18, - USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A, - USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B, - USERUI_MSG_GET_LOGLEVEL = 0x1C, - USERUI_MSG_SET_LOGLEVEL = 0x1D, - USERUI_MSG_PRINTK = 0x1E, - - /* Kernel -> Userspace */ - USERUI_MSG_MESSAGE = 0x21, - USERUI_MSG_PROGRESS = 0x22, - USERUI_MSG_POST_ATOMIC_RESTORE = 0x25, - - USERUI_MSG_MAX, -}; - -struct userui_msg_params { - u32 a, b, c, d; - char text[255]; -}; - -struct ui_ops { - char (*wait_for_key) (int timeout); - u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...); - void (*prepare_status) (int clearbar, const char *fmt, ...); - void (*cond_pause) (int pause, char *message); - void (*abort)(int result_code, const char *fmt, ...); - void (*prepare)(void); - void (*cleanup)(void); - void (*message)(u32 section, u32 level, u32 normally_logged, - const char *fmt, ...); -}; - -extern struct ui_ops *toi_current_ui; - -#define toi_update_status(val, max, fmt, args...) \ - (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \ - max) - -#define toi_prepare_console(void) \ - do { if (toi_current_ui) \ - (toi_current_ui->prepare)(); \ - } while (0) - -#define toi_cleanup_console(void) \ - do { if (toi_current_ui) \ - (toi_current_ui->cleanup)(); \ - } while (0) - -#define abort_hibernate(result, fmt, args...) \ - do { if (toi_current_ui) \ - (toi_current_ui->abort)(result, fmt, ##args); \ - else { \ - set_abort_result(result); \ - } \ - } while (0) - -#define toi_cond_pause(pause, message) \ - do { if (toi_current_ui) \ - (toi_current_ui->cond_pause)(pause, message); \ - } while (0) - -#define toi_prepare_status(clear, fmt, args...) \ - do { if (toi_current_ui) \ - (toi_current_ui->prepare_status)(clear, fmt, ##args); \ - else \ - printk(KERN_INFO fmt "%s", ##args, "\n"); \ - } while (0) - -#define toi_message(sn, lev, log, fmt, a...) \ -do { \ - if (toi_current_ui && (!sn || test_debug_state(sn))) \ - toi_current_ui->message(sn, lev, log, fmt, ##a); \ -} while (0) - -__exit void toi_ui_cleanup(void); -extern int toi_ui_init(void); -extern void toi_ui_exit(void); -extern int toi_register_ui_ops(struct ui_ops *this_ui); -extern void toi_remove_ui_ops(struct ui_ops *this_ui); diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c deleted file mode 100644 index edc885c72..000000000 --- a/kernel/power/tuxonice_userui.c +++ /dev/null @@ -1,658 +0,0 @@ -/* - * kernel/power/user_ui.c - * - * Copyright (C) 2005-2007 Bernard Blackham - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for TuxOnIce's user interface. - * - * The user interface code talks to a userspace program via a - * netlink socket. - * - * The kernel side: - * - starts the userui program; - * - sends text messages and progress bar status; - * - * The user space side: - * - passes messages regarding user requests (abort, toggle reboot etc) - * - */ - -#define __KERNEL_SYSCALLS__ - -#include <linux/suspend.h> -#include <linux/freezer.h> -#include <linux/console.h> -#include <linux/ctype.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> -#include <linux/reboot.h> -#include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/vt.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_netlink.h" -#include "tuxonice_power_off.h" - -static char local_printf_buf[1024]; /* Same as printk - should be safe */ - -static struct user_helper_data ui_helper_data; -static struct toi_module_ops userui_ops; -static int orig_kmsg; - -static char lastheader[512]; -static int lastheader_message_len; -static int ui_helper_changed; /* Used at resume-time so don't overwrite value - set from initrd/ramfs. */ - -/* Number of distinct progress amounts that userspace can display */ -static int progress_granularity = 30; - -static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key); -static int userui_wait_should_wake; - -#define toi_stop_waiting_for_userui_key() \ -{ \ - userui_wait_should_wake = true; \ - wake_up_interruptible(&userui_wait_for_key); \ -} - -/** - * ui_nl_set_state - Update toi_action based on a message from userui. - * - * @n: The bit (1 << bit) to set. - */ -static void ui_nl_set_state(int n) -{ - /* Only let them change certain settings */ - static const u32 toi_action_mask = - (1 << TOI_REBOOT) | (1 << TOI_PAUSE) | - (1 << TOI_LOGALL) | - (1 << TOI_SINGLESTEP) | - (1 << TOI_PAUSE_NEAR_PAGESET_END); - static unsigned long new_action; - - new_action = (toi_bkd.toi_action & (~toi_action_mask)) | - (n & toi_action_mask); - - printk(KERN_DEBUG "n is %x. Action flags being changed from %lx " - "to %lx.", n, toi_bkd.toi_action, new_action); - toi_bkd.toi_action = new_action; - - if (!test_action_state(TOI_PAUSE) && - !test_action_state(TOI_SINGLESTEP)) - toi_stop_waiting_for_userui_key(); -} - -/** - * userui_post_atomic_restore - Tell userui that atomic restore just happened. - * - * Tell userui that atomic restore just occured, so that it can do things like - * redrawing the screen, re-getting settings and so on. - */ -static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0); -} - -/** - * userui_storage_needed - Report how much memory in image header is needed. - */ -static int userui_storage_needed(void) -{ - return sizeof(ui_helper_data.program) + 1 + sizeof(int); -} - -/** - * userui_save_config_info - Fill buffer with config info for image header. - * - * @buf: Buffer into which to put the config info we want to save. - */ -static int userui_save_config_info(char *buf) -{ - *((int *) buf) = progress_granularity; - memcpy(buf + sizeof(int), ui_helper_data.program, - sizeof(ui_helper_data.program)); - return sizeof(ui_helper_data.program) + sizeof(int) + 1; -} - -/** - * userui_load_config_info - Restore config info from buffer. - * - * @buf: Buffer containing header info loaded. - * @size: Size of data loaded for this module. - */ -static void userui_load_config_info(char *buf, int size) -{ - progress_granularity = *((int *) buf); - size -= sizeof(int); - - /* Don't load the saved path if one has already been set */ - if (ui_helper_changed) - return; - - if (size > sizeof(ui_helper_data.program)) - size = sizeof(ui_helper_data.program); - - memcpy(ui_helper_data.program, buf + sizeof(int), size); - ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0'; -} - -/** - * set_ui_program_set: Record that userui program was changed. - * - * Side effect routine for when the userui program is set. In an initrd or - * ramfs, the user may set a location for the userui program. If this happens, - * we don't want to reload the value that was saved in the image header. This - * routine allows us to flag that we shouldn't restore the program name from - * the image header. - */ -static void set_ui_program_set(void) -{ - ui_helper_changed = 1; -} - -/** - * userui_memory_needed - Tell core how much memory to reserve for us. - */ -static int userui_memory_needed(void) -{ - /* ball park figure of 128 pages */ - return 128 * PAGE_SIZE; -} - -/** - * userui_update_status - Update the progress bar and (if on) in-bar message. - * - * @value: Current progress percentage numerator. - * @maximum: Current progress percentage denominator. - * @fmt: Message to be displayed in the middle of the progress bar. - * - * Note that a NULL message does not mean that any previous message is erased! - * For that, you need toi_prepare_status with clearbar on. - * - * Returns an unsigned long, being the next numerator (as determined by the - * maximum and progress granularity) where status needs to be updated. - * This is to reduce unnecessary calls to update_status. - */ -static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...) -{ - static u32 last_step = 9999; - struct userui_msg_params msg; - u32 this_step, next_update; - int bitshift; - - if (ui_helper_data.pid == -1) - return 0; - - if ((!maximum) || (!progress_granularity)) - return maximum; - - if (value < 0) - value = 0; - - if (value > maximum) - value = maximum; - - /* Try to avoid math problems - we can't do 64 bit math here - * (and shouldn't need it - anyone got screen resolution - * of 65536 pixels or more?) */ - bitshift = fls(maximum) - 16; - if (bitshift > 0) { - u32 temp_maximum = maximum >> bitshift; - u32 temp_value = value >> bitshift; - this_step = (u32) - (temp_value * progress_granularity / temp_maximum); - next_update = (((this_step + 1) * temp_maximum / - progress_granularity) + 1) << bitshift; - } else { - this_step = (u32) (value * progress_granularity / maximum); - next_update = ((this_step + 1) * maximum / - progress_granularity) + 1; - } - - if (this_step == last_step) - return next_update; - - memset(&msg, 0, sizeof(msg)); - - msg.a = this_step; - msg.b = progress_granularity; - - if (fmt) { - va_list args; - va_start(args, fmt); - vsnprintf(msg.text, sizeof(msg.text), fmt, args); - va_end(args); - msg.text[sizeof(msg.text)-1] = '\0'; - } - - toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS, - &msg, sizeof(msg)); - last_step = this_step; - - return next_update; -} - -/** - * userui_message - Display a message without necessarily logging it. - * - * @section: Type of message. Messages can be filtered by type. - * @level: Degree of importance of the message. Lower values = higher priority. - * @normally_logged: Whether logged even if log_everything is off. - * @fmt: Message (and parameters). - * - * This function is intended to do the same job as printk, but without normally - * logging what is printed. The point is to be able to get debugging info on - * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M" - * - * It may be called from an interrupt context - can't sleep! - */ -static void userui_message(u32 section, u32 level, u32 normally_logged, - const char *fmt, ...) -{ - struct userui_msg_params msg; - - if ((level) && (level > console_loglevel)) - return; - - memset(&msg, 0, sizeof(msg)); - - msg.a = section; - msg.b = level; - msg.c = normally_logged; - - if (fmt) { - va_list args; - va_start(args, fmt); - vsnprintf(msg.text, sizeof(msg.text), fmt, args); - va_end(args); - msg.text[sizeof(msg.text)-1] = '\0'; - } - - if (test_action_state(TOI_LOGALL)) - printk(KERN_INFO "%s\n", msg.text); - - toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE, - &msg, sizeof(msg)); -} - -/** - * wait_for_key_via_userui - Wait for userui to receive a keypress. - */ -static void wait_for_key_via_userui(void) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&userui_wait_for_key, &wait); - set_current_state(TASK_INTERRUPTIBLE); - - wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake); - userui_wait_should_wake = false; - - set_current_state(TASK_RUNNING); - remove_wait_queue(&userui_wait_for_key, &wait); -} - -/** - * userui_prepare_status - Display high level messages. - * - * @clearbar: Whether to clear the progress bar. - * @fmt...: New message for the title. - * - * Prepare the 'nice display', drawing the header and version, along with the - * current action and perhaps also resetting the progress bar. - */ -static void userui_prepare_status(int clearbar, const char *fmt, ...) -{ - va_list args; - - if (fmt) { - va_start(args, fmt); - lastheader_message_len = vsnprintf(lastheader, 512, fmt, args); - va_end(args); - } - - if (clearbar) - toi_update_status(0, 1, NULL); - - if (ui_helper_data.pid == -1) - printk(KERN_EMERG "%s\n", lastheader); - else - toi_message(0, TOI_STATUS, 1, lastheader, NULL); -} - -/** - * toi_wait_for_keypress - Wait for keypress via userui. - * - * @timeout: Maximum time to wait. - * - * Wait for a keypress from userui. - * - * FIXME: Implement timeout? - */ -static char userui_wait_for_keypress(int timeout) -{ - char key = '\0'; - - if (ui_helper_data.pid != -1) { - wait_for_key_via_userui(); - key = ' '; - } - - return key; -} - -/** - * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it. - * - * @result_code: Reason why we're aborting (1 << bit). - * @fmt: Message to display if telling the user what's going on. - * - * Abort a cycle. If this wasn't at the user's request (and we're displaying - * output), tell the user why and wait for them to acknowledge the message. - */ -static void userui_abort_hibernate(int result_code, const char *fmt, ...) -{ - va_list args; - int printed_len = 0; - - set_result_state(result_code); - - if (test_result_state(TOI_ABORTED)) - return; - - set_result_state(TOI_ABORTED); - - if (test_result_state(TOI_ABORT_REQUESTED)) - return; - - va_start(args, fmt); - printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf), - fmt, args); - va_end(args); - if (ui_helper_data.pid != -1) - printed_len = sprintf(local_printf_buf + printed_len, - " (Press SPACE to continue)"); - - toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf); - - if (ui_helper_data.pid != -1) - userui_wait_for_keypress(0); -} - -/** - * request_abort_hibernate - Abort hibernating or resuming at user request. - * - * Handle the user requesting the cancellation of a hibernation or resume by - * pressing escape. - */ -static void request_abort_hibernate(void) -{ - if (test_result_state(TOI_ABORT_REQUESTED) || - !test_action_state(TOI_CAN_CANCEL)) - return; - - if (test_toi_state(TOI_NOW_RESUMING)) { - toi_prepare_status(CLEAR_BAR, "Escape pressed. " - "Powering down again."); - set_toi_state(TOI_STOP_RESUME); - while (!test_toi_state(TOI_IO_STOPPED)) - schedule(); - if (toiActiveAllocator->mark_resume_attempted) - toiActiveAllocator->mark_resume_attempted(0); - toi_power_down(); - } - - toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :" - " ABORTING HIBERNATION ---"); - set_abort_result(TOI_ABORT_REQUESTED); - toi_stop_waiting_for_userui_key(); -} - -/** - * userui_user_rcv_msg - Receive a netlink message from userui. - * - * @skb: skb received. - * @nlh: Netlink header received. - */ -static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type; - int *data; - - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < NETLINK_MSG_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ - if (type >= USERUI_MSG_MAX) - return -EINVAL; - - /* All operations require privileges, even GET */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) { - printk(KERN_INFO "Got NOFREEZE_ME request when " - "ui_helper_data.pid is %d.\n", ui_helper_data.pid); - return -EBUSY; - } - - data = (int *) NLMSG_DATA(nlh); - - switch (type) { - case USERUI_MSG_ABORT: - request_abort_hibernate(); - return 0; - case USERUI_MSG_GET_STATE: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_STATE, &toi_bkd.toi_action, - sizeof(toi_bkd.toi_action)); - return 0; - case USERUI_MSG_GET_DEBUG_STATE: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_DEBUG_STATE, - &toi_bkd.toi_debug_state, - sizeof(toi_bkd.toi_debug_state)); - return 0; - case USERUI_MSG_SET_STATE: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - ui_nl_set_state(*data); - return 0; - case USERUI_MSG_SET_DEBUG_STATE: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - toi_bkd.toi_debug_state = (*data); - return 0; - case USERUI_MSG_SPACE: - toi_stop_waiting_for_userui_key(); - return 0; - case USERUI_MSG_GET_POWERDOWN_METHOD: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_POWERDOWN_METHOD, - &toi_poweroff_method, - sizeof(toi_poweroff_method)); - return 0; - case USERUI_MSG_SET_POWERDOWN_METHOD: - if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char))) - return -EINVAL; - toi_poweroff_method = (unsigned long)(*data); - return 0; - case USERUI_MSG_GET_LOGLEVEL: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_LOGLEVEL, - &toi_bkd.toi_default_console_level, - sizeof(toi_bkd.toi_default_console_level)); - return 0; - case USERUI_MSG_SET_LOGLEVEL: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - toi_bkd.toi_default_console_level = (*data); - return 0; - case USERUI_MSG_PRINTK: - printk(KERN_INFO "%s", (char *) data); - return 0; - } - - /* Unhandled here */ - return 1; -} - -/** - * userui_cond_pause - Possibly pause at user request. - * - * @pause: Whether to pause or just display the message. - * @message: Message to display at the start of pausing. - * - * Potentially pause and wait for the user to tell us to continue. We normally - * only pause when @pause is set. While paused, the user can do things like - * changing the loglevel, toggling the display of debugging sections and such - * like. - */ -static void userui_cond_pause(int pause, char *message) -{ - int displayed_message = 0, last_key = 0; - - while (last_key != 32 && - ui_helper_data.pid != -1 && - ((test_action_state(TOI_PAUSE) && pause) || - (test_action_state(TOI_SINGLESTEP)))) { - if (!displayed_message) { - toi_prepare_status(DONT_CLEAR_BAR, - "%s Press SPACE to continue.%s", - message ? message : "", - (test_action_state(TOI_SINGLESTEP)) ? - " Single step on." : ""); - displayed_message = 1; - } - last_key = userui_wait_for_keypress(0); - } - schedule(); -} - -/** - * userui_prepare_console - Prepare the console for use. - * - * Prepare a console for use, saving current kmsg settings and attempting to - * start userui. Console loglevel changes are handled by userui. - */ -static void userui_prepare_console(void) -{ - orig_kmsg = vt_kmsg_redirect(fg_console + 1); - - ui_helper_data.pid = -1; - - if (!userui_ops.enabled) { - printk(KERN_INFO "TuxOnIce: Userui disabled.\n"); - return; - } - - if (*ui_helper_data.program) - toi_netlink_setup(&ui_helper_data); - else - printk(KERN_INFO "TuxOnIce: Userui program not configured.\n"); -} - -/** - * userui_cleanup_console - Cleanup after a cycle. - * - * Tell userui to cleanup, and restore kmsg_redirect to its original value. - */ - -static void userui_cleanup_console(void) -{ - if (ui_helper_data.pid > -1) - toi_netlink_close(&ui_helper_data); - - vt_kmsg_redirect(orig_kmsg); -} - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) - SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action, - TOI_CAN_CANCEL, 0), - SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action, - TOI_PAUSE, 0), - SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL), - SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1, - 2048, 0, NULL), - SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0, - set_ui_program_set), - SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL) -#endif -}; - -static struct toi_module_ops userui_ops = { - .type = MISC_MODULE, - .name = "userui", - .shared_directory = "user_interface", - .module = THIS_MODULE, - .storage_needed = userui_storage_needed, - .save_config_info = userui_save_config_info, - .load_config_info = userui_load_config_info, - .memory_needed = userui_memory_needed, - .post_atomic_restore = userui_post_atomic_restore, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -static struct ui_ops my_ui_ops = { - .update_status = userui_update_status, - .message = userui_message, - .prepare_status = userui_prepare_status, - .abort = userui_abort_hibernate, - .cond_pause = userui_cond_pause, - .prepare = userui_prepare_console, - .cleanup = userui_cleanup_console, - .wait_for_key = userui_wait_for_keypress, -}; - -/** - * toi_user_ui_init - Boot time initialisation for user interface. - * - * Invoked from the core init routine. - */ -static __init int toi_user_ui_init(void) -{ - int result; - - ui_helper_data.nl = NULL; - strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255); - ui_helper_data.pid = -1; - ui_helper_data.skb_size = sizeof(struct userui_msg_params); - ui_helper_data.pool_limit = 6; - ui_helper_data.netlink_id = NETLINK_TOI_USERUI; - ui_helper_data.name = "userspace ui"; - ui_helper_data.rcv_msg = userui_user_rcv_msg; - ui_helper_data.interface_version = 8; - ui_helper_data.must_init = 0; - ui_helper_data.not_ready = userui_cleanup_console; - init_completion(&ui_helper_data.wait_for_process); - result = toi_register_module(&userui_ops); - if (!result) { - result = toi_register_ui_ops(&my_ui_ops); - if (result) - toi_unregister_module(&userui_ops); - } - - return result; -} - -late_initcall(toi_user_ui_init); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 83cf08088..f62f2d3f9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -33,7 +33,6 @@ #include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/syscalls.h> -#include <linux/suspend.h> #include <linux/kexec.h> #include <linux/kdb.h> #include <linux/ratelimit.h> @@ -86,6 +85,18 @@ static struct lockdep_map console_lock_dep_map = { #endif /* + * Number of registered extended console drivers. + * + * If extended consoles are present, in-kernel cont reassembly is disabled + * and each fragment is stored as a separate log entry with proper + * continuation flag so that every emitted message has full metadata. This + * doesn't change the result for regular consoles or /proc/kmsg. For + * /dev/kmsg, as long as the reader concatenates messages according to + * consecutive continuation flags, the end result should be the same too. + */ +static int nr_ext_console_drivers; + +/* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ @@ -196,14 +207,14 @@ static int console_may_schedule; * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: - * "level,sequnum,timestamp;<message text>\n" + * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n" + * + * Users of the export format should ignore possible additional values + * separated by ',', and find the message after the ';' character. * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible * non-prinatable characters are escaped in the "\xff" notation. - * - * Users of the export format should ignore possible additional values - * separated by ',', and find the message after the ';' character. */ enum log_flags { @@ -269,20 +280,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; -#ifdef CONFIG_TOI_INCREMENTAL -void toi_set_logbuf_untracked(void) -{ - int i; - struct page *log_buf_start_page = virt_to_page(__log_buf); - - printk("Not protecting kernel printk log buffer (%p-%p).\n", - __log_buf, __log_buf + __LOG_BUF_LEN); - - for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++) - SetPageTOI_Untracked(log_buf_start_page + i); -} -#endif - /* Return log buffer address */ char *log_buf_addr_get(void) { @@ -492,13 +489,13 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -int check_syslog_permissions(int type, bool from_file) +int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ - if (from_file && type != SYSLOG_ACTION_OPEN) + if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { @@ -521,6 +518,86 @@ ok: return security_syslog(type); } +static void append_char(char **pp, char *e, char c) +{ + if (*pp < e) + *(*pp)++ = c; +} + +static ssize_t msg_print_ext_header(char *buf, size_t size, + struct printk_log *msg, u64 seq, + enum log_flags prev_flags) +{ + u64 ts_usec = msg->ts_nsec; + char cont = '-'; + + do_div(ts_usec, 1000); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + return scnprintf(buf, size, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, seq, ts_usec, cont); +} + +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *dict, size_t dict_len, + char *text, size_t text_len) +{ + char *p = buf, *e = buf + size; + size_t i; + + /* escape non-printable characters */ + for (i = 0; i < text_len; i++) { + unsigned char c = text[i]; + + if (c < ' ' || c >= 127 || c == '\\') + p += scnprintf(p, e - p, "\\x%02x", c); + else + append_char(&p, e, c); + } + append_char(&p, e, '\n'); + + if (dict_len) { + bool line = true; + + for (i = 0; i < dict_len; i++) { + unsigned char c = dict[i]; + + if (line) { + append_char(&p, e, ' '); + line = false; + } + + if (c == '\0') { + append_char(&p, e, '\n'); + line = true; + continue; + } + + if (c < ' ' || c >= 127 || c == '\\') { + p += scnprintf(p, e - p, "\\x%02x", c); + continue; + } + + append_char(&p, e, c); + } + append_char(&p, e, '\n'); + } + + return p - buf; +} /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { @@ -528,7 +605,7 @@ struct devkmsg_user { u32 idx; enum log_flags prev; struct mutex lock; - char buf[8192]; + char buf[CONSOLE_EXT_LOG_MAX]; }; static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) @@ -586,9 +663,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, { struct devkmsg_user *user = file->private_data; struct printk_log *msg; - u64 ts_usec; - size_t i; - char cont = '-'; size_t len; ssize_t ret; @@ -624,66 +698,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, } msg = log_from_idx(user->idx); - ts_usec = msg->ts_nsec; - do_div(ts_usec, 1000); + len = msg_print_ext_header(user->buf, sizeof(user->buf), + msg, user->seq, user->prev); + len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, + log_dict(msg), msg->dict_len, + log_text(msg), msg->text_len); - /* - * If we couldn't merge continuation line fragments during the print, - * export the stored flags to allow an optional external merge of the - * records. Merging the records isn't always neccessarily correct, like - * when we hit a race during printing. In most cases though, it produces - * better readable output. 'c' in the record flags mark the first - * fragment of a line, '+' the following. - */ - if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) - cont = 'c'; - else if ((msg->flags & LOG_CONT) || - ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) - cont = '+'; - - len = sprintf(user->buf, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, - user->seq, ts_usec, cont); user->prev = msg->flags; - - /* escape non-printable characters */ - for (i = 0; i < msg->text_len; i++) { - unsigned char c = log_text(msg)[i]; - - if (c < ' ' || c >= 127 || c == '\\') - len += sprintf(user->buf + len, "\\x%02x", c); - else - user->buf[len++] = c; - } - user->buf[len++] = '\n'; - - if (msg->dict_len) { - bool line = true; - - for (i = 0; i < msg->dict_len; i++) { - unsigned char c = log_dict(msg)[i]; - - if (line) { - user->buf[len++] = ' '; - line = false; - } - - if (c == '\0') { - user->buf[len++] = '\n'; - line = true; - continue; - } - - if (c < ' ' || c >= 127 || c == '\\') { - len += sprintf(user->buf + len, "\\x%02x", c); - continue; - } - - user->buf[len++] = c; - } - user->buf[len++] = '\n'; - } - user->idx = log_next(user->idx); user->seq++; raw_spin_unlock_irq(&logbuf_lock); @@ -1269,13 +1290,13 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return len; } -int do_syslog(int type, char __user *buf, int len, bool from_file) +int do_syslog(int type, char __user *buf, int len, int source) { bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; - error = check_syslog_permissions(type, from_file); + error = check_syslog_permissions(type, source); if (error) goto out; @@ -1358,7 +1379,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) syslog_prev = 0; syslog_partial = 0; } - if (from_file) { + if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of @@ -1405,7 +1426,9 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. */ -static void call_console_drivers(int level, const char *text, size_t len) +static void call_console_drivers(int level, + const char *ext_text, size_t ext_len, + const char *text, size_t len) { struct console *con; @@ -1426,7 +1449,10 @@ static void call_console_drivers(int level, const char *text, size_t len) if (!cpu_online(smp_processor_id()) && !(con->flags & CON_ANYTIME)) continue; - con->write(con, text, len, level); + if (con->flags & CON_EXTENDED) + con->write(con, ext_text, ext_len, level); + else + con->write(con, text, len, level); } } @@ -1569,8 +1595,12 @@ static bool cont_add(int facility, int level, const char *text, size_t len) if (cont.len && cont.flushed) return false; - if (cont.len + len > sizeof(cont.buf)) { - /* the line gets too long, split it up in separate records */ + /* + * If ext consoles are present, flush and skip in-kernel + * continuation. See nr_ext_console_drivers definition. Also, if + * the line gets too long, split it up in separate records. + */ + if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { cont_flush(LOG_CONT); return false; } @@ -1905,9 +1935,19 @@ static struct cont { u8 level; bool flushed:1; } cont; +static char *log_text(const struct printk_log *msg) { return NULL; } +static char *log_dict(const struct printk_log *msg) { return NULL; } static struct printk_log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } -static void call_console_drivers(int level, const char *text, size_t len) {} +static ssize_t msg_print_ext_header(char *buf, size_t size, + struct printk_log *msg, u64 seq, + enum log_flags prev_flags) { return 0; } +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *dict, size_t dict_len, + char *text, size_t text_len) { return 0; } +static void call_console_drivers(int level, + const char *ext_text, size_t ext_len, + const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } @@ -2160,7 +2200,7 @@ static void console_cont_flush(char *text, size_t size) len = cont_print_text(text, size); raw_spin_unlock(&logbuf_lock); stop_critical_timings(); - call_console_drivers(cont.level, text, len); + call_console_drivers(cont.level, NULL, 0, text, len); start_critical_timings(); local_irq_restore(flags); return; @@ -2184,6 +2224,7 @@ out: */ void console_unlock(void) { + static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; static u64 seen_seq; unsigned long flags; @@ -2202,6 +2243,7 @@ void console_unlock(void) again: for (;;) { struct printk_log *msg; + size_t ext_len = 0; size_t len; int level; @@ -2247,13 +2289,22 @@ skip: level = msg->level; len += msg_print_text(msg, console_prev, false, text + len, sizeof(text) - len); + if (nr_ext_console_drivers) { + ext_len = msg_print_ext_header(ext_text, + sizeof(ext_text), + msg, console_seq, console_prev); + ext_len += msg_print_ext_body(ext_text + ext_len, + sizeof(ext_text) - ext_len, + log_dict(msg), msg->dict_len, + log_text(msg), msg->text_len); + } console_idx = log_next(console_idx); console_seq++; console_prev = msg->flags; raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(level, text, len); + call_console_drivers(level, ext_text, ext_len, text, len); start_critical_timings(); local_irq_restore(flags); } @@ -2509,6 +2560,11 @@ void register_console(struct console *newcon) newcon->next = console_drivers->next; console_drivers->next = newcon; } + + if (newcon->flags & CON_EXTENDED) + if (!nr_ext_console_drivers++) + pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n"); + if (newcon->flags & CON_PRINTBUFFER) { /* * console_unlock(); will print out the buffered messages @@ -2581,6 +2637,9 @@ int unregister_console(struct console *console) } } + if (!res && (console->flags & CON_EXTENDED)) + nr_ext_console_drivers--; + /* * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8dbe27611..59e32684c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -241,6 +241,7 @@ rcu_torture_free(struct rcu_torture *p) struct rcu_torture_ops { int ttype; void (*init)(void); + void (*cleanup)(void); int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); @@ -477,10 +478,12 @@ static struct rcu_torture_ops rcu_busted_ops = { */ DEFINE_STATIC_SRCU(srcu_ctl); +static struct srcu_struct srcu_ctld; +static struct srcu_struct *srcu_ctlp = &srcu_ctl; -static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) +static int srcu_torture_read_lock(void) __acquires(srcu_ctlp) { - return srcu_read_lock(&srcu_ctl); + return srcu_read_lock(srcu_ctlp); } static void srcu_read_delay(struct torture_random_state *rrsp) @@ -499,49 +502,49 @@ static void srcu_read_delay(struct torture_random_state *rrsp) rcu_read_delay(rrsp); } -static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) +static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) { - srcu_read_unlock(&srcu_ctl, idx); + srcu_read_unlock(srcu_ctlp, idx); } static unsigned long srcu_torture_completed(void) { - return srcu_batches_completed(&srcu_ctl); + return srcu_batches_completed(srcu_ctlp); } static void srcu_torture_deferred_free(struct rcu_torture *rp) { - call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); + call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb); } static void srcu_torture_synchronize(void) { - synchronize_srcu(&srcu_ctl); + synchronize_srcu(srcu_ctlp); } static void srcu_torture_call(struct rcu_head *head, void (*func)(struct rcu_head *head)) { - call_srcu(&srcu_ctl, head, func); + call_srcu(srcu_ctlp, head, func); } static void srcu_torture_barrier(void) { - srcu_barrier(&srcu_ctl); + srcu_barrier(srcu_ctlp); } static void srcu_torture_stats(void) { int cpu; - int idx = srcu_ctl.completed & 0x1; + int idx = srcu_ctlp->completed & 0x1; pr_alert("%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { long c0, c1; - c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; - c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; + c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx]; + c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx]; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); @@ -549,7 +552,7 @@ static void srcu_torture_stats(void) static void srcu_torture_synchronize_expedited(void) { - synchronize_srcu_expedited(&srcu_ctl); + synchronize_srcu_expedited(srcu_ctlp); } static struct rcu_torture_ops srcu_ops = { @@ -569,6 +572,38 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; +static void srcu_torture_init(void) +{ + rcu_sync_torture_init(); + WARN_ON(init_srcu_struct(&srcu_ctld)); + srcu_ctlp = &srcu_ctld; +} + +static void srcu_torture_cleanup(void) +{ + cleanup_srcu_struct(&srcu_ctld); + srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ +} + +/* As above, but dynamically allocated. */ +static struct rcu_torture_ops srcud_ops = { + .ttype = SRCU_FLAVOR, + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .started = NULL, + .completed = srcu_torture_completed, + .deferred_free = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, + .stats = srcu_torture_stats, + .name = "srcud" +}; + /* * Definitions for sched torture testing. */ @@ -672,8 +707,8 @@ static void rcu_torture_boost_cb(struct rcu_head *head) struct rcu_boost_inflight *rbip = container_of(head, struct rcu_boost_inflight, rcu); - smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ - rbip->inflight = 0; + /* Ensure RCU-core accesses precede clearing ->inflight */ + smp_store_release(&rbip->inflight, 0); } static int rcu_torture_boost(void *arg) @@ -710,9 +745,9 @@ static int rcu_torture_boost(void *arg) call_rcu_time = jiffies; while (ULONG_CMP_LT(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ - if (!rbi.inflight) { - smp_mb(); /* RCU core before ->inflight = 1. */ - rbi.inflight = 1; + if (!smp_load_acquire(&rbi.inflight)) { + /* RCU core before ->inflight = 1. */ + smp_store_release(&rbi.inflight, 1); call_rcu(&rbi.rcu, rcu_torture_boost_cb); if (jiffies - call_rcu_time > test_boost_duration * HZ - HZ / 2) { @@ -751,11 +786,10 @@ checkwait: stutter_wait("rcu_torture_boost"); } while (!torture_must_stop()); /* Clean up and exit. */ - while (!kthread_should_stop() || rbi.inflight) { + while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) { torture_shutdown_absorb("rcu_torture_boost"); schedule_timeout_uninterruptible(1); } - smp_mb(); /* order accesses to ->inflight before stack-frame death. */ destroy_rcu_head_on_stack(&rbi.rcu); torture_kthread_stopping("rcu_torture_boost"); return 0; @@ -1054,7 +1088,7 @@ static void rcu_torture_timer(unsigned long unused) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); + srcu_read_lock_held(srcu_ctlp)); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -1128,7 +1162,7 @@ rcu_torture_reader(void *arg) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); + srcu_read_lock_held(srcu_ctlp)); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); @@ -1413,12 +1447,15 @@ static int rcu_torture_barrier_cbs(void *arg) do { wait_event(barrier_cbs_wq[myid], (newphase = - ACCESS_ONCE(barrier_phase)) != lastphase || + smp_load_acquire(&barrier_phase)) != lastphase || torture_must_stop()); lastphase = newphase; - smp_mb(); /* ensure barrier_phase load before ->call(). */ if (torture_must_stop()) break; + /* + * The above smp_load_acquire() ensures barrier_phase load + * is ordered before the folloiwng ->call(). + */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); @@ -1439,8 +1476,8 @@ static int rcu_torture_barrier(void *arg) do { atomic_set(&barrier_cbs_invoked, 0); atomic_set(&barrier_cbs_count, n_barrier_cbs); - smp_mb(); /* Ensure barrier_phase after prior assignments. */ - barrier_phase = !barrier_phase; + /* Ensure barrier_phase ordered after prior assignments. */ + smp_store_release(&barrier_phase, !barrier_phase); for (i = 0; i < n_barrier_cbs; i++) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, @@ -1588,10 +1625,14 @@ rcu_torture_cleanup(void) rcutorture_booster_cleanup(i); } - /* Wait for all RCU callbacks to fire. */ - + /* + * Wait for all RCU callbacks to fire, then do flavor-specific + * cleanup operations. + */ if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ @@ -1668,8 +1709,8 @@ rcu_torture_init(void) int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, - RCUTORTURE_TASKS_OPS + &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, + &sched_ops, RCUTORTURE_TASKS_OPS }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) @@ -1701,7 +1742,7 @@ rcu_torture_init(void) if (nreaders >= 0) { nrealreaders = nreaders; } else { - nrealreaders = num_online_cpus() - 1; + nrealreaders = num_online_cpus() - 2 - nreaders; if (nrealreaders <= 0) nrealreaders = 1; } diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index cad76e76b..fb33d35ee 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -151,7 +151,7 @@ static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) unsigned long t; for_each_possible_cpu(cpu) { - t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); + t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); sum += t; } return sum; @@ -168,7 +168,7 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) unsigned long t; for_each_possible_cpu(cpu) { - t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); + t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); sum += t; } return sum; @@ -265,8 +265,8 @@ static int srcu_readers_active(struct srcu_struct *sp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); - sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); + sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); } return sum; } @@ -296,7 +296,7 @@ int __srcu_read_lock(struct srcu_struct *sp) { int idx; - idx = ACCESS_ONCE(sp->completed) & 0x1; + idx = READ_ONCE(sp->completed) & 0x1; preempt_disable(); __this_cpu_inc(sp->per_cpu_ref->c[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index ec3086879..c291bd65d 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -35,7 +35,7 @@ #include <linux/time.h> #include <linux/cpu.h> #include <linux/prefetch.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include "rcu.h" @@ -49,39 +49,6 @@ static void __call_rcu(struct rcu_head *head, #include "tiny_plugin.h" -/* - * Enter idle, which is an extended quiescent state if we have fully - * entered that mode. - */ -void rcu_idle_enter(void) -{ -} -EXPORT_SYMBOL_GPL(rcu_idle_enter); - -/* - * Exit an interrupt handler towards idle. - */ -void rcu_irq_exit(void) -{ -} -EXPORT_SYMBOL_GPL(rcu_irq_exit); - -/* - * Exit idle, so that we are no longer in an extended quiescent state. - */ -void rcu_idle_exit(void) -{ -} -EXPORT_SYMBOL_GPL(rcu_idle_exit); - -/* - * Enter an interrupt handler, moving away from idle. - */ -void rcu_irq_enter(void) -{ -} -EXPORT_SYMBOL_GPL(rcu_irq_enter); - #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) /* diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index f94e209a1..e492a5253 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -144,16 +144,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) return; rcp->ticks_this_gp++; j = jiffies; - js = ACCESS_ONCE(rcp->jiffies_stall); + js = READ_ONCE(rcp->jiffies_stall); if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, jiffies - rcp->gp_start, rcp->qlen); dump_stack(); - ACCESS_ONCE(rcp->jiffies_stall) = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); } else if (ULONG_CMP_GE(j, js)) { - ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); + WRITE_ONCE(rcp->jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); } } @@ -161,7 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) { rcp->ticks_this_gp = 0; rcp->gp_start = jiffies; - ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); + WRITE_ONCE(rcp->jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); } static void check_cpu_stalls(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8cf7304b2..65137bc28 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -54,7 +54,7 @@ #include <linux/delay.h> #include <linux/stop_machine.h> #include <linux/random.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/suspend.h> #include "tree.h" @@ -91,7 +91,7 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ DEFINE_RCU_TPS(sname) \ -DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .rda = &sname##_data, \ @@ -110,11 +110,18 @@ struct rcu_state sname##_state = { \ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); -static struct rcu_state *rcu_state_p; +static struct rcu_state *const rcu_state_p; +static struct rcu_data __percpu *const rcu_data_p; LIST_HEAD(rcu_struct_flavors); -/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ -static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; +/* Dump rcu_node combining tree at boot to verify correct setup. */ +static bool dump_tree; +module_param(dump_tree, bool, 0444); +/* Control rcu_node-tree auto-balancing at boot time. */ +static bool rcu_fanout_exact; +module_param(rcu_fanout_exact, bool, 0444); +/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */ +static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ @@ -159,17 +166,46 @@ static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); /* rcuc/rcub kthread realtime priority */ +#ifdef CONFIG_RCU_KTHREAD_PRIO static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ +static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; +#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ module_param(kthread_prio, int, 0644); /* Delay in jiffies for grace-period initialization delays, debug only. */ + +#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT +static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; +module_param(gp_preinit_delay, int, 0644); +#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ +static const int gp_preinit_delay; +#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ + #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; module_param(gp_init_delay, int, 0644); #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ static const int gp_init_delay; #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ -#define PER_RCU_NODE_PERIOD 10 /* Number of grace periods between delays. */ + +#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP +static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; +module_param(gp_cleanup_delay, int, 0644); +#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ +static const int gp_cleanup_delay; +#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ + +/* + * Number of grace periods between delays, normalized by the duration of + * the delay. The longer the the delay, the more the grace periods between + * each delay. The reason for this normalization is that it means that, + * for non-zero delays, the overall slowdown of grace periods is constant + * regardless of the duration of the delay. This arrangement balances + * the need for long delays to increase some race probabilities with the + * need for fast grace periods to increase other race probabilities. + */ +#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ /* * Track the rcutorture test sequence number and the update version @@ -191,17 +227,17 @@ unsigned long rcutorture_vernum; */ unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) { - return ACCESS_ONCE(rnp->qsmaskinitnext); + return READ_ONCE(rnp->qsmaskinitnext); } /* - * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s + * Return true if an RCU grace period is in progress. The READ_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. */ static int rcu_gp_in_progress(struct rcu_state *rsp) { - return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); + return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum); } /* @@ -278,8 +314,8 @@ static void rcu_momentary_dyntick_idle(void) if (!(resched_mask & rsp->flavor_mask)) continue; smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ - if (ACCESS_ONCE(rdp->mynode->completed) != - ACCESS_ONCE(rdp->cond_resched_completed)) + if (READ_ONCE(rdp->mynode->completed) != + READ_ONCE(rdp->cond_resched_completed)) continue; /* @@ -491,9 +527,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, break; } if (rsp != NULL) { - *flags = ACCESS_ONCE(rsp->gp_flags); - *gpnum = ACCESS_ONCE(rsp->gpnum); - *completed = ACCESS_ONCE(rsp->completed); + *flags = READ_ONCE(rsp->gp_flags); + *gpnum = READ_ONCE(rsp->gpnum); + *completed = READ_ONCE(rsp->completed); return; } *flags = 0; @@ -539,10 +575,10 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static int rcu_future_needs_gp(struct rcu_state *rsp) { struct rcu_node *rnp = rcu_get_root(rsp); - int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; + int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; - return ACCESS_ONCE(*fp); + return READ_ONCE(*fp); } /* @@ -565,7 +601,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Yes, this CPU has newly registered callbacks. */ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) if (rdp->nxttail[i - 1] != rdp->nxttail[i] && - ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), + ULONG_CMP_LT(READ_ONCE(rsp->completed), rdp->nxtcompleted[i])) return 1; /* Yes, CBs for future grace period. */ return 0; /* No grace period needed. */ @@ -585,7 +621,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); - if (!user && !is_idle_task(current)) { + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); @@ -604,7 +641,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user) smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); smp_mb__after_atomic(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + atomic_read(&rdtp->dynticks) & 0x1); rcu_dynticks_task_enter(); /* @@ -630,7 +668,8 @@ static void rcu_eqs_enter(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (oldval & DYNTICK_TASK_NEST_MASK) == 0); if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; rcu_eqs_enter_common(oldval, user); @@ -703,7 +742,8 @@ void rcu_irq_exit(void) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; - WARN_ON_ONCE(rdtp->dynticks_nesting < 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + rdtp->dynticks_nesting < 0); if (rdtp->dynticks_nesting) trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else @@ -728,10 +768,12 @@ static void rcu_eqs_exit_common(long long oldval, int user) atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); - if (!user && !is_idle_task(current)) { + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); @@ -755,7 +797,7 @@ static void rcu_eqs_exit(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval < 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { @@ -828,7 +870,8 @@ void rcu_irq_enter(void) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; - WARN_ON_ONCE(rdtp->dynticks_nesting == 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + rdtp->dynticks_nesting == 0); if (oldval) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else @@ -1011,9 +1054,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); return 1; } else { - if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4, + if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) - ACCESS_ONCE(rdp->gpwrap) = true; + WRITE_ONCE(rdp->gpwrap, true); return 0; } } @@ -1093,12 +1136,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs) || ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { - ACCESS_ONCE(rdp->cond_resched_completed) = - ACCESS_ONCE(rdp->mynode->completed); + if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { + WRITE_ONCE(rdp->cond_resched_completed, + READ_ONCE(rdp->mynode->completed)); smp_mb(); /* ->cond_resched_completed before *rcrmp. */ - ACCESS_ONCE(*rcrmp) = - ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; + WRITE_ONCE(*rcrmp, + READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ rdp->rsp->jiffies_resched += 5; /* Enable beating. */ } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { @@ -1119,9 +1162,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - ACCESS_ONCE(rsp->jiffies_stall) = j + j1; + WRITE_ONCE(rsp->jiffies_stall, j + j1); rsp->jiffies_resched = j + j1 / 2; - rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs); + rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); } /* @@ -1133,10 +1176,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) unsigned long j; j = jiffies; - gpa = ACCESS_ONCE(rsp->gp_activity); + gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies!\n", - rsp->name, j - gpa); + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n", + rsp->name, j - gpa, + rsp->gpnum, rsp->completed, rsp->gp_flags); } /* @@ -1173,12 +1217,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave(&rnp->lock, flags); - delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); + delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rsp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -1212,12 +1257,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) if (ndetected) { rcu_dump_cpu_stacks(rsp); } else { - if (ACCESS_ONCE(rsp->gpnum) != gpnum || - ACCESS_ONCE(rsp->completed) == gpnum) { + if (READ_ONCE(rsp->gpnum) != gpnum || + READ_ONCE(rsp->completed) == gpnum) { pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = ACCESS_ONCE(rsp->gp_activity); + gpa = READ_ONCE(rsp->gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rsp->name, j - gpa, j, gpa, jiffies_till_next_fqs, @@ -1262,9 +1307,9 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); - if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) - ACCESS_ONCE(rsp->jiffies_stall) = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; + if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) + WRITE_ONCE(rsp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -1307,20 +1352,20 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) * Given this check, comparisons of jiffies, rsp->jiffies_stall, * and rsp->gp_start suffice to forestall false positives. */ - gpnum = ACCESS_ONCE(rsp->gpnum); + gpnum = READ_ONCE(rsp->gpnum); smp_rmb(); /* Pick up ->gpnum first... */ - js = ACCESS_ONCE(rsp->jiffies_stall); + js = READ_ONCE(rsp->jiffies_stall); smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = ACCESS_ONCE(rsp->gp_start); + gps = READ_ONCE(rsp->gp_start); smp_rmb(); /* ...and finally ->gp_start before ->completed. */ - completed = ACCESS_ONCE(rsp->completed); + completed = READ_ONCE(rsp->completed); if (ULONG_CMP_GE(completed, gpnum) || ULONG_CMP_LT(j, js) || ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; if (rcu_gp_in_progress(rsp) && - (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { + (READ_ONCE(rnp->qsmask) & rdp->grpmask)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -1347,7 +1392,7 @@ void rcu_cpu_stall_reset(void) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2; + WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); } /* @@ -1457,7 +1502,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * doing some extra useless work. */ if (rnp->gpnum != rnp->completed || - ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) { + READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; @@ -1542,7 +1587,7 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) static void rcu_gp_kthread_wake(struct rcu_state *rsp) { if (current == rsp->gp_kthread || - !ACCESS_ONCE(rsp->gp_flags) || + !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; wake_up(&rsp->gp_wq); @@ -1677,7 +1722,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && - !unlikely(ACCESS_ONCE(rdp->gpwrap))) { + !unlikely(READ_ONCE(rdp->gpwrap))) { /* No grace period end, so just accelerate recent callbacks. */ ret = rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1692,7 +1737,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } - if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) { + if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1704,7 +1749,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); - ACCESS_ONCE(rdp->gpwrap) = false; + WRITE_ONCE(rdp->gpwrap, false); } return ret; } @@ -1717,9 +1762,9 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; - if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && - rdp->completed == ACCESS_ONCE(rnp->completed) && - !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */ + if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && + rdp->completed == READ_ONCE(rnp->completed) && + !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; @@ -1731,6 +1776,13 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) rcu_gp_kthread_wake(rsp); } +static void rcu_gp_slow(struct rcu_state *rsp, int delay) +{ + if (delay > 0 && + !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + schedule_timeout_uninterruptible(delay); +} + /* * Initialize a new grace period. Return 0 if no grace period required. */ @@ -1740,15 +1792,15 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - if (!ACCESS_ONCE(rsp->gp_flags)) { + if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); return 0; } - ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */ + WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { /* @@ -1773,6 +1825,7 @@ static int rcu_gp_init(struct rcu_state *rsp) * will handle subsequent offline CPUs. */ rcu_for_each_leaf_node(rsp, rnp) { + rcu_gp_slow(rsp, gp_preinit_delay); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1829,14 +1882,15 @@ static int rcu_gp_init(struct rcu_state *rsp) * process finishes, because this kthread handles both. */ rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_gp_slow(rsp, gp_init_delay); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; + WRITE_ONCE(rnp->gpnum, rsp->gpnum); if (WARN_ON_ONCE(rnp->completed != rsp->completed)) - ACCESS_ONCE(rnp->completed) = rsp->completed; + WRITE_ONCE(rnp->completed, rsp->completed); if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -1845,10 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; - if (gp_init_delay > 0 && - !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD))) - schedule_timeout_uninterruptible(gp_init_delay); + WRITE_ONCE(rsp->gp_activity, jiffies); } return 1; @@ -1864,7 +1915,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ @@ -1882,11 +1933,11 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - ACCESS_ONCE(rsp->gp_flags) = - ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS; + WRITE_ONCE(rsp->gp_flags, + READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); } return fqs_state; @@ -1903,7 +1954,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; @@ -1934,7 +1985,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) smp_mb__after_unlock_lock(); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); WARN_ON_ONCE(rnp->qsmask); - ACCESS_ONCE(rnp->completed) = rsp->gpnum; + WRITE_ONCE(rnp->completed, rsp->gpnum); rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; @@ -1942,7 +1993,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); + rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1950,16 +2002,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ - ACCESS_ONCE(rsp->completed) = rsp->gpnum; + WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); /* Advance CBs to reduce false positives below. */ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; if (needgp || cpu_needs_another_gp(rsp, rdp)) { - ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; + WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("newreq")); } raw_spin_unlock_irq(&rnp->lock); @@ -1983,20 +2035,20 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; wait_event_interruptible(rsp->gp_wq, - ACCESS_ONCE(rsp->gp_flags) & + READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("reqwaitsig")); } @@ -2012,39 +2064,39 @@ static int __noreturn rcu_gp_kthread(void *arg) if (!ret) rsp->jiffies_force_qs = jiffies + j; trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, - ((gf = ACCESS_ONCE(rsp->gp_flags)) & + ((gf = READ_ONCE(rsp->gp_flags)) & RCU_GP_FLAG_FQS) || - (!ACCESS_ONCE(rnp->qsmask) && + (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ - if (!ACCESS_ONCE(rnp->qsmask) && + if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("fqsstart")); fqs_state = rcu_gp_fqs(rsp, fqs_state); trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("fqsend")); cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("fqswaitsig")); } j = jiffies_till_next_fqs; @@ -2086,8 +2138,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, */ return false; } - ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; - trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), + WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); /* @@ -2137,6 +2189,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); rcu_gp_kthread_wake(rsp); } @@ -2334,8 +2387,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) rcu_report_qs_rdp(rdp->cpu, rsp, rdp); } -#ifdef CONFIG_HOTPLUG_CPU - /* * Send the specified CPU's RCU callbacks to the orphanage. The * specified CPU must be offline, and the caller must hold the @@ -2346,7 +2397,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* No-CBs CPUs do not have orphanable callbacks. */ - if (rcu_is_nocb_cpu(rdp->cpu)) + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) return; /* @@ -2359,7 +2410,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, rsp->qlen += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen_lazy = 0; - ACCESS_ONCE(rdp->qlen) = 0; + WRITE_ONCE(rdp->qlen, 0); } /* @@ -2405,7 +2456,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ - if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) return; /* Do the accounting first. */ @@ -2452,6 +2504,9 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + RCU_TRACE(mask = rdp->grpmask); trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), @@ -2480,7 +2535,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; for (;;) { mask = rnp->grpmask; @@ -2511,6 +2567,9 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave(&rnp->lock, flags); @@ -2532,6 +2591,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); @@ -2546,26 +2608,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) cpu, rdp->qlen, rdp->nxtlist); } -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) -{ -} - -static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) -{ -} - -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -{ -} - -static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. @@ -2580,7 +2622,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); - trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), + trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); return; @@ -2636,7 +2678,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - ACCESS_ONCE(rdp->qlen) = rdp->qlen - count; + WRITE_ONCE(rdp->qlen, rdp->qlen - count); rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -2730,10 +2772,6 @@ static void force_qs_rnp(struct rcu_state *rsp, mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (!rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } if (rnp->qsmask == 0) { if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || @@ -2763,8 +2801,6 @@ static void force_qs_rnp(struct rcu_state *rsp, bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { if ((rnp->qsmask & bit) != 0) { - if ((rnp->qsmaskinit & bit) == 0) - *isidle = false; /* Pending hotplug. */ if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; } @@ -2793,7 +2829,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* Funnel through hierarchy to reduce memory contention. */ rnp = __this_cpu_read(rsp->rda->mynode); for (; rnp != NULL; rnp = rnp->parent) { - ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || + ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); @@ -2809,13 +2845,12 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_lock_irqsave(&rnp_old->lock, flags); smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } - ACCESS_ONCE(rsp->gp_flags) = - ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS; + WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore(&rnp_old->lock, flags); rcu_gp_kthread_wake(rsp); } @@ -2881,7 +2916,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { - if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) + if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) return; if (likely(!rsp->boost)) { rcu_do_batch(rsp, rdp); @@ -2972,7 +3007,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ - ACCESS_ONCE(head->func) = rcu_leak_callback; + WRITE_ONCE(head->func, rcu_leak_callback); WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); return; } @@ -3011,7 +3046,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (!likely(rdp->nxtlist)) init_default_callback_list(rdp); } - ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; + WRITE_ONCE(rdp->qlen, rdp->qlen + 1); if (lazy) rdp->qlen_lazy++; else @@ -3287,7 +3322,7 @@ void synchronize_sched_expedited(void) if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), (ulong)atomic_long_read(&rsp->expedited_done) + ULONG_MAX / 8)) { - synchronize_sched(); + wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_wrap); return; } @@ -3450,14 +3485,14 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ + if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ rdp->n_rp_gp_completed++; return 1; } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum || - unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */ + if (READ_ONCE(rnp->gpnum) != rdp->gpnum || + unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } @@ -3493,7 +3528,7 @@ static int rcu_pending(void) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; @@ -3564,7 +3599,7 @@ static void _rcu_barrier(struct rcu_state *rsp) { int cpu; struct rcu_data *rdp; - unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); + unsigned long snap = READ_ONCE(rsp->n_barrier_done); unsigned long snap_done; _rcu_barrier_trace(rsp, "Begin", -1, snap); @@ -3606,10 +3641,10 @@ static void _rcu_barrier(struct rcu_state *rsp) /* * Increment ->n_barrier_done to avoid duplicate work. Use - * ACCESS_ONCE() to prevent the compiler from speculating + * WRITE_ONCE() to prevent the compiler from speculating * the increment to precede the early-exit check. */ - ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; + WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ @@ -3645,7 +3680,7 @@ static void _rcu_barrier(struct rcu_state *rsp) __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); } - } else if (ACCESS_ONCE(rdp->qlen)) { + } else if (READ_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); @@ -3665,7 +3700,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Increment ->n_barrier_done to prevent duplicate work. */ smp_mb(); /* Keep increment after above mechanism. */ - ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; + WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); smp_mb(); /* Keep increment before caller's subsequent code. */ @@ -3780,7 +3815,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->passed_quiesce = false; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->qs_pending = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -3924,16 +3959,16 @@ void rcu_scheduler_starting(void) /* * Compute the per-level fanout, either using the exact fanout specified - * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. + * or balancing the tree, depending on the rcu_fanout_exact boot parameter. */ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) { + if (rcu_fanout_exact) { rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; for (i = rcu_num_lvls - 2; i >= 0; i--) - rsp->levelspread[i] = CONFIG_RCU_FANOUT; + rsp->levelspread[i] = RCU_FANOUT; } else { int ccur; int cprv; @@ -3971,9 +4006,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ - /* Silence gcc 4.8 warning about array index out of range. */ - if (rcu_num_lvls > RCU_NUM_LVLS) - panic("rcu_init_one: rcu_num_lvls overflow"); + /* Silence gcc 4.8 false positive about array index out of range. */ + if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) + panic("rcu_init_one: rcu_num_lvls out of range"); /* Initialize the level-tracking arrays. */ @@ -4059,7 +4094,7 @@ static void __init rcu_init_geometry(void) jiffies_till_next_fqs = d; /* If the compile-time values are accurate, just leave. */ - if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && + if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", @@ -4073,7 +4108,7 @@ static void __init rcu_init_geometry(void) rcu_capacity[0] = 1; rcu_capacity[1] = rcu_fanout_leaf; for (i = 2; i <= MAX_RCU_LVLS; i++) - rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; + rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; /* * The boot-time rcu_fanout_leaf parameter is only permitted @@ -4083,7 +4118,7 @@ static void __init rcu_init_geometry(void) * the configured number of CPUs. Complain and fall back to the * compile-time values if these limits are exceeded. */ - if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || + if (rcu_fanout_leaf < RCU_FANOUT_LEAF || rcu_fanout_leaf > sizeof(unsigned long) * 8 || n > rcu_capacity[MAX_RCU_LVLS]) { WARN_ON(1); @@ -4109,6 +4144,28 @@ static void __init rcu_init_geometry(void) rcu_num_nodes -= n; } +/* + * Dump out the structure of the rcu_node combining tree associated + * with the rcu_state structure referenced by rsp. + */ +static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) +{ + int level = 0; + struct rcu_node *rnp; + + pr_info("rcu_node tree layout dump\n"); + pr_info(" "); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp->level != level) { + pr_cont("\n"); + pr_info(" "); + level = rnp->level; + } + pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum); + } + pr_cont("\n"); +} + void __init rcu_init(void) { int cpu; @@ -4119,6 +4176,8 @@ void __init rcu_init(void) rcu_init_geometry(); rcu_init_one(&rcu_bh_state, &rcu_bh_data); rcu_init_one(&rcu_sched_state, &rcu_sched_data); + if (dump_tree) + rcu_dump_rcu_node_tree(&rcu_sched_state); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a69d3dab2..4adb7ca0b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -35,11 +35,33 @@ * In practice, this did work well going from three levels to four. * Of course, your mileage may vary. */ + #define MAX_RCU_LVLS 4 -#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) -#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) -#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) -#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) + +#ifdef CONFIG_RCU_FANOUT +#define RCU_FANOUT CONFIG_RCU_FANOUT +#else /* #ifdef CONFIG_RCU_FANOUT */ +# ifdef CONFIG_64BIT +# define RCU_FANOUT 64 +# else +# define RCU_FANOUT 32 +# endif +#endif /* #else #ifdef CONFIG_RCU_FANOUT */ + +#ifdef CONFIG_RCU_FANOUT_LEAF +#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF +#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ +# ifdef CONFIG_64BIT +# define RCU_FANOUT_LEAF 64 +# else +# define RCU_FANOUT_LEAF 32 +# endif +#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ + +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) #if NR_CPUS <= RCU_FANOUT_1 # define RCU_NUM_LVLS 1 @@ -170,7 +192,6 @@ struct rcu_node { /* if there is no such task. If there */ /* is no current expedited grace period, */ /* then there can cannot be any such task. */ -#ifdef CONFIG_RCU_BOOST struct list_head *boost_tasks; /* Pointer to first task that needs to be */ /* priority boosted, or NULL if no priority */ @@ -208,7 +229,6 @@ struct rcu_node { unsigned long n_balk_nos; /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU wait_queue_head_t nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -519,14 +539,11 @@ extern struct list_head rcu_struct_flavors; * RCU implementation internal declarations: */ extern struct rcu_state rcu_sched_state; -DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); extern struct rcu_state rcu_bh_state; -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); #ifdef CONFIG_PREEMPT_RCU extern struct rcu_state rcu_preempt_state; -DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8c0ec0f5a..013485fb2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -43,7 +43,17 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); -#endif /* #ifdef CONFIG_RCU_BOOST */ +#else /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, + * all uses are in dead code. Provide a definition to keep the compiler + * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. + * This probably needs to be excluded from -rt builds. + */ +#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ @@ -60,11 +70,11 @@ static void __init rcu_bootup_announce_oddness(void) { if (IS_ENABLED(CONFIG_RCU_TRACE)) pr_info("\tRCU debugfs-based tracing is enabled.\n"); - if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || - (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)) + if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || + (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", - CONFIG_RCU_FANOUT); - if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) + RCU_FANOUT); + if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); @@ -76,10 +86,10 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tAdditional per-CPU info printed with stalls.\n"); if (NUM_RCU_LVL_4 != 0) pr_info("\tFour-level hierarchy is enabled.\n"); - if (CONFIG_RCU_FANOUT_LEAF != 16) + if (RCU_FANOUT_LEAF != 16) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", - CONFIG_RCU_FANOUT_LEAF); - if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) + RCU_FANOUT_LEAF); + if (rcu_fanout_leaf != RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); @@ -90,7 +100,8 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_PREEMPT_RCU RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -static struct rcu_state *rcu_state_p = &rcu_preempt_state; +static struct rcu_state *const rcu_state_p = &rcu_preempt_state; +static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; static int rcu_preempted_readers_exp(struct rcu_node *rnp); static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, @@ -116,11 +127,11 @@ static void __init rcu_bootup_announce(void) */ static void rcu_preempt_qs(void) { - if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) { + if (!__this_cpu_read(rcu_data_p->passed_quiesce)) { trace_rcu_grace_period(TPS("rcu_preempt"), - __this_cpu_read(rcu_preempt_data.gpnum), + __this_cpu_read(rcu_data_p->gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_preempt_data.passed_quiesce, 1); + __this_cpu_write(rcu_data_p->passed_quiesce, 1); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ current->rcu_read_unlock_special.b.need_qs = false; } @@ -150,7 +161,7 @@ static void rcu_preempt_note_context_switch(void) !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = this_cpu_ptr(rcu_preempt_state.rda); + rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); @@ -180,10 +191,9 @@ static void rcu_preempt_note_context_switch(void) if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); rnp->gp_tasks = &t->rcu_node_entry; -#ifdef CONFIG_RCU_BOOST - if (rnp->boost_tasks != NULL) + if (IS_ENABLED(CONFIG_RCU_BOOST) && + rnp->boost_tasks != NULL) rnp->boost_tasks = rnp->gp_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ } else { list_add(&t->rcu_node_entry, &rnp->blkd_tasks); if (rnp->qsmask & rdp->grpmask) @@ -263,9 +273,7 @@ void rcu_read_unlock_special(struct task_struct *t) bool empty_exp_now; unsigned long flags; struct list_head *np; -#ifdef CONFIG_RCU_BOOST bool drop_boost_mutex = false; -#endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; union rcu_special special; @@ -307,9 +315,11 @@ void rcu_read_unlock_special(struct task_struct *t) t->rcu_read_unlock_special.b.blocked = false; /* - * Remove this task from the list it blocked on. The - * task can migrate while we acquire the lock, but at - * most one time. So at most two passes through loop. + * Remove this task from the list it blocked on. The task + * now remains queued on the rcu_node corresponding to + * the CPU it first blocked on, so the first attempt to + * acquire the task's rcu_node's ->lock will succeed. + * Keep the loop and add a WARN_ON() out of sheer paranoia. */ for (;;) { rnp = t->rcu_blocked_node; @@ -317,6 +327,7 @@ void rcu_read_unlock_special(struct task_struct *t) smp_mb__after_unlock_lock(); if (rnp == t->rcu_blocked_node) break; + WARN_ON_ONCE(1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); @@ -331,12 +342,12 @@ void rcu_read_unlock_special(struct task_struct *t) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; - /* Snapshot ->boost_mtx ownership with rcu_node lock held. */ - drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; -#endif /* #ifdef CONFIG_RCU_BOOST */ + if (IS_ENABLED(CONFIG_RCU_BOOST)) { + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; + /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + } /* * If this was the last task on the current list, and if @@ -353,24 +364,21 @@ void rcu_read_unlock_special(struct task_struct *t) rnp->grplo, rnp->grphi, !!rnp->gp_tasks); - rcu_report_unblock_qs_rnp(&rcu_preempt_state, - rnp, flags); + rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); } else { raw_spin_unlock_irqrestore(&rnp->lock, flags); } -#ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (drop_boost_mutex) + if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) rt_mutex_unlock(&rnp->boost_mtx); -#endif /* #ifdef CONFIG_RCU_BOOST */ /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) - rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); + rcu_report_exp_rnp(rcu_state_p, rnp, true); } else { local_irq_restore(flags); } @@ -390,7 +398,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - t = list_entry(rnp->gp_tasks, + t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) sched_show_task(t); @@ -447,7 +455,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; rcu_print_task_stall_begin(rnp); - t = list_entry(rnp->gp_tasks, + t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); @@ -491,8 +499,8 @@ static void rcu_preempt_check_callbacks(void) return; } if (t->rcu_read_lock_nesting > 0 && - __this_cpu_read(rcu_preempt_data.qs_pending) && - !__this_cpu_read(rcu_preempt_data.passed_quiesce)) + __this_cpu_read(rcu_data_p->qs_pending) && + !__this_cpu_read(rcu_data_p->passed_quiesce)) t->rcu_read_unlock_special.b.need_qs = true; } @@ -500,7 +508,7 @@ static void rcu_preempt_check_callbacks(void) static void rcu_preempt_do_callbacks(void) { - rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); + rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p)); } #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -510,7 +518,7 @@ static void rcu_preempt_do_callbacks(void) */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state, -1, 0); + __call_rcu(head, func, rcu_state_p, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu); @@ -570,7 +578,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp) static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { return !rcu_preempted_readers_exp(rnp) && - ACCESS_ONCE(rnp->expmask) == 0; + READ_ONCE(rnp->expmask) == 0; } /* @@ -711,12 +719,12 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) void synchronize_rcu_expedited(void) { struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_preempt_state; + struct rcu_state *rsp = rcu_state_p; unsigned long snap; int trycount = 0; smp_mb(); /* Caller's modifications seen first by other CPUs. */ - snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; + snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1; smp_mb(); /* Above access cannot bleed into critical section. */ /* @@ -740,7 +748,7 @@ void synchronize_rcu_expedited(void) */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { if (ULONG_CMP_LT(snap, - ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + READ_ONCE(sync_rcu_preempt_exp_count))) { put_online_cpus(); goto mb_ret; /* Others did our work for us. */ } @@ -752,7 +760,7 @@ void synchronize_rcu_expedited(void) return; } } - if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) { put_online_cpus(); goto unlock_mb_ret; /* Others did our work for us. */ } @@ -780,8 +788,7 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ smp_mb(); /* ensure expedited GP seen before counter increment. */ - ACCESS_ONCE(sync_rcu_preempt_exp_count) = - sync_rcu_preempt_exp_count + 1; + WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1); unlock_mb_ret: mutex_unlock(&sync_rcu_preempt_exp_mutex); mb_ret: @@ -799,7 +806,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); */ void rcu_barrier(void) { - _rcu_barrier(&rcu_preempt_state); + _rcu_barrier(rcu_state_p); } EXPORT_SYMBOL_GPL(rcu_barrier); @@ -808,7 +815,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ static void __init __rcu_init_preempt(void) { - rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); + rcu_init_one(rcu_state_p, rcu_data_p); } /* @@ -831,7 +838,8 @@ void exit_rcu(void) #else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state *rcu_state_p = &rcu_sched_state; +static struct rcu_state *const rcu_state_p = &rcu_sched_state; +static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; /* * Tell them what RCU they are running. @@ -994,8 +1002,8 @@ static int rcu_boost(struct rcu_node *rnp) struct task_struct *t; struct list_head *tb; - if (ACCESS_ONCE(rnp->exp_tasks) == NULL && - ACCESS_ONCE(rnp->boost_tasks) == NULL) + if (READ_ONCE(rnp->exp_tasks) == NULL && + READ_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1048,8 +1056,8 @@ static int rcu_boost(struct rcu_node *rnp) rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - return ACCESS_ONCE(rnp->exp_tasks) != NULL || - ACCESS_ONCE(rnp->boost_tasks) != NULL; + return READ_ONCE(rnp->exp_tasks) != NULL || + READ_ONCE(rnp->boost_tasks) != NULL; } /* @@ -1173,7 +1181,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct sched_param sp; struct task_struct *t; - if (&rcu_preempt_state != rsp) + if (rcu_state_p != rsp) return 0; if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) @@ -1367,13 +1375,12 @@ static void rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -#ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *delta_jiffies) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(NULL); + *nextevt = KTIME_MAX; + return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) + ? 0 : rcu_cpu_has_callbacks(NULL); } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up @@ -1432,8 +1439,6 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_active; - /* * Try to advance callbacks for all flavors of RCU on the current CPU, but * only if it has been awhile since the last time we did so. Afterwards, @@ -1462,7 +1467,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * callbacks not yet ready to invoke. */ if ((rdp->completed != rnp->completed || - unlikely(ACCESS_ONCE(rdp->gpwrap))) && + unlikely(READ_ONCE(rdp->gpwrap))) && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) note_gp_changes(rsp, rdp); @@ -1480,17 +1485,22 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * * The caller must have disabled interrupts. */ -#ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *dj) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + unsigned long dj; + + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { + *nextevt = KTIME_MAX; + return 0; + } /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { - *dj = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } @@ -1504,14 +1514,14 @@ int rcu_needs_cpu(unsigned long *dj) /* Request timer delay depending on laziness, and round. */ if (!rdtp->all_lazy) { - *dj = round_up(rcu_idle_gp_delay + jiffies, + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { - *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } + *nextevt = basemono + dj * TICK_NSEC; return 0; } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Prepare a CPU for idle from an RCU perspective. The first major task @@ -1525,7 +1535,6 @@ int rcu_needs_cpu(unsigned long *dj) */ static void rcu_prepare_for_idle(void) { -#ifndef CONFIG_RCU_NOCB_CPU_ALL bool needwake; struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); @@ -1533,8 +1542,11 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) + return; + /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_active); + tne = READ_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(NULL)) invoke_rcu_core(); /* force nohz to see update. */ @@ -1580,7 +1592,6 @@ static void rcu_prepare_for_idle(void) if (needwake) rcu_gp_kthread_wake(rsp); } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* @@ -1590,12 +1601,11 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { -#ifndef CONFIG_RCU_NOCB_CPU_ALL - if (rcu_is_nocb_cpu(smp_processor_id())) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || + rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* @@ -1760,7 +1770,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, + READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, fast_no_hz); } @@ -1898,11 +1908,11 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) { struct rcu_data *rdp_leader = rdp->nocb_leader; - if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) + if (!READ_ONCE(rdp_leader->nocb_kthread)) return; - if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { + if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ - ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; + WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); wake_up(&rdp_leader->nocb_wq); } } @@ -1934,14 +1944,14 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) ret = atomic_long_read(&rdp->nocb_q_count); #ifdef CONFIG_PROVE_RCU - rhp = ACCESS_ONCE(rdp->nocb_head); + rhp = READ_ONCE(rdp->nocb_head); if (!rhp) - rhp = ACCESS_ONCE(rdp->nocb_gp_head); + rhp = READ_ONCE(rdp->nocb_gp_head); if (!rhp) - rhp = ACCESS_ONCE(rdp->nocb_follower_head); + rhp = READ_ONCE(rdp->nocb_follower_head); /* Having no rcuo kthread but CBs after scheduler starts is bad! */ - if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp && + if (!READ_ONCE(rdp->nocb_kthread) && rhp && rcu_scheduler_fully_active) { /* RCU callback enqueued before CPU first came online??? */ pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", @@ -1975,12 +1985,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, atomic_long_add(rhcount, &rdp->nocb_q_count); /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ old_rhpp = xchg(&rdp->nocb_tail, rhtp); - ACCESS_ONCE(*old_rhpp) = rhp; + WRITE_ONCE(*old_rhpp, rhp); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ /* If we are not being polled and there is a kthread, awaken it ... */ - t = ACCESS_ONCE(rdp->nocb_kthread); + t = READ_ONCE(rdp->nocb_kthread); if (rcu_nocb_poll || !t) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNotPoll")); @@ -2118,7 +2128,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) for (;;) { wait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], - (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); + (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); if (likely(d)) break; WARN_ON(signal_pending(current)); @@ -2145,7 +2155,7 @@ wait_again: if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); wait_event_interruptible(my_rdp->nocb_wq, - !ACCESS_ONCE(my_rdp->nocb_leader_sleep)); + !READ_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ @@ -2159,12 +2169,12 @@ wait_again: */ gotcbs = false; for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head); + rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) continue; /* No CBs here, try next follower. */ /* Move callbacks to wait-for-GP list, which is empty. */ - ACCESS_ONCE(rdp->nocb_head) = NULL; + WRITE_ONCE(rdp->nocb_head, NULL); rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); gotcbs = true; } @@ -2184,7 +2194,7 @@ wait_again: my_rdp->nocb_leader_sleep = true; smp_mb(); /* Ensure _sleep true before scan. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) - if (ACCESS_ONCE(rdp->nocb_head)) { + if (READ_ONCE(rdp->nocb_head)) { /* Found CB, so short-circuit next wait. */ my_rdp->nocb_leader_sleep = false; break; @@ -2205,7 +2215,7 @@ wait_again: /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (ACCESS_ONCE(rdp->nocb_head)) + if (READ_ONCE(rdp->nocb_head)) my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ @@ -2241,7 +2251,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); wait_event_interruptible(rdp->nocb_wq, - ACCESS_ONCE(rdp->nocb_follower_head)); + READ_ONCE(rdp->nocb_follower_head)); } else if (firsttime) { /* Don't drown trace log with "Poll"! */ firsttime = false; @@ -2282,10 +2292,10 @@ static int rcu_nocb_kthread(void *arg) nocb_follower_wait(rdp); /* Pull the ready-to-invoke callbacks onto local list. */ - list = ACCESS_ONCE(rdp->nocb_follower_head); + list = READ_ONCE(rdp->nocb_follower_head); BUG_ON(!list); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); - ACCESS_ONCE(rdp->nocb_follower_head) = NULL; + WRITE_ONCE(rdp->nocb_follower_head, NULL); tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); /* Each pass through the following loop invokes a callback. */ @@ -2324,7 +2334,7 @@ static int rcu_nocb_kthread(void *arg) /* Is a deferred wakeup of rcu_nocb_kthread() required? */ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { - return ACCESS_ONCE(rdp->nocb_defer_wakeup); + return READ_ONCE(rdp->nocb_defer_wakeup); } /* Do a deferred wakeup of rcu_nocb_kthread(). */ @@ -2334,8 +2344,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; - ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup); - ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT; + ndw = READ_ONCE(rdp->nocb_defer_wakeup); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } @@ -2448,7 +2458,7 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) t = kthread_run(rcu_nocb_kthread, rdp_spawn, "rcuo%c/%d", rsp->abbr, cpu); BUG_ON(IS_ERR(t)); - ACCESS_ONCE(rdp_spawn->nocb_kthread) = t; + WRITE_ONCE(rdp_spawn->nocb_kthread, t); } /* @@ -2663,7 +2673,7 @@ static void rcu_sysidle_enter(int irq) /* Record start of fully idle period. */ j = jiffies; - ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; + WRITE_ONCE(rdtp->dynticks_idle_jiffies, j); smp_mb__before_atomic(); atomic_inc(&rdtp->dynticks_idle); smp_mb__after_atomic(); @@ -2681,7 +2691,7 @@ static void rcu_sysidle_enter(int irq) */ void rcu_sysidle_force_exit(void) { - int oldstate = ACCESS_ONCE(full_sysidle_state); + int oldstate = READ_ONCE(full_sysidle_state); int newoldstate; /* @@ -2794,7 +2804,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, smp_mb(); /* Read counters before timestamps. */ /* Pick up timestamps. */ - j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); + j = READ_ONCE(rdtp->dynticks_idle_jiffies); /* If this CPU entered idle more recently, update maxj timestamp. */ if (ULONG_CMP_LT(*maxj, j)) *maxj = j; @@ -2831,11 +2841,11 @@ static unsigned long rcu_sysidle_delay(void) static void rcu_sysidle(unsigned long j) { /* Check the current state. */ - switch (ACCESS_ONCE(full_sysidle_state)) { + switch (READ_ONCE(full_sysidle_state)) { case RCU_SYSIDLE_NOT: /* First time all are idle, so note a short idle period. */ - ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; + WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT); break; case RCU_SYSIDLE_SHORT: @@ -2873,7 +2883,7 @@ static void rcu_sysidle_cancel(void) { smp_mb(); if (full_sysidle_state > RCU_SYSIDLE_SHORT) - ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; + WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT); } /* @@ -2925,7 +2935,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp) smp_mb(); /* grace period precedes setting inuse. */ rshp = container_of(rhp, struct rcu_sysidle_head, rh); - ACCESS_ONCE(rshp->inuse) = 0; + WRITE_ONCE(rshp->inuse, 0); } /* @@ -2936,7 +2946,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp) bool rcu_sys_is_idle(void) { static struct rcu_sysidle_head rsh; - int rss = ACCESS_ONCE(full_sysidle_state); + int rss = READ_ONCE(full_sysidle_state); if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) return false; @@ -2964,7 +2974,7 @@ bool rcu_sys_is_idle(void) } rcu_sysidle_report(rcu_state_p, isidle, maxj, false); oldrss = rss; - rss = ACCESS_ONCE(full_sysidle_state); + rss = READ_ONCE(full_sysidle_state); } } @@ -3048,10 +3058,10 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress(rsp) || - ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ))) - return 1; + ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ))) + return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ - return 0; + return false; } /* @@ -3077,7 +3087,7 @@ static void rcu_bind_gp_kthread(void) static void rcu_dynticks_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id(); + WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } @@ -3085,6 +3095,6 @@ static void rcu_dynticks_task_enter(void) static void rcu_dynticks_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1; + WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index f92361efd..3ea7ffc7d 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -277,7 +277,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); + READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); @@ -323,8 +323,8 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp = &rsp->node[0]; raw_spin_lock_irqsave(&rnp->lock, flags); - completed = ACCESS_ONCE(rsp->completed); - gpnum = ACCESS_ONCE(rsp->gpnum); + completed = READ_ONCE(rsp->completed); + gpnum = READ_ONCE(rsp->gpnum); if (completed == gpnum) gpage = 0; else diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1f133350d..afaecb7a7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -150,14 +150,14 @@ void __rcu_read_unlock(void) barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s))) + if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ t->rcu_read_lock_nesting = 0; } #ifdef CONFIG_PROVE_LOCKING { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + int rrln = READ_ONCE(t->rcu_read_lock_nesting); WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); } @@ -389,17 +389,17 @@ module_param(rcu_cpu_stall_timeout, int, 0644); int rcu_jiffies_till_stall_check(void) { - int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); /* * Limit check must be consistent with the Kconfig limits * for CONFIG_RCU_CPU_STALL_TIMEOUT. */ if (till_stall_check < 3) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + WRITE_ONCE(rcu_cpu_stall_timeout, 3); till_stall_check = 3; } else if (till_stall_check > 300) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + WRITE_ONCE(rcu_cpu_stall_timeout, 300); till_stall_check = 300; } return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; @@ -550,12 +550,12 @@ static void check_holdout_task(struct task_struct *t, { int cpu; - if (!ACCESS_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || - !ACCESS_ONCE(t->on_rq) || + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || (IS_ENABLED(CONFIG_NO_HZ_FULL) && !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - ACCESS_ONCE(t->rcu_tasks_holdout) = false; + WRITE_ONCE(t->rcu_tasks_holdout, false); list_del_init(&t->rcu_tasks_holdout_list); put_task_struct(t); return; @@ -639,11 +639,11 @@ static int __noreturn rcu_tasks_kthread(void *arg) */ rcu_read_lock(); for_each_process_thread(g, t) { - if (t != current && ACCESS_ONCE(t->on_rq) && + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { get_task_struct(t); - t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw); - ACCESS_ONCE(t->rcu_tasks_holdout) = true; + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); list_add(&t->rcu_tasks_holdout_list, &rcu_tasks_holdouts); } @@ -672,7 +672,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct task_struct *t1; schedule_timeout_interruptible(HZ); - rtst = ACCESS_ONCE(rcu_task_stall_timeout); + rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); if (needreport) @@ -728,7 +728,7 @@ static void rcu_spawn_tasks_kthread(void) static struct task_struct *rcu_tasks_kthread_ptr; struct task_struct *t; - if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) { + if (READ_ONCE(rcu_tasks_kthread_ptr)) { smp_mb(); /* Ensure caller sees full kthread. */ return; } @@ -740,7 +740,7 @@ static void rcu_spawn_tasks_kthread(void) t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); BUG_ON(IS_ERR(t)); smp_mb(); /* Ensure others see full kthread. */ - ACCESS_ONCE(rcu_tasks_kthread_ptr) = t; + WRITE_ONCE(rcu_tasks_kthread_ptr, t); mutex_unlock(&rcu_tasks_kthread_mutex); } diff --git a/kernel/relay.c b/kernel/relay.c index e9dbaeb8f..0b4570cfa 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -81,10 +81,7 @@ static struct page **relay_alloc_page_array(unsigned int n_pages) */ static void relay_free_page_array(struct page **array) { - if (is_vmalloc_addr(array)) - vfree(array); - else - kfree(array); + kvfree(array); } /** diff --git a/kernel/resource.c b/kernel/resource.c index 90552aab5..fed052a1b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -504,13 +504,13 @@ int region_is_ram(resource_size_t start, unsigned long size) { struct resource *p; resource_size_t end = start + size - 1; - int flags = IORESOURCE_MEM | IORESOURCE_BUSY; + unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; const char *name = "System RAM"; int ret = -1; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - if (end < p->start) + if (p->end < start) continue; if (p->start <= start && end <= p->end) { @@ -521,7 +521,7 @@ int region_is_ram(resource_size_t start, unsigned long size) ret = 1; break; } - if (p->end < start) + if (end < p->start) break; /* not found */ } read_unlock(&resource_lock); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54b88a1c0..67687973c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,16 +11,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -ifdef CONFIG_SCHED_BFS -obj-y += bfs.o clock.o -else -obj-y += core.o proc.o clock.o cputime.o +obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-$(CONFIG_SMP) += cpudeadline.o +obj-y += wait.o completion.o idle.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o +obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -endif -obj-y += wait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o -obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index eae160dd6..750ed601d 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -1,5 +1,3 @@ -#ifdef CONFIG_SCHED_AUTOGROUP - #include "sched.h" #include <linux/proc_fs.h> @@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + if (!READ_ONCE(sysctl_sched_autogroup_enabled)) goto out; for_each_thread(p, t) @@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } #endif /* CONFIG_SCHED_DEBUG */ - -#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142..890c95f25 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c deleted file mode 100644 index 5366182bd..000000000 --- a/kernel/sched/bfs.c +++ /dev/null @@ -1,7420 +0,0 @@ -/* - * kernel/sched/bfs.c, was kernel/sched.c - * - * Kernel scheduler and related syscalls - * - * Copyright (C) 1991-2002 Linus Torvalds - * - * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and - * make semaphores SMP safe - * 1998-11-19 Implemented schedule_timeout() and related stuff - * by Andrea Arcangeli - * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: - * hybrid priority-list and round-robin design with - * an array-switch method of distributing timeslices - * and per-CPU runqueues. Cleanups and useful suggestions - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-04-15 Work begun on replacing all interactivity tuning with a - * fair scheduling design by Con Kolivas. - * 2007-05-05 Load balancing (smp-nice) and other improvements - * by Peter Williams - * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith - * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri - * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, - * Thomas Gleixner, Mike Kravetz - * now Brainfuck deadline scheduling policy by Con Kolivas deletes - * a whole lot of those previous things. - */ - -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/nmi.h> -#include <linux/init.h> -#include <asm/uaccess.h> -#include <linux/highmem.h> -#include <asm/mmu_context.h> -#include <linux/interrupt.h> -#include <linux/capability.h> -#include <linux/completion.h> -#include <linux/kernel_stat.h> -#include <linux/debug_locks.h> -#include <linux/perf_event.h> -#include <linux/security.h> -#include <linux/notifier.h> -#include <linux/profile.h> -#include <linux/freezer.h> -#include <linux/vmalloc.h> -#include <linux/blkdev.h> -#include <linux/delay.h> -#include <linux/smp.h> -#include <linux/threads.h> -#include <linux/timer.h> -#include <linux/rcupdate.h> -#include <linux/cpu.h> -#include <linux/cpuset.h> -#include <linux/cpumask.h> -#include <linux/percpu.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/syscalls.h> -#include <linux/sched/sysctl.h> -#include <linux/times.h> -#include <linux/tsacct_kern.h> -#include <linux/kprobes.h> -#include <linux/delayacct.h> -#include <linux/log2.h> -#include <linux/bootmem.h> -#include <linux/ftrace.h> -#include <linux/slab.h> -#include <linux/init_task.h> -#include <linux/binfmts.h> -#include <linux/context_tracking.h> -#include <linux/sched/prio.h> - -#include <asm/irq_regs.h> -#include <asm/switch_to.h> -#include <asm/tlb.h> -#include <asm/unistd.h> -#include <asm/mutex.h> -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#endif - -#include "cpupri.h" -#include "../workqueue_internal.h" -#include "../smpboot.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> - -#include "bfs_sched.h" - -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -#define rt_task(p) rt_prio((p)->prio) -#define rt_queue(rq) rt_prio((rq)->rq_prio) -#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ - (policy) == SCHED_RR) -#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) - -#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) -#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) -#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy))) - -#define is_iso_policy(policy) ((policy) == SCHED_ISO) -#define iso_task(p) unlikely(is_iso_policy((p)->policy)) -#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy)) -#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO) - -#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) - -#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) - -#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) -#define STOP_PRIO (MAX_RT_PRIO - 1) - -/* - * Some helpers for converting to/from various scales. Use shifts to get - * approximate multiples of ten for less overhead. - */ -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -#define JIFFY_NS (1000000000 / HZ) -#define HALF_JIFFY_NS (1000000000 / HZ / 2) -#define HALF_JIFFY_US (1000000 / HZ / 2) -#define MS_TO_NS(TIME) ((TIME) << 20) -#define MS_TO_US(TIME) ((TIME) << 10) -#define NS_TO_MS(TIME) ((TIME) >> 20) -#define NS_TO_US(TIME) ((TIME) >> 10) - -#define RESCHED_US (100) /* Reschedule if less than this many μs left */ - -void print_scheduler_version(void) -{ - printk(KERN_INFO "BFS CPU scheduler v0.464 by Con Kolivas.\n"); -} - -/* - * This is the time all tasks within the same priority round robin. - * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. - * Tunable via /proc interface. - */ -#ifdef CONFIG_PCK_INTERACTIVE -int rr_interval __read_mostly = 3; -#else -int rr_interval __read_mostly = 6; -#endif - -/* - * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks - * are allowed to run five seconds as real time tasks. This is the total over - * all online cpus. - */ -#ifdef CONFIG_PCK_INTERACTIVE -int sched_iso_cpu __read_mostly = 25; -#else -int sched_iso_cpu __read_mostly = 70; -#endif - -/* - * The relative length of deadline for each priority(nice) level. - */ -static int prio_ratios[NICE_WIDTH] __read_mostly; - -/* - * The quota handed out to tasks of all priority levels when refilling their - * time_slice. - */ -static inline int timeslice(void) -{ - return MS_TO_US(rr_interval); -} - -/* - * The global runqueue data that all CPUs work off. Data is protected either - * by the global grq lock, or the discrete lock that precedes the data in this - * struct. - */ -struct global_rq { - raw_spinlock_t lock; - unsigned long nr_running; - unsigned long nr_uninterruptible; - unsigned long long nr_switches; - struct list_head queue[PRIO_LIMIT]; - DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); - unsigned long qnr; /* queued not running */ -#ifdef CONFIG_SMP - cpumask_t cpu_idle_map; - bool idle_cpus; -#endif - int noc; /* num_online_cpus stored and updated when it changes */ - u64 niffies; /* Nanosecond jiffies */ - unsigned long last_jiffy; /* Last jiffy we updated niffies */ - - raw_spinlock_t iso_lock; - int iso_ticks; - bool iso_refractory; -}; - -#ifdef CONFIG_SMP -/* - * We add the notion of a root-domain which will be used to define per-domain - * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new - * exclusive cpuset is created, we also create and attach a new root-domain - * object. - * - */ -struct root_domain { - atomic_t refcount; - atomic_t rto_count; - struct rcu_head rcu; - cpumask_var_t span; - cpumask_var_t online; - - /* - * The "RT overload" flag: it gets set if a CPU has more than - * one runnable RT task. - */ - cpumask_var_t rto_mask; - struct cpupri cpupri; -}; - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -static struct root_domain def_root_domain; - -#endif /* CONFIG_SMP */ - -/* There can be only one */ -static struct global_rq grq; - -static DEFINE_MUTEX(sched_hotcpu_mutex); - -DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#ifdef CONFIG_SMP -struct rq *cpu_rq(int cpu) -{ - return &per_cpu(runqueues, (cpu)); -} -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -/* - * sched_domains_mutex serialises calls to init_sched_domains, - * detach_destroy_domains and partition_sched_domains. - */ -static DEFINE_MUTEX(sched_domains_mutex); - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -static struct root_domain def_root_domain; - -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; -} -#else -struct rq *uprq; -#endif /* CONFIG_SMP */ - -static inline void update_rq_clock(struct rq *rq); - -/* - * Sanity check should sched_clock return bogus values. We make sure it does - * not appear to go backwards, and use jiffies to determine the maximum and - * minimum it could possibly have increased, and round down to the nearest - * jiffy when it falls outside this. - */ -static inline void niffy_diff(s64 *niff_diff, int jiff_diff) -{ - unsigned long min_diff, max_diff; - - if (jiff_diff > 1) - min_diff = JIFFIES_TO_NS(jiff_diff - 1); - else - min_diff = 1; - /* Round up to the nearest tick for maximum */ - max_diff = JIFFIES_TO_NS(jiff_diff + 1); - - if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff)) - *niff_diff = min_diff; -} - -#ifdef CONFIG_SMP -static inline int cpu_of(struct rq *rq) -{ - return rq->cpu; -} - -/* - * Niffies are a globally increasing nanosecond counter. Whenever a runqueue - * clock is updated with the grq.lock held, it is an opportunity to update the - * niffies value. Any CPU can update it by adding how much its clock has - * increased since it last updated niffies, minus any added niffies by other - * CPUs. - */ -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; - /* old_clock is only updated when we are updating niffies */ - rq->old_clock = rq->clock; - ndiff -= grq.niffies - rq->last_niffy; - jdiff = jiffies - grq.last_jiffy; - niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; - rq->last_niffy = grq.niffies; -} -#else /* CONFIG_SMP */ -static inline int cpu_of(struct rq *rq) -{ - return 0; -} - -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; - rq->old_clock = rq->clock; - jdiff = jiffies - grq.last_jiffy; - niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; -} -#endif - -#include "stats.h" - -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif -#ifndef finish_arch_post_lock_switch -# define finish_arch_post_lock_switch() do { } while (0) -#endif - -/* - * All common locking functions performed on grq.lock. rq->clock is local to - * the CPU accessing it so it can be modified just with interrupts disabled - * when we're not updating niffies. - * Looking up task_rq must be done under grq.lock to be safe. - */ -static void update_rq_clock_task(struct rq *rq, s64 delta); - -static inline void update_rq_clock(struct rq *rq) -{ - s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; - - if (unlikely(delta < 0)) - return; - rq->clock += delta; - update_rq_clock_task(rq, delta); -} - -static inline bool task_running(struct task_struct *p) -{ - return p->on_cpu; -} - -static inline void grq_lock(void) - __acquires(grq.lock) -{ - raw_spin_lock(&grq.lock); -} - -static inline void grq_unlock(void) - __releases(grq.lock) -{ - raw_spin_unlock(&grq.lock); -} - -static inline void grq_lock_irq(void) - __acquires(grq.lock) -{ - raw_spin_lock_irq(&grq.lock); -} - -static inline void time_lock_grq(struct rq *rq) - __acquires(grq.lock) -{ - grq_lock(); - update_clocks(rq); -} - -static inline void grq_unlock_irq(void) - __releases(grq.lock) -{ - raw_spin_unlock_irq(&grq.lock); -} - -static inline void grq_lock_irqsave(unsigned long *flags) - __acquires(grq.lock) -{ - raw_spin_lock_irqsave(&grq.lock, *flags); -} - -static inline void grq_unlock_irqrestore(unsigned long *flags) - __releases(grq.lock) -{ - raw_spin_unlock_irqrestore(&grq.lock, *flags); -} - -static inline struct rq -*task_grq_lock(struct task_struct *p, unsigned long *flags) - __acquires(grq.lock) -{ - grq_lock_irqsave(flags); - return task_rq(p); -} - -static inline struct rq -*time_task_grq_lock(struct task_struct *p, unsigned long *flags) - __acquires(grq.lock) -{ - struct rq *rq = task_grq_lock(p, flags); - update_clocks(rq); - return rq; -} - -static inline struct rq *task_grq_lock_irq(struct task_struct *p) - __acquires(grq.lock) -{ - grq_lock_irq(); - return task_rq(p); -} - -static inline void time_task_grq_lock_irq(struct task_struct *p) - __acquires(grq.lock) -{ - struct rq *rq = task_grq_lock_irq(p); - update_clocks(rq); -} - -static inline void task_grq_unlock_irq(void) - __releases(grq.lock) -{ - grq_unlock_irq(); -} - -static inline void task_grq_unlock(unsigned long *flags) - __releases(grq.lock) -{ - grq_unlock_irqrestore(flags); -} - -/** - * grunqueue_is_locked - * - * Returns true if the global runqueue is locked. - * This interface allows printk to be called with the runqueue lock - * held and know whether or not it is OK to wake up the klogd. - */ -bool grunqueue_is_locked(void) -{ - return raw_spin_is_locked(&grq.lock); -} - -void grq_unlock_wait(void) - __releases(grq.lock) -{ - smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - raw_spin_unlock_wait(&grq.lock); -} - -static inline void time_grq_lock(struct rq *rq, unsigned long *flags) - __acquires(grq.lock) -{ - local_irq_save(*flags); - time_lock_grq(rq); -} - -static inline struct rq *__task_grq_lock(struct task_struct *p) - __acquires(grq.lock) -{ - grq_lock(); - return task_rq(p); -} - -static inline void __task_grq_unlock(void) - __releases(grq.lock) -{ - grq_unlock(); -} - -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - grq.lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_); - - grq_unlock_irq(); -} - -static inline bool deadline_before(u64 deadline, u64 time) -{ - return (deadline < time); -} - -static inline bool deadline_after(u64 deadline, u64 time) -{ - return (deadline > time); -} - -/* - * A task that is queued but not running will be on the grq run list. - * A task that is not running or queued will not be on the grq run list. - * A task that is currently running will have ->on_cpu set but not on the - * grq run list. - */ -static inline bool task_queued(struct task_struct *p) -{ - return (!list_empty(&p->run_list)); -} - -/* - * Removing from the global runqueue. Enter with grq locked. - */ -static void dequeue_task(struct task_struct *p) -{ - list_del_init(&p->run_list); - if (list_empty(grq.queue + p->prio)) - __clear_bit(p->prio, grq.prio_bitmap); - sched_info_dequeued(task_rq(p), p); -} - -/* - * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as - * an idle task, we ensure none of the following conditions are met. - */ -static bool idleprio_suitable(struct task_struct *p) -{ - return (!freezing(p) && !signal_pending(p) && - !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); -} - -/* - * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check - * that the iso_refractory flag is not set. - */ -static bool isoprio_suitable(void) -{ - return !grq.iso_refractory; -} - -/* - * Adding to the global runqueue. Enter with grq locked. - */ -static void enqueue_task(struct task_struct *p, struct rq *rq) -{ - if (!rt_task(p)) { - /* Check it hasn't gotten rt from PI */ - if ((idleprio_task(p) && idleprio_suitable(p)) || - (iso_task(p) && isoprio_suitable())) - p->prio = p->normal_prio; - else - p->prio = NORMAL_PRIO; - } - __set_bit(p->prio, grq.prio_bitmap); - list_add_tail(&p->run_list, grq.queue + p->prio); - sched_info_queued(rq, p); -} - -static inline void requeue_task(struct task_struct *p) -{ - sched_info_queued(task_rq(p), p); -} - -/* - * Returns the relative length of deadline all compared to the shortest - * deadline which is that of nice -20. - */ -static inline int task_prio_ratio(struct task_struct *p) -{ - return prio_ratios[TASK_USER_PRIO(p)]; -} - -/* - * task_timeslice - all tasks of all priorities get the exact same timeslice - * length. CPU distribution is handled by giving different deadlines to - * tasks of different priorities. Use 128 as the base value for fast shifts. - */ -static inline int task_timeslice(struct task_struct *p) -{ - return (rr_interval * task_prio_ratio(p) / 128); -} - -static void resched_task(struct task_struct *p); - -static inline void resched_curr(struct rq *rq) -{ - resched_task(rq->curr); -} - -/* - * qnr is the "queued but not running" count which is the total number of - * tasks on the global runqueue list waiting for cpu time but not actually - * currently running on a cpu. - */ -static inline void inc_qnr(void) -{ - grq.qnr++; -} - -static inline void dec_qnr(void) -{ - grq.qnr--; -} - -static inline int queued_notrunning(void) -{ - return grq.qnr; -} - -#ifdef CONFIG_SMP -/* - * The cpu_idle_map stores a bitmap of all the CPUs currently idle to - * allow easy lookup of whether any suitable idle CPUs are available. - * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the - * idle_cpus variable than to do a full bitmask check when we are busy. - */ -static inline void set_cpuidle_map(int cpu) -{ - if (likely(cpu_online(cpu))) { - cpumask_set_cpu(cpu, &grq.cpu_idle_map); - grq.idle_cpus = true; - } -} - -static inline void clear_cpuidle_map(int cpu) -{ - cpumask_clear_cpu(cpu, &grq.cpu_idle_map); - if (cpumask_empty(&grq.cpu_idle_map)) - grq.idle_cpus = false; -} - -static bool suitable_idle_cpus(struct task_struct *p) -{ - if (!grq.idle_cpus) - return false; - return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map)); -} - -#define CPUIDLE_DIFF_THREAD (1) -#define CPUIDLE_DIFF_CORE (2) -#define CPUIDLE_CACHE_BUSY (4) -#define CPUIDLE_DIFF_CPU (8) -#define CPUIDLE_THREAD_BUSY (16) -#define CPUIDLE_THROTTLED (32) -#define CPUIDLE_DIFF_NODE (64) - -static inline bool scaling_rq(struct rq *rq); - -/* - * The best idle CPU is chosen according to the CPUIDLE ranking above where the - * lowest value would give the most suitable CPU to schedule p onto next. The - * order works out to be the following: - * - * Same core, idle or busy cache, idle or busy threads - * Other core, same cache, idle or busy cache, idle threads. - * Same node, other CPU, idle cache, idle threads. - * Same node, other CPU, busy cache, idle threads. - * Other core, same cache, busy threads. - * Same node, other CPU, busy threads. - * Other node, other CPU, idle cache, idle threads. - * Other node, other CPU, busy cache, idle threads. - * Other node, other CPU, busy threads. - */ -static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) -{ - int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THROTTLED | - CPUIDLE_THREAD_BUSY | CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | - CPUIDLE_DIFF_CORE | CPUIDLE_DIFF_THREAD; - int cpu_tmp; - - if (cpumask_test_cpu(best_cpu, tmpmask)) - goto out; - - for_each_cpu(cpu_tmp, tmpmask) { - int ranking, locality; - struct rq *tmp_rq; - - ranking = 0; - tmp_rq = cpu_rq(cpu_tmp); - - locality = rq->cpu_locality[cpu_tmp]; -#ifdef CONFIG_NUMA - if (locality > 3) - ranking |= CPUIDLE_DIFF_NODE; - else -#endif - if (locality > 2) - ranking |= CPUIDLE_DIFF_CPU; -#ifdef CONFIG_SCHED_MC - else if (locality == 2) - ranking |= CPUIDLE_DIFF_CORE; - if (!(tmp_rq->cache_idle(cpu_tmp))) - ranking |= CPUIDLE_CACHE_BUSY; -#endif -#ifdef CONFIG_SCHED_SMT - if (locality == 1) - ranking |= CPUIDLE_DIFF_THREAD; - if (!(tmp_rq->siblings_idle(cpu_tmp))) - ranking |= CPUIDLE_THREAD_BUSY; -#endif - if (scaling_rq(tmp_rq)) - ranking |= CPUIDLE_THROTTLED; - - if (ranking < best_ranking) { - best_cpu = cpu_tmp; - best_ranking = ranking; - } - } -out: - return best_cpu; -} - -static void resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) -{ - best_cpu = best_mask_cpu(best_cpu, rq, tmpmask); - resched_curr(cpu_rq(best_cpu)); -} - -bool cpus_share_cache(int this_cpu, int that_cpu) -{ - struct rq *this_rq = cpu_rq(this_cpu); - - return (this_rq->cpu_locality[that_cpu] < 3); -} - -#ifdef CONFIG_SCHED_SMT -#ifdef CONFIG_SMT_NICE -static const cpumask_t *thread_cpumask(int cpu); - -/* Find the best real time priority running on any SMT siblings of cpu and if - * none are running, the static priority of the best deadline task running. - * The lookups to the other runqueues is done lockless as the occasional wrong - * value would be harmless. */ -static int best_smt_bias(int cpu) -{ - int other_cpu, best_bias = 0; - - for_each_cpu(other_cpu, thread_cpumask(cpu)) { - struct rq *rq; - - if (other_cpu == cpu) - continue; - rq = cpu_rq(other_cpu); - if (rq_idle(rq)) - continue; - if (!rq->online) - continue; - if (!rq->rq_mm) - continue; - if (likely(rq->rq_smt_bias > best_bias)) - best_bias = rq->rq_smt_bias; - } - return best_bias; -} - -static int task_prio_bias(struct task_struct *p) -{ - if (rt_task(p)) - return 1 << 30; - else if (task_running_iso(p)) - return 1 << 29; - else if (task_running_idle(p)) - return 0; - return MAX_PRIO - p->static_prio; -} - -/* We've already decided p can run on CPU, now test if it shouldn't for SMT - * nice reasons. */ -static bool smt_should_schedule(struct task_struct *p, int cpu) -{ - int best_bias, task_bias; - - /* Kernel threads always run */ - if (unlikely(!p->mm)) - return true; - if (rt_task(p)) - return true; - if (!idleprio_suitable(p)) - return true; - best_bias = best_smt_bias(cpu); - /* The smt siblings are all idle or running IDLEPRIO */ - if (best_bias < 1) - return true; - task_bias = task_prio_bias(p); - if (task_bias < 1) - return false; - if (task_bias >= best_bias) - return true; - /* Dither 25% cpu of normal tasks regardless of nice difference */ - if (best_bias % 4 == 1) - return true; - /* Sorry, you lose */ - return false; -} -#endif -#endif - -static bool resched_best_idle(struct task_struct *p) -{ - cpumask_t tmpmask; - int best_cpu; - - cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); - best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask); -#ifdef CONFIG_SMT_NICE - if (!smt_should_schedule(p, best_cpu)) - return false; -#endif - resched_curr(cpu_rq(best_cpu)); - return true; -} - -static inline void resched_suitable_idle(struct task_struct *p) -{ - if (suitable_idle_cpus(p)) - resched_best_idle(p); -} -/* - * Flags to tell us whether this CPU is running a CPU frequency governor that - * has slowed its speed or not. No locking required as the very rare wrongly - * read value would be harmless. - */ -void cpu_scaling(int cpu) -{ - cpu_rq(cpu)->scaling = true; -} - -void cpu_nonscaling(int cpu) -{ - cpu_rq(cpu)->scaling = false; -} - -static inline bool scaling_rq(struct rq *rq) -{ - return rq->scaling; -} - -static inline int locality_diff(struct task_struct *p, struct rq *rq) -{ - return rq->cpu_locality[task_cpu(p)]; -} -#else /* CONFIG_SMP */ -static inline void set_cpuidle_map(int cpu) -{ -} - -static inline void clear_cpuidle_map(int cpu) -{ -} - -static inline bool suitable_idle_cpus(struct task_struct *p) -{ - return uprq->curr == uprq->idle; -} - -static inline void resched_suitable_idle(struct task_struct *p) -{ -} - -void cpu_scaling(int __unused) -{ -} - -void cpu_nonscaling(int __unused) -{ -} - -/* - * Although CPUs can scale in UP, there is nowhere else for tasks to go so this - * always returns 0. - */ -static inline bool scaling_rq(struct rq *rq) -{ - return false; -} - -static inline int locality_diff(struct task_struct *p, struct rq *rq) -{ - return 0; -} -#endif /* CONFIG_SMP */ -EXPORT_SYMBOL_GPL(cpu_scaling); -EXPORT_SYMBOL_GPL(cpu_nonscaling); - -static inline int normal_prio(struct task_struct *p) -{ - if (has_rt_policy(p)) - return MAX_RT_PRIO - 1 - p->rt_priority; - if (idleprio_task(p)) - return IDLE_PRIO; - if (iso_task(p)) - return ISO_PRIO; - return NORMAL_PRIO; -} - -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks as it will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) -{ - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; -} - -/* - * activate_task - move a task to the runqueue. Enter with grq locked. - */ -static void activate_task(struct task_struct *p, struct rq *rq) -{ - update_clocks(rq); - - /* - * Sleep time is in units of nanosecs, so shift by 20 to get a - * milliseconds-range estimation of the amount of time that the task - * spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - if (p->state == TASK_UNINTERRUPTIBLE) - profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (rq->clock_task - p->last_ran) >> 20); - } - - p->prio = effective_prio(p); - if (task_contributes_to_load(p)) - grq.nr_uninterruptible--; - enqueue_task(p, rq); - rq->soft_affined++; - p->on_rq = 1; - grq.nr_running++; - inc_qnr(); -} - -static inline void clear_sticky(struct task_struct *p); - -/* - * deactivate_task - If it's running, it's not on the grq and we can just - * decrement the nr_running. Enter with grq locked. - */ -static inline void deactivate_task(struct task_struct *p, struct rq *rq) -{ - if (task_contributes_to_load(p)) - grq.nr_uninterruptible++; - rq->soft_affined--; - p->on_rq = 0; - grq.nr_running--; - clear_sticky(p); -} - -#ifdef CONFIG_SMP -void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ -#ifdef CONFIG_LOCKDEP - /* - * The caller should hold grq lock. - */ - WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock)); -#endif - if (task_cpu(p) == cpu) - return; - trace_sched_migrate_task(p, cpu); - perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); - - /* - * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be - * successfully executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - if (p->on_rq) { - task_rq(p)->soft_affined--; - cpu_rq(cpu)->soft_affined++; - } - task_thread_info(p)->cpu = cpu; -} - -static inline void clear_sticky(struct task_struct *p) -{ - p->sticky = false; -} - -static inline bool task_sticky(struct task_struct *p) -{ - return p->sticky; -} - -/* Reschedule the best idle CPU that is not this one. */ -static void -resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p) -{ - cpumask_t tmpmask; - - cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); - cpumask_clear_cpu(cpu, &tmpmask); - if (cpumask_empty(&tmpmask)) - return; - resched_best_mask(cpu, rq, &tmpmask); -} - -/* - * We set the sticky flag on a task that is descheduled involuntarily meaning - * it is awaiting further CPU time. If the last sticky task is still sticky - * but unlucky enough to not be the next task scheduled, we unstick it and try - * to find it an idle CPU. Realtime tasks do not stick to minimise their - * latency at all times. - */ -static inline void -swap_sticky(struct rq *rq, int cpu, struct task_struct *p) -{ - if (rq->sticky_task) { - if (rq->sticky_task == p) { - p->sticky = true; - return; - } - if (task_sticky(rq->sticky_task)) { - clear_sticky(rq->sticky_task); - resched_closest_idle(rq, cpu, rq->sticky_task); - } - } - if (!rt_task(p)) { - p->sticky = true; - rq->sticky_task = p; - } else { - resched_closest_idle(rq, cpu, p); - rq->sticky_task = NULL; - } -} - -static inline void unstick_task(struct rq *rq, struct task_struct *p) -{ - rq->sticky_task = NULL; - clear_sticky(p); -} -#else -static inline void clear_sticky(struct task_struct *p) -{ -} - -static inline bool task_sticky(struct task_struct *p) -{ - return false; -} - -static inline void -swap_sticky(struct rq *rq, int cpu, struct task_struct *p) -{ -} - -static inline void unstick_task(struct rq *rq, struct task_struct *p) -{ -} -#endif - -/* - * Move a task off the global queue and take it to a cpu for it will - * become the running task. - */ -static inline void take_task(int cpu, struct task_struct *p) -{ - set_task_cpu(p, cpu); - dequeue_task(p); - clear_sticky(p); - dec_qnr(); -} - -/* - * Returns a descheduling task to the grq runqueue unless it is being - * deactivated. - */ -static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate) -{ - if (deactivate) - deactivate_task(p, rq); - else { - inc_qnr(); - enqueue_task(p, rq); - } -} - -/* Enter with grq lock held. We know p is on the local cpu */ -static inline void __set_tsk_resched(struct task_struct *p) -{ - set_tsk_need_resched(p); - set_preempt_need_resched(); -} - -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -void resched_task(struct task_struct *p) -{ - int cpu; - - lockdep_assert_held(&grq.lock); - - if (test_tsk_need_resched(p)) - return; - - set_tsk_need_resched(p); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) { - set_preempt_need_resched(); - return; - } - - smp_send_reschedule(cpu); -} - -/** - * task_curr - is this task currently executing on a CPU? - * @p: the task in question. - * - * Return: 1 if the task is currently executing. 0 otherwise. - */ -inline int task_curr(const struct task_struct *p) -{ - return cpu_curr(task_cpu(p)) == p; -} - -#ifdef CONFIG_SMP -struct migration_req { - struct task_struct *task; - int dest_cpu; -}; - -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) -{ - unsigned long flags; - bool running, on_rq; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since this will return false - * if the runqueue has changed and p is actually now - * running somewhere else! - */ - while (task_running(p) && p == rq->curr) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the grq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_grq_lock(p, &flags); - trace_sched_wait_task(p); - running = task_running(p); - on_rq = p->on_rq; - ncsw = 0; - if (!match_state || p->state == match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_grq_unlock(&flags); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(on_rq)) { - ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ); - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - * - * NOTE: this function doesn't have to take the runqueue lock, - * because all it wants to ensure is that the remote task enters - * the kernel. If the IPI races and the task has been migrated - * to another CPU then no harm is done and the purpose has been - * achieved as well. - */ -void kick_process(struct task_struct *p) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); - preempt_enable(); -} -EXPORT_SYMBOL_GPL(kick_process); -#endif - -/* - * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the - * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or - * between themselves, they cooperatively multitask. An idle rq scores as - * prio PRIO_LIMIT so it is always preempted. - */ -static inline bool -can_preempt(struct task_struct *p, int prio, u64 deadline) -{ - /* Better static priority RT task or better policy preemption */ - if (p->prio < prio) - return true; - if (p->prio > prio) - return false; - /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */ - if (!deadline_before(p->deadline, deadline)) - return false; - return true; -} - -#ifdef CONFIG_SMP -#define cpu_online_map (*(cpumask_t *)cpu_online_mask) -#ifdef CONFIG_HOTPLUG_CPU -/* - * Check to see if there is a task that is affined only to offline CPUs but - * still wants runtime. This happens to kernel threads during suspend/halt and - * disabling of CPUs. - */ -static inline bool online_cpus(struct task_struct *p) -{ - return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed))); -} -#else /* CONFIG_HOTPLUG_CPU */ -/* All available CPUs are always online without hotplug. */ -static inline bool online_cpus(struct task_struct *p) -{ - return true; -} -#endif - -/* - * Check to see if p can run on cpu, and if not, whether there are any online - * CPUs it can run on instead. - */ -static inline bool needs_other_cpu(struct task_struct *p, int cpu) -{ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed))) - return true; - return false; -} - -/* - * When all else is equal, still prefer this_rq. - */ -static void try_preempt(struct task_struct *p, struct rq *this_rq) -{ - struct rq *highest_prio_rq = NULL; - int cpu, highest_prio; - u64 latest_deadline; - cpumask_t tmp; - - /* - * We clear the sticky flag here because for a task to have called - * try_preempt with the sticky flag enabled means some complicated - * re-scheduling has occurred and we should ignore the sticky flag. - */ - clear_sticky(p); - - if (suitable_idle_cpus(p) && resched_best_idle(p)) - return; - - /* IDLEPRIO tasks never preempt anything but idle */ - if (p->policy == SCHED_IDLEPRIO) - return; - - if (likely(online_cpus(p))) - cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); - else - return; - - highest_prio = latest_deadline = 0; - - for_each_cpu(cpu, &tmp) { - struct rq *rq; - int rq_prio; - - rq = cpu_rq(cpu); - rq_prio = rq->rq_prio; - if (rq_prio < highest_prio) - continue; - - if (rq_prio > highest_prio || - deadline_after(rq->rq_deadline, latest_deadline)) { - latest_deadline = rq->rq_deadline; - highest_prio = rq_prio; - highest_prio_rq = rq; - } - } - - if (likely(highest_prio_rq)) { -#ifdef CONFIG_SMT_NICE - cpu = cpu_of(highest_prio_rq); - if (!smt_should_schedule(p, cpu)) - return; -#endif - if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline)) - resched_curr(highest_prio_rq); - } -} -#else /* CONFIG_SMP */ -static inline bool needs_other_cpu(struct task_struct *p, int cpu) -{ - return false; -} - -static void try_preempt(struct task_struct *p, struct rq *this_rq) -{ - if (p->policy == SCHED_IDLEPRIO) - return; - if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) - resched_curr(uprq); -} -#endif /* CONFIG_SMP */ - -static void -ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -{ -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); - -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - - if (cpu == this_cpu) - schedstat_inc(rq, ttwu_local); - else { - struct sched_domain *sd; - - rcu_read_lock(); - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - rcu_read_unlock(); - } - -#endif /* CONFIG_SMP */ - - schedstat_inc(rq, ttwu_count); -#endif /* CONFIG_SCHEDSTATS */ -} - -void wake_up_if_idle(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - rcu_read_lock(); - - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; - - grq_lock_irqsave(&flags); - if (likely(is_idle_task(rq->curr))) - smp_send_reschedule(cpu); - /* Else cpu is not in idle, do nothing here */ - grq_unlock_irqrestore(&flags); - -out: - rcu_read_unlock(); -} - -#ifdef CONFIG_SMP -void scheduler_ipi(void) -{ - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); -} -#endif - -static inline void ttwu_activate(struct task_struct *p, struct rq *rq, - bool is_sync) -{ - activate_task(p, rq); - - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption if there are no idle cpus, - * instead waiting for current to deschedule. - */ - if (!is_sync || suitable_idle_cpus(p)) - try_preempt(p, rq); -} - -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, - bool success) -{ - trace_sched_wakeup(p, success); - p->state = TASK_RUNNING; - - /* - * if a worker is waking up, notify workqueue. Note that on BFS, we - * don't really know what cpu it will be, so we fake it for - * wq_worker_waking_up :/ - */ - if ((p->flags & PF_WQ_WORKER) && success) - wq_worker_waking_up(p, cpu_of(rq)); -} - -/* - * wake flags - */ -#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* child wakeup after fork */ -#define WF_MIGRATED 0x4 /* internal use, task got migrated */ - -/*** - * try_to_wake_up - wake up a thread - * @p: the thread to be awakened - * @state: the mask of task states that can be woken - * @wake_flags: wake modifier flags (WF_*) - * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. - * - * Return: %true if @p was woken up, %false if it was already running. - * or @state didn't match @p's state. - */ -static bool try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) -{ - bool success = false; - unsigned long flags; - struct rq *rq; - int cpu; - - get_cpu(); - - /* - * If we are going to wake up a thread waiting for CONDITION we - * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. - */ - smp_mb__before_spinlock(); - - /* - * No need to do time_lock_grq as we only need to update the rq clock - * if we activate the task - */ - rq = task_grq_lock(p, &flags); - cpu = task_cpu(p); - - /* state is a volatile long, どうして、分からない */ - if (!((unsigned int)p->state & state)) - goto out_unlock; - - if (task_queued(p) || task_running(p)) - goto out_running; - - ttwu_activate(p, rq, wake_flags & WF_SYNC); - success = true; - -out_running: - ttwu_post_activation(p, rq, success); -out_unlock: - task_grq_unlock(&flags); - - ttwu_stat(p, cpu, wake_flags); - - put_cpu(); - - return success; -} - -/** - * try_to_wake_up_local - try to wake up a local task with grq lock held - * @p: the thread to be awakened - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that grq is locked and, @p is not the current task. - * grq stays locked over invocation. - */ -static void try_to_wake_up_local(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - bool success = false; - - lockdep_assert_held(&grq.lock); - - if (!(p->state & TASK_NORMAL)) - return; - - if (!task_queued(p)) { - if (likely(!task_running(p))) { - schedstat_inc(rq, ttwu_count); - schedstat_inc(rq, ttwu_local); - } - ttwu_activate(p, rq, false); - ttwu_stat(p, smp_processor_id(), 0); - success = true; - } - ttwu_post_activation(p, rq, success); -} - -/** - * wake_up_process - Wake up a specific process - * @p: The process to be woken up. - * - * Attempt to wake up the nominated process and move it to the set of runnable - * processes. - * - * Return: 1 if the process was woken up, 0 if it was already running. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -int wake_up_process(struct task_struct *p) -{ - WARN_ON(task_is_stopped_or_traced(p)); - return try_to_wake_up(p, TASK_NORMAL, 0); -} -EXPORT_SYMBOL(wake_up_process); - -int wake_up_state(struct task_struct *p, unsigned int state) -{ - return try_to_wake_up(p, state, 0); -} - -static void time_slice_expired(struct task_struct *p); - -/* - * Perform scheduler related setup for a newly forked process p. - * p is forked by current. - */ -int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -{ -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); -#endif - /* - * The process state is set to the same value of the process executing - * do_fork() code. That is running. This guarantees that nobody will - * actually run it, and a signal or other external event cannot wake - * it up and insert it on the runqueue either. - */ - - /* Should be reset in fork.c but done here for ease of bfs patching */ - p->on_rq = - p->utime = - p->stime = - p->utimescaled = - p->stimescaled = - p->sched_time = - p->stime_pc = - p->utime_pc = 0; - - /* - * Revert to default priority/policy on fork if requested. - */ - if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { - p->policy = SCHED_NORMAL; - p->normal_prio = normal_prio(p); - } - - if (PRIO_TO_NICE(p->static_prio) < 0) { - p->static_prio = NICE_TO_PRIO(0); - p->normal_prio = p->static_prio; - } - - /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: - */ - p->sched_reset_on_fork = 0; - } - - INIT_LIST_HEAD(&p->run_list); -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (unlikely(sched_info_on())) - memset(&p->sched_info, 0, sizeof(p->sched_info)); -#endif - p->on_cpu = false; - clear_sticky(p); - init_task_preempt_count(p); - return 0; -} - -/* - * wake_up_new_task - wake up a newly created task for the first time. - * - * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created context, then puts the task - * on the runqueue and wakes it. - */ -void wake_up_new_task(struct task_struct *p) -{ - struct task_struct *parent; - unsigned long flags; - struct rq *rq; - - parent = p->parent; - rq = task_grq_lock(p, &flags); - - /* - * Reinit new task deadline as its creator deadline could have changed - * since call to dup_task_struct(). - */ - p->deadline = rq->rq_deadline; - - /* - * If the task is a new process, current and parent are the same. If - * the task is a new thread in the thread group, it will have much more - * in common with current than with the parent. - */ - set_task_cpu(p, task_cpu(rq->curr)); - - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = rq->curr->normal_prio; - - activate_task(p, rq); - trace_sched_wakeup_new(p, 1); - if (unlikely(p->policy == SCHED_FIFO)) - goto after_ts_init; - - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. If it's negative, it won't - * matter since that's the same as being 0. current's time_slice is - * actually in rq_time_slice when it's running, as is its last_ran - * value. rq->rq_deadline is only modified within schedule() so it - * is always equal to current->deadline. - */ - p->last_ran = rq->rq_last_ran; - if (likely(rq->rq_time_slice >= RESCHED_US * 2)) { - rq->rq_time_slice /= 2; - p->time_slice = rq->rq_time_slice; -after_ts_init: - if (rq->curr == parent && !suitable_idle_cpus(p)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ - __set_tsk_resched(parent); - } else - try_preempt(p, rq); - } else { - if (rq->curr == parent) { - /* - * Forking task has run out of timeslice. Reschedule it and - * start its child with a new time slice and deadline. The - * child will end up running first because its deadline will - * be slightly earlier. - */ - rq->rq_time_slice = 0; - __set_tsk_resched(parent); - } - time_slice_expired(p); - } - task_grq_unlock(&flags); -} - -#ifdef CONFIG_PREEMPT_NOTIFIERS - -/** - * preempt_notifier_register - tell me when current is being preempted & rescheduled - * @notifier: notifier struct to register - */ -void preempt_notifier_register(struct preempt_notifier *notifier) -{ - hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -} -EXPORT_SYMBOL_GPL(preempt_notifier_register); - -/** - * preempt_notifier_unregister - no longer interested in preemption notifications - * @notifier: notifier struct to unregister - * - * This is safe to call from within a preemption notifier. - */ -void preempt_notifier_unregister(struct preempt_notifier *notifier) -{ - hlist_del(¬ifier->link); -} -EXPORT_SYMBOL_GPL(preempt_notifier_unregister); - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ - struct preempt_notifier *notifier; - - hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) - notifier->ops->sched_in(notifier, raw_smp_processor_id()); -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ - struct preempt_notifier *notifier; - - hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) - notifier->ops->sched_out(notifier, next); -} - -#else /* !CONFIG_PREEMPT_NOTIFIERS */ - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ -} - -#endif /* CONFIG_PREEMPT_NOTIFIERS */ - -/** - * prepare_task_switch - prepare to switch tasks - * @rq: the runqueue preparing to switch - * @next: the task we are going to switch to. - * - * This is called with the rq lock held and interrupts off. It must - * be paired with a subsequent finish_task_switch after the context - * switch. - * - * prepare_task_switch sets up locking and calls architecture specific - * hooks. - */ -static inline void -prepare_task_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - sched_info_switch(rq, prev, next); - perf_event_task_sched_out(prev, next); - fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); - prepare_arch_switch(next); - trace_sched_switch(prev, next); -} - -/** - * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch - * @prev: the thread we just switched away from. - * - * finish_task_switch must be called after the context switch, paired - * with a prepare_task_switch call before the context switch. - * finish_task_switch will reconcile locking set up by prepare_task_switch, - * and do any other architecture-specific cleanup actions. - * - * Note that we may have delayed dropping an mm in context_switch(). If - * so, we finish that here outside of the runqueue lock. (Doing it - * with the lock held can cause deadlocks; see schedule() for - * details.) - * - * The context switch have flipped the stack from under us and restored the - * local variables which were saved when this task called schedule() in the - * past. prev == current is still correct but we need to recalculate this_rq - * because prev may have moved to another CPU. - */ -static struct rq *finish_task_switch(struct task_struct *prev) - __releases(grq.lock) -{ - struct rq *rq = this_rq(); - struct mm_struct *mm = rq->prev_mm; - long prev_state; - - rq->prev_mm = NULL; - - /* - * A task struct has one reference for the use as "current". - * If a task dies, then it sets TASK_DEAD in tsk->state and calls - * schedule one last time. The schedule call will never return, and - * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul <manfred@colorfullife.com> - */ - prev_state = prev->state; - vtime_task_switch(prev); - finish_arch_switch(prev); - perf_event_task_sched_in(prev, current); - finish_lock_switch(rq, prev); - finish_arch_post_lock_switch(); - - fire_sched_in_preempt_notifiers(current); - if (mm) - mmdrop(mm); - if (unlikely(prev_state == TASK_DEAD)) { - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - put_task_struct(prev); - } - return rq; -} - -/** - * schedule_tail - first thing a freshly forked thread must call. - * @prev: the thread we just switched away from. - */ -asmlinkage __visible void schedule_tail(struct task_struct *prev) - __releases(grq.lock) -{ - struct rq *rq; - - /* finish_task_switch() drops rq->lock and enables preemption */ - preempt_disable(); - rq = finish_task_switch(prev); - preempt_enable(); - - if (current->set_child_tid) - put_user(task_pid_vnr(current), current->set_child_tid); -} - -/* - * context_switch - switch to the new MM and the new thread's register state. - */ -static inline struct rq * -context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - struct mm_struct *mm, *oldmm; - - prepare_task_switch(rq, prev, next); - - mm = next->mm; - oldmm = prev->active_mm; - /* - * For paravirt, this is coupled with an exit in switch_to to - * combine the page table reload and the switch backend into - * one hypercall. - */ - arch_start_context_switch(prev); - - if (!mm) { - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next); - } else - switch_mm(oldmm, mm, next); - - if (!prev->mm) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; - } - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); - - /* Here we just switch the register state and the stack. */ - context_tracking_task_switch(prev, next); - switch_to(prev, next, prev); - - barrier(); - - return finish_task_switch(prev); -} - -/* - * nr_running, nr_uninterruptible and nr_context_switches: - * - * externally visible scheduler statistics: current number of runnable - * threads, total number of context switches performed since bootup. All are - * measured without grabbing the grq lock but the occasional inaccurate result - * doesn't matter so long as it's positive. - */ -unsigned long nr_running(void) -{ - long nr = grq.nr_running; - - if (unlikely(nr < 0)) - nr = 0; - return (unsigned long)nr; -} - -static unsigned long nr_uninterruptible(void) -{ - long nu = grq.nr_uninterruptible; - - if (unlikely(nu < 0)) - nu = 0; - return nu; -} - -/* - * Check if only the current task is running on the cpu. - */ -bool single_task_running(void) -{ - if (cpu_rq(smp_processor_id())->soft_affined == 1) - return true; - else - return false; -} -EXPORT_SYMBOL(single_task_running); - -unsigned long long nr_context_switches(void) -{ - long long ns = grq.nr_switches; - - /* This is of course impossible */ - if (unlikely(ns < 0)) - ns = 1; - return (unsigned long long)ns; -} - -unsigned long nr_iowait(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); - - return sum; -} - -unsigned long nr_iowait_cpu(int cpu) -{ - struct rq *this = cpu_rq(cpu); - return atomic_read(&this->nr_iowait); -} - -unsigned long nr_active(void) -{ - return nr_running() + nr_uninterruptible(); -} - -/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we - * base it on the number of running or queued tasks with their ->rq pointer - * set to this cpu as being the CPU they're more likely to run on. */ -void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) -{ - struct rq *this = this_rq(); - - *nr_waiters = atomic_read(&this->nr_iowait); - *load = this->soft_affined; -} - -/* Variables and functions for calc_load */ -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - return load >> FSHIFT; -} - -/* - * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. - */ -void calc_global_load(unsigned long ticks) -{ - long active; - - if (time_before(jiffies, calc_load_update)) - return; - active = nr_active() * FIXED_1; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update = jiffies + LOAD_FREQ; -} - -DEFINE_PER_CPU(struct kernel_stat, kstat); -DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); - -EXPORT_PER_CPU_SYMBOL(kstat); -EXPORT_PER_CPU_SYMBOL(kernel_cpustat); - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void irqtime_account_irq(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(irqtime_account_irq); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ -/* - * In theory, the compile should just see 0 here, and optimize out the call - * to sched_rt_avg_update. But I don't trust it... - */ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}irq region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}irq - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; - - rq->prev_irq_time += irq_delta; - delta -= irq_delta; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_key_false((¶virt_steal_rq_enabled))) { - s64 steal = paravirt_steal_clock(cpu_of(rq)); - - steal -= rq->prev_steal_time_rq; - - if (unlikely(steal > delta)) - steal = delta; - - rq->prev_steal_time_rq += steal; - - delta -= steal; - } -#endif - - rq->clock_task += delta; -} - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static void irqtime_account_hi_si(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - u64 latest_ns; - - latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)); - if (latest_ns > cpustat[CPUTIME_IRQ]) - cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy; - - latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)); - if (latest_ns > cpustat[CPUTIME_SOFTIRQ]) - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy; -} -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -static inline void irqtime_account_hi_si(void) -{ -} -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -static __always_inline bool steal_account_process_tick(void) -{ -#ifdef CONFIG_PARAVIRT - if (static_key_false(¶virt_steal_enabled)) { - u64 steal; - cputime_t steal_ct; - - steal = paravirt_steal_clock(smp_processor_id()); - steal -= this_rq()->prev_steal_time; - - /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime - * granularity and account the rest on the next rounds. - */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); - - account_steal_time(steal_ct); - return steal_ct; - } -#endif - return false; -} - -/* - * Accumulate raw cputime values of dead tasks (sig->[us]time) and live - * tasks (sum on group iteration) belonging to @tsk's group. - */ -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ - struct signal_struct *sig = tsk->signal; - cputime_t utime, stime; - struct task_struct *t; - unsigned int seq, nextseq; - unsigned long flags; - - rcu_read_lock(); - /* Attempt a lockless read on the first round. */ - nextseq = 0; - do { - seq = nextseq; - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; - - for_each_thread(tsk, t) { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } - /* If lockless access failed, take the lock. */ - nextseq = 1; - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); -} - -/* - * On each tick, see what percentage of that tick was attributed to each - * component and add the percentage to the _pc values. Once a _pc value has - * accumulated one tick's worth, account for that. This means the total - * percentage of load components will always be 128 (pseudo 100) per tick. - */ -static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - if (atomic_read(&rq->nr_iowait) > 0) { - rq->iowait_pc += pc; - if (rq->iowait_pc >= 128) { - cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128; - rq->iowait_pc %= 128; - } - } else { - rq->idle_pc += pc; - if (rq->idle_pc >= 128) { - cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128; - rq->idle_pc %= 128; - } - } - acct_update_integrals(idle); -} - -static void -pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset, - unsigned long pc, unsigned long ns) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - - p->stime_pc += pc; - if (p->stime_pc >= 128) { - int jiffs = p->stime_pc / 128; - - p->stime_pc %= 128; - p->stime += (__force u64)cputime_one_jiffy * jiffs; - p->stimescaled += one_jiffy_scaled * jiffs; - account_group_system_time(p, cputime_one_jiffy * jiffs); - } - p->sched_time += ns; - account_group_exec_runtime(p, ns); - - if (hardirq_count() - hardirq_offset) { - rq->irq_pc += pc; - if (rq->irq_pc >= 128) { - cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128; - rq->irq_pc %= 128; - } - } else if (in_serving_softirq()) { - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; - } - } else { - rq->system_pc += pc; - if (rq->system_pc >= 128) { - cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128; - rq->system_pc %= 128; - } - } - acct_update_integrals(p); -} - -static void pc_user_time(struct rq *rq, struct task_struct *p, - unsigned long pc, unsigned long ns) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - - p->utime_pc += pc; - if (p->utime_pc >= 128) { - int jiffs = p->utime_pc / 128; - - p->utime_pc %= 128; - p->utime += (__force u64)cputime_one_jiffy * jiffs; - p->utimescaled += one_jiffy_scaled * jiffs; - account_group_user_time(p, cputime_one_jiffy * jiffs); - } - p->sched_time += ns; - account_group_exec_runtime(p, ns); - - if (this_cpu_ksoftirqd() == p) { - /* - * ksoftirqd time do not get accounted in cpu_softirq_time. - * So, we have to handle it separately here. - */ - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; - } - } - - if (task_nice(p) > 0 || idleprio_task(p)) { - rq->nice_pc += pc; - if (rq->nice_pc >= 128) { - cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128; - rq->nice_pc %= 128; - } - } else { - rq->user_pc += pc; - if (rq->user_pc >= 128) { - cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128; - rq->user_pc %= 128; - } - } - acct_update_integrals(p); -} - -/* - * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast - * shifts instead of 100 - */ -#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS) - -/* - * This is called on clock ticks. - * Bank in p->sched_time the ns elapsed since the last tick or switch. - * CPU scheduler quota accounting is also performed here in microseconds. - */ -static void -update_cpu_clock_tick(struct rq *rq, struct task_struct *p) -{ - long account_ns = rq->clock_task - rq->rq_last_ran; - struct task_struct *idle = rq->idle; - unsigned long account_pc; - - if (unlikely(account_ns < 0) || steal_account_process_tick()) - goto ts_account; - - account_pc = NS_TO_PC(account_ns); - - /* Accurate tick timekeeping */ - if (user_mode(get_irq_regs())) - pc_user_time(rq, p, account_pc, account_ns); - else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) - pc_system_time(rq, p, HARDIRQ_OFFSET, - account_pc, account_ns); - else - pc_idle_time(rq, idle, account_pc); - - if (sched_clock_irqtime) - irqtime_account_hi_si(); - -ts_account: - /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); - } - - rq->rq_last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; -} - -/* - * This is called on context switches. - * Bank in p->sched_time the ns elapsed since the last tick or switch. - * CPU scheduler quota accounting is also performed here in microseconds. - */ -static void -update_cpu_clock_switch(struct rq *rq, struct task_struct *p) -{ - long account_ns = rq->clock_task - rq->rq_last_ran; - struct task_struct *idle = rq->idle; - unsigned long account_pc; - - if (unlikely(account_ns < 0)) - goto ts_account; - - account_pc = NS_TO_PC(account_ns); - - /* Accurate subtick timekeeping */ - if (p != idle) { - pc_user_time(rq, p, account_pc, account_ns); - } - else - pc_idle_time(rq, idle, account_pc); - -ts_account: - /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); - } - - rq->rq_last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; -} - -/* - * Return any ns on the sched_clock that have not yet been accounted in - * @p in case that task is currently running. - * - * Called with task_grq_lock() held. - */ -static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -{ - u64 ns = 0; - - /* - * Must be ->curr _and_ ->on_rq. If dequeued, we would - * project cycles that may never be accounted to this - * thread, breaking clock_gettime(). - */ - if (p == rq->curr && p->on_rq) { - update_clocks(rq); - ns = rq->clock_task - rq->rq_last_ran; - if (unlikely((s64)ns < 0)) - ns = 0; - } - - return ns; -} - -/* - * Return accounted runtime for the task. - * Return separately the current's pending runtime that have not been - * accounted yet. - * - */ -unsigned long long task_sched_runtime(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns; - -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) - /* - * 64-bit doesn't need locks to atomically read a 64bit value. - * So we have a optimization chance when the task's delta_exec is 0. - * Reading ->on_cpu is racy, but this is ok. - * - * If we race with it leaving cpu, we'll take a lock. So we're correct. - * If we race with it entering cpu, unaccounted time is 0. This is - * indistinguishable from the read occurring a few cycles earlier. - * If we see ->on_cpu without ->on_rq, the task is leaving, and has - * been accounted, so we're correct here as well. - */ - if (!p->on_cpu || !p->on_rq) - return tsk_seruntime(p); -#endif - - rq = task_grq_lock(p, &flags); - ns = p->sched_time + do_task_delta_exec(p, rq); - task_grq_unlock(&flags); - - return ns; -} - -/* Compatibility crap */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ -} - -void account_idle_time(cputime_t cputime) -{ -} - -void update_cpu_load_nohz(void) -{ -} - -#ifdef CONFIG_NO_HZ_COMMON -void calc_load_enter_idle(void) -{ -} - -void calc_load_exit_idle(void) -{ -} -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - /* Add guest time to process. */ - p->utime += (__force u64)cputime; - p->utimescaled += (__force u64)cputime_scaled; - account_group_user_time(p, cputime); - p->gtime += (__force u64)cputime; - - /* Add guest time to cpustat. */ - if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64)cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime; - } else { - cpustat[CPUTIME_USER] += (__force u64)cputime; - cpustat[CPUTIME_GUEST] += (__force u64)cputime; - } -} - -/* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated - */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, cputime64_t *target_cputime64) -{ - /* Add system time to process. */ - p->stime += (__force u64)cputime; - p->stimescaled += (__force u64)cputime_scaled; - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - *target_cputime64 += (__force u64)cputime; - - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * This is for guest only now. - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) - account_guest_time(p, cputime, cputime_scaled); -} - -/* - * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - cpustat[CPUTIME_STEAL] += (__force u64)cputime; -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -static void account_idle_times(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64)cputime; - else - cpustat[CPUTIME_IDLE] += (__force u64)cputime; -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - -void account_process_tick(struct task_struct *p, int user_tick) -{ -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - account_idle_times(jiffies_to_cputime(ticks)); -} -#endif - -static inline void grq_iso_lock(void) - __acquires(grq.iso_lock) -{ - raw_spin_lock(&grq.iso_lock); -} - -static inline void grq_iso_unlock(void) - __releases(grq.iso_lock) -{ - raw_spin_unlock(&grq.iso_lock); -} - -/* - * Functions to test for when SCHED_ISO tasks have used their allocated - * quota as real time scheduling and convert them back to SCHED_NORMAL. - * Where possible, the data is tested lockless, to avoid grabbing iso_lock - * because the occasional inaccurate result won't matter. However the - * tick data is only ever modified under lock. iso_refractory is only simply - * set to 0 or 1 so it's not worth grabbing the lock yet again for that. - */ -static bool set_iso_refractory(void) -{ - grq.iso_refractory = true; - return grq.iso_refractory; -} - -static bool clear_iso_refractory(void) -{ - grq.iso_refractory = false; - return grq.iso_refractory; -} - -/* - * Test if SCHED_ISO tasks have run longer than their alloted period as RT - * tasks and set the refractory flag if necessary. There is 10% hysteresis - * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a - * slow division. - */ -static bool test_ret_isorefractory(struct rq *rq) -{ - if (likely(!grq.iso_refractory)) { - if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu) - return set_iso_refractory(); - } else { - if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) - return clear_iso_refractory(); - } - return grq.iso_refractory; -} - -static void iso_tick(void) -{ - grq_iso_lock(); - grq.iso_ticks += 100; - grq_iso_unlock(); -} - -/* No SCHED_ISO task was running so decrease rq->iso_ticks */ -static inline void no_iso_tick(void) -{ - if (grq.iso_ticks) { - grq_iso_lock(); - grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; - if (unlikely(grq.iso_refractory && grq.iso_ticks < - ISO_PERIOD * (sched_iso_cpu * 115 / 128))) - clear_iso_refractory(); - grq_iso_unlock(); - } -} - -/* This manages tasks that have run out of timeslice during a scheduler_tick */ -static void task_running_tick(struct rq *rq) -{ - struct task_struct *p; - - /* - * If a SCHED_ISO task is running we increment the iso_ticks. In - * order to prevent SCHED_ISO tasks from causing starvation in the - * presence of true RT tasks we account those as iso_ticks as well. - */ - if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) { - if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128) - iso_tick(); - } else - no_iso_tick(); - - if (iso_queue(rq)) { - if (unlikely(test_ret_isorefractory(rq))) { - if (rq_running_iso(rq)) { - /* - * SCHED_ISO task is running as RT and limit - * has been hit. Force it to reschedule as - * SCHED_NORMAL by zeroing its time_slice - */ - rq->rq_time_slice = 0; - } - } - } - - /* SCHED_FIFO tasks never run out of timeslice. */ - if (rq->rq_policy == SCHED_FIFO) - return; - /* - * Tasks that were scheduled in the first half of a tick are not - * allowed to run into the 2nd half of the next tick if they will - * run out of time slice in the interim. Otherwise, if they have - * less than RESCHED_US μs of time slice left they will be rescheduled. - */ - if (rq->dither) { - if (rq->rq_time_slice > HALF_JIFFY_US) - return; - else - rq->rq_time_slice = 0; - } else if (rq->rq_time_slice >= RESCHED_US) - return; - - /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ - p = rq->curr; - - grq_lock(); - requeue_task(p); - __set_tsk_resched(p); - grq_unlock(); -} - -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. The data modified is all - * local to struct rq so we don't need to grab grq lock. - */ -void scheduler_tick(void) -{ - int cpu __maybe_unused = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - sched_clock_tick(); - /* grq lock not grabbed, so only update rq clock */ - update_rq_clock(rq); - update_cpu_clock_tick(rq, rq->curr); - if (!rq_idle(rq)) - task_running_tick(rq); - else - no_iso_tick(); - rq->last_tick = rq->clock; - perf_event_task_tick(); -} - -notrace unsigned long get_parent_ip(unsigned long addr) -{ - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; -} - -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) -void preempt_count_add(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) - return; -#endif - __preempt_count_add(val); -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Spinlock count overflowing soon? - */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= - PREEMPT_MASK - 10); -#endif - if (preempt_count() == val) { - unsigned long ip = get_parent_ip(CALLER_ADDR1); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } -} -EXPORT_SYMBOL(preempt_count_add); -NOKPROBE_SYMBOL(preempt_count_add); - -void preempt_count_sub(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) - return; - /* - * Is the spinlock portion underflowing? - */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; -#endif - - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - __preempt_count_sub(val); -} -EXPORT_SYMBOL(preempt_count_sub); -NOKPROBE_SYMBOL(preempt_count_sub); -#endif - -/* - * Deadline is "now" in niffies + (offset by priority). Setting the deadline - * is the key to everything. It distributes cpu fairly amongst tasks of the - * same nice value, it proportions cpu according to nice level, it means the - * task that last woke up the longest ago has the earliest deadline, thus - * ensuring that interactive tasks get low latency on wake up. The CPU - * proportion works out to the square of the virtual deadline difference, so - * this equation will give nice 19 3% CPU compared to nice 0. - */ -static inline u64 prio_deadline_diff(int user_prio) -{ - return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); -} - -static inline u64 task_deadline_diff(struct task_struct *p) -{ - return prio_deadline_diff(TASK_USER_PRIO(p)); -} - -static inline u64 static_deadline_diff(int static_prio) -{ - return prio_deadline_diff(USER_PRIO(static_prio)); -} - -static inline int longest_deadline_diff(void) -{ - return prio_deadline_diff(39); -} - -static inline int ms_longest_deadline_diff(void) -{ - return NS_TO_MS(longest_deadline_diff()); -} - -/* - * The time_slice is only refilled when it is empty and that is when we set a - * new deadline. - */ -static void time_slice_expired(struct task_struct *p) -{ - p->time_slice = timeslice(); - p->deadline = grq.niffies + task_deadline_diff(p); -#ifdef CONFIG_SMT_NICE - if (!p->mm) - p->smt_bias = 0; - else if (rt_task(p)) - p->smt_bias = 1 << 30; - else if (task_running_iso(p)) - p->smt_bias = 1 << 29; - else if (idleprio_task(p)) { - if (task_running_idle(p)) - p->smt_bias = 0; - else - p->smt_bias = 1; - } else if (--p->smt_bias < 1) - p->smt_bias = MAX_PRIO - p->static_prio; -#endif -} - -/* - * Timeslices below RESCHED_US are considered as good as expired as there's no - * point rescheduling when there's so little time left. SCHED_BATCH tasks - * have been flagged be not latency sensitive and likely to be fully CPU - * bound so every time they're rescheduled they have their time_slice - * refilled, but get a new later deadline to have little effect on - * SCHED_NORMAL tasks. - - */ -static inline void check_deadline(struct task_struct *p) -{ - if (p->time_slice < RESCHED_US || batch_task(p)) - time_slice_expired(p); -} - -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) - -/* - * Scheduler queue bitmap specific find next bit. - */ -static inline unsigned long -next_sched_bit(const unsigned long *addr, unsigned long offset) -{ - const unsigned long *p; - unsigned long result; - unsigned long size; - unsigned long tmp; - - size = PRIO_LIMIT; - if (offset >= size) - return size; - - p = addr + BITOP_WORD(offset); - result = offset & ~(BITS_PER_LONG-1); - size -= result; - offset %= BITS_PER_LONG; - if (offset) { - tmp = *(p++); - tmp &= (~0UL << offset); - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - } - while (size & ~(BITS_PER_LONG-1)) { - if ((tmp = *(p++))) - goto found_middle; - result += BITS_PER_LONG; - size -= BITS_PER_LONG; - } - if (!size) - return result; - tmp = *p; - -found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __ffs(tmp); -} - -/* - * O(n) lookup of all tasks in the global runqueue. The real brainfuck - * of lock contention and O(n). It's not really O(n) as only the queued, - * but not running tasks are scanned, and is O(n) queued in the worst case - * scenario only because the right task can be found before scanning all of - * them. - * Tasks are selected in this order: - * Real time tasks are selected purely by their static priority and in the - * order they were queued, so the lowest value idx, and the first queued task - * of that priority value is chosen. - * If no real time tasks are found, the SCHED_ISO priority is checked, and - * all SCHED_ISO tasks have the same priority value, so they're selected by - * the earliest deadline value. - * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the - * earliest deadline. - * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are - * selected by the earliest deadline. - */ -static inline struct -task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -{ - struct task_struct *edt = NULL; - unsigned long idx = -1; - - do { - struct list_head *queue; - struct task_struct *p; - u64 earliest_deadline; - - idx = next_sched_bit(grq.prio_bitmap, ++idx); - if (idx >= PRIO_LIMIT) - return idle; - queue = grq.queue + idx; - - if (idx < MAX_RT_PRIO) { - /* We found an rt task */ - list_for_each_entry(p, queue, run_list) { - /* Make sure cpu affinity is ok */ - if (needs_other_cpu(p, cpu)) - continue; - edt = p; - goto out_take; - } - /* - * None of the RT tasks at this priority can run on - * this cpu - */ - continue; - } - - /* - * No rt tasks. Find the earliest deadline task. Now we're in - * O(n) territory. - */ - earliest_deadline = ~0ULL; - list_for_each_entry(p, queue, run_list) { - u64 dl; - - /* Make sure cpu affinity is ok */ - if (needs_other_cpu(p, cpu)) - continue; - -#ifdef CONFIG_SMT_NICE - if (!smt_should_schedule(p, cpu)) - continue; -#endif - /* - * Soft affinity happens here by not scheduling a task - * with its sticky flag set that ran on a different CPU - * last when the CPU is scaling, or by greatly biasing - * against its deadline when not, based on cpu cache - * locality. - */ - if (task_sticky(p) && task_rq(p) != rq) { - if (scaling_rq(rq)) - continue; - dl = p->deadline << locality_diff(p, rq); - } else - dl = p->deadline; - - if (deadline_before(dl, earliest_deadline)) { - earliest_deadline = dl; - edt = p; - } - } - } while (!edt); - -out_take: - take_task(cpu, edt); - return edt; -} - - -/* - * Print scheduling while atomic bug: - */ -static noinline void __schedule_bug(struct task_struct *prev) -{ - if (oops_in_progress) - return; - - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); - - debug_show_held_locks(prev); - print_modules(); - if (irqs_disabled()) - print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { - pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); - pr_cont("\n"); - } -#endif - dump_stack(); - add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -} - -/* - * Various schedule()-time debugging checks and statistics: - */ -static inline void schedule_debug(struct task_struct *prev) -{ -#ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(unlikely(task_stack_end_corrupted(prev))); -#endif - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path. Otherwise whine - * if we are scheduling when we should not. - */ - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) - __schedule_bug(prev); - rcu_sleep_check(); - - profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - - schedstat_inc(this_rq(), sched_count); -} - -/* - * The currently running task's information is all stored in rq local data - * which is only modified by the local CPU, thereby allowing the data to be - * changed without grabbing the grq lock. - */ -static inline void set_rq_task(struct rq *rq, struct task_struct *p) -{ - rq->rq_time_slice = p->time_slice; - rq->rq_deadline = p->deadline; - rq->rq_last_ran = p->last_ran = rq->clock_task; - rq->rq_policy = p->policy; - rq->rq_prio = p->prio; -#ifdef CONFIG_SMT_NICE - rq->rq_mm = p->mm; - rq->rq_smt_bias = p->smt_bias; -#endif - if (p != rq->idle) - rq->rq_running = true; - else - rq->rq_running = false; -} - -static void reset_rq_task(struct rq *rq, struct task_struct *p) -{ - rq->rq_policy = p->policy; - rq->rq_prio = p->prio; -#ifdef CONFIG_SMT_NICE - rq->rq_smt_bias = p->smt_bias; -#endif -} - -#ifdef CONFIG_SMT_NICE -/* Iterate over smt siblings when we've scheduled a process on cpu and decide - * whether they should continue running or be descheduled. */ -static void check_smt_siblings(int cpu) -{ - int other_cpu; - - for_each_cpu(other_cpu, thread_cpumask(cpu)) { - struct task_struct *p; - struct rq *rq; - - if (other_cpu == cpu) - continue; - rq = cpu_rq(other_cpu); - if (rq_idle(rq)) - continue; - if (!rq->online) - continue; - p = rq->curr; - if (!smt_should_schedule(p, cpu)) { - set_tsk_need_resched(p); - smp_send_reschedule(other_cpu); - } - } -} - -static void wake_smt_siblings(int cpu) -{ - int other_cpu; - - if (!queued_notrunning()) - return; - - for_each_cpu(other_cpu, thread_cpumask(cpu)) { - struct rq *rq; - - if (other_cpu == cpu) - continue; - rq = cpu_rq(other_cpu); - if (rq_idle(rq)) { - struct task_struct *p = rq->curr; - - set_tsk_need_resched(p); - smp_send_reschedule(other_cpu); - } - } -} -#else -static void check_smt_siblings(int __maybe_unused cpu) {} -static void wake_smt_siblings(int __maybe_unused cpu) {} -#endif - -/* - * schedule() is the main scheduler function. - * - * The main means of driving the scheduler and thus entering this function are: - * - * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. - * - * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return - * paths. For example, see arch/x86/entry_64.S. - * - * To drive preemption between tasks, the scheduler sets the flag in timer - * interrupt handler scheduler_tick(). - * - * 3. Wakeups don't really cause entry into schedule(). They add a - * task to the run-queue and that's it. - * - * Now, if the new task added to the run-queue preempts the current - * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets - * called on the nearest possible occasion: - * - * - If the kernel is preemptible (CONFIG_PREEMPT=y): - * - * - in syscall or exception context, at the next outmost - * preempt_enable(). (this might be as soon as the wake_up()'s - * spin_unlock()!) - * - * - in IRQ context, return from interrupt-handler to - * preemptible context - * - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) - * then at the next: - * - * - cond_resched() call - * - explicit schedule() call - * - return from syscall or exception to user-space - * - return from interrupt-handler to user-space - * - * WARNING: all callers must re-check need_resched() afterward and reschedule - * accordingly in case an event triggered the need for rescheduling (such as - * an interrupt waking up a task) while preemption was disabled in __schedule(). - */ -static void __sched __schedule(void) -{ - struct task_struct *prev, *next, *idle; - unsigned long *switch_count; - bool deactivate; - struct rq *rq; - int cpu; - -need_resched: - deactivate = false; - preempt_disable(); - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - rcu_note_context_switch(); - prev = rq->curr; - - schedule_debug(prev); - - /* - * Make sure that signal_pending_state()->signal_pending() below - * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). - */ - smp_mb__before_spinlock(); - grq_lock_irq(); - - switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) { - prev->state = TASK_RUNNING; - } else { - deactivate = true; - prev->on_rq = 0; - - /* - * If a worker is going to sleep, notify and - * ask workqueue whether it wants to wake up a - * task to maintain concurrency. If so, wake - * up the task. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev, cpu); - if (to_wakeup) { - /* This shouldn't happen, but does */ - if (unlikely(to_wakeup == prev)) - deactivate = false; - else - try_to_wake_up_local(to_wakeup); - } - } - } - switch_count = &prev->nvcsw; - } - - /* - * If we are going to sleep and we have plugged IO queued, make - * sure to submit it to avoid deadlocks. - */ - if (unlikely(deactivate && blk_needs_flush_plug(prev))) { - grq_unlock_irq(); - preempt_enable_no_resched(); - blk_schedule_flush_plug(prev); - goto need_resched; - } - - update_clocks(rq); - update_cpu_clock_switch(rq, prev); - if (rq->clock - rq->last_tick > HALF_JIFFY_NS) - rq->dither = false; - else - rq->dither = true; - - clear_tsk_need_resched(prev); - clear_preempt_need_resched(); - - idle = rq->idle; - if (idle != prev) { - /* Update all the information stored on struct rq */ - prev->time_slice = rq->rq_time_slice; - prev->deadline = rq->rq_deadline; - check_deadline(prev); - prev->last_ran = rq->clock_task; - - /* Task changed affinity off this CPU */ - if (likely(!needs_other_cpu(prev, cpu))) { - if (!deactivate) { - if (!queued_notrunning()) { - /* - * We now know prev is the only thing that is - * awaiting CPU so we can bypass rechecking for - * the earliest deadline task and just run it - * again. - */ - set_rq_task(rq, prev); - check_smt_siblings(cpu); - grq_unlock_irq(); - goto rerun_prev_unlocked; - } else - swap_sticky(rq, cpu, prev); - } - } - return_task(prev, rq, deactivate); - } - - if (unlikely(!queued_notrunning())) { - /* - * This CPU is now truly idle as opposed to when idle is - * scheduled as a high priority task in its own right. - */ - next = idle; - schedstat_inc(rq, sched_goidle); - set_cpuidle_map(cpu); - } else { - next = earliest_deadline_task(rq, cpu, idle); - if (likely(next->prio != PRIO_LIMIT)) - clear_cpuidle_map(cpu); - else - set_cpuidle_map(cpu); - } - - if (likely(prev != next)) { - /* - * Don't reschedule an idle task or deactivated tasks - */ - if (prev != idle && !deactivate) - resched_suitable_idle(prev); - /* - * Don't stick tasks when a real time task is going to run as - * they may literally get stuck. - */ - if (rt_task(next)) - unstick_task(rq, prev); - set_rq_task(rq, next); - if (next != idle) - check_smt_siblings(cpu); - else - wake_smt_siblings(cpu); - grq.nr_switches++; - prev->on_cpu = false; - next->on_cpu = true; - rq->curr = next; - ++*switch_count; - - rq = context_switch(rq, prev, next); /* unlocks the grq */ - cpu = cpu_of(rq); - idle = rq->idle; - } else { - check_smt_siblings(cpu); - grq_unlock_irq(); - } - -rerun_prev_unlocked: - sched_preempt_enable_no_resched(); -} - -asmlinkage __visible void __sched schedule(void) -{ - do { - __schedule(); - } while (need_resched()); -} - -EXPORT_SYMBOL(schedule); - -#ifdef CONFIG_CONTEXT_TRACKING -asmlinkage __visible void __sched schedule_user(void) -{ - /* - * If we come here after a random call to set_need_resched(), - * or we have been woken up remotely but the IPI has not yet arrived, - * we haven't yet exited the RCU idle mode. Do it here manually until - * we find a better solution. - * - * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != IN_USER, but that will trigger - * too frequently to make sense yet. - */ - enum ctx_state prev_state = exception_enter(); - schedule(); - exception_exit(prev_state); -} -#endif - -/** - * schedule_preempt_disabled - called with preemption disabled - * - * Returns with preemption disabled. Note: preempt_count must be 1 - */ -void __sched schedule_preempt_disabled(void) -{ - sched_preempt_enable_no_resched(); - schedule(); - preempt_disable(); -} - -static void __sched notrace preempt_schedule_common(void) -{ - do { - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (need_resched()); -} - -#ifdef CONFIG_PREEMPT -/* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. - */ -asmlinkage __visible void __sched notrace preempt_schedule(void) -{ - /* - * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. - */ - if (likely(!preemptible())) - return; - - preempt_schedule_common(); -} -NOKPROBE_SYMBOL(preempt_schedule); -EXPORT_SYMBOL(preempt_schedule); - -#ifdef CONFIG_CONTEXT_TRACKING -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) -{ - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - do { - __preempt_count_add(PREEMPT_ACTIVE); - /* - * Needs preempt disabled in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - prev_ctx = exception_enter(); - __schedule(); - exception_exit(prev_ctx); - - __preempt_count_sub(PREEMPT_ACTIVE); - barrier(); - } while (need_resched()); -} -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_CONTEXT_TRACKING */ - -#endif /* CONFIG_PREEMPT */ - -/* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. - */ -asmlinkage __visible void __sched preempt_schedule_irq(void) -{ - enum ctx_state prev_state; - - /* Catch callers which need to be fixed */ - BUG_ON(preempt_count() || !irqs_disabled()); - - prev_state = exception_enter(); - - do { - __preempt_count_add(PREEMPT_ACTIVE); - local_irq_enable(); - schedule(); - local_irq_disable(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (need_resched()); - - exception_exit(prev_state); -} - -int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, - void *key) -{ - return try_to_wake_up(curr->private, mode, wake_flags); -} -EXPORT_SYMBOL(default_wake_function); - -#ifdef CONFIG_RT_MUTEXES - -/* - * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) - * - * This function changes the 'effective' priority of a task. It does - * not touch ->normal_prio like __setscheduler(). - * - * Used by the rt_mutex code to implement priority inheritance - * logic. Call site only calls if the priority of the task changed. - */ -void rt_mutex_setprio(struct task_struct *p, int prio) -{ - unsigned long flags; - int queued, oldprio; - struct rq *rq; - - BUG_ON(prio < 0 || prio > MAX_PRIO); - - rq = task_grq_lock(p, &flags); - - /* - * Idle task boosting is a nono in general. There is one - * exception, when PREEMPT_RT and NOHZ is active: - * - * The idle task calls get_next_timer_interrupt() and holds - * the timer wheel base->lock on the CPU and another CPU wants - * to access the timer (probably to cancel it). We can safely - * ignore the boosting request, as the idle CPU runs this code - * with interrupts disabled and will complete the lock - * protected section without being interrupted. So there is no - * real need to boost. - */ - if (unlikely(p == rq->idle)) { - WARN_ON(p != rq->curr); - WARN_ON(p->pi_blocked_on); - goto out_unlock; - } - - trace_sched_pi_setprio(p, prio); - oldprio = p->prio; - queued = task_queued(p); - if (queued) - dequeue_task(p); - p->prio = prio; - if (task_running(p) && prio > oldprio) - resched_task(p); - if (queued) { - enqueue_task(p, rq); - try_preempt(p, rq); - } - -out_unlock: - task_grq_unlock(&flags); -} - -#endif - -/* - * Adjust the deadline for when the priority is to change, before it's - * changed. - */ -static inline void adjust_deadline(struct task_struct *p, int new_prio) -{ - p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); -} - -void set_user_nice(struct task_struct *p, long nice) -{ - int queued, new_static, old_static; - unsigned long flags; - struct rq *rq; - - if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) - return; - new_static = NICE_TO_PRIO(nice); - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - rq = time_task_grq_lock(p, &flags); - /* - * The RT priorities are set via sched_setscheduler(), but we still - * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is - * not SCHED_NORMAL/SCHED_BATCH: - */ - if (has_rt_policy(p)) { - p->static_prio = new_static; - goto out_unlock; - } - queued = task_queued(p); - if (queued) - dequeue_task(p); - - adjust_deadline(p, new_static); - old_static = p->static_prio; - p->static_prio = new_static; - p->prio = effective_prio(p); - - if (queued) { - enqueue_task(p, rq); - if (new_static < old_static) - try_preempt(p, rq); - } else if (task_running(p)) { - reset_rq_task(rq, p); - if (old_static < new_static) - resched_task(p); - } -out_unlock: - task_grq_unlock(&flags); -} -EXPORT_SYMBOL(set_user_nice); - -/* - * can_nice - check if a task can reduce its nice value - * @p: task - * @nice: nice value - */ -int can_nice(const struct task_struct *p, const int nice) -{ - /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = nice_to_rlimit(nice); - - return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || - capable(CAP_SYS_NICE)); -} - -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -SYSCALL_DEFINE1(nice, int, increment) -{ - long nice, retval; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - - increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); - nice = task_nice(current) + increment; - - nice = clamp_val(nice, MIN_NICE, MAX_NICE); - if (increment < 0 && !can_nice(current, nice)) - return -EPERM; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - -/** - * task_prio - return the priority value of a given task. - * @p: the task in question. - * - * Return: The priority value as seen by users in /proc. - * RT tasks are offset by -100. Normal tasks are centered around 1, value goes - * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). - */ -int task_prio(const struct task_struct *p) -{ - int delta, prio = p->prio - MAX_RT_PRIO; - - /* rt tasks and iso tasks */ - if (prio <= 0) - goto out; - - /* Convert to ms to avoid overflows */ - delta = NS_TO_MS(p->deadline - grq.niffies); - delta = delta * 40 / ms_longest_deadline_diff(); - if (delta > 0 && delta <= 80) - prio += delta; - if (idleprio_task(p)) - prio += 40; -out: - return prio; -} - -/** - * idle_cpu - is a given cpu idle currently? - * @cpu: the processor in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int idle_cpu(int cpu) -{ - return cpu_curr(cpu) == cpu_rq(cpu)->idle; -} - -/** - * idle_task - return the idle task for a given cpu. - * @cpu: the processor in question. - * - * Return: The idle task for the cpu @cpu. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - * - * The task of @pid, if found. %NULL otherwise. - */ -static inline struct task_struct *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_vpid(pid) : current; -} - -/* Actually do priority change: must hold grq lock. */ -static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, - int prio, bool keep_boost) -{ - int oldrtprio, oldprio; - - p->policy = policy; - oldrtprio = p->rt_priority; - p->rt_priority = prio; - p->normal_prio = normal_prio(p); - oldprio = p->prio; - /* - * Keep a potential priority boosting if called from - * sched_setscheduler(). - */ - if (keep_boost) - p->prio = rt_mutex_get_effective_prio(p, p->normal_prio); - else - p->prio = p->normal_prio; - if (task_running(p)) { - reset_rq_task(rq, p); - /* Resched only if we might now be preempted */ - if (p->prio > oldprio || p->rt_priority > oldrtprio) - resched_task(p); - } -} - -/* - * check the target process has a UID that matches the current process's - */ -static bool check_same_owner(struct task_struct *p) -{ - const struct cred *cred = current_cred(), *pcred; - bool match; - - rcu_read_lock(); - pcred = __task_cred(p); - match = (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); - rcu_read_unlock(); - return match; -} - -static int __sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool user) -{ - struct sched_param zero_param = { .sched_priority = 0 }; - int queued, retval, oldpolicy = -1; - unsigned long flags, rlim_rtprio = 0; - int reset_on_fork; - struct rq *rq; - - /* may grab non-irq protected spin_locks */ - BUG_ON(in_interrupt()); - - if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { - unsigned long lflags; - - if (!lock_task_sighand(p, &lflags)) - return -ESRCH; - rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - unlock_task_sighand(p, &lflags); - if (rlim_rtprio) - goto recheck; - /* - * If the caller requested an RT policy without having the - * necessary rights, we downgrade the policy to SCHED_ISO. - * We also set the parameter to zero to pass the checks. - */ - policy = SCHED_ISO; - param = &zero_param; - } -recheck: - /* double check policy once rq lock held */ - if (policy < 0) { - reset_on_fork = p->sched_reset_on_fork; - policy = oldpolicy = p->policy; - } else { - reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); - policy &= ~SCHED_RESET_ON_FORK; - - if (!SCHED_RANGE(policy)) - return -EINVAL; - } - - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and - * SCHED_BATCH is 0. - */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO - 1)) - return -EINVAL; - if (is_rt_policy(policy) != (param->sched_priority != 0)) - return -EINVAL; - - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (user && !capable(CAP_SYS_NICE)) { - if (is_rt_policy(policy)) { - unsigned long rlim_rtprio = - task_rlimit(p, RLIMIT_RTPRIO); - - /* can't set/change the rt policy */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; - - /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) - return -EPERM; - } else { - switch (p->policy) { - /* - * Can only downgrade policies but not back to - * SCHED_NORMAL - */ - case SCHED_ISO: - if (policy == SCHED_ISO) - goto out; - if (policy == SCHED_NORMAL) - return -EPERM; - break; - case SCHED_BATCH: - if (policy == SCHED_BATCH) - goto out; - if (policy != SCHED_IDLEPRIO) - return -EPERM; - break; - case SCHED_IDLEPRIO: - if (policy == SCHED_IDLEPRIO) - goto out; - return -EPERM; - default: - break; - } - } - - /* can't change other user's priorities */ - if (!check_same_owner(p)) - return -EPERM; - - /* Normal users shall not reset the sched_reset_on_fork flag */ - if (p->sched_reset_on_fork && !reset_on_fork) - return -EPERM; - } - - if (user) { - retval = security_task_setscheduler(p); - if (retval) - return retval; - } - - /* - * make sure no PI-waiters arrive (or leave) while we are - * changing the priority of the task: - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* - * To be able to change p->policy safely, the grunqueue lock must be - * held. - */ - rq = __task_grq_lock(p); - - /* - * Changing the policy of the stop threads its a very bad idea - */ - if (p == rq->stop) { - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - return -EINVAL; - } - - /* - * If not changing anything there's no need to proceed further: - */ - if (unlikely(policy == p->policy && (!is_rt_policy(policy) || - param->sched_priority == p->rt_priority))) { - - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - return 0; - } - - /* recheck policy now with rq lock held */ - if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { - policy = oldpolicy = -1; - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - goto recheck; - } - update_clocks(rq); - p->sched_reset_on_fork = reset_on_fork; - - queued = task_queued(p); - if (queued) - dequeue_task(p); - __setscheduler(p, rq, policy, param->sched_priority, true); - if (queued) { - enqueue_task(p, rq); - try_preempt(p, rq); - } - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - - rt_mutex_adjust_pi(p); -out: - return 0; -} - -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, true); -} - -EXPORT_SYMBOL_GPL(sched_setscheduler); - -int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -{ - const struct sched_param param = { .sched_priority = attr->sched_priority }; - int policy = attr->sched_policy; - - return __sched_setscheduler(p, policy, ¶m, true); -} -EXPORT_SYMBOL_GPL(sched_setattr); - -/** - * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Just like sched_setscheduler, only don't bother checking if the - * current context has permission. For example, this is needed in - * stop_machine(): we create temporary high priority worker threads, - * but our caller might not have that capability. - * - * Return: 0 on success. An error code otherwise. - */ -int sched_setscheduler_nocheck(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, false); -} - -static int -do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -{ - struct sched_param lparam; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - if (copy_from_user(&lparam, param, sizeof(struct sched_param))) - return -EFAULT; - - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); - rcu_read_unlock(); - - return retval; -} - -/* - * Mimics kernel/events/core.c perf_copy_attr(). - */ -static int sched_copy_attr(struct sched_attr __user *uattr, - struct sched_attr *attr) -{ - u32 size; - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = SCHED_ATTR_SIZE_VER0; - - if (size < SCHED_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * XXX: do we want to be lenient like existing syscalls; or do we want - * to be strict and return an error on out-of-bounds values? - */ - attr->sched_nice = clamp(attr->sched_nice, -20, 19); - - /* sched/core.c uses zero here but we already know ret is zero */ - return 0; - -err_size: - put_user(sizeof(*attr), &uattr->size); - return -E2BIG; -} - -/** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy. - * - * Return: 0 on success. An error code otherwise. - * @param: structure containing the new RT priority. - */ -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, - struct sched_param __user *param) -{ - /* negative values for policy are not valid */ - if (policy < 0) - return -EINVAL; - - return do_sched_setscheduler(pid, policy, param); -} - -/* - * sched_setparam() passes in -1 for its policy, to let the functions - * it calls know not to change it. - */ -#define SETPARAM_POLICY -1 - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -{ - return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -} - -/** - * sys_sched_setattr - same as above, but with extended sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - */ -SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, flags) -{ - struct sched_attr attr; - struct task_struct *p; - int retval; - - if (!uattr || pid < 0 || flags) - return -EINVAL; - - retval = sched_copy_attr(uattr, &attr); - if (retval) - return retval; - - if ((int)attr.sched_policy < 0) - return -EINVAL; - - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setattr(p, &attr); - rcu_read_unlock(); - - return retval; -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - * - * Return: On success, the policy of the thread. Otherwise, a negative error - * code. - */ -SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -{ - struct task_struct *p; - int retval = -EINVAL; - - if (pid < 0) - goto out_nounlock; - - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy; - } - rcu_read_unlock(); - -out_nounlock: - return retval; -} - -/** - * sys_sched_getscheduler - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - * - * Return: On success, 0 and the RT priority is in @param. Otherwise, an error - * code. - */ -SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -{ - struct sched_param lp = { .sched_priority = 0 }; - struct task_struct *p; - int retval = -EINVAL; - - if (!param || pid < 0) - goto out_nounlock; - - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - if (has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - -out_nounlock: - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; -} - -static int sched_read_attr(struct sched_attr __user *uattr, - struct sched_attr *attr, - unsigned int usize) -{ - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, usize)) - return -EFAULT; - - /* - * If we're handed a smaller struct than we know of, - * ensure all the unknown bits are 0 - i.e. old - * user-space does not get uncomplete information. - */ - if (usize < sizeof(*attr)) { - unsigned char *addr; - unsigned char *end; - - addr = (void *)attr + usize; - end = (void *)attr + sizeof(*attr); - - for (; addr < end; addr++) { - if (*addr) - return -EFBIG; - } - - attr->size = usize; - } - - ret = copy_to_user(uattr, attr, attr->size); - if (ret) - return -EFAULT; - - /* sched/core.c uses zero here but we already know ret is zero */ - return ret; -} - -/** - * sys_sched_getattr - similar to sched_getparam, but with sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @size: sizeof(attr) for fwd/bwd comp. - * @flags: for future extension. - */ -SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size, unsigned int, flags) -{ - struct sched_attr attr = { - .size = sizeof(struct sched_attr), - }; - struct task_struct *p; - int retval; - - if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0 || flags) - return -EINVAL; - - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - attr.sched_policy = p->policy; - if (rt_task(p)) - attr.sched_priority = p->rt_priority; - else - attr.sched_nice = task_nice(p); - - rcu_read_unlock(); - - retval = sched_read_attr(uattr, &attr, size); - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; -} - -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -{ - cpumask_var_t cpus_allowed, new_mask; - struct task_struct *p; - int retval; - - get_online_cpus(); - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); - put_online_cpus(); - return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); - - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; - if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - goto out_unlock; - } - rcu_read_unlock(); - } - - retval = security_task_setscheduler(p); - if (retval) - goto out_unlock; - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); -again: - retval = set_cpus_allowed_ptr(p, new_mask); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_unlock: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); -out_put_task: - put_task_struct(p); - put_online_cpus(); - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) -{ - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - - -/** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - cpumask_var_t new_mask; - int retval; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); - if (retval == 0) - retval = sched_setaffinity(pid, new_mask); - free_cpumask_var(new_mask); - return retval; -} - -long sched_getaffinity(pid_t pid, cpumask_t *mask) -{ - struct task_struct *p; - unsigned long flags; - int retval; - - get_online_cpus(); - rcu_read_lock(); - - retval = -ESRCH; - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - grq_lock_irqsave(&flags); - cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask); - grq_unlock_irqrestore(&flags); - -out_unlock: - rcu_read_unlock(); - put_online_cpus(); - - return retval; -} - -/** - * sys_sched_getaffinity - get the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - int ret; - cpumask_var_t mask; - - if ((len * BITS_PER_BYTE) < nr_cpu_ids) - return -EINVAL; - if (len & (sizeof(unsigned long)-1)) - return -EINVAL; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - ret = sched_getaffinity(pid, mask); - if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); - - if (copy_to_user(user_mask_ptr, mask, retlen)) - ret = -EFAULT; - else - ret = retlen; - } - free_cpumask_var(mask); - - return ret; -} - -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. It does this by - * scheduling away the current task. If it still has the earliest deadline - * it will be scheduled again as the next task. - * - * Return: 0. - */ -SYSCALL_DEFINE0(sched_yield) -{ - struct task_struct *p; - - p = current; - grq_lock_irq(); - schedstat_inc(task_rq(p), yld_count); - requeue_task(p); - - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ - __release(grq.lock); - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&grq.lock); - sched_preempt_enable_no_resched(); - - schedule(); - - return 0; -} - -int __sched _cond_resched(void) -{ - if (should_resched()) { - preempt_schedule_common(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(_cond_resched); - -/* - * __cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -int __cond_resched_lock(spinlock_t *lock) -{ - int resched = should_resched(); - int ret = 0; - - lockdep_assert_held(lock); - - if (spin_needbreak(lock) || resched) { - spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else - cpu_relax(); - ret = 1; - spin_lock(lock); - } - return ret; -} -EXPORT_SYMBOL(__cond_resched_lock); - -int __sched __cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (should_resched()) { - local_bh_enable(); - preempt_schedule_common(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(__cond_resched_softirq); - -/** - * yield - yield the current processor to other threads. - * - * Do not ever use this function, there's a 99% chance you're doing it wrong. - * - * The scheduler is at all times free to pick the calling task as the most - * eligible task to run, if removing the yield() call from your code breaks - * it, its already broken. - * - * Typical broken usage is: - * - * while (!event) - * yield(); - * - * where one assumes that yield() will let 'the other' process run that will - * make event true. If the current task is a SCHED_FIFO task that will never - * happen. Never use yield() as a progress guarantee!! - * - * If you want to use yield() to wait for something, use wait_event(). - * If you want to use yield() to be 'nice' for others, use cond_resched(). - * If you still want to use yield(), do not! - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - sys_sched_yield(); -} -EXPORT_SYMBOL(yield); - -/** - * yield_to - yield the current processor to another thread in - * your thread group, or accelerate that thread toward the - * processor it's on. - * @p: target task - * @preempt: whether task preemption is allowed or not - * - * It's the caller's job to ensure that the target task struct - * can't go away on us before we can do any checks. - * - * Return: - * true (>0) if we indeed boosted the target task. - * false (0) if we failed to boost the target. - * -ESRCH if there's no task to yield to. - */ -int __sched yield_to(struct task_struct *p, bool preempt) -{ - struct rq *rq, *p_rq; - unsigned long flags; - int yielded = 0; - - rq = this_rq(); - grq_lock_irqsave(&flags); - if (task_running(p) || p->state) { - yielded = -ESRCH; - goto out_unlock; - } - - p_rq = task_rq(p); - yielded = 1; - if (p->deadline > rq->rq_deadline) - p->deadline = rq->rq_deadline; - p->time_slice += rq->rq_time_slice; - rq->rq_time_slice = 0; - if (p->time_slice > timeslice()) - p->time_slice = timeslice(); - if (preempt && rq != p_rq) - resched_curr(p_rq); -out_unlock: - grq_unlock_irqrestore(&flags); - - if (yielded > 0) - schedule(); - return yielded; -} -EXPORT_SYMBOL_GPL(yield_to); - -/* - * This task is about to go to sleep on IO. Increment rq->nr_iowait so - * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) - */ - -long __sched io_schedule_timeout(long timeout) -{ - int old_iowait = current->in_iowait; - struct rq *rq; - long ret; - - current->in_iowait = 1; - blk_schedule_flush_plug(current); - - delayacct_blkio_start(); - rq = raw_rq(); - atomic_inc(&rq->nr_iowait); - ret = schedule_timeout(timeout); - current->in_iowait = old_iowait; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); - - return ret; -} -EXPORT_SYMBOL(io_schedule_timeout); - -/** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the maximum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_ISO: - case SCHED_IDLEPRIO: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the minimum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_ISO: - case SCHED_IDLEPRIO: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) -{ - struct task_struct *p; - unsigned int time_slice; - unsigned long flags; - int retval; - struct timespec t; - - if (pid < 0) - return -EINVAL; - - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - grq_lock_irqsave(&flags); - time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); - grq_unlock_irqrestore(&flags); - - rcu_read_unlock(); - t = ns_to_timespec(time_slice); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; -} - -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - -void sched_show_task(struct task_struct *p) -{ - unsigned long free = 0; - int ppid; - unsigned long state = p->state; - - if (state) - state = __ffs(state) + 1; - printk(KERN_INFO "%-15.15s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif -#ifdef CONFIG_DEBUG_STACK_USAGE - free = stack_not_used(p); -#endif - ppid = 0; - rcu_read_lock(); - if (pid_alive(p)) - ppid = task_pid_nr(rcu_dereference(p->real_parent)); - rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, - (unsigned long)task_thread_info(p)->flags); - - print_worker_info(KERN_INFO, p); - show_stack(p, NULL); -} - -void show_state_filter(unsigned long state_filter) -{ - struct task_struct *g, *p; - -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif - rcu_read_lock(); - for_each_process_thread(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take a lot of time: - */ - touch_nmi_watchdog(); - if (!state_filter || (p->state & state_filter)) - sched_show_task(p); - } - - touch_all_softlockup_watchdogs(); - - rcu_read_unlock(); - /* - * Only show locks if all tasks are dumped: - */ - if (!state_filter) - debug_show_all_locks(); -} - -void dump_cpu_task(int cpu) -{ - pr_info("Task dump for CPU %d:\n", cpu); - sched_show_task(cpu_curr(cpu)); -} - -#ifdef CONFIG_SMP -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ - cpumask_copy(tsk_cpus_allowed(p), new_mask); -} -#endif - -/** - * init_idle - set up an idle thread for a given CPU - * @idle: task in question - * @cpu: cpu the idle task belongs to - * - * NOTE: this function does not set the idle thread's NEED_RESCHED - * flag, to make booting more robust. - */ -void init_idle(struct task_struct *idle, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - time_grq_lock(rq, &flags); - idle->last_ran = rq->clock_task; - idle->state = TASK_RUNNING; - /* Setting prio to illegal value shouldn't matter when never queued */ - idle->prio = PRIO_LIMIT; -#ifdef CONFIG_SMT_NICE - idle->smt_bias = 0; -#endif - set_rq_task(rq, idle); - do_set_cpus_allowed(idle, get_cpu_mask(cpu)); - /* Silence PROVE_RCU */ - rcu_read_lock(); - set_task_cpu(idle, cpu); - rcu_read_unlock(); - rq->curr = rq->idle = idle; - idle->on_cpu = 1; - grq_unlock_irqrestore(&flags); - - /* Set the preempt count _outside_ the spinlocks! */ - init_idle_preempt_count(idle, cpu); - - ftrace_graph_init_idle_task(idle, cpu); -#if defined(CONFIG_SMP) - sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif -} - -int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, - const struct cpumask __maybe_unused *trial) -{ - return 1; -} - -int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) -{ - int ret = 0; - - /* - * Kthreads which disallow setaffinity shouldn't be moved - * to a new cpuset; we don't want to change their cpu - * affinity and isolating such threads by their set of - * allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for - * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. - */ - if (p->flags & PF_NO_SETAFFINITY) - ret = -EINVAL; - - return ret; -} - -void resched_cpu(int cpu) -{ - unsigned long flags; - - grq_lock_irqsave(&flags); - resched_task(cpu_curr(cpu)); - grq_unlock_irqrestore(&flags); -} - -#ifdef CONFIG_SMP -#ifdef CONFIG_NO_HZ_COMMON -void nohz_balance_enter_idle(int cpu) -{ -} - -void select_nohz_load_balancer(int stop_tick) -{ -} - -void set_cpu_sd_state_idle(void) {} -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * lowest_flag_domain - Return lowest sched_domain containing flag. - * @cpu: The cpu whose lowest level of sched domain is to - * be returned. - * @flag: The flag to check for the lowest sched_domain - * for the given cpu. - * - * Returns the lowest sched_domain of a cpu which contains the given flag. - */ -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -{ - struct sched_domain *sd; - - for_each_domain(cpu, sd) - if (sd && (sd->flags & flag)) - break; - - return sd; -} - -/** - * for_each_flag_domain - Iterates over sched_domains containing the flag. - * @cpu: The cpu whose domains we're iterating over. - * @sd: variable holding the value of the power_savings_sd - * for cpu. - * @flag: The flag to filter the sched_domains to be iterated. - * - * Iterates over all the scheduler domains for a given cpu that has the 'flag' - * set, starting from the lowest sched_domain to the highest. - */ -#define for_each_flag_domain(cpu, sd, flag) \ - for (sd = lowest_flag_domain(cpu, flag); \ - (sd && (sd->flags & flag)); sd = sd->parent) - -#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ - -/* - * In the semi idle case, use the nearest busy cpu for migrating timers - * from an idle cpu. This is good for power-savings. - * - * We don't do similar optimization for completely idle system, as - * selecting an idle cpu will add more delays to the timers than intended - * (as that cpu's timer base may not be uptodate wrt jiffies etc). - */ -int get_nohz_timer_target(int pinned) -{ - int cpu = smp_processor_id(); - int i; - struct sched_domain *sd; - - if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) - return cpu; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { - cpu = i; - goto unlock; - } - } - } -unlock: - rcu_read_unlock(); - return cpu; -} - -/* - * When add_timer_on() enqueues a timer into the timer wheel of an - * idle CPU then this timer might expire before the next timer event - * which is scheduled to wake up that CPU. In case of a completely - * idle system the next event might even be infinite time into the - * future. wake_up_idle_cpu() ensures that the CPU is woken up and - * leaves the inner idle loop so the newly added timer is taken into - * account when the CPU goes back to idle and evaluates the timer - * wheel for the next timer event. - */ -void wake_up_idle_cpu(int cpu) -{ - if (cpu == smp_processor_id()) - return; - - set_tsk_need_resched(cpu_rq(cpu)->idle); - smp_send_reschedule(cpu); -} - -void wake_up_nohz_cpu(int cpu) -{ - wake_up_idle_cpu(cpu); -} -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - bool running_wrong = false; - bool queued = false; - unsigned long flags; - struct rq *rq; - int ret = 0; - - rq = task_grq_lock(p, &flags); - - if (cpumask_equal(tsk_cpus_allowed(p), new_mask)) - goto out; - - if (!cpumask_intersects(new_mask, cpu_active_mask)) { - ret = -EINVAL; - goto out; - } - - queued = task_queued(p); - - do_set_cpus_allowed(p, new_mask); - - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; - - if (task_running(p)) { - /* Task is running on the wrong cpu now, reschedule it. */ - if (rq == this_rq()) { - set_tsk_need_resched(p); - running_wrong = true; - } else - resched_task(p); - } else - set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask)); - -out: - if (queued) - try_preempt(p, rq); - task_grq_unlock(&flags); - - if (running_wrong) - preempt_schedule_common(); - - return ret; -} -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -#ifdef CONFIG_HOTPLUG_CPU -extern struct task_struct *cpu_stopper_task; -/* Run through task list and find tasks affined to the dead cpu, then remove - * that cpu from the list, enable cpu0 and set the zerobound flag. */ -static void bind_zero(int src_cpu) -{ - struct task_struct *p, *t, *stopper; - int bound = 0; - - if (src_cpu == 0) - return; - - stopper = per_cpu(cpu_stopper_task, src_cpu); - do_each_thread(t, p) { - if (p != stopper && cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) { - cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p)); - cpumask_set_cpu(0, tsk_cpus_allowed(p)); - p->zerobound = true; - bound++; - } - clear_sticky(p); - } while_each_thread(t, p); - - if (bound) { - printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n", - bound, src_cpu); - } -} - -/* Find processes with the zerobound flag and reenable their affinity for the - * CPU coming alive. */ -static void unbind_zero(int src_cpu) -{ - int unbound = 0, zerobound = 0; - struct task_struct *p, *t; - - if (src_cpu == 0) - return; - - do_each_thread(t, p) { - if (!p->mm) - p->zerobound = false; - if (p->zerobound) { - unbound++; - cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p)); - /* Once every CPU affinity has been re-enabled, remove - * the zerobound flag */ - if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) { - p->zerobound = false; - zerobound++; - } - } - } while_each_thread(t, p); - - if (unbound) { - printk(KERN_INFO "Added affinity for %d processes to cpu %d\n", - unbound, src_cpu); - } - if (zerobound) { - printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n", - zerobound); - } -} - -/* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. - */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); - finish_arch_post_lock_switch(); - } - mmdrop(mm); -} -#else /* CONFIG_HOTPLUG_CPU */ -static void unbind_zero(int src_cpu) {} -#endif /* CONFIG_HOTPLUG_CPU */ - -void sched_set_stop_task(int cpu, struct task_struct *stop) -{ - struct sched_param stop_param = { .sched_priority = STOP_PRIO }; - struct sched_param start_param = { .sched_priority = 0 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - - if (stop) { - /* - * Make it appear like a SCHED_FIFO task, its something - * userspace knows about and won't get confused about. - * - * Also, it will make PI more or less work without too - * much confusion -- but then, stop work should not - * rely on PI working anyway. - */ - sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); - } - - cpu_rq(cpu)->stop = stop; - - if (old_stop) { - /* - * Reset it back to a normal scheduling policy so that - * it can die in pieces. - */ - sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); - } -} - - -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - mode_t mode, proc_handler *proc_handler) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(14); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[13] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif - -static void set_rq_online(struct rq *rq) -{ - if (!rq->online) { - cpumask_set_cpu(cpu_of(rq), rq->rd->online); - rq->online = true; - } -} - -static void set_rq_offline(struct rq *rq) -{ - if (rq->online) { - cpumask_clear_cpu(cpu_of(rq), rq->rd->online); - rq->online = false; - } -} - -/* - * migration_call - callback that gets triggered when a CPU is added. - */ -static int -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - unsigned long flags; - struct rq *rq = cpu_rq(cpu); -#ifdef CONFIG_HOTPLUG_CPU - struct task_struct *idle = rq->idle; -#endif - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: - return NOTIFY_OK; - case CPU_UP_PREPARE: - break; - - case CPU_ONLINE: - /* Update our root-domain */ - grq_lock_irqsave(&flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - - set_rq_online(rq); - } - unbind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - grq_lock_irq(); - set_rq_task(rq, idle); - update_clocks(rq); - grq_unlock_irq(); - break; - - case CPU_DYING: - /* Update our root-domain */ - grq_lock_irqsave(&flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - bind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); - break; -#endif - } - return NOTIFY_OK; -} - -/* - * Register at high priority so that task migration (migrate_all_tasks) - * happens before everything else. This has to be lower priority than - * the notifier in the perf_counter subsystem, though. - */ -static struct notifier_block migration_notifier = { - .notifier_call = migration_call, - .priority = CPU_PRI_MIGRATION, -}; - -static int sched_cpu_active(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - set_cpu_active((long)hcpu, true); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -static int sched_cpu_inactive(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -int __init migration_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - /* Initialise migration for the boot CPU */ - err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); - migration_call(&migration_notifier, CPU_ONLINE, cpu); - register_cpu_notifier(&migration_notifier); - - /* Register cpu active notifiers */ - cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); - cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); - - return 0; -} -early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP - -static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ - -#ifdef CONFIG_SCHED_DEBUG - -static __read_mostly int sched_debug_enabled; - -static int __init sched_debug_setup(char *str) -{ - sched_debug_enabled = 1; - - return 0; -} -early_param("sched_debug", sched_debug_setup); - -static inline bool sched_debug(void) -{ - return sched_debug_enabled; -} - -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - struct cpumask *groupmask) -{ - cpumask_clear(groupmask); - - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); - return -1; - } - - printk(KERN_CONT "span %*pbl level %s\n", - cpumask_pr_args(sched_domain_span(sd)), sd->name); - - if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); - } - - printk(KERN_CONT "\n"); - - if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - - if (sd->parent && - !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); - return 0; -} - -static void sched_domain_debug(struct sched_domain *sd, int cpu) -{ - int level = 0; - - if (!sched_debug_enabled) - return; - - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; - } - - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) - break; - level++; - sd = sd->parent; - if (!sd) - break; - } -} -#else /* !CONFIG_SCHED_DEBUG */ -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ - -static int sd_degenerate(struct sched_domain *sd) -{ - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; - - /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_AFFINE)) - return 0; - - return 1; -} - -static int -sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) -{ - unsigned long cflags = sd->flags, pflags = parent->flags; - - if (sd_degenerate(parent)) - return 1; - - if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) - return 0; - - if (~cflags & pflags) - return 0; - - return 1; -} - -static void free_rootdomain(struct rcu_head *rcu) -{ - struct root_domain *rd = container_of(rcu, struct root_domain, rcu); - - cpupri_cleanup(&rd->cpupri); - free_cpumask_var(rd->rto_mask); - free_cpumask_var(rd->online); - free_cpumask_var(rd->span); - kfree(rd); -} - -static void rq_attach_root(struct rq *rq, struct root_domain *rd) -{ - struct root_domain *old_rd = NULL; - unsigned long flags; - - grq_lock_irqsave(&flags); - - if (rq->rd) { - old_rd = rq->rd; - - if (cpumask_test_cpu(rq->cpu, old_rd->online)) - set_rq_offline(rq); - - cpumask_clear_cpu(rq->cpu, old_rd->span); - - /* - * If we dont want to free the old_rd yet then - * set old_rd to NULL to skip the freeing later - * in this function: - */ - if (!atomic_dec_and_test(&old_rd->refcount)) - old_rd = NULL; - } - - atomic_inc(&rd->refcount); - rq->rd = rd; - - cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); - - grq_unlock_irqrestore(&flags); - - if (old_rd) - call_rcu_sched(&old_rd->rcu, free_rootdomain); -} - -static int init_rootdomain(struct root_domain *rd) -{ - memset(rd, 0, sizeof(*rd)); - - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) - goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) - goto free_online; - - if (cpupri_init(&rd->cpupri) != 0) - goto free_rto_mask; - return 0; - -free_rto_mask: - free_cpumask_var(rd->rto_mask); -free_online: - free_cpumask_var(rd->online); -free_span: - free_cpumask_var(rd->span); -out: - return -ENOMEM; -} - -static void init_defrootdomain(void) -{ - init_rootdomain(&def_root_domain); - - atomic_set(&def_root_domain.refcount, 1); -} - -static struct root_domain *alloc_rootdomain(void) -{ - struct root_domain *rd; - - rd = kmalloc(sizeof(*rd), GFP_KERNEL); - if (!rd) - return NULL; - - if (init_rootdomain(rd) != 0) { - kfree(rd); - return NULL; - } - - return rd; -} - -static void free_sched_domain(struct rcu_head *rcu) -{ - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - - kfree(sd); -} - -static void destroy_sched_domain(struct sched_domain *sd, int cpu) -{ - call_rcu(&sd->rcu, free_sched_domain); -} - -static void destroy_sched_domains(struct sched_domain *sd, int cpu) -{ - for (; sd; sd = sd->parent) - destroy_sched_domain(sd, cpu); -} - -/* - * Attach the domain 'sd' to 'cpu' as its base domain. Callers must - * hold the hotplug lock. - */ -static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; - - /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; ) { - struct sched_domain *parent = tmp->parent; - if (!parent) - break; - - if (sd_parent_degenerate(tmp, parent)) { - tmp->parent = parent->parent; - if (parent->parent) - parent->parent->child = tmp; - /* - * Transfer SD_PREFER_SIBLING down in case of a - * degenerate parent; the spans match for this - * so the property transfers. - */ - if (parent->flags & SD_PREFER_SIBLING) - tmp->flags |= SD_PREFER_SIBLING; - destroy_sched_domain(parent, cpu); - } else - tmp = tmp->parent; - } - - if (sd && sd_degenerate(sd)) { - tmp = sd; - sd = sd->parent; - destroy_sched_domain(tmp, cpu); - if (sd) - sd->child = NULL; - } - - sched_domain_debug(sd, cpu); - - rq_attach_root(rq, rd); - tmp = rq->sd; - rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp, cpu); -} - -/* cpus with isolated domains */ -cpumask_var_t cpu_isolated_map; - -/* Setup the mask of cpus configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - alloc_bootmem_cpumask_var(&cpu_isolated_map); - cpulist_parse(str, cpu_isolated_map); - return 1; -} - -__setup("isolcpus=", isolated_cpu_setup); - -struct s_data { - struct sched_domain ** __percpu sd; - struct root_domain *rd; -}; - -enum s_alloc { - sa_rootdomain, - sa_sd, - sa_sd_storage, - sa_none, -}; - -/* - * Initializers for schedule domains - * Non-inlined to reduce accumulated stack pressure in build_sched_domains() - */ - -static int default_relax_domain_level = -1; -int sched_domain_level_max; - -static int __init setup_relax_domain_level(char *str) -{ - if (kstrtoint(str, 0, &default_relax_domain_level)) - pr_warn("Unable to set relax_domain_level\n"); - - return 1; -} -__setup("relax_domain_level=", setup_relax_domain_level); - -static void set_domain_attribute(struct sched_domain *sd, - struct sched_domain_attr *attr) -{ - int request; - - if (!attr || attr->relax_domain_level < 0) { - if (default_relax_domain_level < 0) - return; - else - request = default_relax_domain_level; - } else - request = attr->relax_domain_level; - if (request < sd->level) { - /* turn off idle balance on this domain */ - sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } else { - /* turn on idle balance on this domain */ - sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } -} - -static void __sdt_free(const struct cpumask *cpu_map); -static int __sdt_alloc(const struct cpumask *cpu_map); - -static void __free_domain_allocs(struct s_data *d, enum s_alloc what, - const struct cpumask *cpu_map) -{ - switch (what) { - case sa_rootdomain: - if (!atomic_read(&d->rd->refcount)) - free_rootdomain(&d->rd->rcu); /* fall through */ - case sa_sd: - free_percpu(d->sd); /* fall through */ - case sa_sd_storage: - __sdt_free(cpu_map); /* fall through */ - case sa_none: - break; - } -} - -static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, - const struct cpumask *cpu_map) -{ - memset(d, 0, sizeof(*d)); - - if (__sdt_alloc(cpu_map)) - return sa_sd_storage; - d->sd = alloc_percpu(struct sched_domain *); - if (!d->sd) - return sa_sd_storage; - d->rd = alloc_rootdomain(); - if (!d->rd) - return sa_sd; - return sa_rootdomain; -} - -/* - * NULL the sd_data elements we've used to build the sched_domain - * structure so that the subsequent __free_domain_allocs() - * will not free the data we're using. - */ -static void claim_allocations(int cpu, struct sched_domain *sd) -{ - struct sd_data *sdd = sd->private; - - WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); - *per_cpu_ptr(sdd->sd, cpu) = NULL; -} - -#ifdef CONFIG_NUMA -static int sched_domains_numa_levels; -static int *sched_domains_numa_distance; -static struct cpumask ***sched_domains_numa_masks; -static int sched_domains_curr_level; -#endif - -/* - * SD_flags allowed in topology descriptions. - * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain - * - * Odd one out: - * SD_ASYM_PACKING - describes SMT quirks - */ -#define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUCAPACITY | \ - SD_SHARE_PKG_RESOURCES | \ - SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) - -static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) -{ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int sd_weight, sd_flags = 0; - -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); - - if (tl->sd_flags) - sd_flags = (*tl->sd_flags)(); - if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, - "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; - - *sd = (struct sched_domain){ - .min_interval = sd_weight, - .max_interval = 2*sd_weight, - .busy_factor = 32, - .imbalance_pct = 125, - - .cache_nice_tries = 0, - .busy_idx = 0, - .idle_idx = 0, - .newidle_idx = 0, - .wake_idx = 0, - .forkexec_idx = 0, - - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE - | 1*SD_BALANCE_EXEC - | 1*SD_BALANCE_FORK - | 0*SD_BALANCE_WAKE - | 1*SD_WAKE_AFFINE - | 0*SD_SHARE_CPUCAPACITY - | 0*SD_SHARE_PKG_RESOURCES - | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING - | 0*SD_NUMA - | sd_flags - , - - .last_balance = jiffies, - .balance_interval = sd_weight, - .smt_gain = 0, - .max_newidle_lb_cost = 0, - .next_decay_max_lb_cost = jiffies, -#ifdef CONFIG_SCHED_DEBUG - .name = tl->name, -#endif - }; - - /* - * Convert topological properties into behaviour. - */ - - if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; - sd->imbalance_pct = 110; - sd->smt_gain = 1178; /* ~15% */ - - } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->imbalance_pct = 117; - sd->cache_nice_tries = 1; - sd->busy_idx = 2; - -#ifdef CONFIG_NUMA - } else if (sd->flags & SD_NUMA) { - sd->cache_nice_tries = 2; - sd->busy_idx = 3; - sd->idle_idx = 2; - - sd->flags |= SD_SERIALIZE; - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { - sd->flags &= ~(SD_BALANCE_EXEC | - SD_BALANCE_FORK | - SD_WAKE_AFFINE); - } - -#endif - } else { - sd->flags |= SD_PREFER_SIBLING; - sd->cache_nice_tries = 1; - sd->busy_idx = 2; - sd->idle_idx = 1; - } - - sd->private = &tl->data; - - return sd; -} - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - -struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->mask; tl++) - -void set_sched_topology(struct sched_domain_topology_level *tl) -{ - sched_domain_topology = tl; -} - -#ifdef CONFIG_NUMA - -static const struct cpumask *sd_numa_mask(int cpu) -{ - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; -} - -static void sched_numa_warn(const char *str) -{ - static int done = false; - int i,j; - - if (done) - return; - - done = true; - - printk(KERN_WARNING "ERROR: %s\n\n", str); - - for (i = 0; i < nr_node_ids; i++) { - printk(KERN_WARNING " "); - for (j = 0; j < nr_node_ids; j++) - printk(KERN_CONT "%02d ", node_distance(i,j)); - printk(KERN_CONT "\n"); - } - printk(KERN_WARNING "\n"); -} - -static bool find_numa_distance(int distance) -{ - int i; - - if (distance == node_distance(0, 0)) - return true; - - for (i = 0; i < sched_domains_numa_levels; i++) { - if (sched_domains_numa_distance[i] == distance) - return true; - } - - return false; -} - -static void sched_init_numa(void) -{ - int next_distance, curr_distance = node_distance(0, 0); - struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* - * O(nr_nodes^2) deduplicating selection sort -- in order to find the - * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. - */ - next_distance = curr_distance; - for (i = 0; i < nr_node_ids; i++) { - for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; - - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); - - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); - } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; - } - - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; - } - /* - * 'level' contains the number of unique distances, excluding the - * identity distance node_distance(i,i). - * - * The sched_domains_numa_distance[] array includes the actual distance - * numbers. - */ - - /* - * Here, we should temporarily reset sched_domains_numa_levels to 0. - * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be - * dangerous when we use it to iterate array sched_domains_numa_masks[][] - * in other functions. - * - * We reset it to 'level' at the end of this function. - */ - sched_domains_numa_levels = 0; - - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); - if (!sched_domains_numa_masks) - return; - - /* - * Now for each level, construct a mask per node which contains all - * cpus of nodes that are that many hops away from us. - */ - for (i = 0; i < level; i++) { - sched_domains_numa_masks[i] = - kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); - if (!sched_domains_numa_masks[i]) - return; - - for (j = 0; j < nr_node_ids; j++) { - struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!mask) - return; - - sched_domains_numa_masks[i][j] = mask; - - for (k = 0; k < nr_node_ids; k++) { - if (node_distance(j, k) > sched_domains_numa_distance[i]) - continue; - - cpumask_or(mask, mask, cpumask_of_node(k)); - } - } - } - - /* Compute default topology size */ - for (i = 0; sched_domain_topology[i].mask; i++); - - tl = kzalloc((i + level + 1) * - sizeof(struct sched_domain_topology_level), GFP_KERNEL); - if (!tl) - return; - - /* - * Copy the default topology bits.. - */ - for (i = 0; sched_domain_topology[i].mask; i++) - tl[i] = sched_domain_topology[i]; - - /* - * .. and append 'j' levels of NUMA goodness. - */ - for (j = 0; j < level; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; - } - - sched_domain_topology = tl; - - sched_domains_numa_levels = level; -} - -static void sched_domains_numa_masks_set(int cpu) -{ - int i, j; - int node = cpu_to_node(cpu); - - for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) { - if (node_distance(j, node) <= sched_domains_numa_distance[i]) - cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); - } - } -} - -static void sched_domains_numa_masks_clear(int cpu) -{ - int i, j; - for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) - cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); - } -} - -/* - * Update sched_domains_numa_masks[level][node] array when new cpus - * are onlined. - */ -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - sched_domains_numa_masks_set(cpu); - break; - - case CPU_DEAD: - sched_domains_numa_masks_clear(cpu); - break; - - default: - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} -#else -static inline void sched_init_numa(void) -{ -} - -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - return 0; -} -#endif /* CONFIG_NUMA */ - -static int __sdt_alloc(const struct cpumask *cpu_map) -{ - struct sched_domain_topology_level *tl; - int j; - - for_each_sd_topology(tl) { - struct sd_data *sdd = &tl->data; - - sdd->sd = alloc_percpu(struct sched_domain *); - if (!sdd->sd) - return -ENOMEM; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd; - - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sd) - return -ENOMEM; - - *per_cpu_ptr(sdd->sd, j) = sd; - } - } - - return 0; -} - -static void __sdt_free(const struct cpumask *cpu_map) -{ - struct sched_domain_topology_level *tl; - int j; - - for_each_sd_topology(tl) { - struct sd_data *sdd = &tl->data; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd; - - if (sdd->sd) { - sd = *per_cpu_ptr(sdd->sd, j); - kfree(*per_cpu_ptr(sdd->sd, j)); - } - } - free_percpu(sdd->sd); - sdd->sd = NULL; - } -} - -struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int cpu) -{ - struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; - - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - if (child) { - sd->level = child->level + 1; - sched_domain_level_max = max(sched_domain_level_max, sd->level); - child->parent = sd; - sd->child = child; - - if (!cpumask_subset(sched_domain_span(child), - sched_domain_span(sd))) { - pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG - pr_err(" the %s domain not a subset of the %s domain\n", - child->name, sd->name); -#endif - /* Fixup, ensure @sd has at least @child cpus. */ - cpumask_or(sched_domain_span(sd), - sched_domain_span(sd), - sched_domain_span(child)); - } - - } - set_domain_attribute(sd, attr); - - return sd; -} - -/* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus - */ -static int build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) -{ - enum s_alloc alloc_state; - struct sched_domain *sd; - struct s_data d; - int i, ret = -ENOMEM; - - alloc_state = __visit_domain_allocation_hell(&d, cpu_map); - if (alloc_state != sa_rootdomain) - goto error; - - /* Set up domains for cpus specified by the cpu_map. */ - for_each_cpu(i, cpu_map) { - struct sched_domain_topology_level *tl; - - sd = NULL; - for_each_sd_topology(tl) { - sd = build_sched_domain(tl, cpu_map, attr, sd, i); - if (tl == sched_domain_topology) - *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP) - sd->flags |= SD_OVERLAP; - if (cpumask_equal(cpu_map, sched_domain_span(sd))) - break; - } - } - - /* Calculate CPU capacity for physical packages and nodes */ - for (i = nr_cpumask_bits-1; i >= 0; i--) { - if (!cpumask_test_cpu(i, cpu_map)) - continue; - - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - claim_allocations(i, sd); - } - } - - /* Attach the domains */ - rcu_read_lock(); - for_each_cpu(i, cpu_map) { - sd = *per_cpu_ptr(d.sd, i); - cpu_attach_domain(sd, d.rd, i); - } - rcu_read_unlock(); - - ret = 0; -error: - __free_domain_allocs(&d, alloc_state, cpu_map); - return ret; -} - -static cpumask_var_t *doms_cur; /* current sched domains */ -static int ndoms_cur; /* number of sched domains in 'doms_cur' */ -static struct sched_domain_attr *dattr_cur; - /* attribues of custom domains in 'doms_cur' */ - -/* - * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask) fails, then fallback to a single sched domain, - * as determined by the single cpumask fallback_doms. - */ -static cpumask_var_t fallback_doms; - -/* - * arch_update_cpu_topology lets virtualized architectures update the - * cpu core maps. It is supposed to return 1 if the topology changed - * or 0 if it stayed the same. - */ -int __weak arch_update_cpu_topology(void) -{ - return 0; -} - -cpumask_var_t *alloc_sched_domains(unsigned int ndoms) -{ - int i; - cpumask_var_t *doms; - - doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); - if (!doms) - return NULL; - for (i = 0; i < ndoms; i++) { - if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { - free_sched_domains(doms, i); - return NULL; - } - } - return doms; -} - -void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) -{ - unsigned int i; - for (i = 0; i < ndoms; i++) - free_cpumask_var(doms[i]); - kfree(doms); -} - -/* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ -static int init_sched_domains(const struct cpumask *cpu_map) -{ - int err; - - arch_update_cpu_topology(); - ndoms_cur = 1; - doms_cur = alloc_sched_domains(ndoms_cur); - if (!doms_cur) - doms_cur = &fallback_doms; - cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - err = build_sched_domains(doms_cur[0], NULL); - register_sched_domain_sysctl(); - - return err; -} - -/* - * Detach sched domains from a group of cpus specified in cpu_map - * These cpus will now be attached to the NULL domain - */ -static void detach_destroy_domains(const struct cpumask *cpu_map) -{ - int i; - - rcu_read_lock(); - for_each_cpu(i, cpu_map) - cpu_attach_domain(NULL, &def_root_domain, i); - rcu_read_unlock(); -} - -/* handle null as "default" */ -static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, - struct sched_domain_attr *new, int idx_new) -{ - struct sched_domain_attr tmp; - - /* fast path */ - if (!new && !cur) - return 1; - - tmp = SD_ATTR_INIT; - return !memcmp(cur ? (cur + idx_cur) : &tmp, - new ? (new + idx_new) : &tmp, - sizeof(struct sched_domain_attr)); -} - -/* - * Partition sched domains as specified by the 'ndoms_new' - * cpumasks in the array doms_new[] of cpumasks. This compares - * doms_new[] to the current sched domain partitioning, doms_cur[]. - * It destroys each deleted domain and builds each new domain. - * - * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. - * The masks don't intersect (don't overlap.) We should setup one - * sched domain for each mask. CPUs not in any of the cpumasks will - * not be load balanced. If the same cpumask appears both in the - * current 'doms_cur' domains and in the new 'doms_new', we can leave - * it as it is. - * - * The passed in 'doms_new' should be allocated using - * alloc_sched_domains. This routine takes ownership of it and will - * free_sched_domains it when done with it. If the caller failed the - * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, - * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms', it also forces the domains to be rebuilt. - * - * If doms_new == NULL it will be replaced with cpu_online_mask. - * ndoms_new == 0 is a special case for destroying existing domains, - * and it will not create the default domain. - * - * Call with hotplug lock held - */ -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - int i, j, n; - int new_topology; - - mutex_lock(&sched_domains_mutex); - - /* always unregister in case we don't destroy any domains */ - unregister_sched_domain_sysctl(); - - /* Let architecture update cpu core mappings. */ - new_topology = arch_update_cpu_topology(); - - n = doms_new ? ndoms_new : 0; - - /* Destroy deleted domains */ - for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_cur[i], doms_new[j]) - && dattrs_equal(dattr_cur, i, dattr_new, j)) - goto match1; - } - /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur[i]); -match1: - ; - } - - n = ndoms_cur; - if (doms_new == NULL) { - n = 0; - doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); - WARN_ON_ONCE(dattr_new); - } - - /* Build new domains */ - for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_new[i], doms_cur[j]) - && dattrs_equal(dattr_new, i, dattr_cur, j)) - goto match2; - } - /* no match - add a new doms_new */ - build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); -match2: - ; - } - - /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) - free_sched_domains(doms_cur, ndoms_cur); - kfree(dattr_cur); /* kfree(NULL) is safe */ - doms_cur = doms_new; - dattr_cur = dattr_new; - ndoms_cur = ndoms_new; - - register_sched_domain_sysctl(); - - mutex_unlock(&sched_domains_mutex); -} - -static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ - -/* - * Update cpusets according to cpu_active mask. If cpusets are - * disabled, cpuset_update_active_cpus() becomes a simple wrapper - * around partition_sched_domains(). - * - * If we come here as part of a suspend/resume, don't touch cpusets because we - * want to restore it back to its original state upon resume anyway. - */ -static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - switch (action) { - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED_FROZEN: - - /* - * num_cpus_frozen tracks how many CPUs are involved in suspend - * resume sequence. As long as this is not the last online - * operation in the resume sequence, just build a single sched - * domain, ignoring cpusets. - */ - num_cpus_frozen--; - if (likely(num_cpus_frozen)) { - partition_sched_domains(1, NULL, NULL); - break; - } - - /* - * This is the last CPU online operation. So fall through and - * restore the original sched domains by considering the - * cpuset configurations. - */ - - case CPU_ONLINE: - cpuset_update_active_cpus(true); - break; - default: - return NOTIFY_DONE; - } - return NOTIFY_OK; -} - -static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - switch (action) { - case CPU_DOWN_PREPARE: - cpuset_update_active_cpus(false); - break; - case CPU_DOWN_PREPARE_FROZEN: - num_cpus_frozen++; - partition_sched_domains(1, NULL, NULL); - break; - default: - return NOTIFY_DONE; - } - return NOTIFY_OK; -} - -#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) -/* - * Cheaper version of the below functions in case support for SMT and MC is - * compiled in but CPUs have no siblings. - */ -static bool sole_cpu_idle(int cpu) -{ - return rq_idle(cpu_rq(cpu)); -} -#endif -#ifdef CONFIG_SCHED_SMT -static const cpumask_t *thread_cpumask(int cpu) -{ - return topology_thread_cpumask(cpu); -} -/* All this CPU's SMT siblings are idle */ -static bool siblings_cpu_idle(int cpu) -{ - return cpumask_subset(thread_cpumask(cpu), &grq.cpu_idle_map); -} -#endif -#ifdef CONFIG_SCHED_MC -static const cpumask_t *core_cpumask(int cpu) -{ - return topology_core_cpumask(cpu); -} -/* All this CPU's shared cache siblings are idle */ -static bool cache_cpu_idle(int cpu) -{ - return cpumask_subset(core_cpumask(cpu), &grq.cpu_idle_map); -} -#endif - -enum sched_domain_level { - SD_LV_NONE = 0, - SD_LV_SIBLING, - SD_LV_MC, - SD_LV_BOOK, - SD_LV_CPU, - SD_LV_NODE, - SD_LV_ALLNODES, - SD_LV_MAX -}; - -void __init sched_init_smp(void) -{ - struct sched_domain *sd; - int cpu, other_cpu; - - cpumask_var_t non_isolated_cpus; - - alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - - sched_init_numa(); - - /* - * There's no userspace yet to cause hotplug operations; hence all the - * cpu masks are stable and all blatant races in the below code cannot - * happen. - */ - mutex_lock(&sched_domains_mutex); - init_sched_domains(cpu_active_mask); - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - if (cpumask_empty(non_isolated_cpus)) - cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); - mutex_unlock(&sched_domains_mutex); - - hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); - hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); - hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - - /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) - BUG(); - free_cpumask_var(non_isolated_cpus); - - grq_lock_irq(); - /* - * Set up the relative cache distance of each online cpu from each - * other in a simple array for quick lookup. Locality is determined - * by the closest sched_domain that CPUs are separated by. CPUs with - * shared cache in SMT and MC are treated as local. Separate CPUs - * (within the same package or physically) within the same node are - * treated as not local. CPUs not even in the same domain (different - * nodes) are treated as very distant. - */ - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - - /* First check if this cpu is in the same node */ - for_each_domain(cpu, sd) { - if (sd->level > SD_LV_NODE) - continue; - /* Set locality to local node if not already found lower */ - for_each_cpu(other_cpu, sched_domain_span(sd)) { - if (rq->cpu_locality[other_cpu] > 3) - rq->cpu_locality[other_cpu] = 3; - } - } - - /* - * Each runqueue has its own function in case it doesn't have - * siblings of its own allowing mixed topologies. - */ -#ifdef CONFIG_SCHED_MC - for_each_cpu(other_cpu, core_cpumask(cpu)) { - if (rq->cpu_locality[other_cpu] > 2) - rq->cpu_locality[other_cpu] = 2; - } - if (cpumask_weight(core_cpumask(cpu)) > 1) - rq->cache_idle = cache_cpu_idle; -#endif -#ifdef CONFIG_SCHED_SMT - for_each_cpu(other_cpu, thread_cpumask(cpu)) - rq->cpu_locality[other_cpu] = 1; - if (cpumask_weight(thread_cpumask(cpu)) > 1) - rq->siblings_idle = siblings_cpu_idle; -#endif - } - grq_unlock_irq(); - - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - for_each_online_cpu(other_cpu) { - if (other_cpu <= cpu) - continue; - printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); - } - } -} -#else -void __init sched_init_smp(void) -{ -} -#endif /* CONFIG_SMP */ - -unsigned int sysctl_timer_migration = 1; - -int in_sched_functions(unsigned long addr) -{ - return in_lock_functions(addr) || - (addr >= (unsigned long)__sched_text_start - && addr < (unsigned long)__sched_text_end); -} - -void __init sched_init(void) -{ -#ifdef CONFIG_SMP - int cpu_ids; -#endif - int i; - struct rq *rq; - - prio_ratios[0] = 128; - for (i = 1 ; i < NICE_WIDTH ; i++) - prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; - - raw_spin_lock_init(&grq.lock); - grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; - grq.niffies = 0; - grq.last_jiffy = jiffies; - raw_spin_lock_init(&grq.iso_lock); - grq.iso_ticks = 0; - grq.iso_refractory = false; - grq.noc = 1; -#ifdef CONFIG_SMP - init_defrootdomain(); - grq.qnr = grq.idle_cpus = 0; - cpumask_clear(&grq.cpu_idle_map); -#else - uprq = &per_cpu(runqueues, 0); -#endif - for_each_possible_cpu(i) { - rq = cpu_rq(i); - rq->grq_lock = &grq.lock; - rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = - rq->iowait_pc = rq->idle_pc = 0; - rq->dither = false; -#ifdef CONFIG_SMP - rq->sticky_task = NULL; - rq->last_niffy = 0; - rq->sd = NULL; - rq->rd = NULL; - rq->online = false; - rq->cpu = i; - rq_attach_root(rq, &def_root_domain); -#endif - atomic_set(&rq->nr_iowait, 0); - } - -#ifdef CONFIG_SMP - cpu_ids = i; - /* - * Set the base locality for cpu cache distance calculation to - * "distant" (3). Make sure the distance from a CPU to itself is 0. - */ - for_each_possible_cpu(i) { - int j; - - rq = cpu_rq(i); -#ifdef CONFIG_SCHED_SMT - rq->siblings_idle = sole_cpu_idle; -#endif -#ifdef CONFIG_SCHED_MC - rq->cache_idle = sole_cpu_idle; -#endif - rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); - for_each_possible_cpu(j) { - if (i == j) - rq->cpu_locality[j] = 0; - else - rq->cpu_locality[j] = 4; - } - } -#endif - - for (i = 0; i < PRIO_LIMIT; i++) - INIT_LIST_HEAD(grq.queue + i); - /* delimiter for bitsearch */ - __set_bit(PRIO_LIMIT, grq.prio_bitmap); - -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - - /* - * The boot idle thread does lazy MMU switching as well: - */ - atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current); - - /* - * Make us the idle thread. Technically, schedule() should not be - * called from this thread, however somewhere below it might be, - * but because we are the idle thread, we just pick up running again - * when this runqueue becomes "idle". - */ - init_idle(current, smp_processor_id()); - -#ifdef CONFIG_SMP - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); - /* May be allocated at isolcpus cmdline parse time */ - if (cpu_isolated_map == NULL) - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); - idle_thread_set_boot_cpu(); -#endif /* SMP */ -} - -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline int preempt_count_equals(int preempt_offset) -{ - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); - - return (nested == preempt_offset); -} - -void __might_sleep(const char *file, int line, int preempt_offset) -{ - /* - * Blocking primitives will set (and therefore destroy) current->state, - * since we will exit with TASK_RUNNING make sure we enter with it, - * otherwise we will destroy state. - */ - WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, - "do not call blocking ops when !TASK_RUNNING; " - "state=%lx set at [<%p>] %pS\n", - current->state, - (void *)current->task_state_change, - (void *)current->task_state_change); - - ___might_sleep(file, line, preempt_offset); -} -EXPORT_SYMBOL(__might_sleep); - -void ___might_sleep(const char *file, int line, int preempt_offset) -{ - static unsigned long prev_jiffy; /* ratelimiting */ - - rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) - return; - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); - - if (task_stack_end_corrupted(current)) - printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); - - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { - pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); - pr_cont("\n"); - } -#endif - dump_stack(); -} -EXPORT_SYMBOL(___might_sleep); -#endif - -#ifdef CONFIG_MAGIC_SYSRQ -void normalize_rt_tasks(void) -{ - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; - int queued; - - read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - if (!rt_task(p) && !iso_task(p)) - continue; - - rq = task_grq_lock(p, &flags); - queued = task_queued(p); - if (queued) - dequeue_task(p); - __setscheduler(p, rq, SCHED_NORMAL, 0, false); - if (queued) { - enqueue_task(p, rq); - try_preempt(p, rq); - } - - task_grq_unlock(&flags); - } - read_unlock(&tasklist_lock); -} -#endif /* CONFIG_MAGIC_SYSRQ */ - -#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -/* - * These functions are only useful for the IA64 MCA handling, or kdb. - * - * They can only be called when the whole system has been - * stopped - every CPU needs to be quiescent, and no scheduling - * activity can take place. Using them for anything else would - * be a serious bug, and as a result, they aren't even visible - * under any other configuration. - */ - -/** - * curr_task - return the current task for a given cpu. - * @cpu: the processor in question. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - * - * Return: The current task for @cpu. - */ -struct task_struct *curr_task(int cpu) -{ - return cpu_curr(cpu); -} - -#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ - -#ifdef CONFIG_IA64 -/** - * set_curr_task - set the current task for a given cpu. - * @cpu: the processor in question. - * @p: the task pointer to set. - * - * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a cpu in a non-blocking manner. This function - * must be called with all CPU's synchronised, and interrupts disabled, the - * and caller must save the original value of the current task (see - * curr_task() above) and restore that value before reenabling interrupts and - * re-starting the system. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -void set_curr_task(int cpu, struct task_struct *p) -{ - cpu_curr(cpu) = p; -} - -#endif - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} - -void vtime_account_system_irqsafe(struct task_struct *tsk) -{ - unsigned long flags; - - local_irq_save(flags); - vtime_account_system(tsk); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); - -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_system(prev); - - vtime_account_user(prev); - arch_vtime_task_switch(prev); -} -#endif - -#else -/* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) { - u64 tmp = rtime; rtime = stime; stime = tmp; - } - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return (__force cputime_t) scaled; -} - -/* - * Adjust tick based cputime random precision against scheduler - * runtime accounting. - */ -static void cputime_adjust(struct task_cputime *curr, - struct cputime *prev, - cputime_t *ut, cputime_t *st) -{ - cputime_t rtime, stime, utime, total; - - stime = curr->stime; - total = stime + curr->utime; - - /* - * Tick based cputime accounting depend on random scheduling - * timeslices of a task to be interrupted or not by the timer. - * Depending on these circumstances, the number of these interrupts - * may be over or under-optimistic, matching the real user and system - * cputime with a variable precision. - * - * Fix this by scaling these tick based values against the total - * runtime accounted by the CFS scheduler. - */ - rtime = nsecs_to_cputime(curr->sum_exec_runtime); - - /* - * Update userspace visible utime/stime values only if actual execution - * time is bigger than already exported. Note that can happen, that we - * provided bigger values due to scaling inaccuracy on big numbers. - */ - if (prev->stime + prev->utime >= rtime) - goto out; - - if (total) { - stime = scale_stime((__force u64)stime, - (__force u64)rtime, (__force u64)total); - utime = rtime - stime; - } else { - stime = rtime; - utime = 0; - } - - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); - -out: - *ut = prev->utime; - *st = prev->stime; -} - -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime = { - .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); - cputime_adjust(&cputime, &p->prev_cputime, ut, st); -} - -/* - * Must be called with siglock held. - */ -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); -} -#endif - -void init_idle_bootup_task(struct task_struct *idle) -{} - -#ifdef CONFIG_SCHED_DEBUG -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) -{} - -void proc_sched_set_task(struct task_struct *p) -{} -#endif - -#ifdef CONFIG_SMP -#define SCHED_LOAD_SHIFT (10) -#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) - -unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) -{ - return SCHED_LOAD_SCALE; -} - -unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) -{ - unsigned long weight = cpumask_weight(sched_domain_span(sd)); - unsigned long smt_gain = sd->smt_gain; - - smt_gain /= weight; - - return smt_gain; -} -#endif diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h deleted file mode 100644 index 876969fff..000000000 --- a/kernel/sched/bfs_sched.h +++ /dev/null @@ -1,172 +0,0 @@ -#include <linux/sched.h> -#include <linux/cpuidle.h> - -#ifndef BFS_SCHED_H -#define BFS_SCHED_H - -/* - * This is the main, per-CPU runqueue data structure. - * This data should only be modified by the local cpu. - */ -struct rq { - struct task_struct *curr, *idle, *stop; - struct mm_struct *prev_mm; - - /* Pointer to grq spinlock */ - raw_spinlock_t *grq_lock; - - /* Stored data about rq->curr to work outside grq lock */ - u64 rq_deadline; - unsigned int rq_policy; - int rq_time_slice; - u64 rq_last_ran; - int rq_prio; - bool rq_running; /* There is a task running */ - int soft_affined; /* Running or queued tasks with this set as their rq */ -#ifdef CONFIG_SMT_NICE - struct mm_struct *rq_mm; - int rq_smt_bias; /* Policy/nice level bias across smt siblings */ -#endif - /* Accurate timekeeping data */ - u64 timekeep_clock; - unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, - iowait_pc, idle_pc; - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - int cpu; /* cpu of this runqueue */ - bool online; - bool scaling; /* This CPU is managed by a scaling CPU freq governor */ - struct task_struct *sticky_task; - - struct root_domain *rd; - struct sched_domain *sd; - int *cpu_locality; /* CPU relative cache distance */ -#ifdef CONFIG_SCHED_SMT - bool (*siblings_idle)(int cpu); - /* See if all smt siblings are idle */ -#endif /* CONFIG_SCHED_SMT */ -#ifdef CONFIG_SCHED_MC - bool (*cache_idle)(int cpu); - /* See if all cache siblings are idle */ -#endif /* CONFIG_SCHED_MC */ - u64 last_niffy; /* Last time this RQ updated grq.niffies */ -#endif /* CONFIG_SMP */ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -#ifdef CONFIG_PARAVIRT - u64 prev_steal_time; -#endif /* CONFIG_PARAVIRT */ -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - u64 prev_steal_time_rq; -#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ - - u64 clock, old_clock, last_tick; - u64 clock_task; - bool dither; - -#ifdef CONFIG_SCHEDSTATS - - /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ - - /* sys_sched_yield() stats */ - unsigned int yld_count; - - /* schedule() stats */ - unsigned int sched_switch; - unsigned int sched_count; - unsigned int sched_goidle; - - /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; -#endif /* CONFIG_SCHEDSTATS */ -#ifdef CONFIG_CPU_IDLE - /* Must be inspected within a rcu lock section */ - struct cpuidle_state *idle_state; -#endif -}; - -#ifdef CONFIG_SMP -struct rq *cpu_rq(int cpu); -#endif - -#ifndef CONFIG_SMP -extern struct rq *uprq; -#define cpu_rq(cpu) (uprq) -#define this_rq() (uprq) -#define raw_rq() (uprq) -#define task_rq(p) (uprq) -#define cpu_curr(cpu) ((uprq)->curr) -#else /* CONFIG_SMP */ -DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#define this_rq() this_cpu_ptr(&runqueues) -#define raw_rq() raw_cpu_ptr(&runqueues) -#endif /* CONFIG_SMP */ - -static inline u64 __rq_clock_broken(struct rq *rq) -{ - return ACCESS_ONCE(rq->clock); -} - -static inline u64 rq_clock(struct rq *rq) -{ - lockdep_assert_held(rq->grq_lock); - return rq->clock; -} - -static inline u64 rq_clock_task(struct rq *rq) -{ - lockdep_assert_held(rq->grq_lock); - return rq->clock_task; -} - -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) - -/* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. - * - * The domain tree of any CPU may only be accessed from within - * preempt-disabled sections. - */ -#define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) - -static inline void sched_ttwu_pending(void) { } - -static inline int task_on_rq_queued(struct task_struct *p) -{ - return p->on_rq; -} - -#ifdef CONFIG_CPU_IDLE -static inline void idle_set_state(struct rq *rq, - struct cpuidle_state *idle_state) -{ - rq->idle_state = idle_state; -} - -static inline struct cpuidle_state *idle_get_state(struct rq *rq) -{ - WARN_ON(!rcu_read_lock_held()); - return rq->idle_state; -} -#else -static inline void idle_set_state(struct rq *rq, - struct cpuidle_state *idle_state) -{ -} - -static inline struct cpuidle_state *idle_get_state(struct rq *rq) -{ - return NULL; -} -#endif -#endif /* BFS_SCHED_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 123673291..78b4bad10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,26 +90,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) -{ - unsigned long delta; - ktime_t soft, hard, now; - - for (;;) { - if (hrtimer_active(period_timer)) - break; - - now = hrtimer_cb_get_time(period_timer); - hrtimer_forward(period_timer, now, period); - - soft = hrtimer_get_softexpires(period_timer); - hard = hrtimer_get_expires(period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } -} - DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -355,12 +335,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) #ifdef CONFIG_SMP -static int __hrtick_restart(struct rq *rq) +static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = hrtimer_get_softexpires(timer); - return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } /* @@ -440,8 +419,8 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense. Rely on vruntime for fairness. */ delay = max_t(u64, delay, 10000LL); - __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED); } static inline void init_hrtick(void) @@ -511,7 +490,7 @@ static bool set_nr_and_not_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); for (;;) { if (!(val & _TIF_POLLING_NRFLAG)) @@ -541,6 +520,52 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + struct wake_q_node *node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means + * its already queued (either by us or someone else) and will get the + * wakeup due to that. + * + * This cmpxchg() implies a full barrier, which pairs with the write + * barrier implied by the wakeup in wake_up_list(). + */ + if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + return; + + get_task_struct(task); + + /* + * The head is context local, there can be no concurrency. + */ + *head->lastp = node; + head->lastp = &node->next; +} + +void wake_up_q(struct wake_q_head *head) +{ + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + + task = container_of(node, struct task_struct, wake_q); + BUG_ON(!task); + /* task can safely be re-inserted now */ + node = node->next; + task->wake_q.next = NULL; + + /* + * wake_up_process() implies a wmb() to pair with the queueing + * in wake_q_add() so as not to miss wakeups. + */ + wake_up_process(task); + put_task_struct(task); + } +} + /* * resched_curr - mark rq's current task 'to be rescheduled now'. * @@ -593,13 +618,12 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(int pinned) +int get_nohz_timer_target(void) { - int cpu = smp_processor_id(); - int i; + int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + if (!idle_cpu(cpu)) return cpu; rcu_read_lock(); @@ -976,7 +1000,11 @@ inline int task_curr(const struct task_struct *p) } /* - * Can drop rq->lock because from sched_class::switched_from() methods drop it. + * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, + * use the balance_callback list if you want balancing. + * + * this means any call to check_class_changed() must be followed by a call to + * balance_callback(). */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, @@ -985,7 +1013,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); - /* Possble rq->lock 'hole'. */ + p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1017,6 +1045,177 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we invoke migration_cpu_stop() on the target CPU using + * stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + * off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 5) stopper completes and stop_one_cpu() returns and the migration + * is done. + */ + +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) +{ + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + +struct migration_arg { + struct task_struct *task; + int dest_cpu; +}; + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) +{ + if (unlikely(!cpu_active(dest_cpu))) + return rq; + + /* Affinity changed (again). */ + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return rq; + + rq = move_queued_task(rq, p, dest_cpu); + + return rq; +} + +/* + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. + */ +static int migration_cpu_stop(void *data) +{ + struct migration_arg *arg = data; + struct task_struct *p = arg->task; + struct rq *rq = this_rq(); + + /* + * The original target cpu might have gone down and we might + * be on another cpu but it doesn't matter. + */ + local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); + + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because + * we're holding p->pi_lock. + */ + if (task_rq(p) == rq && task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + raw_spin_unlock(&rq->lock); + raw_spin_unlock(&p->pi_lock); + + local_irq_enable(); + return 0; +} + +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); +} + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + unsigned long flags; + struct rq *rq; + unsigned int dest_cpu; + int ret = 0; + + rq = task_rq_lock(p, &flags); + + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; + + if (!cpumask_intersects(new_mask, cpu_active_mask)) { + ret = -EINVAL; + goto out; + } + + do_set_cpus_allowed(p, new_mask); + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask)) + goto out; + + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (task_running(rq, p) || p->state == TASK_WAKING) { + struct migration_arg arg = { p, dest_cpu }; + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, p, &flags); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + return 0; + } else if (task_on_rq_queued(p)) { + /* + * OK, since we're going to drop the lock immediately + * afterwards anyway. + */ + lockdep_unpin_lock(&rq->lock); + rq = move_queued_task(rq, p, dest_cpu); + lockdep_pin_lock(&rq->lock); + } +out: + task_rq_unlock(rq, p, &flags); + + return ret; +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -1049,7 +1248,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); + perf_event_task_migrate(p); } __set_task_cpu(p, new_cpu); @@ -1157,13 +1356,6 @@ out: return ret; } -struct migration_arg { - struct task_struct *task; - int dest_cpu; -}; - -static int migration_cpu_stop(void *data); - /* * wait_task_inactive - wait for a thread to unschedule. * @@ -1296,9 +1488,7 @@ void kick_process(struct task_struct *p) preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); -#endif /* CONFIG_SMP */ -#ifdef CONFIG_SMP /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ @@ -1378,6 +1568,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { + lockdep_assert_held(&p->pi_lock); + if (p->nr_cpus_allowed > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); @@ -1403,7 +1595,7 @@ static void update_avg(u64 *avg, u64 sample) s64 diff = sample - *avg; *avg += diff >> 3; } -#endif +#endif /* CONFIG_SMP */ static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) @@ -1466,8 +1658,15 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) p->state = TASK_RUNNING; #ifdef CONFIG_SMP - if (p->sched_class->task_woken) + if (p->sched_class->task_woken) { + /* + * Our task @p is fully woken up and running; so its safe to + * drop the rq->lock, hereafter rq is only used for statistics. + */ + lockdep_unpin_lock(&rq->lock); p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); + } if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; @@ -1486,6 +1685,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) { + lockdep_assert_held(&rq->lock); + #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; @@ -1530,6 +1731,7 @@ void sched_ttwu_pending(void) return; raw_spin_lock_irqsave(&rq->lock, flags); + lockdep_pin_lock(&rq->lock); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); @@ -1537,6 +1739,7 @@ void sched_ttwu_pending(void) ttwu_do_activate(rq, p, 0); } + lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1633,7 +1836,9 @@ static void ttwu_queue(struct task_struct *p, int cpu) #endif raw_spin_lock(&rq->lock); + lockdep_pin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); } @@ -1728,9 +1933,17 @@ static void try_to_wake_up_local(struct task_struct *p) lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet picked a replacement task. + */ + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); + lockdep_pin_lock(&rq->lock); } if (!(p->state & TASK_NORMAL)) @@ -1951,7 +2164,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) set_task_cpu(p, cpu); raw_spin_unlock_irqrestore(&p->pi_lock, flags); -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif @@ -2105,12 +2318,29 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; + +void preempt_notifier_inc(void) +{ + static_key_slow_inc(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_inc); + +void preempt_notifier_dec(void) +{ + static_key_slow_dec(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_dec); + /** * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) { + if (!static_key_false(&preempt_notifier_key)) + WARN(1, "registering preempt_notifier while notifiers disabled\n"); + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); } EXPORT_SYMBOL_GPL(preempt_notifier_register); @@ -2119,7 +2349,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register); * preempt_notifier_unregister - no longer interested in preemption notifications * @notifier: notifier struct to unregister * - * This is safe to call from within a preemption notifier. + * This is *not* safe to call from within a preemption notifier. */ void preempt_notifier_unregister(struct preempt_notifier *notifier) { @@ -2127,7 +2357,7 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier) } EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) { struct preempt_notifier *notifier; @@ -2135,9 +2365,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) notifier->ops->sched_in(notifier, raw_smp_processor_id()); } +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + if (static_key_false(&preempt_notifier_key)) + __fire_sched_in_preempt_notifiers(curr); +} + static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) +__fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) { struct preempt_notifier *notifier; @@ -2145,13 +2381,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, notifier->ops->sched_out(notifier, next); } +static __always_inline void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + if (static_key_false(&preempt_notifier_key)) + __fire_sched_out_preempt_notifiers(curr, next); +} + #else /* !CONFIG_PREEMPT_NOTIFIERS */ -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { } -static void +static inline void fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { @@ -2252,23 +2496,35 @@ static struct rq *finish_task_switch(struct task_struct *prev) #ifdef CONFIG_SMP /* rq->lock is NOT held, but preemption is disabled */ -static inline void post_schedule(struct rq *rq) +static void __balance_callback(struct rq *rq) { - if (rq->post_schedule) { - unsigned long flags; + struct callback_head *head, *next; + void (*func)(struct rq *rq); + unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->curr->sched_class->post_schedule) - rq->curr->sched_class->post_schedule(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); + head = rq->balance_callback; + rq->balance_callback = NULL; + while (head) { + func = (void (*)(struct rq *))head->func; + next = head->next; + head->next = NULL; + head = next; - rq->post_schedule = 0; + func(rq); } + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static inline void balance_callback(struct rq *rq) +{ + if (unlikely(rq->balance_callback)) + __balance_callback(rq); } #else -static inline void post_schedule(struct rq *rq) +static inline void balance_callback(struct rq *rq) { } @@ -2286,7 +2542,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* finish_task_switch() drops rq->lock and enables preemtion */ preempt_disable(); rq = finish_task_switch(prev); - post_schedule(rq); + balance_callback(rq); preempt_enable(); if (current->set_child_tid) @@ -2330,9 +2586,9 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ + lockdep_unpin_lock(&rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); @@ -2397,9 +2653,9 @@ unsigned long nr_iowait_cpu(int cpu) void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { - struct rq *this = this_rq(); - *nr_waiters = atomic_read(&this->nr_iowait); - *load = this->cpu_load[0]; + struct rq *rq = this_rq(); + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq->load.weight; } #ifdef CONFIG_SMP @@ -2497,6 +2753,7 @@ void scheduler_tick(void) update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -2525,7 +2782,7 @@ void scheduler_tick(void) u64 scheduler_tick_max_deferment(void) { struct rq *rq = this_rq(); - unsigned long next, now = ACCESS_ONCE(jiffies); + unsigned long next, now = READ_ONCE(jiffies); next = rq->last_sched_tick + HZ; @@ -2726,9 +2983,7 @@ again: * - return from syscall or exception to user-space * - return from interrupt-handler to user-space * - * WARNING: all callers must re-check need_resched() afterward and reschedule - * accordingly in case an event triggered the need for rescheduling (such as - * an interrupt waking up a task) while preemption was disabled in __schedule(). + * WARNING: must be called with preemption disabled! */ static void __sched __schedule(void) { @@ -2737,7 +2992,6 @@ static void __sched __schedule(void) struct rq *rq; int cpu; - preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_note_context_switch(); @@ -2755,6 +3009,7 @@ static void __sched __schedule(void) */ smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); + lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -2797,12 +3052,12 @@ static void __sched __schedule(void) rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); - } else + } else { + lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); + } - post_schedule(rq); - - sched_preempt_enable_no_resched(); + balance_callback(rq); } static inline void sched_submit_work(struct task_struct *tsk) @@ -2823,7 +3078,9 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { + preempt_disable(); __schedule(); + sched_preempt_enable_no_resched(); } while (need_resched()); } EXPORT_SYMBOL(schedule); @@ -2862,15 +3119,14 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - __preempt_count_add(PREEMPT_ACTIVE); + preempt_active_enter(); __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); + preempt_active_exit(); /* * Check again in case we missed a preemption opportunity * between schedule and now. */ - barrier(); } while (need_resched()); } @@ -2894,9 +3150,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); -#ifdef CONFIG_CONTEXT_TRACKING /** - * preempt_schedule_context - preempt_schedule called by tracing + * preempt_schedule_notrace - preempt_schedule called by tracing * * The tracing infrastructure uses preempt_enable_notrace to prevent * recursion and tracing preempt enabling caused by the tracing @@ -2909,7 +3164,7 @@ EXPORT_SYMBOL(preempt_schedule); * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) { enum ctx_state prev_ctx; @@ -2917,7 +3172,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void) return; do { - __preempt_count_add(PREEMPT_ACTIVE); + /* + * Use raw __prempt_count() ops that don't call function. + * We can't call functions before disabling preemption which + * disarm preemption tracing recursions. + */ + __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); + barrier(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -2927,12 +3188,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void) __schedule(); exception_exit(prev_ctx); - __preempt_count_sub(PREEMPT_ACTIVE); barrier(); + __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); } while (need_resched()); } -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_CONTEXT_TRACKING */ +EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #endif /* CONFIG_PREEMPT */ @@ -2952,17 +3212,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - __preempt_count_add(PREEMPT_ACTIVE); + preempt_active_enter(); local_irq_enable(); __schedule(); local_irq_disable(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); + preempt_active_exit(); } while (need_resched()); exception_exit(prev_state); @@ -3040,7 +3294,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - p->dl.dl_throttled = 0; enqueue_flag = ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; @@ -3068,7 +3321,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio); out_unlock: + preempt_disable(); /* avoid rq from going away on us */ __task_rq_unlock(rq); + + balance_callback(rq); + preempt_enable(); } #endif @@ -3406,7 +3663,7 @@ static bool dl_param_changed(struct task_struct *p, static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, - bool user) + bool user, bool pi) { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; @@ -3592,18 +3849,20 @@ change: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; - /* - * Take priority boosted tasks into account. If the new - * effective priority is unchanged, we just store the new - * normal parameters and do not touch the scheduler class and - * the runqueue. This will be done when the task deboost - * itself. - */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; + if (pi) { + /* + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + if (new_effective_prio == oldprio) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } } queued = task_on_rq_queued(p); @@ -3614,7 +3873,7 @@ change: put_prev_task(rq, p); prev_class = p->sched_class; - __setscheduler(rq, p, attr, true); + __setscheduler(rq, p, attr, pi); if (running) p->sched_class->set_curr_task(rq); @@ -3627,9 +3886,17 @@ change: } check_class_changed(rq, p, prev_class, oldprio); + preempt_disable(); /* avoid rq from going away on us */ task_rq_unlock(rq, p, &flags); - rt_mutex_adjust_pi(p); + if (pi) + rt_mutex_adjust_pi(p); + + /* + * Run balance callbacks after we've adjusted the PI chain. + */ + balance_callback(rq); + preempt_enable(); return 0; } @@ -3650,7 +3917,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy, attr.sched_policy = policy; } - return __sched_setscheduler(p, &attr, check); + return __sched_setscheduler(p, &attr, check, true); } /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. @@ -3671,7 +3938,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { - return __sched_setscheduler(p, attr, true); + return __sched_setscheduler(p, attr, true, true); } EXPORT_SYMBOL_GPL(sched_setattr); @@ -4719,149 +4986,6 @@ out: } #ifdef CONFIG_SMP -/* - * move_queued_task - move a queued task to new rq. - * - * Returns (locked) new rq. Old rq's lock is released. - */ -static struct rq *move_queued_task(struct task_struct *p, int new_cpu) -{ - struct rq *rq = task_rq(p); - - lockdep_assert_held(&rq->lock); - - dequeue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_MIGRATING; - set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); - - rq = cpu_rq(new_cpu); - - raw_spin_lock(&rq->lock); - BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; - enqueue_task(rq, p, 0); - check_preempt_curr(rq, p, 0); - - return rq; -} - -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - - cpumask_copy(&p->cpus_allowed, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); -} - -/* - * This is how migration works: - * - * 1) we invoke migration_cpu_stop() on the target CPU using - * stop_one_cpu(). - * 2) stopper starts to run (implicitly forcing the migrated thread - * off the CPU) - * 3) it checks whether the migrated task is still in the wrong runqueue. - * 4) if it's in the wrong runqueue then the migration thread removes - * it and puts it into the right queue. - * 5) stopper completes and stop_one_cpu() returns and the migration - * is done. - */ - -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - unsigned long flags; - struct rq *rq; - unsigned int dest_cpu; - int ret = 0; - - rq = task_rq_lock(p, &flags); - - if (cpumask_equal(&p->cpus_allowed, new_mask)) - goto out; - - if (!cpumask_intersects(new_mask, cpu_active_mask)) { - ret = -EINVAL; - goto out; - } - - do_set_cpus_allowed(p, new_mask); - - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; - - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); - return 0; - } else if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -out: - task_rq_unlock(rq, p, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -/* - * Move (not current) task off this cpu, onto dest cpu. We're doing - * this because either it can't run here any more (set_cpus_allowed() - * away from this CPU, or CPU going down), or because we're - * attempting to rebalance this task on exec (sched_exec). - * - * So we race with normal scheduler movements, but that's OK, as long - * as the task is no longer on this CPU. - * - * Returns non-zero if task was successfully migrated. - */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) -{ - struct rq *rq; - int ret = 0; - - if (unlikely(!cpu_active(dest_cpu))) - return ret; - - rq = cpu_rq(src_cpu); - - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - /* Already moved. */ - if (task_cpu(p) != src_cpu) - goto done; - - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - goto fail; - - /* - * If we're not on a rq, the next wake-up will ensure we're - * placed properly. - */ - if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -done: - ret = 1; -fail: - raw_spin_unlock(&rq->lock); - raw_spin_unlock(&p->pi_lock); - return ret; -} #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ @@ -4909,35 +5033,9 @@ void sched_setnuma(struct task_struct *p, int nid) enqueue_task(rq, p, 0); task_rq_unlock(rq, p, &flags); } -#endif - -/* - * migration_cpu_stop - this will be executed by a highprio stopper thread - * and performs thread migration by bumping thread off CPU then - * 'pushing' onto another runqueue. - */ -static int migration_cpu_stop(void *data) -{ - struct migration_arg *arg = data; - - /* - * The original target cpu might have gone down and we might - * be on another cpu but it doesn't matter. - */ - local_irq_disable(); - /* - * We need to explicitly wake pending tasks before running - * __migrate_task() such that we will not miss enforcing cpus_allowed - * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. - */ - sched_ttwu_pending(); - __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); - local_irq_enable(); - return 0; -} +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_HOTPLUG_CPU - /* * Ensures that the idle task is using init_mm right before its cpu goes * offline. @@ -4993,9 +5091,9 @@ static struct task_struct fake_task = { * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(unsigned int dead_cpu) +static void migrate_tasks(struct rq *dead_rq) { - struct rq *rq = cpu_rq(dead_cpu); + struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; @@ -5017,7 +5115,7 @@ static void migrate_tasks(unsigned int dead_cpu) */ update_rq_clock(rq); - for ( ; ; ) { + for (;;) { /* * There's this thread running, bail when that's the only * remaining thread. @@ -5025,22 +5123,29 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; + /* + * Ensure rq->lock covers the entire task selection + * until the migration. + */ + lockdep_pin_lock(&rq->lock); next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_cpu, next); - raw_spin_unlock(&rq->lock); - - __migrate_task(next, dead_cpu, dest_cpu); - - raw_spin_lock(&rq->lock); + dest_cpu = select_fallback_rq(dead_rq->cpu, next); + + lockdep_unpin_lock(&rq->lock); + rq = __migrate_task(rq, next, dest_cpu); + if (rq != dead_rq) { + raw_spin_unlock(&rq->lock); + rq = dead_rq; + raw_spin_lock(&rq->lock); + } } rq->stop = stop; } - #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5219,7 +5324,7 @@ static void register_sched_domain_sysctl(void) static void unregister_sched_domain_sysctl(void) { } -#endif +#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ static void set_rq_online(struct rq *rq) { @@ -5288,7 +5393,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(cpu); + migrate_tasks(rq); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; @@ -5314,7 +5419,7 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; -static void __cpuinit set_cpu_rq_start_time(void) +static void set_cpu_rq_start_time(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); @@ -5366,9 +5471,6 @@ static int __init migration_init(void) return 0; } early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ @@ -6594,7 +6696,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) struct sched_group *sg; struct sched_group_capacity *sgc; - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sd) return -ENOMEM; @@ -7032,6 +7134,9 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + /* nohz_full won't take effect without isolating the cpus. */ + tick_nohz_full_add_cpus_to(cpu_isolated_map); + sched_init_numa(); /* @@ -7068,8 +7173,6 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ -const_debug unsigned int sysctl_timer_migration = 1; - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -7199,7 +7302,7 @@ void __init sched_init(void) rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; - rq->post_schedule = 0; + rq->balance_callback = NULL; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -7329,32 +7432,12 @@ EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ -static void normalize_task(struct rq *rq, struct task_struct *p) +void normalize_rt_tasks(void) { - const struct sched_class *prev_class = p->sched_class; + struct task_struct *g, *p; struct sched_attr attr = { .sched_policy = SCHED_NORMAL, }; - int old_prio = p->prio; - int queued; - - queued = task_on_rq_queued(p); - if (queued) - dequeue_task(rq, p, 0); - __setscheduler(rq, p, &attr, false); - if (queued) { - enqueue_task(rq, p, 0); - resched_curr(rq); - } - - check_class_changed(rq, p, prev_class, old_prio); -} - -void normalize_rt_tasks(void) -{ - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; read_lock(&tasklist_lock); for_each_process_thread(g, p) { @@ -7381,9 +7464,7 @@ void normalize_rt_tasks(void) continue; } - rq = task_rq_lock(p, &flags); - normalize_task(rq, p); - task_rq_unlock(rq, p, &flags); + __sched_setscheduler(p, &attr, false, false); } read_unlock(&tasklist_lock); } @@ -7734,11 +7815,11 @@ static long sched_group_rt_runtime(struct task_group *tg) return rt_runtime_us; } -static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) { u64 rt_runtime, rt_period; - rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_period = rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); @@ -8105,10 +8186,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) __refill_cfs_bandwidth_runtime(cfs_b); /* restart the period timer (if active) to handle new period expiry */ - if (runtime_enabled && cfs_b->timer_active) { - /* force a reprogram */ - __start_cfs_bandwidth(cfs_b, true); - } + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); raw_spin_unlock_irq(&cfs_b->lock); for_each_online_cpu(i) { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8394b1ee6..f5a64ffad 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new) { cputime_t old; - while (new > (old = ACCESS_ONCE(*counter))) + while (new > (old = READ_ONCE(*counter))) cmpxchg_cputime(counter, old, new); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5e9514508..0a17af356 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -213,14 +213,28 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) return dl_task(prev); } -static inline void set_post_schedule(struct rq *rq) +static DEFINE_PER_CPU(struct callback_head, dl_push_head); +static DEFINE_PER_CPU(struct callback_head, dl_pull_head); + +static void push_dl_tasks(struct rq *); +static void pull_dl_task(struct rq *); + +static inline void queue_push_tasks(struct rq *rq) +{ + if (!has_pushable_dl_tasks(rq)) + return; + + queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); +} + +static inline void queue_pull_task(struct rq *rq) { - rq->post_schedule = has_pushable_dl_tasks(rq); + queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); } static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); -static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) +static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; bool fallback = false; @@ -254,14 +268,19 @@ static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) double_lock_balance(rq, later_rq); } + /* + * By now the task is replenished and enqueued; migrate it. + */ deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); - activate_task(later_rq, p, ENQUEUE_REPLENISH); + activate_task(later_rq, p, 0); if (!fallback) resched_curr(later_rq); - double_unlock_balance(rq, later_rq); + double_unlock_balance(later_rq, rq); + + return later_rq; } #else @@ -291,12 +310,15 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) return false; } -static inline int pull_dl_task(struct rq *rq) +static inline void pull_dl_task(struct rq *rq) { - return 0; } -static inline void set_post_schedule(struct rq *rq) +static inline void queue_push_tasks(struct rq *rq) +{ +} + +static inline void queue_pull_task(struct rq *rq) { } #endif /* CONFIG_SMP */ @@ -498,24 +520,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) +static int start_dl_timer(struct task_struct *p) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct sched_dl_entity *dl_se = &p->dl; + struct hrtimer *timer = &dl_se->dl_timer; + struct rq *rq = task_rq(p); ktime_t now, act; - ktime_t soft, hard; - unsigned long range; s64 delta; - if (boosted) - return 0; + lockdep_assert_held(&rq->lock); + /* * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. */ act = ns_to_ktime(dl_se->deadline); - now = hrtimer_cb_get_time(&dl_se->dl_timer); + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -527,15 +548,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) if (ktime_us_delta(act, now) < 0) return 0; - hrtimer_set_expires(&dl_se->dl_timer, act); - - soft = hrtimer_get_softexpires(&dl_se->dl_timer); - hard = hrtimer_get_expires(&dl_se->dl_timer); - range = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&dl_se->dl_timer, soft, - range, HRTIMER_MODE_ABS, 0); + /* + * !enqueued will guarantee another callback; even if one is already in + * progress. This ensures a balanced {get,put}_task_struct(). + * + * The race against __run_timer() clearing the enqueued state is + * harmless because we're holding task_rq()->lock, therefore the timer + * expiring after we've done the check will wait on its task_rq_lock() + * and observe our state. + */ + if (!hrtimer_is_queued(timer)) { + get_task_struct(p); + hrtimer_start(timer, act, HRTIMER_MODE_ABS); + } - return hrtimer_active(&dl_se->dl_timer); + return 1; } /* @@ -563,35 +590,40 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) rq = task_rq_lock(p, &flags); /* - * We need to take care of several possible races here: - * - * - the task might have changed its scheduling policy - * to something different than SCHED_DEADLINE - * - the task might have changed its reservation parameters - * (through sched_setattr()) - * - the task might have been boosted by someone else and - * might be in the boosting/deboosting path + * The task might have changed its scheduling policy to something + * different than SCHED_DEADLINE (through switched_fromd_dl()). + */ + if (!dl_task(p)) { + __dl_clear_params(p); + goto unlock; + } + + /* + * This is possible if switched_from_dl() raced against a running + * callback that took the above !dl_task() path and we've since then + * switched back into SCHED_DEADLINE. * - * In all this cases we bail out, as the task is already - * in the runqueue or is going to be enqueued back anyway. + * There's nothing to do except drop our task reference. */ - if (!dl_task(p) || dl_se->dl_new || - dl_se->dl_boosted || !dl_se->dl_throttled) + if (dl_se->dl_new) goto unlock; - sched_clock_tick(); - update_rq_clock(rq); + /* + * The task might have been boosted by someone else and might be in the + * boosting/deboosting path, its not throttled. + */ + if (dl_se->dl_boosted) + goto unlock; -#ifdef CONFIG_SMP /* - * If we find that the rq the task was on is no longer - * available, we need to select a new rq. + * Spurious timer due to start_dl_timer() race; or we already received + * a replenishment from rt_mutex_setprio(). */ - if (unlikely(!rq->online)) { - dl_task_offline_migration(rq, p); + if (!dl_se->dl_throttled) goto unlock; - } -#endif + + sched_clock_tick(); + update_rq_clock(rq); /* * If the throttle happened during sched-out; like: @@ -617,17 +649,38 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); + #ifdef CONFIG_SMP /* - * Queueing this task back might have overloaded rq, - * check if we need to kick someone away. + * Perform balancing operations here; after the replenishments. We + * cannot drop rq->lock before this, otherwise the assertion in + * start_dl_timer() about not missing updates is not true. + * + * If we find that the rq the task was on is no longer available, we + * need to select a new rq. + * + * XXX figure out if select_task_rq_dl() deals with offline cpus. + */ + if (unlikely(!rq->online)) + rq = dl_task_offline_migration(rq, p); + + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. */ if (has_pushable_dl_tasks(rq)) push_dl_task(rq); #endif + unlock: task_rq_unlock(rq, p, &flags); + /* + * This can free the task_struct, including this hrtimer, do not touch + * anything related to that after this. + */ + put_task_struct(p); + return HRTIMER_NORESTART; } @@ -640,7 +693,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) } static -int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) +int dl_runtime_exceeded(struct sched_dl_entity *dl_se) { return (dl_se->runtime <= 0); } @@ -684,10 +737,10 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; - if (dl_runtime_exceeded(rq, dl_se)) { + if (dl_runtime_exceeded(dl_se)) { dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) + if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -995,7 +1048,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If we are dealing with a -deadline task, we must @@ -1012,7 +1065,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) (p->nr_cpus_allowed > 1)) { int target = find_later_rq(p); - if (target != -1) + if (target != -1 && + dl_time_before(p->dl.deadline, + cpu_rq(target)->dl.earliest_dl.curr)) cpu = target; } rcu_read_unlock(); @@ -1042,8 +1097,6 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); } -static int pull_dl_task(struct rq *this_rq); - #endif /* CONFIG_SMP */ /* @@ -1100,7 +1153,15 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) dl_rq = &rq->dl; if (need_pull_dl_task(rq, prev)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we're + * being very careful to re-start the picking loop. + */ + lockdep_unpin_lock(&rq->lock); pull_dl_task(rq); + lockdep_pin_lock(&rq->lock); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to @@ -1134,7 +1195,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); - set_post_schedule(rq); + queue_push_tasks(rq); return p; } @@ -1171,7 +1232,6 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { - struct hrtimer *timer = &p->dl.dl_timer; struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); /* @@ -1181,8 +1241,6 @@ static void task_dead_dl(struct task_struct *p) /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); - - hrtimer_cancel(timer); } static void set_curr_task_dl(struct rq *rq) @@ -1230,6 +1288,32 @@ next_node: return NULL; } +/* + * Return the earliest pushable rq's task, which is suitable to be executed + * on the CPU, NULL otherwise: + */ +static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) +{ + struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost; + struct task_struct *p = NULL; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + +next_node: + if (next_node) { + p = rb_entry(next_node, struct task_struct, pushable_dl_tasks); + + if (pick_dl_task(rq, p, cpu)) + return p; + + next_node = rb_next(next_node); + goto next_node; + } + + return NULL; +} + static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) @@ -1333,6 +1417,17 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); + if (!dl_time_before(task->dl.deadline, + later_rq->dl.earliest_dl.curr)) { + /* + * Target rq has tasks of equal or earlier deadline, + * retrying does not release any lock and is unlikely + * to yield a different result. + */ + later_rq = NULL; + break; + } + /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || @@ -1473,15 +1568,16 @@ static void push_dl_tasks(struct rq *rq) ; } -static int pull_dl_task(struct rq *this_rq) +static void pull_dl_task(struct rq *this_rq) { - int this_cpu = this_rq->cpu, ret = 0, cpu; + int this_cpu = this_rq->cpu, cpu; struct task_struct *p; + bool resched = false; struct rq *src_rq; u64 dmin = LONG_MAX; if (likely(!dl_overloaded(this_rq))) - return 0; + return; /* * Match the barrier from dl_set_overloaded; this guarantees that if we @@ -1514,7 +1610,7 @@ static int pull_dl_task(struct rq *this_rq) if (src_rq->dl.dl_nr_running <= 1) goto skip; - p = pick_next_earliest_dl_task(src_rq, this_cpu); + p = pick_earliest_pushable_dl_task(src_rq, this_cpu); /* * We found a task to be pulled if: @@ -1536,7 +1632,7 @@ static int pull_dl_task(struct rq *this_rq) src_rq->curr->dl.deadline)) goto skip; - ret = 1; + resched = true; deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); @@ -1549,12 +1645,8 @@ skip: double_unlock_balance(this_rq, src_rq); } - return ret; -} - -static void post_schedule_dl(struct rq *rq) -{ - push_dl_tasks(rq); + if (resched) + resched_curr(this_rq); } /* @@ -1659,7 +1751,7 @@ static void rq_offline_dl(struct rq *rq) cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } -void init_sched_dl_class(void) +void __init init_sched_dl_class(void) { unsigned int i; @@ -1670,37 +1762,16 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ -/* - * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. - */ -static void cancel_dl_timer(struct rq *rq, struct task_struct *p) -{ - struct hrtimer *dl_timer = &p->dl.dl_timer; - - /* Nobody will change task's class if pi_lock is held */ - lockdep_assert_held(&p->pi_lock); - - if (hrtimer_active(dl_timer)) { - int ret = hrtimer_try_to_cancel(dl_timer); - - if (unlikely(ret == -1)) { - /* - * Note, p may migrate OR new deadline tasks - * may appear in rq when we are unlocking it. - * A caller of us must be fine with that. - */ - raw_spin_unlock(&rq->lock); - hrtimer_cancel(dl_timer); - raw_spin_lock(&rq->lock); - } - } -} - static void switched_from_dl(struct rq *rq, struct task_struct *p) { - /* XXX we should retain the bw until 0-lag */ - cancel_dl_timer(rq, p); - __dl_clear_params(p); + /* + * Start the deadline timer; if we switch back to dl before this we'll + * continue consuming our current CBS slice. If we stay outside of + * SCHED_DEADLINE until the deadline passes, the timer will reset the + * task. + */ + if (!start_dl_timer(p)) + __dl_clear_params(p); /* * Since this might be the only -deadline task on the rq, @@ -1710,8 +1781,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) return; - if (pull_dl_task(rq)) - resched_curr(rq); + queue_pull_task(rq); } /* @@ -1720,21 +1790,16 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - int check_resched = 1; - if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && - push_dl_task(rq) && rq != task_rq(p)) - /* Only reschedule if pushing failed */ - check_resched = 0; -#endif /* CONFIG_SMP */ - if (check_resched) { - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); - } + if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) + queue_push_tasks(rq); +#else + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); +#endif } } @@ -1754,15 +1819,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, * or lowering its prio, so... */ if (!rq->dl.overloaded) - pull_dl_task(rq); + queue_pull_task(rq); /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this * runqueue. */ - if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && - rq->curr == p) + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline)) resched_curr(rq); #else /* @@ -1792,7 +1856,6 @@ const struct sched_class dl_sched_class = { .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, - .post_schedule = post_schedule_dl, .task_woken = task_woken_dl, #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a245c1fc6..4222ec50a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -132,15 +132,17 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); #ifdef CONFIG_SCHEDSTATS SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.vruntime), + SPLIT_NS(p->se.statistics.wait_sum), SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(p->se.statistics.sum_sleep_runtime)); #else - SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", - 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + 0LL, 0L, + SPLIT_NS(p->se.sum_exec_runtime), + 0LL, 0L); #endif #ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d", task_node(p)); + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); @@ -156,7 +158,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\nrunnable tasks:\n" " task PID tree-key switches prio" - " exec-runtime sum-exec sum-sleep\n" + " wait-time sum-exec sum-sleep\n" "------------------------------------------------------" "----------------------------------------------------\n"); @@ -230,8 +232,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif #endif #ifdef CONFIG_CFS_BANDWIDTH - SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", - cfs_rq->tg->cfs_bandwidth.timer_active); SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); SEQ_printf(m, " .%-30s: %d\n", "throttle_count", @@ -517,11 +517,21 @@ __initcall(init_sched_debug_procfs); SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#ifdef CONFIG_NUMA_BALANCING +void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf) +{ + SEQ_printf(m, "numa_faults node=%d ", node); + SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf); + SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf); +} +#endif + + static void sched_show_numa(struct task_struct *p, struct seq_file *m) { #ifdef CONFIG_NUMA_BALANCING struct mempolicy *pol; - int node, i; if (p->mm) P(mm->numa_scan_seq); @@ -533,26 +543,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) mpol_get(pol); task_unlock(p); - SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); - - for_each_online_node(node) { - for (i = 0; i < 2; i++) { - unsigned long nr_faults = -1; - int cpu_current, home_node; - - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; - - cpu_current = !i ? (task_node(p) == node) : - (pol && node_isset(node, pol->v.nodes)); - - home_node = (p->numa_preferred_nid == node); - - SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", - i, node, cpu_current, home_node, nr_faults); - } - } - + P(numa_pages_migrated); + P(numa_preferred_nid); + P(total_numa_faults); + SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", + task_node(p), task_numa_group_id(p)); + show_numa_stats(p, m); mpol_put(pol); #endif } @@ -582,6 +578,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS + PN(se.statistics.sum_sleep_runtime); PN(se.statistics.wait_start); PN(se.statistics.sleep_start); PN(se.statistics.block_start); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 936664319..134314406 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -166,9 +166,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ -static int get_update_sysctl_factor(void) +static unsigned int get_update_sysctl_factor(void) { - unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { @@ -601,7 +601,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int factor = get_update_sysctl_factor(); + unsigned int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -859,7 +859,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { - unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); + unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; @@ -1223,11 +1223,9 @@ static void task_numa_assign(struct task_numa_env *env, static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { + long imb, old_imb; + long orig_src_load, orig_dst_load; long src_capacity, dst_capacity; - long orig_src_load; - long load_a, load_b; - long moved_load; - long imb; /* * The load is corrected for the CPU capacity available on each node. @@ -1240,39 +1238,30 @@ static bool load_too_imbalanced(long src_load, long dst_load, dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - load_a = dst_load; - load_b = src_load; - if (load_a < load_b) - swap(load_a, load_b); + if (dst_load < src_load) + swap(dst_load, src_load); /* Is the difference below the threshold? */ - imb = load_a * src_capacity * 100 - - load_b * dst_capacity * env->imbalance_pct; + imb = dst_load * src_capacity * 100 - + src_load * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. - * Allow a move that brings us closer to a balanced situation, - * without moving things past the point of balance. + * Compare it with the old imbalance. */ orig_src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; - /* - * In a task swap, there will be one load moving from src to dst, - * and another moving back. This is the net sum of both moves. - * A simple task move will always have a positive value. - * Allow the move if it brings the system closer to a balanced - * situation, without crossing over the balance point. - */ - moved_load = orig_src_load - src_load; + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); - if (moved_load > 0) - /* Moving src -> dst. Did we overshoot balance? */ - return src_load * dst_capacity < dst_load * src_capacity; - else - /* Moving dst -> src. Did we overshoot balance? */ - return dst_load * src_capacity < src_load * dst_capacity; + old_imb = orig_dst_load * src_capacity * 100 - + orig_src_load * dst_capacity * env->imbalance_pct; + + /* Would this change make things worse? */ + return (imb > old_imb); } /* @@ -1434,6 +1423,30 @@ static void task_numa_find_cpu(struct task_numa_env *env, } } +/* Only move tasks to a NUMA node less busy than the current node. */ +static bool numa_has_capacity(struct task_numa_env *env) +{ + struct numa_stats *src = &env->src_stats; + struct numa_stats *dst = &env->dst_stats; + + if (src->has_free_capacity && !dst->has_free_capacity) + return false; + + /* + * Only consider a task move if the source has a higher load + * than the destination, corrected for CPU capacity on each node. + * + * src->load dst->load + * --------------------- vs --------------------- + * src->compute_capacity dst->compute_capacity + */ + if (src->load * dst->compute_capacity > + dst->load * src->compute_capacity) + return true; + + return false; +} + static int task_numa_migrate(struct task_struct *p) { struct task_numa_env env = { @@ -1488,7 +1501,8 @@ static int task_numa_migrate(struct task_struct *p) update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ - task_numa_find_cpu(&env, taskimp, groupimp); + if (numa_has_capacity(&env)) + task_numa_find_cpu(&env, taskimp, groupimp); /* * Look at other nodes in these cases: @@ -1519,7 +1533,8 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); - task_numa_find_cpu(&env, taskimp, groupimp); + if (numa_has_capacity(&env)) + task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -1819,7 +1834,12 @@ static void task_numa_placement(struct task_struct *p) u64 runtime, period; spinlock_t *group_lock = NULL; - seq = ACCESS_ONCE(p->mm->numa_scan_seq); + /* + * The p->mm->numa_scan_seq field gets updated without + * exclusive access. Use READ_ONCE() here to ensure + * that the field is read in a single access: + */ + seq = READ_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; @@ -1963,7 +1983,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, } rcu_read_lock(); - tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + tsk = READ_ONCE(cpu_rq(cpu)->curr); if (!cpupid_match_pid(tsk, cpupid)) goto no_join; @@ -2132,7 +2152,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) static void reset_ptenuma_scan(struct task_struct *p) { - ACCESS_ONCE(p->mm->numa_scan_seq)++; + /* + * We only did a read acquisition of the mmap sem, so + * p->mm->numa_scan_seq is written to without exclusive access + * and the update is not guaranteed to be atomic. That's not + * much of an issue though, since this is just used for + * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not + * expensive, to avoid any form of compiler optimizations: + */ + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); p->mm->numa_scan_offset = 0; } @@ -3501,16 +3529,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { - /* - * If the bandwidth pool has become inactive, then at least one - * period must have elapsed since the last consumption. - * Refresh the global state and ensure bandwidth timer becomes - * active. - */ - if (!cfs_b->timer_active) { - __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b, false); - } + start_cfs_bandwidth(cfs_b); if (cfs_b->runtime > 0) { amount = min(cfs_b->runtime, min_amount); @@ -3659,6 +3678,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, dequeue = 1; + bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -3688,13 +3708,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); + empty = list_empty(&cfs_b->throttled_cfs_rq); + /* * Add to the _head_ of the list, so that an already-started * distribute_cfs_runtime will not see us */ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b, false); + + /* + * If we're the first throttled task, make sure the bandwidth + * timer is running. + */ + if (empty) + start_cfs_bandwidth(cfs_b); + raw_spin_unlock(&cfs_b->lock); } @@ -3809,13 +3837,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (cfs_b->idle && !throttled) goto out_deactivate; - /* - * if we have relooped after returning idle once, we need to update our - * status as actually running, so that other cpus doing - * __start_cfs_bandwidth will stop trying to cancel us. - */ - cfs_b->timer_active = 1; - __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { @@ -3860,7 +3881,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) return 0; out_deactivate: - cfs_b->timer_active = 0; return 1; } @@ -3875,7 +3895,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; * Are we near the end of the current quota period? * * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the - * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * hrtimer base being cleared by hrtimer_start. In the case of * migrate_hrtimers, base is never cleared, so we are fine. */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) @@ -3903,8 +3923,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) if (runtime_refresh_within(cfs_b, min_left)) return; - start_bandwidth_timer(&cfs_b->slack_timer, - ns_to_ktime(cfs_bandwidth_slack_period)); + hrtimer_start(&cfs_b->slack_timer, + ns_to_ktime(cfs_bandwidth_slack_period), + HRTIMER_MODE_REL); } /* we know any runtime found here is valid as update_curr() precedes return */ @@ -4024,6 +4045,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); return HRTIMER_NORESTART; @@ -4033,20 +4055,19 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); - ktime_t now; int overrun; int idle = 0; raw_spin_lock(&cfs_b->lock); for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, cfs_b->period); - + overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; idle = do_sched_cfs_period_timer(cfs_b, overrun); } + if (idle) + cfs_b->period_active = 0; raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; @@ -4060,7 +4081,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period = ns_to_ktime(default_cfs_period()); INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; @@ -4072,28 +4093,15 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) INIT_LIST_HEAD(&cfs_rq->throttled_list); } -/* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - /* - * The timer may be active because we're trying to set a new bandwidth - * period or because we're racing with the tear-down path - * (timer_active==0 becomes visible before the hrtimer call-back - * terminates). In either case we ensure that it's re-programmed - */ - while (unlikely(hrtimer_active(&cfs_b->period_timer)) && - hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { - /* bounce the lock to allow do_sched_cfs_period_timer to run */ - raw_spin_unlock(&cfs_b->lock); - cpu_relax(); - raw_spin_lock(&cfs_b->lock); - /* if someone else restarted the timer then we're done */ - if (!force && cfs_b->timer_active) - return; - } + lockdep_assert_held(&cfs_b->lock); - cfs_b->timer_active = 1; - start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); + if (!cfs_b->period_active) { + cfs_b->period_active = 1; + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); + } } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -4348,6 +4356,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* + * per rq 'load' arrray crap; XXX kill this. + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +static void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = READ_ONCE(jiffies); + unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = READ_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + unsigned long load = this_rq->cfs.runnable_load_avg; + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, load, 1); +} + /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { @@ -4400,7 +4591,7 @@ static unsigned long capacity_orig_of(int cpu) static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); + unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -5151,18 +5342,21 @@ again: * entity, update_curr() will update its vruntime, otherwise * forget we've ever seen it. */ - if (curr && curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; + if (curr) { + if (curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; - /* - * This call to check_cfs_rq_runtime() will do the throttle and - * dequeue its entity in the parent(s). Therefore the 'simple' - * nr_running test will indeed be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto simple; + /* + * This call to check_cfs_rq_runtime() will do the + * throttle and dequeue its entity in the parent(s). + * Therefore the 'simple' nr_running test will indeed + * be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + } se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); @@ -5223,7 +5417,15 @@ simple: return p; idle: + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + lockdep_unpin_lock(&rq->lock); new_tasks = idle_balance(rq); + lockdep_pin_lock(&rq->lock); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -5492,10 +5694,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env) } #ifdef CONFIG_NUMA_BALANCING -/* Returns true if the destination node has incurred more faults */ +/* + * Returns true if the destination node is the preferred node. + * Needs to match fbq_classify_rq(): if there is a runnable task + * that is not on its preferred node, we should identify it. + */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_faults, dst_faults; int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || @@ -5509,29 +5716,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - if (numa_group) { - /* Task is already in the group's interleave set. */ - if (node_isset(src_nid, numa_group->active_nodes)) - return false; - - /* Task is moving into the group's interleave set. */ - if (node_isset(dst_nid, numa_group->active_nodes)) - return true; - - return group_faults(p, dst_nid) > group_faults(p, src_nid); - } - /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) return true; - return task_faults(p, dst_nid) > task_faults(p, src_nid); + /* Migrating away from the preferred node is bad. */ + if (src_nid == p->numa_preferred_nid) + return false; + + if (numa_group) { + src_faults = group_faults(p, src_nid); + dst_faults = group_faults(p, dst_nid); + } else { + src_faults = task_faults(p, src_nid); + dst_faults = task_faults(p, dst_nid); + } + + return dst_faults > src_faults; } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_faults, dst_faults; int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5546,23 +5754,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - if (numa_group) { - /* Task is moving within/into the group's interleave set. */ - if (node_isset(dst_nid, numa_group->active_nodes)) - return false; + /* Migrating away from the preferred node is bad. */ + if (src_nid == p->numa_preferred_nid) + return true; - /* Task is moving out of the group's interleave set. */ - if (node_isset(src_nid, numa_group->active_nodes)) - return true; + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) + return false; - return group_faults(p, dst_nid) < group_faults(p, src_nid); + if (numa_group) { + src_faults = group_faults(p, src_nid); + dst_faults = group_faults(p, dst_nid); + } else { + src_faults = task_faults(p, src_nid); + dst_faults = task_faults(p, dst_nid); } - /* Migrating away from the preferred node is always bad. */ - if (src_nid == p->numa_preferred_nid) - return true; - - return task_faults(p, dst_nid) < task_faults(p, src_nid); + return dst_faults < src_faults; } #else @@ -6062,8 +6270,8 @@ static unsigned long scale_rt_capacity(int cpu) * Since we're reading these variables without serialization make sure * we read them once before doing sanity checks on them. */ - age_stamp = ACCESS_ONCE(rq->age_stamp); - avg = ACCESS_ONCE(rq->rt_avg); + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); delta = __rq_clock_broken(rq) - age_stamp; if (unlikely(delta < 0)) @@ -7251,9 +7459,6 @@ static int idle_balance(struct rq *this_rq) goto out; } - /* - * Drop the rq->lock, but keep IRQ/preempt disabled. - */ raw_spin_unlock(&this_rq->lock); update_blocked_averages(this_cpu); @@ -8293,7 +8498,27 @@ void print_cfs_stats(struct seq_file *m, int cpu) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } -#endif + +#ifdef CONFIG_NUMA_BALANCING +void show_numa_stats(struct task_struct *p, struct seq_file *m) +{ + int node; + unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; + + for_each_online_node(node) { + if (p->numa_faults) { + tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; + tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + if (p->numa_group) { + gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], + gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + print_numa_stats(m, node, tsf, tpf, gsf, gpf); + } +} +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ __init void init_sched_fair_class(void) { diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 70e698d02..594275ed2 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -13,11 +13,16 @@ #include <trace/events/power.h> -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" -#else #include "sched.h" -#endif + +/** + * sched_idle_set_state - Record idle state for the current CPU. + * @idle_state: State to record. + */ +void sched_idle_set_state(struct cpuidle_state *idle_state) +{ + idle_set_state(this_rq(), idle_state); +} static int __read_mostly cpu_idle_force_poll; @@ -72,6 +77,46 @@ void __weak arch_cpu_idle(void) } /** + * default_idle_call - Default CPU idle routine. + * + * To use when the cpuidle framework cannot be used. + */ +void default_idle_call(void) +{ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); +} + +static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, + int next_state) +{ + /* Fall back to the default arch idle method on errors. */ + if (next_state < 0) { + default_idle_call(); + return next_state; + } + + /* + * The idle task must be scheduled, it is pointless to go to idle, just + * update no idle residency and return. + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + local_irq_enable(); + return -EBUSY; + } + + /* + * Enter the idle state previously returned by the governor decision. + * This function will block until an interrupt occurs and will take + * care of re-enabling the local interrupts + */ + return cpuidle_enter(drv, dev, next_state); +} + +/** * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here @@ -85,7 +130,6 @@ static void cpuidle_idle_call(void) struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; - bool reflect; /* * Check if the idle task must be rescheduled. If it is the @@ -109,8 +153,10 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); - if (cpuidle_not_available(drv, dev)) - goto use_default; + if (cpuidle_not_available(drv, dev)) { + default_idle_call(); + goto exit_idle; + } /* * Suspend-to-idle ("freeze") is a system state in which all user space @@ -128,52 +174,19 @@ static void cpuidle_idle_call(void) goto exit_idle; } - reflect = false; next_state = cpuidle_find_deepest_state(drv, dev); + call_cpuidle(drv, dev, next_state); } else { - reflect = true; /* * Ask the cpuidle framework to choose a convenient idle state. */ next_state = cpuidle_select(drv, dev); - } - /* Fall back to the default arch idle method on errors. */ - if (next_state < 0) - goto use_default; - - /* - * The idle task must be scheduled, it is pointless to - * go to idle, just update no idle residency and get - * out of this function - */ - if (current_clr_polling_and_test()) { - dev->last_residency = 0; - entered_state = next_state; - local_irq_enable(); - goto exit_idle; - } - - /* Take note of the planned idle state. */ - idle_set_state(this_rq(), &drv->states[next_state]); - - /* - * Enter the idle state previously returned by the governor decision. - * This function will block until an interrupt occurs and will take - * care of re-enabling the local interrupts - */ - entered_state = cpuidle_enter(drv, dev, next_state); - - /* The cpu is no longer idle or about to enter idle. */ - idle_set_state(this_rq(), NULL); - - if (entered_state == -EBUSY) - goto use_default; - - /* - * Give the governor an opportunity to reflect on the outcome - */ - if (reflect) + entered_state = call_cpuidle(drv, dev, next_state); + /* + * Give the governor an opportunity to reflect on the outcome + */ cpuidle_reflect(dev, entered_state); + } exit_idle: __current_set_polling(); @@ -186,19 +199,6 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); - return; - -use_default: - /* - * We can't use the cpuidle framework, let's use the default - * idle routine. - */ - if (current_clr_polling_and_test()) - local_irq_enable(); - else - arch_cpu_idle(); - - goto exit_idle; } DEFINE_PER_CPU(bool, cpu_dead_idle); diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c index 8ecd552fe..ef7159012 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/loadavg.c @@ -1,7 +1,9 @@ /* - * kernel/sched/proc.c + * kernel/sched/loadavg.c * - * Kernel load calculations, forked from sched/core.c + * This file contains the magic bits required to compute the global loadavg + * figure. Its a silly number but people think its important. We go through + * great pains to make it work on big machines and tickless kernels. */ #include <linux/export.h> @@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq) long nr_active, delta = 0; nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; + nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; @@ -186,6 +188,7 @@ void calc_load_enter_idle(void) delta = calc_load_fold_active(this_rq); if (delta) { int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); } } @@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) { unsigned long result = 1UL << frac_bits; - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; } return result; @@ -285,7 +290,6 @@ static unsigned long calc_load_n(unsigned long load, unsigned long exp, unsigned long active, unsigned int n) { - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); } @@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { } /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. + * + * Called from the global timer code. */ void calc_global_load(unsigned long ticks) { @@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks) } /* - * Called from update_cpu_load() to periodically update this CPU's + * Called from scheduler_tick() to periodically update this CPU's * active count. */ -static void calc_load_account_active(struct rq *this_rq) +void calc_global_load_tick(struct rq *this_rq) { long delta; @@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq) this_rq->calc_load_update += LOAD_FREQ; } - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_SMP -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->cfs.runnable_load_avg; -} -#else -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->load.weight; -} -#endif - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = get_rq_runnable_load(this_rq); - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ */ - -/* - * Called from scheduler_tick() - */ -void update_cpu_load_active(struct rq *this_rq) -{ - unsigned long load = get_rq_runnable_load(this_rq); - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); - - calc_load_account_active(this_rq); -} diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 575da76a3..0d193a243 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -18,19 +18,22 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; int idle = 0; + int overrun; + raw_spin_lock(&rt_b->rt_runtime_lock); for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - + overrun = hrtimer_forward_now(timer, rt_b->rt_period); if (!overrun) break; + raw_spin_unlock(&rt_b->rt_runtime_lock); idle = do_sched_rt_period_timer(rt_b, overrun); + raw_spin_lock(&rt_b->rt_runtime_lock); } + if (idle) + rt_b->rt_period_active = 0; + raw_spin_unlock(&rt_b->rt_runtime_lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -52,11 +55,12 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); + if (!rt_b->rt_period_active) { + rt_b->rt_period_active = 1; + hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); + } raw_spin_unlock(&rt_b->rt_runtime_lock); } @@ -256,7 +260,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP -static int pull_rt_task(struct rq *this_rq); +static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { @@ -350,13 +354,23 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } -static inline void set_post_schedule(struct rq *rq) +static DEFINE_PER_CPU(struct callback_head, rt_push_head); +static DEFINE_PER_CPU(struct callback_head, rt_pull_head); + +static void push_rt_tasks(struct rq *); +static void pull_rt_task(struct rq *); + +static inline void queue_push_tasks(struct rq *rq) { - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); + if (!has_pushable_tasks(rq)) + return; + + queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); +} + +static inline void queue_pull_task(struct rq *rq) +{ + queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); } static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) @@ -408,12 +422,11 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) return false; } -static inline int pull_rt_task(struct rq *this_rq) +static inline void pull_rt_task(struct rq *this_rq) { - return 0; } -static inline void set_post_schedule(struct rq *rq) +static inline void queue_push_tasks(struct rq *rq) { } #endif /* CONFIG_SMP */ @@ -1323,7 +1336,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If the current task on @p's runqueue is an RT task, then @@ -1465,7 +1478,15 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) struct rt_rq *rt_rq = &rq->rt; if (need_pull_rt_task(rq, prev)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we're + * being very careful to re-start the picking loop. + */ + lockdep_unpin_lock(&rq->lock); pull_rt_task(rq); + lockdep_pin_lock(&rq->lock); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a dl or stop task can slip in, in which case we need @@ -1493,7 +1514,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); - set_post_schedule(rq); + queue_push_tasks(rq); return p; } @@ -1948,14 +1969,15 @@ static void push_irq_work_func(struct irq_work *work) } #endif /* HAVE_RT_PUSH_IPI */ -static int pull_rt_task(struct rq *this_rq) +static void pull_rt_task(struct rq *this_rq) { - int this_cpu = this_rq->cpu, ret = 0, cpu; + int this_cpu = this_rq->cpu, cpu; + bool resched = false; struct task_struct *p; struct rq *src_rq; if (likely(!rt_overloaded(this_rq))) - return 0; + return; /* * Match the barrier from rt_set_overloaded; this guarantees that if we @@ -1966,7 +1988,7 @@ static int pull_rt_task(struct rq *this_rq) #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); - return 0; + return; } #endif @@ -2019,7 +2041,7 @@ static int pull_rt_task(struct rq *this_rq) if (p->prio < src_rq->curr->prio) goto skip; - ret = 1; + resched = true; deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); @@ -2035,12 +2057,8 @@ skip: double_unlock_balance(this_rq, src_rq); } - return ret; -} - -static void post_schedule_rt(struct rq *rq) -{ - push_rt_tasks(rq); + if (resched) + resched_curr(this_rq); } /* @@ -2136,8 +2154,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; - if (pull_rt_task(rq)) - resched_curr(rq); + queue_pull_task(rq); } void __init init_sched_rt_class(void) @@ -2158,8 +2175,6 @@ void __init init_sched_rt_class(void) */ static void switched_to_rt(struct rq *rq, struct task_struct *p) { - int check_resched = 1; - /* * If we are already running, then there's nothing * that needs to be done. But if we are not running @@ -2169,13 +2184,12 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && - /* Don't resched if we changed runqueues */ - push_rt_task(rq) && rq != task_rq(p)) - check_resched = 0; -#endif /* CONFIG_SMP */ - if (check_resched && p->prio < rq->curr->prio) + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + queue_push_tasks(rq); +#else + if (p->prio < rq->curr->prio) resched_curr(rq); +#endif /* CONFIG_SMP */ } } @@ -2196,14 +2210,13 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * may need to pull tasks to this runqueue. */ if (oldprio < p->prio) - pull_rt_task(rq); + queue_pull_task(rq); + /* * If there's a higher priority task waiting to run - * then reschedule. Note, the above pull_rt_task - * can release the rq lock and p could migrate. - * Only reschedule if p is still on the same runqueue. + * then reschedule. */ - if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) + if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); #else /* For UP simply resched on drop of prio */ @@ -2314,7 +2327,6 @@ const struct sched_class rt_sched_class = { .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, - .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e129993..84d48790b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); + +#ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); +#else +static inline void update_cpu_load_active(struct rq *this_rq) { } +#endif /* * Helpers for converting nanosecond timing to jiffy resolution @@ -131,6 +137,7 @@ struct rt_bandwidth { ktime_t rt_period; u64 rt_runtime; struct hrtimer rt_period_timer; + unsigned int rt_period_active; }; void __dl_clear_params(struct task_struct *p); @@ -215,7 +222,7 @@ struct cfs_bandwidth { s64 hierarchical_quota; u64 runtime_expires; - int idle, timer_active; + int idle, period_active; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; @@ -306,7 +313,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); @@ -617,9 +624,10 @@ struct rq { unsigned long cpu_capacity; unsigned long cpu_capacity_orig; + struct callback_head *balance_callback; + unsigned char idle_balance; /* For active balancing */ - int post_schedule; int active_balance; int push_cpu; struct cpu_stop_work active_balance_work; @@ -707,7 +715,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static inline u64 __rq_clock_broken(struct rq *rq) { - return ACCESS_ONCE(rq->clock); + return READ_ONCE(rq->clock); } static inline u64 rq_clock(struct rq *rq) @@ -760,6 +768,21 @@ extern int migrate_swap(struct task_struct *, struct task_struct *); #ifdef CONFIG_SMP +static inline void +queue_balance_callback(struct rq *rq, + struct callback_head *head, + void (*func)(struct rq *rq)) +{ + lockdep_assert_held(&rq->lock); + + if (unlikely(head->next)) + return; + + head->func = (void (*)(struct callback_head *))func; + head->next = rq->balance_callback; + rq->balance_callback = head; +} + extern void sched_ttwu_pending(void); #define rcu_dereference_check_sched_domain(p) \ @@ -1185,7 +1208,6 @@ struct sched_class { int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int next_cpu); - void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); @@ -1284,7 +1306,6 @@ extern void update_max_interval(void); extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void init_sched_dl_class(void); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -1298,8 +1319,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void update_idle_cpu_load(struct rq *this_rq); - extern void init_task_runnable_average(struct task_struct *p); static inline void add_nr_running(struct rq *rq, unsigned count) @@ -1406,8 +1425,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -1421,8 +1438,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + lockdep_pin_lock(&rq->lock); return rq; + } raw_spin_unlock(&rq->lock); while (unlikely(task_on_rq_migrating(p))) @@ -1459,8 +1478,10 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag * If we observe the new cpu in task_rq_lock, the acquire will * pair with the WMB to ensure we must then also see migrating. */ - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + lockdep_pin_lock(&rq->lock); return rq; + } raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); @@ -1472,6 +1493,7 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag static inline void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); } @@ -1480,6 +1502,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) __releases(rq->lock) __releases(p->pi_lock) { + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } @@ -1666,9 +1689,22 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_SCHED_DEBUG extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); +extern void +print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); + +#ifdef CONFIG_NUMA_BALANCING +extern void +show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void +print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf); +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 7466a0bb2..87e2c9f0c 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -4,11 +4,7 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> -#ifndef CONFIG_SCHED_BFS #include "sched.h" -#else -#include "bfs_sched.h" -#endif /* * bump this up when changing the output format or the meaning of an existing diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4ab704339..b0fbc7632 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -47,7 +47,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) # define schedstat_set(var, val) do { } while (0) #endif -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { t->sched_info.last_queued = 0; @@ -156,7 +156,7 @@ sched_info_switch(struct rq *rq, #define sched_info_depart(rq, t) do { } while (0) #define sched_info_arrive(rq, next) do { } while (0) #define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +#endif /* CONFIG_SCHED_INFO */ /* * The following are functions that support scheduler-internal time accounting. @@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(cputimer->running)) return false; /* @@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.utime += cputime; - raw_spin_unlock(&cputimer->lock); + atomic64_add(cputime, &cputimer->cputime_atomic.utime); } /** @@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.stime += cputime; - raw_spin_unlock(&cputimer->lock); + atomic64_add(cputime, &cputimer->cputime_atomic.stime); } /** @@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.sum_exec_runtime += ns; - raw_spin_unlock(&cputimer->lock); + atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 852143a79..052e02672 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss * an event. */ - set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ return timeout; } @@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) * doesn't imply write barrier and the users expects write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() - * and is paired with set_mb() in wait_woken(). + * and is paired with smp_store_mb() in wait_woken(). */ smp_wmb(); /* C */ wait->flags |= WQ_FLAG_WOKEN; @@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io); __sched int bit_wait_timeout(struct wait_bit_key *word) { - unsigned long now = ACCESS_ONCE(jiffies); + unsigned long now = READ_ONCE(jiffies); if (signal_pending_state(current->state, current)) return 1; if (time_after_eq(now, word->timeout)) @@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); __sched int bit_wait_io_timeout(struct wait_bit_key *word) { - unsigned long now = ACCESS_ONCE(jiffies); + unsigned long now = READ_ONCE(jiffies); if (signal_pending_state(current->state, current)) return 1; if (time_after_eq(now, word->timeout)) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4f4402894..245df6b32 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -346,16 +346,13 @@ static inline void seccomp_sync_threads(void) */ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { - struct seccomp_filter *filter; - unsigned long fp_size; - struct sock_filter *fp; - int new_len; - long ret; + struct seccomp_filter *sfilter; + int ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); + BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); - fp_size = fprog->len * sizeof(struct sock_filter); /* * Installing a seccomp filter requires that the task has @@ -368,60 +365,21 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) CAP_SYS_ADMIN) != 0) return ERR_PTR(-EACCES); - fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); - if (!fp) - return ERR_PTR(-ENOMEM); - - /* Copy the instructions from fprog. */ - ret = -EFAULT; - if (copy_from_user(fp, fprog->filter, fp_size)) - goto free_prog; - - /* Check and rewrite the fprog via the skb checker */ - ret = bpf_check_classic(fp, fprog->len); - if (ret) - goto free_prog; - - /* Check and rewrite the fprog for seccomp use */ - ret = seccomp_check_filter(fp, fprog->len); - if (ret) - goto free_prog; - - /* Convert 'sock_filter' insns to 'bpf_insn' insns */ - ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len); - if (ret) - goto free_prog; - /* Allocate a new seccomp_filter */ - ret = -ENOMEM; - filter = kzalloc(sizeof(struct seccomp_filter), - GFP_KERNEL|__GFP_NOWARN); - if (!filter) - goto free_prog; - - filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); - if (!filter->prog) - goto free_filter; - - ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); - if (ret) - goto free_filter_prog; - - kfree(fp); - atomic_set(&filter->usage, 1); - filter->prog->len = new_len; + sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); + if (!sfilter) + return ERR_PTR(-ENOMEM); - bpf_prog_select_runtime(filter->prog); + ret = bpf_prog_create_from_user(&sfilter->prog, fprog, + seccomp_check_filter); + if (ret < 0) { + kfree(sfilter); + return ERR_PTR(ret); + } - return filter; + atomic_set(&sfilter->usage, 1); -free_filter_prog: - __bpf_prog_free(filter->prog); -free_filter: - kfree(filter); -free_prog: - kfree(fp); - return ERR_PTR(ret); + return sfilter; } /** diff --git a/kernel/signal.c b/kernel/signal.c index 0206be728..0f6bbbe77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig) * RETURNS: * %true if @mask is set, %false if made noop because @task was dying. */ -bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) +bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); @@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) * CONTEXT: * Must be called with @task->sighand->siglock held. */ -void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) +void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~JOBCTL_PENDING_MASK); @@ -414,21 +414,16 @@ void flush_sigqueue(struct sigpending *queue) } /* - * Flush all pending signals for a task. + * Flush all pending signals for this kthread. */ -void __flush_signals(struct task_struct *t) -{ - clear_tsk_thread_flag(t, TIF_SIGPENDING); - flush_sigqueue(&t->pending); - flush_sigqueue(&t->signal->shared_pending); -} - void flush_signals(struct task_struct *t) { unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); - __flush_signals(t); + clear_tsk_thread_flag(t, TIF_SIGPENDING); + flush_sigqueue(&t->pending); + flush_sigqueue(&t->signal->shared_pending); spin_unlock_irqrestore(&t->sighand->siglock, flags); } @@ -2000,7 +1995,7 @@ static bool do_signal_stop(int signr) struct signal_struct *sig = current->signal; if (!(current->jobctl & JOBCTL_STOP_PENDING)) { - unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; + unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; /* signr will be recorded in task->jobctl for retries */ diff --git a/kernel/smpboot.c b/kernel/smpboot.c index bdcc6c018..7c434c39f 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -173,7 +173,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) if (tsk) return 0; - td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu)); + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); if (!td) return -ENOMEM; td->cpu = cpu; @@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu) mutex_lock(&smpboot_threads_lock); list_for_each_entry(cur, &hotplug_threads, list) - smpboot_unpark_thread(cur, cpu); + if (cpumask_test_cpu(cpu, cur->cpumask)) + smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); } @@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) { unsigned int cpu; + /* Unpark any threads that were voluntarily parked. */ + for_each_cpu_not(cpu, ht->cpumask) { + if (cpu_online(cpu)) { + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + if (tsk) + kthread_unpark(tsk); + } + } + /* We need to destroy also the parked threads of offline cpus */ for_each_possible_cpu(cpu) { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); @@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) unsigned int cpu; int ret = 0; + if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) + return -ENOMEM; + cpumask_copy(plug_thread->cpumask, cpu_possible_mask); + get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { @@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); + free_cpumask_var(plug_thread->cpumask); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); +/** + * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked + * @plug_thread: Hotplug thread descriptor + * @new: Revised mask to use + * + * The cpumask field in the smp_hotplug_thread must not be updated directly + * by the client, but only by calling this function. + * This function can only be called on a registered smp_hotplug_thread. + */ +int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *new) +{ + struct cpumask *old = plug_thread->cpumask; + cpumask_var_t tmp; + unsigned int cpu; + + if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) + return -ENOMEM; + + get_online_cpus(); + mutex_lock(&smpboot_threads_lock); + + /* Park threads that were exclusively enabled on the old mask. */ + cpumask_andnot(tmp, old, new); + for_each_cpu_and(cpu, tmp, cpu_online_mask) + smpboot_park_thread(plug_thread, cpu); + + /* Unpark threads that are exclusively enabled on the new mask. */ + cpumask_andnot(tmp, new, old); + for_each_cpu_and(cpu, tmp, cpu_online_mask) + smpboot_unpark_thread(plug_thread, cpu); + + cpumask_copy(old, new); + + mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); + + free_cpumask_var(tmp); + + return 0; +} +EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); + static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 263b0e1ad..fd643d8c4 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -41,8 +41,7 @@ struct cpu_stopper { }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); -DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); - +static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; /* @@ -212,25 +211,6 @@ static int multi_cpu_stop(void *data) return err; } -struct irq_cpu_stop_queue_work_info { - int cpu1; - int cpu2; - struct cpu_stop_work *work1; - struct cpu_stop_work *work2; -}; - -/* - * This function is always run with irqs and preemption disabled. - * This guarantees that both work1 and work2 get queued, before - * our local migrate thread gets the chance to preempt us. - */ -static void irq_cpu_stop_queue_work(void *arg) -{ - struct irq_cpu_stop_queue_work_info *info = arg; - cpu_stop_queue_work(info->cpu1, info->work1); - cpu_stop_queue_work(info->cpu2, info->work2); -} - /** * stop_two_cpus - stops two cpus * @cpu1: the cpu to stop @@ -246,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * { struct cpu_stop_done done; struct cpu_stop_work work1, work2; - struct irq_cpu_stop_queue_work_info call_args; struct multi_stop_data msdata; preempt_disable(); @@ -263,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * .done = &done }; - call_args = (struct irq_cpu_stop_queue_work_info){ - .cpu1 = cpu1, - .cpu2 = cpu2, - .work1 = &work1, - .work2 = &work2, - }; - cpu_stop_init_done(&done, 2); set_state(&msdata, MULTI_STOP_PREPARE); @@ -286,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * return -ENOENT; } - lg_local_lock(&stop_cpus_lock); - /* - * Queuing needs to be done by the lowest numbered CPU, to ensure - * that works are always queued in the same order on every CPU. - * This prevents deadlocks. - */ - smp_call_function_single(min(cpu1, cpu2), - &irq_cpu_stop_queue_work, - &call_args, 1); - lg_local_unlock(&stop_cpus_lock); + lg_double_lock(&stop_cpus_lock, cpu1, cpu2); + cpu_stop_queue_work(cpu1, &work1); + cpu_stop_queue_work(cpu2, &work2); + lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + preempt_enable(); wait_for_completion(&done.completion); diff --git a/kernel/sys.c b/kernel/sys.c index a4e372b79..259fda25e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -92,10 +92,10 @@ # define SET_TSC_CTL(a) (-EINVAL) #endif #ifndef MPX_ENABLE_MANAGEMENT -# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL) +# define MPX_ENABLE_MANAGEMENT() (-EINVAL) #endif #ifndef MPX_DISABLE_MANAGEMENT -# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) +# define MPX_DISABLE_MANAGEMENT() (-EINVAL) #endif #ifndef GET_FP_MODE # define GET_FP_MODE(a) (-EINVAL) @@ -1722,7 +1722,6 @@ exit_err: goto exit; } -#ifdef CONFIG_CHECKPOINT_RESTORE /* * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. @@ -1818,6 +1817,7 @@ out: return error; } +#ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) { struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; @@ -1902,10 +1902,41 @@ out: } #endif /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + /* + * This doesn't move the auxiliary vector itself since it's pinned to + * mm_struct, but it permits filling the vector with new values. It's + * up to the caller to provide sane values here, otherwise userspace + * tools which use this vector might be unhappy. + */ + unsigned long user_auxv[AT_VECTOR_SIZE]; + + if (len > sizeof(user_auxv)) + return -EINVAL; + + if (copy_from_user(user_auxv, (const void __user *)addr, len)) + return -EFAULT; + + /* Make sure the last entry is always AT_NULL */ + user_auxv[AT_VECTOR_SIZE - 2] = 0; + user_auxv[AT_VECTOR_SIZE - 1] = 0; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + + task_lock(current); + memcpy(mm->saved_auxv, user_auxv, len); + task_unlock(current); + + return 0; +} + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { struct mm_struct *mm = current->mm; + struct prctl_mm_map prctl_map; struct vm_area_struct *vma; int error; @@ -1925,6 +1956,9 @@ static int prctl_set_mm(int opt, unsigned long addr, if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (opt == PR_SET_MM_AUXV) + return prctl_set_auxv(mm, addr, arg4); + if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; @@ -1933,42 +1967,64 @@ static int prctl_set_mm(int opt, unsigned long addr, down_read(&mm->mmap_sem); vma = find_vma(mm, addr); + prctl_map.start_code = mm->start_code; + prctl_map.end_code = mm->end_code; + prctl_map.start_data = mm->start_data; + prctl_map.end_data = mm->end_data; + prctl_map.start_brk = mm->start_brk; + prctl_map.brk = mm->brk; + prctl_map.start_stack = mm->start_stack; + prctl_map.arg_start = mm->arg_start; + prctl_map.arg_end = mm->arg_end; + prctl_map.env_start = mm->env_start; + prctl_map.env_end = mm->env_end; + prctl_map.auxv = NULL; + prctl_map.auxv_size = 0; + prctl_map.exe_fd = -1; + switch (opt) { case PR_SET_MM_START_CODE: - mm->start_code = addr; + prctl_map.start_code = addr; break; case PR_SET_MM_END_CODE: - mm->end_code = addr; + prctl_map.end_code = addr; break; case PR_SET_MM_START_DATA: - mm->start_data = addr; + prctl_map.start_data = addr; break; case PR_SET_MM_END_DATA: - mm->end_data = addr; + prctl_map.end_data = addr; + break; + case PR_SET_MM_START_STACK: + prctl_map.start_stack = addr; break; - case PR_SET_MM_START_BRK: - if (addr <= mm->end_data) - goto out; - - if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, - mm->end_data, mm->start_data)) - goto out; - - mm->start_brk = addr; + prctl_map.start_brk = addr; break; - case PR_SET_MM_BRK: - if (addr <= mm->end_data) - goto out; - - if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, - mm->end_data, mm->start_data)) - goto out; - - mm->brk = addr; + prctl_map.brk = addr; break; + case PR_SET_MM_ARG_START: + prctl_map.arg_start = addr; + break; + case PR_SET_MM_ARG_END: + prctl_map.arg_end = addr; + break; + case PR_SET_MM_ENV_START: + prctl_map.env_start = addr; + break; + case PR_SET_MM_ENV_END: + prctl_map.env_end = addr; + break; + default: + goto out; + } + + error = validate_prctl_map(&prctl_map); + if (error) + goto out; + switch (opt) { /* * If command line arguments and environment * are placed somewhere else on stack, we can @@ -1985,52 +2041,20 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EFAULT; goto out; } - if (opt == PR_SET_MM_START_STACK) - mm->start_stack = addr; - else if (opt == PR_SET_MM_ARG_START) - mm->arg_start = addr; - else if (opt == PR_SET_MM_ARG_END) - mm->arg_end = addr; - else if (opt == PR_SET_MM_ENV_START) - mm->env_start = addr; - else if (opt == PR_SET_MM_ENV_END) - mm->env_end = addr; - break; - - /* - * This doesn't move auxiliary vector itself - * since it's pinned to mm_struct, but allow - * to fill vector with new values. It's up - * to a caller to provide sane values here - * otherwise user space tools which use this - * vector might be unhappy. - */ - case PR_SET_MM_AUXV: { - unsigned long user_auxv[AT_VECTOR_SIZE]; - - if (arg4 > sizeof(user_auxv)) - goto out; - up_read(&mm->mmap_sem); - - if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) - return -EFAULT; - - /* Make sure the last entry is always AT_NULL */ - user_auxv[AT_VECTOR_SIZE - 2] = 0; - user_auxv[AT_VECTOR_SIZE - 1] = 0; - - BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); - - task_lock(current); - memcpy(mm->saved_auxv, user_auxv, arg4); - task_unlock(current); - - return 0; - } - default: - goto out; } + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + error = 0; out: up_read(&mm->mmap_sem); @@ -2230,12 +2254,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_MPX_ENABLE_MANAGEMENT: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = MPX_ENABLE_MANAGEMENT(me); + error = MPX_ENABLE_MANAGEMENT(); break; case PR_MPX_DISABLE_MANAGEMENT: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = MPX_DISABLE_MANAGEMENT(me); + error = MPX_DISABLE_MANAGEMENT(); break; case PR_SET_FP_MODE: error = SET_FP_MODE(me, arg2); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7f45887fa..19b62b522 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,12 +124,7 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; -static int __maybe_unused one_hundred = 100; -#ifdef CONFIG_SCHED_BFS -extern int rr_interval; -extern int sched_iso_cpu; -static int __read_mostly one_thousand = 1000; -#endif +static int one_hundred = 100; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -264,7 +259,7 @@ static struct ctl_table sysctl_base_table[] = { { } }; -#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS) +#ifdef CONFIG_SCHED_DEBUG static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ @@ -281,7 +276,6 @@ static int max_extfrag_threshold = 1000; #endif static struct ctl_table kern_table[] = { -#ifndef CONFIG_SCHED_BFS { .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, @@ -355,15 +349,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -448,7 +433,6 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif -#endif /* !CONFIG_SCHED_BFS */ #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -888,6 +872,13 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, + { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), @@ -979,26 +970,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SCHED_BFS - { - .procname = "rr_interval", - .data = &rr_interval, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one_thousand, - }, - { - .procname = "iso_cpu", - .data = &sched_iso_cpu, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one_hundred, - }, -#endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { .procname = "spin_retry", @@ -1159,6 +1130,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + }, +#endif { } }; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 7ceb68656..579ce1b92 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -89,7 +89,7 @@ config NO_HZ_IDLE config NO_HZ_FULL bool "Full dynticks system (tickless)" # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # We need at least one periodic CPU for timekeeping depends on SMP # RCU_USER_QS dependency diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 01f031241..49eca0bee 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -12,20 +12,3 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o - -$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 1b001ed1e..7fbba635a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init); * @alarm: ptr to alarm to set * @start: time to run the alarm */ -int alarm_start(struct alarm *alarm, ktime_t start) +void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - int ret; spin_lock_irqsave(&base->lock, flags); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); - ret = hrtimer_start(&alarm->timer, alarm->node.expires, - HRTIMER_MODE_ABS); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - return ret; } EXPORT_SYMBOL_GPL(alarm_start); @@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start); * @alarm: ptr to alarm to set * @start: time relative to now to run the alarm */ -int alarm_start_relative(struct alarm *alarm, ktime_t start) +void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; start = ktime_add(start, base->gettime()); - return alarm_start(alarm, start); + alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -495,12 +492,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, */ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) { - clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; - if (!alarmtimer_get_rtcdev()) return -EINVAL; - return hrtimer_get_res(baseid, tp); + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; } /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 637a09461..50eb107f1 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -94,8 +94,8 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) } EXPORT_SYMBOL_GPL(clockevent_delta2ns); -static int __clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state) +static int __clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) { /* Transition with legacy set_mode() callback */ if (dev->set_mode) { @@ -120,19 +120,37 @@ static int __clockevents_set_state(struct clock_event_device *dev, /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: - return dev->set_state_shutdown(dev); + if (dev->set_state_shutdown) + return dev->set_state_shutdown(dev); + return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; - return dev->set_state_periodic(dev); + if (dev->set_state_periodic) + return dev->set_state_periodic(dev); + return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; - return dev->set_state_oneshot(dev); + if (dev->set_state_oneshot) + return dev->set_state_oneshot(dev); + return 0; + + case CLOCK_EVT_STATE_ONESHOT_STOPPED: + /* Core internal bug */ + if (WARN_ONCE(!clockevent_state_oneshot(dev), + "Current state: %d\n", + clockevent_get_state(dev))) + return -EINVAL; + + if (dev->set_state_oneshot_stopped) + return dev->set_state_oneshot_stopped(dev); + else + return -ENOSYS; default: return -ENOSYS; @@ -140,26 +158,26 @@ static int __clockevents_set_state(struct clock_event_device *dev, } /** - * clockevents_set_state - set the operating state of a clock event device + * clockevents_switch_state - set the operating state of a clock event device * @dev: device to modify * @state: new state * * Must be called with interrupts disabled ! */ -void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state) +void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) { - if (dev->state != state) { - if (__clockevents_set_state(dev, state)) + if (clockevent_get_state(dev) != state) { + if (__clockevents_switch_state(dev, state)) return; - dev->state = state; + clockevent_set_state(dev, state); /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ - if (state == CLOCK_EVT_STATE_ONESHOT) { + if (clockevent_state_oneshot(dev)) { if (unlikely(!dev->mult)) { dev->mult = 1; WARN_ON(1); @@ -174,7 +192,7 @@ void clockevents_set_state(struct clock_event_device *dev, */ void clockevents_shutdown(struct clock_event_device *dev) { - clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event.tv64 = KTIME_MAX; } @@ -248,7 +266,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; dev->retries++; @@ -285,7 +303,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; dev->retries++; @@ -317,9 +335,13 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, dev->next_event = expires; - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; + /* We must be in ONESHOT state here */ + WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", + clockevent_get_state(dev)); + /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) return dev->set_next_ktime(expires, dev); @@ -362,7 +384,7 @@ static int clockevents_replace(struct clock_event_device *ced) struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { - if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) + if (dev == ced || !clockevent_state_detached(dev)) continue; if (!tick_check_replacement(newdev, dev)) @@ -388,7 +410,7 @@ static int clockevents_replace(struct clock_event_device *ced) static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ - if (ced->state == CLOCK_EVT_STATE_DETACHED) { + if (clockevent_state_detached(ced)) { list_del_init(&ced->list); return 0; } @@ -445,7 +467,8 @@ static int clockevents_sanity_check(struct clock_event_device *dev) if (dev->set_mode) { /* We shouldn't be supporting new modes now */ WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || - dev->set_state_shutdown || dev->tick_resume); + dev->set_state_shutdown || dev->tick_resume || + dev->set_state_oneshot_stopped); BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); return 0; @@ -454,18 +477,6 @@ static int clockevents_sanity_check(struct clock_event_device *dev) if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; - /* New state-specific callbacks */ - if (!dev->set_state_shutdown) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && - !dev->set_state_periodic) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && - !dev->set_state_oneshot) - return -EINVAL; - return 0; } @@ -480,7 +491,7 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(clockevents_sanity_check(dev)); /* Initialize state to DETACHED */ - dev->state = CLOCK_EVT_STATE_DETACHED; + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); @@ -545,11 +556,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); - if (dev->state == CLOCK_EVT_STATE_ONESHOT) + if (clockevent_state_oneshot(dev)) return clockevents_program_event(dev, dev->next_event, false); - if (dev->state == CLOCK_EVT_STATE_PERIODIC) - return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + if (clockevent_state_periodic(dev)) + return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } @@ -603,13 +614,13 @@ void clockevents_exchange_device(struct clock_event_device *old, */ if (old) { module_put(old->owner); - clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); + clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); list_del(&old->list); list_add(&old->list, &clockevents_released); } if (new) { - BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); + BUG_ON(!clockevent_state_detached(new)); clockevents_shutdown(new); } } @@ -622,7 +633,7 @@ void clockevents_suspend(void) struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) - if (dev->suspend) + if (dev->suspend && !clockevent_state_detached(dev)) dev->suspend(dev); } @@ -634,7 +645,7 @@ void clockevents_resume(void) struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) - if (dev->resume) + if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } @@ -665,7 +676,7 @@ void tick_cleanup_dead_cpu(int cpu) if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { - BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); + BUG_ON(!clockevent_state_detached(dev)); list_del(&dev->list); } } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 15facb1b9..841b72f72 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -23,6 +23,8 @@ * o Allow clocksource drivers to be unregistered */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/device.h> #include <linux/clocksource.h> #include <linux/init.h> @@ -216,10 +218,11 @@ static void clocksource_watchdog(unsigned long data) /* Check the deviation from the watchdog clocksource. */ if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { - pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); - pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", + pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", + cs->name); + pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wdnow, wdlast, watchdog->mask); - pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", + pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", cs->name, csnow, cslast, cs->mask); __clocksource_unstable(cs); continue; @@ -567,9 +570,8 @@ static void __clocksource_select(bool skipcur) */ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ - printk(KERN_WARNING "Override clocksource %s is not " - "HRT compatible. Cannot switch while in " - "HRT/NOHZ mode\n", cs->name); + pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); override_name[0] = 0; } else /* Override clocksource can be used. */ @@ -708,8 +710,8 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq clocksource_update_max_deferment(cs); - pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", - cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); + pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); @@ -1008,12 +1010,10 @@ __setup("clocksource=", boot_override_clocksource); static int __init boot_override_clock(char* str) { if (!strcmp(str, "pmtmr")) { - printk("Warning: clock=pmtmr is deprecated. " - "Use clocksource=acpi_pm.\n"); + pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n"); return boot_override_clocksource("acpi_pm"); } - printk("Warning! clock= boot option is deprecated. " - "Use clocksource=xyz\n"); + pr_warn("clock= boot option is deprecated - use clocksource=xyz\n"); return boot_override_clocksource(str); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 93ef7190b..5c7ae4b64 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -66,33 +66,29 @@ */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, - .resolution = KTIME_LOW_RES, }, } }; @@ -109,27 +105,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) return hrtimer_clock_to_base_table[clock_id]; } - -/* - * Get the coarse grained time at the softirq based on xtime and - * wall_to_monotonic. - */ -static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) -{ - ktime_t xtim, mono, boot, tai; - ktime_t off_real, off_boot, off_tai; - - mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); - boot = ktime_add(mono, off_boot); - xtim = ktime_add(mono, off_real); - tai = ktime_add(mono, off_tai); - - base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; - base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; -} - /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -137,6 +112,18 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) #ifdef CONFIG_SMP /* + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() + * such that hrtimer_callback_running() can unconditionally dereference + * timer->base->cpu_base + */ +static struct hrtimer_cpu_base migration_cpu_base = { + .seq = SEQCNT_ZERO(migration_cpu_base), + .clock_base = { { .cpu_base = &migration_cpu_base, }, }, +}; + +#define migration_base migration_cpu_base.clock_base[0] + +/* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. @@ -145,8 +132,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * possible to set timer->base = &migration_base and drop the lock: the timer + * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, @@ -156,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, for (;;) { base = timer->base; - if (likely(base != NULL)) { + if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; @@ -190,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&hrtimer_bases); + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +} +#else +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + return this_cpu_ptr(&hrtimer_bases); +} +#endif + /* * Switch the timer base to the current CPU when possible. */ @@ -197,14 +202,13 @@ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { + struct hrtimer_cpu_base *new_cpu_base, *this_base; struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = get_nohz_timer_target(pinned); int basenum = base->index; + this_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_base, pinned); again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { @@ -220,22 +224,24 @@ again: if (unlikely(hrtimer_callback_running(timer))) return base; - /* See the comment in lock_timer_base() */ - timer->base = NULL; + /* See the comment in lock_hrtimer_base() */ + timer->base = &migration_base; raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_base; goto again; } } @@ -443,24 +449,35 @@ static inline void debug_deactivate(struct hrtimer *timer) } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *timer) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + cpu_base->next_timer = timer; +#endif +} + static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; - int i; + unsigned int active = cpu_base->active_bases; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + hrtimer_update_next_timer(cpu_base, NULL); + for (; active; base++, active >>= 1) { struct timerqueue_node *next; struct hrtimer *timer; - next = timerqueue_getnext(&base->active); - if (!next) + if (!(active & 0x01)) continue; + next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires.tv64 < expires_next.tv64) + if (expires.tv64 < expires_next.tv64) { expires_next = expires; + hrtimer_update_next_timer(cpu_base, timer); + } } /* * clock_was_set() might have changed base->offset of any of @@ -473,6 +490,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) } #endif +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + return ktime_get_update_offsets_now(&base->clock_was_set_seq, + offs_real, offs_boot, offs_tai); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -480,6 +507,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) * High resolution timer enabled ? */ static int hrtimer_hres_enabled __read_mostly = 1; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode @@ -508,9 +537,14 @@ static inline int hrtimer_is_hres_enabled(void) /* * Is the high resolution mode active ? */ +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return cpu_base->hres_active; +} + static inline int hrtimer_hres_active(void) { - return __this_cpu_read(hrtimer_bases.hres_active); + return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } /* @@ -521,7 +555,12 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - ktime_t expires_next = __hrtimer_get_next_event(cpu_base); + ktime_t expires_next; + + if (!cpu_base->hres_active) + return; + + expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -545,63 +584,53 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (cpu_base->hang_detected) return; - if (cpu_base->expires_next.tv64 != KTIME_MAX) - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(cpu_base->expires_next, 1); } /* - * Shared reprogramming for clock_realtime and clock_monotonic - * * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * - * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming - * and no expiry check happens. The timer gets enqueued into the rbtree. The - * reprogramming and expiry check is done in the hrtimer_interrupt or in the - * softirq. - * * Called with interrupts disabled and base->cpu_base.lock held */ -static int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static void hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - int res; WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* - * When the callback is running, we do not reprogram the clock event - * device. The timer callback is either running on a different CPU or - * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. */ - if (hrtimer_callback_running(timer)) - return 0; + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; /* * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Nothing wrong - * about that, just avoid to call into the tick code, which - * has now objections against negative expiry values. + * expiry time which is less than base->offset. Set it to 0. */ if (expires.tv64 < 0) - return -ETIME; + expires.tv64 = 0; if (expires.tv64 >= cpu_base->expires_next.tv64) - return 0; + return; - /* - * When the target cpu of the timer is currently executing - * hrtimer_interrupt(), then we do not touch the clock event - * device. hrtimer_interrupt() will reevaluate all clock bases - * before reprogramming the device. - */ - if (cpu_base->in_hrtirq) - return 0; + /* Update the pointer to the next expiring timer */ + cpu_base->next_timer = timer; /* * If a hang was detected in the last timer interrupt then we @@ -610,15 +639,14 @@ static int hrtimer_reprogram(struct hrtimer *timer, * to make progress. */ if (cpu_base->hang_detected) - return 0; + return; /* - * Clockevents returns -ETIME, when the event was in the past. + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. */ - res = tick_program_event(expires, 0); - if (!IS_ERR_VALUE(res)) - cpu_base->expires_next = expires; - return res; + cpu_base->expires_next = expires; + tick_program_event(expires, 1); } /* @@ -630,15 +658,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) -{ - ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; - ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - - return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); -} - /* * Retrigger next event is called after clock was set * @@ -648,7 +667,7 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - if (!hrtimer_hres_active()) + if (!base->hres_active) return; raw_spin_lock(&base->lock); @@ -662,29 +681,19 @@ static void retrigger_next_event(void *arg) */ static int hrtimer_switch_to_hres(void) { - int i, cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); - unsigned long flags; - - if (base->hres_active) - return 1; - - local_irq_save(flags); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", base->cpu); return 0; } base->hres_active = 1; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - base->clock_base[i].resolution = KTIME_HIGH_RES; + hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); - local_irq_restore(flags); return 1; } @@ -706,6 +715,7 @@ void clock_was_set_delayed(void) #else +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } @@ -803,6 +813,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { @@ -814,8 +832,11 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta.tv64 < 0) return 0; - if (interval.tv64 < timer->base->resolution.tv64) - interval.tv64 = timer->base->resolution.tv64; + if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + return 0; + + if (interval.tv64 < hrtimer_resolution) + interval.tv64 = hrtimer_resolution; if (unlikely(delta.tv64 >= interval.tv64)) { s64 incr = ktime_to_ns(interval); @@ -849,16 +870,11 @@ static int enqueue_hrtimer(struct hrtimer *timer, { debug_activate(timer); - timerqueue_add(&base->active, &timer->node); base->cpu_base->active_bases |= 1 << base->index; - /* - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the - * state of a possibly running callback. - */ - timer->state |= HRTIMER_STATE_ENQUEUED; + timer->state = HRTIMER_STATE_ENQUEUED; - return (&timer->node == base->active.next); + return timerqueue_add(&base->active, &timer->node); } /* @@ -875,39 +891,38 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { - struct timerqueue_node *next_timer; - if (!(timer->state & HRTIMER_STATE_ENQUEUED)) - goto out; + struct hrtimer_cpu_base *cpu_base = base->cpu_base; + unsigned int state = timer->state; + + timer->state = newstate; + if (!(state & HRTIMER_STATE_ENQUEUED)) + return; + + if (!timerqueue_del(&base->active, &timer->node)) + cpu_base->active_bases &= ~(1 << base->index); - next_timer = timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); - if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (base->cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(base->cpu_base, 1); - } + /* + * Note: If reprogram is false we do not update + * cpu_base->next_timer. This happens when we remove the first + * timer on a remote cpu. No harm as we never dereference + * cpu_base->next_timer. So the worst thing what can happen is + * an superflous call to hrtimer_force_reprogram() on the + * remote cpu later on if the same timer gets enqueued again. + */ + if (reprogram && timer == cpu_base->next_timer) + hrtimer_force_reprogram(cpu_base, 1); #endif - } - if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); -out: - timer->state = newstate; } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state; + unsigned long state = timer->state; int reprogram; /* @@ -921,30 +936,35 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state = timer->state & HRTIMER_STATE_CALLBACK; + + if (!restart) + state = HRTIMER_STATE_INACTIVE; + __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } -int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode, - int wakeup) +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + */ +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret, leftmost; + int leftmost; base = lock_hrtimer_base(timer, &flags); /* Remove an active timer from the queue: */ - ret = remove_hrtimer(timer, base); + remove_hrtimer(timer, base, true); if (mode & HRTIMER_MODE_REL) { tim = ktime_add_safe(tim, base->get_time()); @@ -956,7 +976,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * timeouts. This will go away with the GTOD framework. */ #ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, base->resolution); + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); #endif } @@ -968,85 +988,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); leftmost = enqueue_hrtimer(timer, new_base); - - if (!leftmost) { - unlock_hrtimer_base(timer, &flags); - return ret; - } + if (!leftmost) + goto unlock; if (!hrtimer_is_hres_active(timer)) { /* * Kick to reschedule the next tick to handle the new timer * on dynticks target. */ - wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && - hrtimer_reprogram(timer, new_base)) { - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (wakeup) { - /* - * We need to drop cpu_base->lock to avoid a - * lock ordering issue vs. rq->lock. - */ - raw_spin_unlock(&new_base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - local_irq_restore(flags); - return ret; - } else { - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - } + if (new_base->cpu_base->nohz_active) + wake_up_nohz_cpu(new_base->cpu_base->cpu); + } else { + hrtimer_reprogram(timer, new_base); } - +unlock: unlock_hrtimer_base(timer, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); - -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** - * hrtimer_start - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); -} -EXPORT_SYMBOL_GPL(hrtimer_start); - - -/** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * @@ -1062,10 +1022,19 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) unsigned long flags; int ret = -1; + /* + * Check lockless first. If the timer is not active (neither + * enqueued nor running the callback, nothing to do here. The + * base lock does not serialize against a concurrent enqueue, + * so we can avoid taking it. + */ + if (!hrtimer_active(timer)) + return 0; + base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base); + ret = remove_hrtimer(timer, base, false); unlock_hrtimer_base(timer, &flags); @@ -1115,26 +1084,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); /** * hrtimer_get_next_event - get the time until next expiry event * - * Returns the delta to the next expiry event or KTIME_MAX if no timer - * is pending. + * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -ktime_t hrtimer_get_next_event(void) +u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t mindelta = { .tv64 = KTIME_MAX }; + u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) - mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), - ktime_get()); + if (!__hrtimer_hres_active(cpu_base)) + expires = __hrtimer_get_next_event(cpu_base).tv64; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - if (mindelta.tv64 < 0) - mindelta.tv64 = 0; - return mindelta; + return expires; } #endif @@ -1176,37 +1141,73 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); -/** - * hrtimer_get_res - get the timer resolution for a clock - * @which_clock: which clock to query - * @tp: pointer to timespec variable to store the resolution +/* + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. * - * Store the resolution of the clock selected by @which_clock in the - * variable pointed to by @tp. + * It is important for this function to not return a false negative. */ -int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) +bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_cpu_base *cpu_base; - int base = hrtimer_clockid_to_base(which_clock); + unsigned int seq; - cpu_base = raw_cpu_ptr(&hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); + do { + cpu_base = READ_ONCE(timer->base->cpu_base); + seq = raw_read_seqcount_begin(&cpu_base->seq); - return 0; + if (timer->state != HRTIMER_STATE_INACTIVE || + cpu_base->running == timer) + return true; + + } while (read_seqcount_retry(&cpu_base->seq, seq) || + cpu_base != READ_ONCE(timer->base->cpu_base)); + + return false; } -EXPORT_SYMBOL_GPL(hrtimer_get_res); +EXPORT_SYMBOL_GPL(hrtimer_active); -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) +/* + * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 + * distinct sections: + * + * - queued: the timer is queued + * - callback: the timer is being ran + * - post: the timer is inactive or (re)queued + * + * On the read side we ensure we observe timer->state and cpu_base->running + * from the same section, if anything changed while we looked at it, we retry. + * This includes timer->base changing because sequence numbers alone are + * insufficient for that. + * + * The sequence numbers are required because otherwise we could still observe + * a false negative if the read side got smeared over multiple consequtive + * __run_hrtimer() invocations. + */ + +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t *now) { - struct hrtimer_clock_base *base = timer->base; - struct hrtimer_cpu_base *cpu_base = base->cpu_base; enum hrtimer_restart (*fn)(struct hrtimer *); int restart; - WARN_ON(!irqs_disabled()); + lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + cpu_base->running = timer; + + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1222,58 +1223,43 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) raw_spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * Note: We clear the running state after enqueue_hrtimer and * we do not reprogramm the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() + * + * Note: Because we dropped the cpu_base->lock above, + * hrtimer_start_range_ns() can have popped in and enqueued the timer + * for us already. */ - if (restart != HRTIMER_NORESTART) { - BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + if (restart != HRTIMER_NORESTART && + !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base); - } - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); - timer->state &= ~HRTIMER_STATE_CALLBACK; + WARN_ON_ONCE(cpu_base->running != timer); + cpu_base->running = NULL; } -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer interrupt - * Called with interrupts disabled - */ -void hrtimer_interrupt(struct clock_event_device *dev) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) { - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t expires_next, now, entry_time, delta; - int i, retries = 0; - - BUG_ON(!cpu_base->hres_active); - cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); - entry_time = now = hrtimer_update_base(cpu_base); -retry: - cpu_base->in_hrtirq = 1; - /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. - */ - cpu_base->expires_next.tv64 = KTIME_MAX; + struct hrtimer_clock_base *base = cpu_base->clock_base; + unsigned int active = cpu_base->active_bases; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - struct hrtimer_clock_base *base; + for (; active; base++, active >>= 1) { struct timerqueue_node *node; ktime_t basenow; - if (!(cpu_base->active_bases & (1 << i))) + if (!(active & 0x01)) continue; - base = cpu_base->clock_base + i; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { @@ -1296,9 +1282,42 @@ retry: if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) break; - __run_hrtimer(timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow); } } +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + int retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + cpu_base->in_hrtirq = 1; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + + __hrtimer_run_queues(cpu_base, now); + /* Reevaluate the clock bases for the next expiry */ expires_next = __hrtimer_get_next_event(cpu_base); /* @@ -1310,8 +1329,7 @@ retry: raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 == KTIME_MAX || - !tick_program_event(expires_next, 0)) { + if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } @@ -1344,8 +1362,8 @@ retry: cpu_base->hang_detected = 1; raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); - if (delta.tv64 > cpu_base->max_hang_time.tv64) - cpu_base->max_hang_time = delta; + if ((unsigned int)delta.tv64 > cpu_base->max_hang_time) + cpu_base->max_hang_time = (unsigned int) delta.tv64; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. @@ -1363,7 +1381,7 @@ retry: * local version of hrtimer_peek_ahead_timers() called with interrupts * disabled. */ -static void __hrtimer_peek_ahead_timers(void) +static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; @@ -1375,29 +1393,6 @@ static void __hrtimer_peek_ahead_timers(void) hrtimer_interrupt(td->evtdev); } -/** - * hrtimer_peek_ahead_timers -- run soft-expired timers now - * - * hrtimer_peek_ahead_timers will peek at the timer queue of - * the current cpu and check if there are any timers for which - * the soft expires time has passed. If any such timers exist, - * they are run immediately and then removed from the timer queue. - * - */ -void hrtimer_peek_ahead_timers(void) -{ - unsigned long flags; - - local_irq_save(flags); - __hrtimer_peek_ahead_timers(); - local_irq_restore(flags); -} - -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_peek_ahead_timers(); -} - #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } @@ -1405,66 +1400,32 @@ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* - * Called from timer softirq every jiffy, expire hrtimers: - * - * For HRT its the fall back code to run the softirq in the timer - * softirq context in case the hrtimer initialization failed or has - * not been done yet. + * Called from run_local_timers in hardirq context every jiffy */ -void hrtimer_run_pending(void) +void hrtimer_run_queues(void) { - if (hrtimer_hres_active()) + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t now; + + if (__hrtimer_hres_active(cpu_base)) return; /* - * This _is_ ugly: We have to check in the softirq context, - * whether we can switch to highres and / or nohz mode. The - * clocksource switch happens in the timer interrupt with - * xtime_lock held. Notification from there only sets the - * check bit in the tick_oneshot code, otherwise we might - * deadlock vs. xtime_lock. + * This _is_ ugly: We have to check periodically, whether we + * can switch to highres and / or nohz mode. The clocksource + * switch happens with xtime_lock held. Notification from + * there only sets the check bit in the tick_oneshot code, + * otherwise we might deadlock vs. xtime_lock. */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); -} - -/* - * Called from hardirq context every jiffy - */ -void hrtimer_run_queues(void) -{ - struct timerqueue_node *node; - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *base; - int index, gettime = 1; - - if (hrtimer_hres_active()) return; - - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - if (!timerqueue_getnext(&base->active)) - continue; - - if (gettime) { - hrtimer_get_softirq_time(cpu_base); - gettime = 0; - } - - raw_spin_lock(&cpu_base->lock); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= - hrtimer_get_expires_tv64(timer)) - break; - - __run_hrtimer(timer, &base->softirq_time); - } - raw_spin_unlock(&cpu_base->lock); } + + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now); + raw_spin_unlock(&cpu_base->lock); } /* @@ -1497,8 +1458,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t->timer, mode); - if (!hrtimer_active(&t->timer)) - t->task = NULL; if (likely(t->task)) freezable_schedule(); @@ -1642,11 +1601,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, debug_deactivate(timer); /* - * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1657,9 +1616,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * event device. */ enqueue_hrtimer(timer, new_base); - - /* Clear the migration state bit */ - timer->state &= ~HRTIMER_STATE_MIGRATE; } } @@ -1731,9 +1687,6 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif } /** @@ -1772,8 +1725,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, hrtimer_init_sleeper(&t, current); hrtimer_start_expires(&t.timer, mode); - if (!hrtimer_active(&t.timer)) - t.task = NULL; if (likely(t.task)) schedule(); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7a6810030..fb4d98c7f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -35,6 +35,7 @@ unsigned long tick_nsec; static u64 tick_length; static u64 tick_length_base; +#define SECS_PER_DAY 86400 #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -76,6 +77,9 @@ static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; +/* second value of the next pending leapsecond, or TIME64_MAX if no leap */ +static time64_t ntp_next_leap_sec = TIME64_MAX; + #ifdef CONFIG_NTP_PPS /* @@ -349,6 +353,7 @@ void ntp_clear(void) tick_length = tick_length_base; time_offset = 0; + ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ pps_clear(); } @@ -359,6 +364,21 @@ u64 ntp_tick_length(void) return tick_length; } +/** + * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * + * Provides the time of the next leapsecond against CLOCK_REALTIME in + * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + */ +ktime_t ntp_get_next_leap(void) +{ + ktime_t ret; + + if ((time_state == TIME_INS) && (time_status & STA_INS)) + return ktime_set(ntp_next_leap_sec, 0); + ret.tv64 = KTIME_MAX; + return ret; +} /* * this routine handles the overflow of the microsecond field @@ -382,15 +402,21 @@ int second_overflow(unsigned long secs) */ switch (time_state) { case TIME_OK: - if (time_status & STA_INS) + if (time_status & STA_INS) { time_state = TIME_INS; - else if (time_status & STA_DEL) + ntp_next_leap_sec = secs + SECS_PER_DAY - + (secs % SECS_PER_DAY); + } else if (time_status & STA_DEL) { time_state = TIME_DEL; + ntp_next_leap_sec = secs + SECS_PER_DAY - + ((secs+1) % SECS_PER_DAY); + } break; case TIME_INS: - if (!(time_status & STA_INS)) + if (!(time_status & STA_INS)) { + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - else if (secs % 86400 == 0) { + } else if (secs % SECS_PER_DAY == 0) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE @@ -398,19 +424,21 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if (!(time_status & STA_DEL)) + if (!(time_status & STA_DEL)) { + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - else if ((secs + 1) % 86400 == 0) { + } else if ((secs + 1) % SECS_PER_DAY == 0) { leap = 1; + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; break; - case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; @@ -547,6 +575,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; + ntp_next_leap_sec = TIME64_MAX; /* restart PPS frequency calibration */ pps_reset_freq_interval(); } @@ -711,6 +740,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; + /* Handle leapsec adjustments */ + if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { + if ((time_state == TIME_INS) && (time_status & STA_INS)) { + result = TIME_OOP; + txc->tai++; + txc->time.tv_sec--; + } + if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { + result = TIME_WAIT; + txc->tai--; + txc->time.tv_sec++; + } + if ((time_state == TIME_OOP) && + (ts->tv_sec == ntp_next_leap_sec)) { + result = TIME_WAIT; + } + } + return result; } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index bbd102ad9..65430504c 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -5,6 +5,7 @@ extern void ntp_init(void); extern void ntp_clear(void); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ extern u64 ntp_tick_length(void); +extern ktime_t ntp_get_next_leap(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0ac829b48..892e3dae0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +/* + * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg + * to avoid race conditions with concurrent updates to cputime. + */ +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) { - if (b->utime > a->utime) - a->utime = b->utime; + u64 curr_cputime; +retry: + curr_cputime = atomic64_read(cputime); + if (sum_cputime > curr_cputime) { + if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime) + goto retry; + } +} - if (b->stime > a->stime) - a->stime = b->stime; +static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) +{ + __update_gt_cputime(&cputime_atomic->utime, sum->utime); + __update_gt_cputime(&cputime_atomic->stime, sum->stime); + __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); +} - if (b->sum_exec_runtime > a->sum_exec_runtime) - a->sum_exec_runtime = b->sum_exec_runtime; +/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ +static inline void sample_cputime_atomic(struct task_cputime *times, + struct task_cputime_atomic *atomic_times) +{ + times->utime = atomic64_read(&atomic_times->utime); + times->stime = atomic64_read(&atomic_times->stime); + times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); } void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct task_cputime sum; - unsigned long flags; - if (!cputimer->running) { + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(cputimer->running)) { /* * The POSIX timer interface allows for absolute time expiry * values through the TIMER_ABSTIME flag, therefore we have - * to synchronize the timer to the clock every time we start - * it. + * to synchronize the timer to the clock every time we start it. */ thread_group_cputime(tsk, &sum); - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 1; - update_gt_cputime(&cputimer->cputime, &sum); - } else - raw_spin_lock_irqsave(&cputimer->lock, flags); - *times = cputimer->cputime; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); + update_gt_cputime(&cputimer->cputime_atomic, &sum); + + /* + * We're setting cputimer->running without a lock. Ensure + * this only gets written to in one operation. We set + * running after update_gt_cputime() as a small optimization, + * but barriers are not required because update_gt_cputime() + * can handle concurrent updates. + */ + WRITE_ONCE(cputimer->running, 1); + } + sample_cputime_atomic(times, &cputimer->cputime_atomic); } /* @@ -425,7 +448,7 @@ static void cleanup_timers(struct list_head *head) */ void posix_cpu_timers_exit(struct task_struct *tsk) { - add_device_randomness((const void*) &tsk_seruntime(tsk), + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); cleanup_timers(tsk->cpu_timers); @@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) if (!task_cputime_zero(&tsk->cputime_expires)) return false; - if (tsk->signal->cputimer.running) + /* Check if cputimer is running. This is accessed without locking. */ + if (READ_ONCE(tsk->signal->cputimer.running)) return false; return true; @@ -847,18 +871,18 @@ static void check_thread_timers(struct task_struct *tsk, tsk_expires->virt_exp = expires_to_cputime(expires); tsk_expires->sched_exp = check_timers_list(++timers, firing, - tsk_seruntime(tsk)); + tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. */ - soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); if (soft != RLIM_INFINITY) { unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); if (hard != RLIM_INFINITY && - tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { /* * At the hard limit, we just die. * No need to calculate anything else now. @@ -866,7 +890,7 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { /* * At the soft limit, send a SIGXCPU every second. */ @@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk, } } -static void stop_process_timers(struct signal_struct *sig) +static inline void stop_process_timers(struct signal_struct *sig) { struct thread_group_cputimer *cputimer = &sig->cputimer; - unsigned long flags; - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 0; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); + /* Turn off cputimer->running. This is done without locking. */ + WRITE_ONCE(cputimer->running, 0); } static u32 onecputick; @@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk, SIGPROF); check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, SIGVTALRM); - soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (soft != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); + READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); cputime_t x; if (psecs >= hard) { /* @@ -1103,7 +1125,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) struct task_cputime task_sample = { .utime = utime, .stime = stime, - .sum_exec_runtime = tsk_seruntime(tsk) + .sum_exec_runtime = tsk->se.sum_exec_runtime }; if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) @@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk) } sig = tsk->signal; - if (sig->cputimer.running) { + /* Check if cputimer is running. This is accessed without locking. */ + if (READ_ONCE(sig->cputimer.running)) { struct task_cputime group_sample; - raw_spin_lock(&sig->cputimer.lock); - group_sample = sig->cputimer.cputime; - raw_spin_unlock(&sig->cputimer.lock); + sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; @@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) * If there are any active process wide timers (POSIX 1.b, itimers, * RLIMIT_CPU) cputimer must be running. */ - if (tsk->signal->cputimer.running) + if (READ_ONCE(tsk->signal->cputimer.running)) check_process_timers(tsk, &firing); /* diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31ea01f42..31d11ac9f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; +} + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, @@ -290,7 +297,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_ktime_get_ts, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -300,7 +307,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_monotonic_raw, }; struct k_clock clock_realtime_coarse = { @@ -312,7 +319,7 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_coarse, }; struct k_clock clock_tai = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_tai, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -322,7 +329,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_boottime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_boottime, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 6aac4beed..3e7db49a2 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -22,6 +22,7 @@ static void bc_set_mode(enum clock_event_mode mode, struct clock_event_device *bc) { switch (mode) { + case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: /* * Note, we cannot cancel the timer here as we might @@ -66,9 +67,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * hrtimer_{start/cancel} functions call into tracing, * calls to these functions must be bound within RCU_NONIDLE. */ - RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? - !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : - 0); + RCU_NONIDLE({ + bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; + if (bc_moved) + hrtimer_start(&bctimer, expires, + HRTIMER_MODE_ABS_PINNED);}); if (bc_moved) { /* Bind the "device" to the cpu */ bc->bound_on = smp_processor_id(); @@ -99,10 +102,13 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + switch (ce_broadcast_hrtimer.mode) { + case CLOCK_EVT_MODE_ONESHOT: + if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) + return HRTIMER_RESTART; + default: return HRTIMER_NORESTART; - - return HRTIMER_RESTART; + } } void tick_setup_hrtimer_broadcast(void) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7e8ca4f44..f6aae7977 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -159,7 +159,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; - int ret; + int ret = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -221,13 +221,14 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) * If we kept the cpu in the broadcast mask, * tell the caller to leave the per cpu device * in shutdown state. The periodic interrupt - * is delivered by the broadcast device. + * is delivered by the broadcast device, if + * the broadcast device exists and is not + * hrtimer based. */ - ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); break; default: - /* Nothing to do */ - ret = 0; break; } } @@ -255,18 +256,32 @@ int tick_receive_broadcast(void) /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ -static void tick_do_broadcast(struct cpumask *mask) +static bool tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; + bool local = false; /* * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; + cpumask_clear_cpu(cpu, mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->event_handler(td->evtdev); + /* + * We only run the local handler, if the broadcast + * device is not hrtimer based. Otherwise we run into + * a hrtimer recursion. + * + * local timer_interrupt() + * local_handler() + * expire_hrtimers() + * bc_handler() + * local_handler() + * expire_hrtimers() + */ + local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); } if (!cpumask_empty(mask)) { @@ -279,16 +294,17 @@ static void tick_do_broadcast(struct cpumask *mask) td = &per_cpu(tick_cpu_device, cpumask_first(mask)); td->evtdev->broadcast(mask); } + return local; } /* * Periodic broadcast: * - invoke the broadcast handlers */ -static void tick_do_periodic_broadcast(void) +static bool tick_do_periodic_broadcast(void) { cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); - tick_do_broadcast(tmpmask); + return tick_do_broadcast(tmpmask); } /* @@ -296,34 +312,33 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { - ktime_t next; + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool bc_local; raw_spin_lock(&tick_broadcast_lock); - tick_do_periodic_broadcast(); + /* Handle spurious interrupts gracefully */ + if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { + raw_spin_unlock(&tick_broadcast_lock); + return; + } - /* - * The device is in periodic mode. No reprogramming necessary: - */ - if (dev->state == CLOCK_EVT_STATE_PERIODIC) - goto unlock; + bc_local = tick_do_periodic_broadcast(); - /* - * Setup the next period for devices, which do not have - * periodic mode. We read dev->next_event first and add to it - * when the event already expired. clockevents_program_event() - * sets dev->next_event only when the event is really - * programmed to the device. - */ - for (next = dev->next_event; ;) { - next = ktime_add(next, tick_period); + if (clockevent_state_oneshot(dev)) { + ktime_t next = ktime_add(dev->next_event, tick_period); - if (!clockevents_program_event(dev, next, false)) - goto unlock; - tick_do_periodic_broadcast(); + clockevents_program_event(dev, next, true); } -unlock: raw_spin_unlock(&tick_broadcast_lock); + + /* + * We run the handler of the local cpu after dropping + * tick_broadcast_lock because the handler might deadlock when + * trying to switch to oneshot mode. + */ + if (bc_local) + td->evtdev->event_handler(td->evtdev); } /** @@ -366,8 +381,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { - if (tick_broadcast_device.mode == - TICKDEV_MODE_PERIODIC) + /* + * Only shutdown the cpu local device, if: + * + * - the broadcast device exists + * - the broadcast device is not a hrtimer based one + * - the broadcast device is in periodic mode to + * avoid a hickup during switch to oneshot mode + */ + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && + tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } break; @@ -386,14 +409,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) break; } - if (cpumask_empty(tick_broadcast_mask)) { - if (!bc_stopped) - clockevents_shutdown(bc); - } else if (bc_stopped) { - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - tick_broadcast_start_periodic(bc); - else - tick_broadcast_setup_oneshot(bc); + if (bc) { + if (cpumask_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } } raw_spin_unlock(&tick_broadcast_lock); } @@ -532,23 +557,19 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc, irq_set_affinity(bc->irq, bc->cpumask); } -static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, - ktime_t expires, int force) +static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires) { - int ret; - - if (bc->state != CLOCK_EVT_STATE_ONESHOT) - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + if (!clockevent_state_oneshot(bc)) + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); - ret = clockevents_program_event(bc, expires, force); - if (!ret) - tick_broadcast_set_affinity(bc, cpumask_of(cpu)); - return ret; + clockevents_program_event(bc, expires, 1); + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); } static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); } /* @@ -566,7 +587,7 @@ void tick_check_oneshot_broadcast_this_cpu(void) * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { - clockevents_set_state(td->evtdev, + clockevents_switch_state(td->evtdev, CLOCK_EVT_STATE_ONESHOT); } } @@ -580,9 +601,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) struct tick_device *td; ktime_t now, next_event; int cpu, next_cpu = 0; + bool bc_local; raw_spin_lock(&tick_broadcast_lock); -again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; cpumask_clear(tmpmask); @@ -624,7 +645,7 @@ again: /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(tmpmask); + bc_local = tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: @@ -636,15 +657,15 @@ again: * - There are pending events on sleeping CPUs which were not * in the event mask */ - if (next_event.tv64 != KTIME_MAX) { - /* - * Rearm the broadcast device. If event expired, - * repeat the above - */ - if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) - goto again; - } + if (next_event.tv64 != KTIME_MAX) + tick_broadcast_set_event(dev, next_cpu, next_event); + raw_spin_unlock(&tick_broadcast_lock); + + if (bc_local) { + td = this_cpu_ptr(&tick_cpu_device); + td->evtdev->event_handler(td->evtdev); + } } static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) @@ -670,77 +691,88 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, if (dev->next_event.tv64 < bc->next_event.tv64) return; } - clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -/** - * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode - * @state: The target state (enter/exit) - * - * The system enters/leaves a state, where affected devices might stop - * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. - * - * Called with interrupts disabled, so clockevents_lock is not - * required here because the local clock event device cannot go away - * under us. - */ -int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; - struct tick_device *td; int cpu, ret = 0; ktime_t now; /* - * Periodic mode does not care about the enter/exit of power - * states + * If there is no broadcast device, tell the caller not to go + * into deep idle. */ - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return 0; + if (!tick_broadcast_device.evtdev) + return -EBUSY; - /* - * We are called with preemtion disabled from the depth of the - * idle code, so we can't be moved away. - */ - td = this_cpu_ptr(&tick_cpu_device); - dev = td->evtdev; - - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + dev = this_cpu_ptr(&tick_cpu_device)->evtdev; raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; cpu = smp_processor_id(); if (state == TICK_BROADCAST_ENTER) { + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we do not add + * the CPU to the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + goto out; + + /* + * If the broadcast device is in periodic mode, we + * return. + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + /* If it is a hrtimer based broadcast, return busy */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) + ret = -EBUSY; + goto out; + } + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + + /* Conditionally shut down the local timer. */ broadcast_shutdown_local(bc, dev); + /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and * if the cpu local event is earlier than the * broadcast event. If the current CPU is in * the force mask, then we are going to be - * woken by the IPI right away. + * woken by the IPI right away; we return + * busy, so the CPU does not try to go deep + * idle. */ - if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && - dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(bc, cpu, dev->next_event, 1); + if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { + ret = -EBUSY; + } else if (dev->next_event.tv64 < bc->next_event.tv64) { + tick_broadcast_set_event(bc, cpu, dev->next_event); + /* + * In case of hrtimer broadcasts the + * programming might have moved the + * timer to this cpu. If yes, remove + * us from the broadcast mask and + * return busy. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) { + cpumask_clear_cpu(cpu, + tick_broadcast_oneshot_mask); + } + } } - /* - * If the current CPU owns the hrtimer broadcast - * mechanism, it cannot go deep idle and we remove the - * CPU from the broadcast mask. We don't have to go - * through the EXIT path as the local timer is not - * shutdown. - */ - ret = broadcast_needs_cpu(bc, cpu); - if (ret) - cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -807,7 +839,6 @@ out: raw_spin_unlock(&tick_broadcast_lock); return ret; } -EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); /* * Reset the one shot broadcast for a cpu @@ -842,7 +873,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; + int was_periodic = clockevent_state_periodic(bc); bc->event_handler = tick_handle_oneshot_broadcast; @@ -858,10 +889,10 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_oneshot_mask, tmpmask); if (was_periodic && !cpumask_empty(tmpmask)) { - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(bc, cpu, tick_next_period, 1); + tick_broadcast_set_event(bc, cpu, tick_next_period); } else bc->next_event.tv64 = KTIME_MAX; } else { @@ -949,6 +980,16 @@ bool tick_broadcast_oneshot_available(void) return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; } +#else +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return -EBUSY; + + return 0; +} #endif void __init tick_broadcast_init(void) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 3ae6afa1e..f8bf47571 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -19,6 +19,7 @@ #include <linux/profile.h> #include <linux/sched.h> #include <linux/module.h> +#include <trace/events/power.h> #include <asm/irq_regs.h> @@ -102,7 +103,17 @@ void tick_handle_periodic(struct clock_event_device *dev) tick_periodic(cpu); - if (dev->state != CLOCK_EVT_STATE_ONESHOT) +#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) + /* + * The cpu might have transitioned to HIGHRES or NOHZ mode via + * update_process_times() -> run_local_timers() -> + * hrtimer_run_queues(). + */ + if (dev->event_handler != tick_handle_periodic) + return; +#endif + + if (!clockevent_state_oneshot(dev)) return; for (;;) { /* @@ -140,7 +151,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && !tick_broadcast_oneshot_active()) { - clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { unsigned long seq; ktime_t next; @@ -150,7 +161,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) next = tick_next_period; } while (read_seqretry(&jiffies_lock, seq)); - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); for (;;) { if (!clockevents_program_event(dev, next, false)) @@ -332,6 +343,28 @@ out_bc: tick_install_broadcast_device(newdev); } +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. + */ +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + return __tick_broadcast_oneshot_control(state); +} +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); + #ifdef CONFIG_HOTPLUG_CPU /* * Transfer the do_timer job away from a dying cpu. @@ -367,7 +400,7 @@ void tick_shutdown(unsigned int cpu) * Prevent that the clock events layer tries to call * the set mode function! */ - dev->state = CLOCK_EVT_STATE_DETACHED; + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; @@ -440,6 +473,7 @@ void tick_resume(void) tick_resume_local(); } +#ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); static unsigned int tick_freeze_depth; @@ -457,10 +491,13 @@ void tick_freeze(void) raw_spin_lock(&tick_freeze_lock); tick_freeze_depth++; - if (tick_freeze_depth == num_online_cpus()) + if (tick_freeze_depth == num_online_cpus()) { + trace_suspend_resume(TPS("timekeeping_freeze"), + smp_processor_id(), true); timekeeping_suspend(); - else + } else { tick_suspend_local(); + } raw_spin_unlock(&tick_freeze_lock); } @@ -478,15 +515,19 @@ void tick_unfreeze(void) { raw_spin_lock(&tick_freeze_lock); - if (tick_freeze_depth == num_online_cpus()) + if (tick_freeze_depth == num_online_cpus()) { timekeeping_resume(); - else + trace_suspend_resume(TPS("timekeeping_freeze"), + smp_processor_id(), false); + } else { tick_resume_local(); + } tick_freeze_depth--; raw_spin_unlock(&tick_freeze_lock); } +#endif /* CONFIG_SUSPEND */ /** * tick_init - initialize the tick control diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index b64fdd805..966a5a6fd 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,11 +36,22 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev) +{ + return dev->state_use_accessors; +} + +static inline void clockevent_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + dev->state_use_accessors = state; +} + extern void clockevents_shutdown(struct clock_event_device *dev); extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); -extern void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state); +extern void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state); extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); @@ -137,3 +148,19 @@ extern void tick_nohz_init(void); # else static inline void tick_nohz_init(void) { } #endif + +#ifdef CONFIG_NO_HZ_COMMON +extern unsigned long tick_nohz_active; +#else +#define tick_nohz_active (0) +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void timers_update_migration(bool update_nohz); +#else +static inline void timers_update_migration(bool update_nohz) { } +#endif + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 67a64b167..b51344652 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + if (unlikely(expires.tv64 == KTIME_MAX)) { + /* + * We don't need the clock event device any more, stop it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + return 0; + } + + if (unlikely(clockevent_state_oneshot_stopped(dev))) { + /* + * We need the clock event again, configure it in ONESHOT mode + * before using it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + } + return clockevents_program_event(dev, expires, force); } @@ -38,7 +54,7 @@ void tick_resume_oneshot(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(dev, ktime_get(), true); } @@ -50,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, ktime_t next_event) { newdev->event_handler = handler; - clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(newdev, next_event, true); } @@ -81,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_switch_to_oneshot(); return 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 914259128..c792429e9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -399,7 +399,7 @@ void __init tick_nohz_init(void) * NO HZ enabled ? */ static int tick_nohz_enabled __read_mostly = 1; -int tick_nohz_active __read_mostly; +unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -565,156 +565,144 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, ret = { .tv64 = 0 }; - unsigned long rcu_delta_jiffies; struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - u64 time_delta; - - time_delta = timekeeping_max_deferment(); + u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; + unsigned long seq, basejiff; + ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&jiffies_lock); - last_update = last_jiffies_update; - last_jiffies = jiffies; + basemono = last_jiffies_update.tv64; + basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); + ts->last_jiffies = basejiff; - if (rcu_needs_cpu(&rcu_delta_jiffies) || + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || irq_work_needs_cpu()) { - next_jiffies = last_jiffies + 1; - delta_jiffies = 1; + next_tick = basemono + TICK_NSEC; } else { - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - if (rcu_delta_jiffies < delta_jiffies) { - next_jiffies = last_jiffies + rcu_delta_jiffies; - delta_jiffies = rcu_delta_jiffies; - } + /* + * Get the next pending timer. If high resolution + * timers are enabled this only takes the timer wheel + * timers into account. If high resolution timers are + * disabled this also looks at the next expiring + * hrtimer. + */ + next_tmr = get_next_timer_interrupt(basejiff, basemono); + ts->next_timer = next_tmr; + /* Take the next rcu event into account */ + next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; } /* - * Do not stop the tick, if we are only one off (or less) - * or if the cpu is required for RCU: + * If the tick is due in the next period, keep it ticking or + * restart it proper. */ - if (!ts->tick_stopped && delta_jiffies <= 1) - goto out; - - /* Schedule the tick, if we are at least one jiffie off */ - if ((long)delta_jiffies >= 1) { - - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. Keep track of the fact that it was the one - * which had the do_timer() duty last. If this cpu is - * the one which had the do_timer() duty last, we - * limit the sleep time to the timekeeping - * max_deferement value which we retrieved - * above. Otherwise we can sleep as long as we want. - */ - if (cpu == tick_do_timer_cpu) { - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->do_timer_last = 1; - } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - time_delta = KTIME_MAX; - ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - time_delta = KTIME_MAX; + delta = next_tick - basemono; + if (delta <= (u64)TICK_NSEC) { + tick.tv64 = 0; + if (!ts->tick_stopped) + goto out; + if (delta == 0) { + /* Tick is stopped, but required now. Enforce it */ + tick_nohz_restart(ts, now); + goto out; } + } + + /* + * If this cpu is the one which updates jiffies, then give up + * the assignment and let it be taken by the cpu which runs + * the tick timer next, which might be this cpu as well. If we + * don't drop this here the jiffies might be stale and + * do_timer() never invoked. Keep track of the fact that it + * was the one which had the do_timer() duty last. If this cpu + * is the one which had the do_timer() duty last, we limit the + * sleep time to the timekeeping max_deferement value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu == tick_do_timer_cpu) { + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + delta = KTIME_MAX; + ts->do_timer_last = 0; + } else if (!ts->do_timer_last) { + delta = KTIME_MAX; + } #ifdef CONFIG_NO_HZ_FULL - if (!ts->inidle) { - time_delta = min(time_delta, - scheduler_tick_max_deferment()); - } + /* Limit the tick delta to the maximum scheduler deferment */ + if (!ts->inidle) + delta = min(delta, scheduler_tick_max_deferment()); #endif - /* - * calculate the expiry time for the next timer wheel - * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals - * that there is no timer pending or at least extremely - * far into the future (12 days for HZ=1000). In this - * case we set the expiry to the end of time. - */ - if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { - /* - * Calculate the time delta for the next timer event. - * If the time delta exceeds the maximum time delta - * permitted by the current clocksource then adjust - * the time delta accordingly to ensure the - * clocksource does not wrap. - */ - time_delta = min_t(u64, time_delta, - tick_period.tv64 * delta_jiffies); - } - - if (time_delta < KTIME_MAX) - expires = ktime_add_ns(last_update, time_delta); - else - expires.tv64 = KTIME_MAX; - - /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) - goto out; + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; - ret = expires; + expires = min_t(u64, expires, next_tick); + tick.tv64 = expires; - /* - * nohz_stop_sched_tick can be called several times before - * the nohz_restart_sched_tick is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick. - */ - if (!ts->tick_stopped) { - nohz_balance_enter_idle(cpu); - calc_load_enter_idle(); + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && (expires == dev->next_event.tv64)) + goto out; - ts->last_tick = hrtimer_get_expires(&ts->sched_timer); - ts->tick_stopped = 1; - trace_tick_stop(1, " "); - } + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + nohz_balance_enter_idle(cpu); + calc_load_enter_idle(); - /* - * If the expiration time == KTIME_MAX, then - * in this case we simply stop the tick timer. - */ - if (unlikely(expires.tv64 == KTIME_MAX)) { - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_cancel(&ts->sched_timer); - goto out; - } + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; + trace_tick_stop(1, " "); + } - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - goto out; - } else if (!tick_program_event(expires, 0)) - goto out; - /* - * We are past the event already. So we crossed a - * jiffie boundary. Update jiffies and raise the - * softirq. - */ - tick_do_update_jiffies64(ktime_get()); + /* + * If the expiration time == KTIME_MAX, then we simply stop + * the tick timer. + */ + if (unlikely(expires == KTIME_MAX)) { + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + goto out; } - raise_softirq_irqoff(TIMER_SOFTIRQ); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(tick, 1); out: - ts->next_jiffies = next_jiffies; - ts->last_jiffies = last_jiffies; + /* Update the estimated sleep length */ ts->sleep_length = ktime_sub(dev->next_event, now); - - return ret; + return tick; } static void tick_nohz_full_stop_tick(struct tick_sched *ts) @@ -876,32 +864,6 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } -static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) -{ - hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->last_tick); - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event( - hrtimer_get_expires(&ts->sched_timer), 0)) - break; - } - /* Reread time and update jiffies */ - now = ktime_get(); - tick_do_update_jiffies64(now); - } -} - static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ @@ -972,12 +934,6 @@ void tick_nohz_idle_exit(void) local_irq_enable(); } -static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) -{ - hrtimer_forward(&ts->sched_timer, now, tick_period); - return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); -} - /* * The nohz low res interrupt handler */ @@ -996,10 +952,18 @@ static void tick_nohz_handler(struct clock_event_device *dev) if (unlikely(ts->tick_stopped)) return; - while (tick_nohz_reprogram(ts, now)) { - now = ktime_get(); - tick_do_update_jiffies64(now); - } + hrtimer_forward(&ts->sched_timer, now, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +} + +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) +{ + if (!tick_nohz_enabled) + return; + ts->nohz_mode = mode; + /* One update is enough */ + if (!test_and_set_bit(0, &tick_nohz_active)) + timers_update_migration(true); } /** @@ -1013,13 +977,8 @@ static void tick_nohz_switch_to_nohz(void) if (!tick_nohz_enabled) return; - local_irq_disable(); - if (tick_switch_to_oneshot(tick_nohz_handler)) { - local_irq_enable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) return; - } - tick_nohz_active = 1; - ts->nohz_mode = NOHZ_MODE_LOWRES; /* * Recycle the hrtimer in ts, so we can share the @@ -1029,13 +988,10 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - for (;;) { - hrtimer_set_expires(&ts->sched_timer, next); - if (!tick_program_event(next, 0)) - break; - next = ktime_add(next, tick_period); - } - local_irq_enable(); + hrtimer_forward_now(&ts->sched_timer, tick_period); + hrtimer_set_expires(&ts->sched_timer, next); + tick_program_event(next, 1); + tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } /* @@ -1087,6 +1043,7 @@ static inline void tick_nohz_irq_enter(void) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -1167,22 +1124,9 @@ void tick_setup_sched_timer(void) hrtimer_add_expires_ns(&ts->sched_timer, offset); } - for (;;) { - hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - now = ktime_get(); - } - -#ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) { - ts->nohz_mode = NOHZ_MODE_HIGHRES; - tick_nohz_active = 1; - } -#endif + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ @@ -1227,7 +1171,7 @@ void tick_oneshot_notify(void) * Called cyclic from the hrtimer softirq (driven by the timer * softirq) allow_nohz signals, that we can switch into low-res nohz * mode, because high resolution timers are disabled (either compile - * or runtime). + * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 28b5da3e1..a4a8d4e9b 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -57,7 +57,7 @@ struct tick_sched { ktime_t iowait_sleeptime; ktime_t sleep_length; unsigned long last_jiffies; - unsigned long next_jiffies; + u64 next_timer; ktime_t idle_expires; int do_timer_last; }; @@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu); static inline void tick_cancel_sched_timer(int cpu) { } #endif +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state); +#else +static inline int +__tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + return -EBUSY; +} +#endif + #endif diff --git a/kernel/time/time.c b/kernel/time/time.c index 2c85b7724..85d5bb1d6 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -41,7 +41,7 @@ #include <asm/uaccess.h> #include <asm/unistd.h> -#include "timeconst.h" +#include <generated/timeconst.h> #include "timekeeping.h" /* @@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) return error; if (tz) { + /* Verify we're witin the +-15 hrs range */ + if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) + return -EINVAL; + sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { @@ -483,9 +487,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec) } EXPORT_SYMBOL(ns_to_timespec64); #endif -/* - * When we convert to jiffies then we interpret incoming values - * the following way: +/** + * msecs_to_jiffies: - convert milliseconds to jiffies + * @m: time in milliseconds + * + * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * @@ -493,66 +499,36 @@ EXPORT_SYMBOL(ns_to_timespec64); * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying - * the input value by a factor or dividing it with a factor - * - * We must also be careful about 32-bit overflows. + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows. + * for the details see __msecs_to_jiffies() + * + * msecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __msecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the _msecs_to_jiffies helpers are the HZ dependent conversion + * routines found in include/linux/jiffies.h */ -unsigned long msecs_to_jiffies(const unsigned int m) +unsigned long __msecs_to_jiffies(const unsigned int m) { /* * Negative value, means infinite timeout: */ if ((int)m < 0) return MAX_JIFFY_OFFSET; - -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - /* - * HZ is equal to or smaller than 1000, and 1000 is a nice - * round multiple of HZ, divide with the factor between them, - * but round upwards: - */ - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - /* - * HZ is larger than 1000, and HZ is a nice round multiple of - * 1000 - simply multiply with the factor between them. - * - * But first make sure the multiplication result cannot - * overflow: - */ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return m * (HZ / MSEC_PER_SEC); -#else - /* - * Generic case - multiply, round and divide. But first - * check that if we are doing a net multiplication, that - * we wouldn't overflow: - */ - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; -#endif + return _msecs_to_jiffies(m); } -EXPORT_SYMBOL(msecs_to_jiffies); +EXPORT_SYMBOL(__msecs_to_jiffies); -unsigned long usecs_to_jiffies(const unsigned int u) +unsigned long __usecs_to_jiffies(const unsigned int u) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) - >> USEC_TO_HZ_SHR32; -#endif + return _usecs_to_jiffies(u); } -EXPORT_SYMBOL(usecs_to_jiffies); +EXPORT_SYMBOL(__usecs_to_jiffies); /* * The TICK_NSEC - 1 rounds up the value to the next resolution. Note diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index 511bdf2ca..c7388dee8 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc @@ -50,7 +50,7 @@ define timeconst(hz) { print "#include <linux/types.h>\n\n" print "#if HZ != ", hz, "\n" - print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" + print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n" print "#endif\n\n" if (hz < 2) { @@ -105,4 +105,5 @@ define timeconst(hz) { halt } +hz = read(); timeconst(hz) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 946acb721..bca3667a2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ -/* - * These simple flag variables are managed - * without locks, which is racy, but ok since - * we don't really care about being super - * precise about how many events were seen, - * just that a problem was observed. - */ -static int timekeeping_underflow_seen; -static int timekeeping_overflow_seen; - -/* last_warning is only modified under the timekeeping lock */ -static long timekeeping_last_warning; static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) { @@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) } } - if (timekeeping_underflow_seen) { - if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + if (tk->underflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); - timekeeping_last_warning = jiffies; + tk->last_warning = jiffies; } - timekeeping_underflow_seen = 0; + tk->underflow_seen = 0; } - if (timekeeping_overflow_seen) { - if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + if (tk->overflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); - timekeeping_last_warning = jiffies; + tk->last_warning = jiffies; } - timekeeping_overflow_seen = 0; + tk->overflow_seen = 0; } } static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) { + struct timekeeper *tk = &tk_core.timekeeper; cycle_t now, last, mask, max, delta; unsigned int seq; @@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) * mask-relative negative values. */ if (unlikely((~delta & mask) < (mask >> 3))) { - timekeeping_underflow_seen = 1; + tk->underflow_seen = 1; delta = 0; } /* Cap delta value to the max_cycles values to avoid mult overflows */ if (unlikely(delta > max)) { - timekeeping_overflow_seen = 1; + tk->overflow_seen = 1; delta = tkr->clock->max_cycles; } @@ -330,32 +319,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * - * So we handle this differently than the other timekeeping accessor - * functions which retry when the sequence count has changed. The - * update side does: - * - * smp_wmb(); <- Ensure that the last base[1] update is visible - * tkf->seq++; - * smp_wmb(); <- Ensure that the seqcount update is visible - * update(tkf->base[0], tkr); - * smp_wmb(); <- Ensure that the base[0] update is visible - * tkf->seq++; - * smp_wmb(); <- Ensure that the seqcount update is visible - * update(tkf->base[1], tkr); - * - * The reader side does: - * - * do { - * seq = tkf->seq; - * smp_rmb(); - * idx = seq & 0x01; - * now = now(tkf->base[idx]); - * smp_rmb(); - * } while (seq != tkf->seq) - * - * As long as we update base[0] readers are forced off to - * base[1]. Once base[0] is updated readers are redirected to base[0] - * and the base[1] update takes place. + * Employ the latch technique; see @raw_write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a @@ -418,7 +382,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) u64 now; do { - seq = raw_read_seqcount(&tkf->seq); + seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); } while (read_seqcount_retry(&tkf->seq, seq)); @@ -551,6 +515,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* + * tk_update_leap_state - helper to update the next_leap_ktime + */ +static inline void tk_update_leap_state(struct timekeeper *tk) +{ + tk->next_leap_ktime = ntp_get_next_leap(); + if (tk->next_leap_ktime.tv64 != KTIME_MAX) + /* Convert to monotonic time */ + tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); +} + +/* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) @@ -591,17 +566,25 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) ntp_clear(); } + tk_update_leap_state(tk); tk_update_ktime_data(tk); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + + if (action & TK_CLOCK_WAS_SET) + tk->clock_was_set_seq++; + /* + * The mirroring of the data to the shadow-timekeeper needs + * to happen last here to ensure we don't over-write the + * timekeeper structure on the next update with stale data + */ if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); - - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); } /** @@ -699,6 +682,23 @@ ktime_t ktime_get(void) } EXPORT_SYMBOL_GPL(ktime_get); +u32 ktime_get_resolution_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u32 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return nsecs; +} +EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); + static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, @@ -1179,28 +1179,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) } /** - * read_boot_clock - Return time of the system start. + * read_boot_clock64 - Return time of the system start. * * Weak dummy function for arches that do not yet support it. * Function to read the exact time the system has been started. - * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -void __weak read_boot_clock(struct timespec *ts) +void __weak read_boot_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } -void __weak read_boot_clock64(struct timespec64 *ts64) -{ - struct timespec ts; - - read_boot_clock(&ts); - *ts64 = timespec_to_timespec64(ts); -} - /* Flag for if timekeeping_resume() has injected sleeptime */ static bool sleeptime_injected; @@ -1836,8 +1828,9 @@ void update_wall_time(void) * memcpy under the tk_core.seq against one before we start * updating. */ + timekeeping_update(tk, clock_set); memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, clock_set); + /* The memcpy must come last. Do not put anything here! */ write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1926,47 +1919,20 @@ void do_timer(unsigned long ticks) } /** - * ktime_get_update_offsets_tick - hrtimer helper - * @offs_real: pointer to storage for monotonic -> realtime offset - * @offs_boot: pointer to storage for monotonic -> boottime offset - * @offs_tai: pointer to storage for monotonic -> clock tai offset - * - * Returns monotonic time at last tick and various offsets - */ -ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; - ktime_t base; - u64 nsecs; - - do { - seq = read_seqcount_begin(&tk_core.seq); - - base = tk->tkr_mono.base; - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - return ktime_add_ns(base, nsecs); -} - -#ifdef CONFIG_HIGH_RES_TIMERS -/** * ktime_get_update_offsets_now - hrtimer helper + * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * - * Returns current monotonic time and updates the offsets + * Returns current monotonic time and updates the offsets if the + * sequence number in @cwsseq and timekeeper.clock_was_set_seq are + * different. + * * Called from hrtimer_interrupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) +ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -1978,15 +1944,23 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); + base = ktime_add_ns(base, nsecs); + + if (*cwsseq != tk->clock_was_set_seq) { + *cwsseq = tk->clock_was_set_seq; + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } + + /* Handle leapsecond insertion adjustments */ + if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64)) + *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; } while (read_seqcount_retry(&tk_core.seq, seq)); - return ktime_add_ns(base, nsecs); + return base; } -#endif /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function @@ -2027,6 +2001,8 @@ int do_adjtimex(struct timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } + tk_update_leap_state(tk); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index ead8794b9..704f595ce 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -3,19 +3,16 @@ /* * Internal interfaces for kernel/time/ */ -extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); -extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, + ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern int timekeeping_inject_offset(struct timespec *ts); extern s32 timekeeping_get_tai_offset(void); extern void timekeeping_set_tai_offset(s32 tai_offset); -extern void timekeeping_clocktai(struct timespec *ts); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2ece3aa50..84190f02b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -49,6 +49,8 @@ #include <asm/timex.h> #include <asm/io.h> +#include "tick-internal.h" + #define CREATE_TRACE_POINTS #include <trace/events/timer.h> @@ -68,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64); #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) struct tvec { - struct list_head vec[TVN_SIZE]; + struct hlist_head vec[TVN_SIZE]; }; struct tvec_root { - struct list_head vec[TVR_SIZE]; + struct hlist_head vec[TVR_SIZE]; }; struct tvec_base { @@ -83,6 +85,8 @@ struct tvec_base { unsigned long active_timers; unsigned long all_timers; int cpu; + bool migration_enabled; + bool nohz_active; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -90,43 +94,60 @@ struct tvec_base { struct tvec tv5; } ____cacheline_aligned; -/* - * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've - * made NULL special, hint: lock_timer_base()) and we cannot get a compile time - * pointer to per-cpu entries because we don't know where we'll map the section, - * even for the boot cpu. - * - * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the - * rest of them. - */ -struct tvec_base boot_tvec_bases; -EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; +static DEFINE_PER_CPU(struct tvec_base, tvec_bases); + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +unsigned int sysctl_timer_migration = 1; -/* Functions below help us manage 'deferrable' flag */ -static inline unsigned int tbase_get_deferrable(struct tvec_base *base) +void timers_update_migration(bool update_nohz) { - return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); + bool on = sysctl_timer_migration && tick_nohz_active; + unsigned int cpu; + + /* Avoid the loop, if nothing to update */ + if (this_cpu_read(tvec_bases.migration_enabled) == on) + return; + + for_each_possible_cpu(cpu) { + per_cpu(tvec_bases.migration_enabled, cpu) = on; + per_cpu(hrtimer_bases.migration_enabled, cpu) = on; + if (!update_nohz) + continue; + per_cpu(tvec_bases.nohz_active, cpu) = true; + per_cpu(hrtimer_bases.nohz_active, cpu) = true; + } } -static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(false); + mutex_unlock(&mutex); + return ret; } -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) { - return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&tvec_bases); + return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); } - -static inline void -timer_set_base(struct timer_list *timer, struct tvec_base *new_base) +#else +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) { - unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; - - timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); + return this_cpu_ptr(&tvec_bases); } +#endif static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) @@ -349,26 +370,12 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); -/* - * If the list is empty, catch up ->timer_jiffies to the current time. - * The caller must hold the tvec_base lock. Returns true if the list - * was empty and therefore ->timer_jiffies was updated. - */ -static bool catchup_timer_jiffies(struct tvec_base *base) -{ - if (!base->all_timers) { - base->timer_jiffies = jiffies; - return true; - } - return false; -} - static void __internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; + struct hlist_head *vec; if (idx < TVR_SIZE) { int i = expires & TVR_MASK; @@ -401,25 +408,25 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; vec = base->tv5.vec + i; } - /* - * Timers are FIFO: - */ - list_add_tail(&timer->entry, vec); + + hlist_add_head(&timer->entry, vec); } static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { - (void)catchup_timer_jiffies(base); + /* Advance base->jiffies, if the base is empty */ + if (!base->all_timers++) + base->timer_jiffies = jiffies; + __internal_add_timer(base, timer); /* * Update base->active_timers and base->next_timer */ - if (!tbase_get_deferrable(timer->base)) { + if (!(timer->flags & TIMER_DEFERRABLE)) { if (!base->active_timers++ || time_before(timer->expires, base->next_timer)) base->next_timer = timer->expires; } - base->all_timers++; /* * Check whether the other CPU is in dynticks mode and needs @@ -434,8 +441,11 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) * require special care against races with idle_cpu(), lets deal * with that later. */ - if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) - wake_up_nohz_cpu(base->cpu); + if (base->nohz_active) { + if (!(timer->flags & TIMER_DEFERRABLE) || + tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); + } } #ifdef CONFIG_TIMER_STATS @@ -451,15 +461,12 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) static void timer_stats_account_timer(struct timer_list *timer) { - unsigned int flag = 0; - if (likely(!timer->start_site)) return; - if (unlikely(tbase_get_deferrable(timer->base))) - flag |= TIMER_STATS_FLAG_DEFERRABLE; timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, flag); + timer->function, timer->start_comm, + timer->flags); } #else @@ -516,8 +523,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) * statically initialized. We just make sure that it * is tracked in the object tracker. */ - if (timer->entry.next == NULL && - timer->entry.prev == TIMER_ENTRY_STATIC) { + if (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC) { debug_object_init(timer, &timer_debug_descr); debug_object_activate(timer, &timer_debug_descr); return 0; @@ -563,7 +570,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.prev == TIMER_ENTRY_STATIC) { + if (timer->entry.next == TIMER_ENTRY_STATIC) { /* * This is not really a fixup. The timer was * statically initialized. We just make sure that it @@ -648,7 +655,7 @@ static inline void debug_activate(struct timer_list *timer, unsigned long expires) { debug_timer_activate(timer); - trace_timer_start(timer, expires); + trace_timer_start(timer, expires, timer->flags); } static inline void debug_deactivate(struct timer_list *timer) @@ -665,10 +672,8 @@ static inline void debug_assert_init(struct timer_list *timer) static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key) { - struct tvec_base *base = raw_cpu_read(tvec_bases); - - timer->entry.next = NULL; - timer->base = (void *)((unsigned long)base | flags); + timer->entry.pprev = NULL; + timer->flags = flags | raw_smp_processor_id(); timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -699,24 +704,23 @@ EXPORT_SYMBOL(init_timer_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { - struct list_head *entry = &timer->entry; + struct hlist_node *entry = &timer->entry; debug_deactivate(timer); - __list_del(entry->prev, entry->next); + __hlist_del(entry); if (clear_pending) - entry->next = NULL; - entry->prev = LIST_POISON2; + entry->pprev = NULL; + entry->next = LIST_POISON2; } static inline void detach_expired_timer(struct timer_list *timer, struct tvec_base *base) { detach_timer(timer, true); - if (!tbase_get_deferrable(timer->base)) + if (!(timer->flags & TIMER_DEFERRABLE)) base->active_timers--; base->all_timers--; - (void)catchup_timer_jiffies(base); } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -726,13 +730,14 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, return 0; detach_timer(timer, clear_pending); - if (!tbase_get_deferrable(timer->base)) { + if (!(timer->flags & TIMER_DEFERRABLE)) { base->active_timers--; if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } - base->all_timers--; - (void)catchup_timer_jiffies(base); + /* If this was the last timer, advance base->jiffies */ + if (!--base->all_timers) + base->timer_jiffies = jiffies; return 1; } @@ -744,24 +749,22 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, * So __run_timers/migrate_timers can safely modify all timers which could * be found on ->tvX lists. * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * When the timer's base is locked and removed from the list, the + * TIMER_MIGRATING flag is set, FIXME */ static struct tvec_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { - struct tvec_base *base; - for (;;) { - struct tvec_base *prelock_base = timer->base; - base = tbase_get_base(prelock_base); - if (likely(base != NULL)) { + u32 tf = timer->flags; + struct tvec_base *base; + + if (!(tf & TIMER_MIGRATING)) { + base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); spin_lock_irqsave(&base->lock, *flags); - if (likely(prelock_base == timer->base)) + if (timer->flags == tf) return base; - /* The timer has migrated to another CPU */ spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); @@ -770,11 +773,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) + bool pending_only, int pinned) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0 , cpu; + int ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -787,8 +790,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = get_nohz_timer_target(pinned); - new_base = per_cpu(tvec_bases, cpu); + new_base = get_target_base(base, pinned); if (base != new_base) { /* @@ -800,11 +802,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ - timer_set_base(timer, NULL); + timer->flags |= TIMER_MIGRATING; + spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer_set_base(timer, base); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | base->cpu); } } @@ -966,13 +970,13 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *base = per_cpu(tvec_bases, cpu); + struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); @@ -1037,8 +1041,6 @@ int try_to_del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); - /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1093,7 +1095,7 @@ int del_timer_sync(struct timer_list *timer) * don't use it in hardirq context, because it * could lead to deadlock. */ - WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); + WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1107,17 +1109,17 @@ EXPORT_SYMBOL(del_timer_sync); static int cascade(struct tvec_base *base, struct tvec *tv, int index) { /* cascade all the timers from tv up one level */ - struct timer_list *timer, *tmp; - struct list_head tv_list; + struct timer_list *timer; + struct hlist_node *tmp; + struct hlist_head tv_list; - list_replace_init(tv->vec + index, &tv_list); + hlist_move_list(tv->vec + index, &tv_list); /* * We are removing _all_ timers from the list, so we * don't have to detach them individually. */ - list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(tbase_get_base(timer->base) != base); + hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { /* No accounting, while moving them */ __internal_add_timer(base, timer); } @@ -1182,14 +1184,18 @@ static inline void __run_timers(struct tvec_base *base) struct timer_list *timer; spin_lock_irq(&base->lock); - if (catchup_timer_jiffies(base)) { - spin_unlock_irq(&base->lock); - return; - } + while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list; - struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; + struct hlist_head work_list; + struct hlist_head *head = &work_list; + int index; + + if (!base->all_timers) { + base->timer_jiffies = jiffies; + break; + } + + index = base->timer_jiffies & TVR_MASK; /* * Cascade timers: @@ -1200,16 +1206,16 @@ static inline void __run_timers(struct tvec_base *base) !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, head); - while (!list_empty(head)) { + hlist_move_list(base->tv1.vec + index, head); + while (!hlist_empty(head)) { void (*fn)(unsigned long); unsigned long data; bool irqsafe; - timer = list_first_entry(head, struct timer_list,entry); + timer = hlist_entry(head->first, struct timer_list, entry); fn = timer->function; data = timer->data; - irqsafe = tbase_get_irqsafe(timer->base); + irqsafe = timer->flags & TIMER_IRQSAFE; timer_stats_account_timer(timer); @@ -1248,8 +1254,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base) /* Look for timer events in tv1. */ index = slot = timer_jiffies & TVR_MASK; do { - list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) + hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (nte->flags & TIMER_DEFERRABLE) continue; found = 1; @@ -1279,8 +1285,8 @@ cascade: index = slot = timer_jiffies & TVN_MASK; do { - list_for_each_entry(nte, varp->vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) + hlist_for_each_entry(nte, varp->vec + slot, entry) { + if (nte->flags & TIMER_DEFERRABLE) continue; found = 1; @@ -1311,54 +1317,48 @@ cascade: * Check, if the next hrtimer event is before the next timer wheel * event: */ -static unsigned long cmp_next_hrtimer_event(unsigned long now, - unsigned long expires) +static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { - ktime_t hr_delta = hrtimer_get_next_event(); - struct timespec tsdelta; - unsigned long delta; - - if (hr_delta.tv64 == KTIME_MAX) - return expires; + u64 nextevt = hrtimer_get_next_event(); /* - * Expired timer available, let it expire in the next tick + * If high resolution timers are enabled + * hrtimer_get_next_event() returns KTIME_MAX. */ - if (hr_delta.tv64 <= 0) - return now + 1; - - tsdelta = ktime_to_timespec(hr_delta); - delta = timespec_to_jiffies(&tsdelta); + if (expires <= nextevt) + return expires; /* - * Limit the delta to the max value, which is checked in - * tick_nohz_stop_sched_tick(): + * If the next timer is already expired, return the tick base + * time so the tick is fired immediately. */ - if (delta > NEXT_TIMER_MAX_DELTA) - delta = NEXT_TIMER_MAX_DELTA; + if (nextevt <= basem) + return basem; /* - * Take rounding errors in to account and make sure, that it - * expires in the next tick. Otherwise we go into an endless - * ping pong due to tick_nohz_stop_sched_tick() retriggering - * the timer softirq + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to + * make sure that this tick really expires the timer to avoid + * a ping pong of the nohz stop code. + * + * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 */ - if (delta < 1) - delta = 1; - now += delta; - if (time_before(now, expires)) - return now; - return expires; + return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } /** - * get_next_timer_interrupt - return the jiffy of the next pending timer - * @now: current time (in jiffies) + * get_next_timer_interrupt - return the time (clock mono) of the next timer + * @basej: base time jiffies + * @basem: base time clock monotonic + * + * Returns the tick aligned clock monotonic time of the next pending + * timer or KTIME_MAX if no timer is pending. */ -unsigned long get_next_timer_interrupt(unsigned long now) +u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { - struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires = now + NEXT_TIMER_MAX_DELTA; + struct tvec_base *base = this_cpu_ptr(&tvec_bases); + u64 expires = KTIME_MAX; + unsigned long nextevt; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1371,14 +1371,15 @@ unsigned long get_next_timer_interrupt(unsigned long now) if (base->active_timers) { if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; + nextevt = base->next_timer; + if (time_before_eq(nextevt, basej)) + expires = basem; + else + expires = basem + (nextevt - basej) * TICK_NSEC; } spin_unlock(&base->lock); - if (time_before_eq(expires, now)) - return now; - - return cmp_next_hrtimer_event(now, expires); + return cmp_next_hrtimer_event(basem, expires); } #endif @@ -1407,9 +1408,7 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __this_cpu_read(tvec_bases); - - hrtimer_run_pending(); + struct tvec_base *base = this_cpu_ptr(&tvec_bases); if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); @@ -1545,15 +1544,16 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) EXPORT_SYMBOL(schedule_timeout_uninterruptible); #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) +static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { struct timer_list *timer; + int cpu = new_base->cpu; - while (!list_empty(head)) { - timer = list_first_entry(head, struct timer_list, entry); + while (!hlist_empty(head)) { + timer = hlist_entry(head->first, struct timer_list, entry); /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); - timer_set_base(timer, new_base); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } @@ -1565,8 +1565,8 @@ static void migrate_timers(int cpu) int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu(tvec_bases, cpu); - new_base = get_cpu_var(tvec_bases); + old_base = per_cpu_ptr(&tvec_bases, cpu); + new_base = get_cpu_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1590,7 +1590,7 @@ static void migrate_timers(int cpu) spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); - put_cpu_var(tvec_bases); + put_cpu_ptr(&tvec_bases); } static int timer_cpu_notify(struct notifier_block *self, @@ -1616,52 +1616,27 @@ static inline void timer_register_cpu_notifier(void) static inline void timer_register_cpu_notifier(void) { } #endif /* CONFIG_HOTPLUG_CPU */ -static void __init init_timer_cpu(struct tvec_base *base, int cpu) +static void __init init_timer_cpu(int cpu) { - int j; - - BUG_ON(base != tbase_get_base(base)); + struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); base->cpu = cpu; - per_cpu(tvec_bases, cpu) = base; spin_lock_init(&base->lock); - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; } static void __init init_timer_cpus(void) { - struct tvec_base *base; - int local_cpu = smp_processor_id(); int cpu; - for_each_possible_cpu(cpu) { - if (cpu == local_cpu) - base = &boot_tvec_bases; -#ifdef CONFIG_SMP - else - base = per_cpu_ptr(&__tvec_bases, cpu); -#endif - - init_timer_cpu(base, cpu); - } + for_each_possible_cpu(cpu) + init_timer_cpu(cpu); } void __init init_timers(void) { - /* ensure there are enough low bits for flags in timer->base pointer */ - BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); - init_timer_cpus(); init_timer_stats(); timer_register_cpu_notifier(); @@ -1697,14 +1672,14 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); -static int __sched do_usleep_range(unsigned long min, unsigned long max) +static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; unsigned long delta; kmin = ktime_set(0, min * NSEC_PER_USEC); delta = (max - min) * NSEC_PER_USEC; - return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); + schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } /** @@ -1712,7 +1687,7 @@ static int __sched do_usleep_range(unsigned long min, unsigned long max) * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep */ -void usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range(unsigned long min, unsigned long max) { __set_current_state(TASK_UNINTERRUPTIBLE); do_usleep_range(min, max); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index e878c2e0b..a4536e1e3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -29,19 +29,24 @@ struct timer_list_iter { typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); -DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); - /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): */ -#define SEQ_printf(m, x...) \ - do { \ - if (m) \ - seq_printf(m, x); \ - else \ - printk(x); \ - } while (0) +__printf(2, 3) +static void SEQ_printf(struct seq_file *m, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + + if (m) + seq_vprintf(m, fmt, args); + else + vprintk(fmt, args); + + va_end(args); +} static void print_name_offset(struct seq_file *m, void *sym) { @@ -120,10 +125,10 @@ static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .base: %pK\n", base); - SEQ_printf(m, " .index: %d\n", - base->index); - SEQ_printf(m, " .resolution: %Lu nsecs\n", - (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .index: %d\n", base->index); + + SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution); + SEQ_printf(m, " .get_time: "); print_name_offset(m, base->get_time); SEQ_printf(m, "\n"); @@ -158,7 +163,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(nr_events); P(nr_retries); P(nr_hangs); - P_ns(max_hang_time); + P(max_hang_time); #endif #undef P #undef P_ns @@ -184,7 +189,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(idle_sleeptime); P_ns(iowait_sleeptime); P(last_jiffies); - P(next_jiffies); + P(next_timer); P_ns(idle_expires); SEQ_printf(m, "jiffies: %Lu\n", (unsigned long long)jiffies); @@ -251,6 +256,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, "\n"); } + if (dev->set_state_oneshot_stopped) { + SEQ_printf(m, " oneshot stopped: "); + print_name_offset(m, dev->set_state_oneshot_stopped); + SEQ_printf(m, "\n"); + } + if (dev->tick_resume) { SEQ_printf(m, " resume: "); print_name_offset(m, dev->tick_resume); @@ -269,11 +280,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) { #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); - SEQ_printf(m, "tick_broadcast_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_mask())[0]); + SEQ_printf(m, "tick_broadcast_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_mask())); #ifdef CONFIG_TICK_ONESHOT - SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); + SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_oneshot_mask())); #endif SEQ_printf(m, "\n"); #endif @@ -282,7 +293,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) static inline void timer_list_header(struct seq_file *m, u64 now) { - SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "Timer List Version: v0.8\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1fb08f213..1adecb4b8 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -68,7 +68,7 @@ struct entry { * Number of timeout events: */ unsigned long count; - unsigned int timer_flag; + u32 flags; /* * We save the command-line string to preserve @@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) * @startf: pointer to the function which did the timer setup * @timerf: pointer to the timer callback function of the timer * @comm: name of the process which set up the timer + * @tflags: The flags field of the timer * * When the timer is already registered, then the event counter is * incremented. Otherwise the timer is registered in a free slot. */ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag) + void *timerf, char *comm, u32 tflags) { /* * It doesn't matter which lock we take: @@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.start_func = startf; input.expire_func = timerf; input.pid = pid; - input.timer_flag = timer_flag; + input.flags = tflags; raw_spin_lock_irqsave(lock, flags); if (!timer_stats_active) @@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v) for (i = 0; i < nr_entries; i++) { entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + if (entry->flags & TIMER_DEFERRABLE) { seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); } else { diff --git a/kernel/torture.c b/kernel/torture.c index dd70993c2..3e4840633 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -409,7 +409,7 @@ static void (*torture_shutdown_hook)(void); */ void torture_shutdown_absorb(const char *title) { - while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + while (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { pr_notice("torture thread %s parking due to system shutdown\n", title); schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); @@ -480,9 +480,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1, unsigned long unused2, void *unused3) { mutex_lock(&fullstop_mutex); - if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) { + if (READ_ONCE(fullstop) == FULLSTOP_DONTSTOP) { VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected"); - ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN; + WRITE_ONCE(fullstop, FULLSTOP_SHUTDOWN); } else { pr_warn("Concurrent rmmod and shutdown illegal!\n"); } @@ -523,13 +523,13 @@ static int stutter; */ void stutter_wait(const char *title) { - while (ACCESS_ONCE(stutter_pause_test) || - (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { + while (READ_ONCE(stutter_pause_test) || + (torture_runnable && !READ_ONCE(*torture_runnable))) { if (stutter_pause_test) - if (ACCESS_ONCE(stutter_pause_test) == 1) + if (READ_ONCE(stutter_pause_test) == 1) schedule_timeout_interruptible(1); else - while (ACCESS_ONCE(stutter_pause_test)) + while (READ_ONCE(stutter_pause_test)) cond_resched(); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); @@ -549,14 +549,14 @@ static int torture_stutter(void *arg) if (!torture_must_stop()) { if (stutter > 1) { schedule_timeout_interruptible(stutter - 1); - ACCESS_ONCE(stutter_pause_test) = 2; + WRITE_ONCE(stutter_pause_test, 2); } schedule_timeout_interruptible(1); - ACCESS_ONCE(stutter_pause_test) = 1; + WRITE_ONCE(stutter_pause_test, 1); } if (!torture_must_stop()) schedule_timeout_interruptible(stutter); - ACCESS_ONCE(stutter_pause_test) = 0; + WRITE_ONCE(stutter_pause_test, 0); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); @@ -642,13 +642,13 @@ EXPORT_SYMBOL_GPL(torture_init_end); bool torture_cleanup_begin(void) { mutex_lock(&fullstop_mutex); - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + if (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { pr_warn("Concurrent rmmod and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); schedule_timeout_uninterruptible(10); return true; } - ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD; + WRITE_ONCE(fullstop, FULLSTOP_RMMOD); mutex_unlock(&fullstop_mutex); torture_shutdown_cleanup(); torture_shuffle_cleanup(); @@ -681,7 +681,7 @@ EXPORT_SYMBOL_GPL(torture_must_stop); */ bool torture_must_stop_irq(void) { - return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP; + return READ_ONCE(fullstop) != FULLSTOP_DONTSTOP; } EXPORT_SYMBOL_GPL(torture_must_stop_irq); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 483cecfa5..b3e6b39b6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -439,7 +439,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, { struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; - int ret, i; + int ret; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -451,9 +451,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * some device names have larger paths - convert the slashes * to underscores for this to work as expected */ - for (i = 0; i < strlen(buts->name); i++) - if (buts->name[i] == '/') - buts->name[i] = '_'; + strreplace(buts->name, '/', '_'); bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) @@ -1450,14 +1448,14 @@ static struct trace_event trace_blk_event = { static int __init init_blk_tracer(void) { - if (!register_ftrace_event(&trace_blk_event)) { + if (!register_trace_event(&trace_blk_event)) { pr_warning("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { pr_warning("Warning: could not register the block tracer\n"); - unregister_ftrace_event(&trace_blk_event); + unregister_trace_event(&trace_blk_event); return 1; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2d56ce501..88a041ade 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -79,18 +79,6 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - /* NMI safe access to clock monotonic */ - return ktime_get_mono_fast_ns(); -} - -static const struct bpf_func_proto bpf_ktime_get_ns_proto = { - .func = bpf_ktime_get_ns, - .gpl_only = true, - .ret_type = RET_INTEGER, -}; - /* * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed @@ -159,6 +147,17 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_STACK_SIZE, }; +const struct bpf_func_proto *bpf_get_trace_printk_proto(void) +{ + /* + * this program might be calling bpf_trace_printk, + * so allocate per-cpu printk buffers + */ + trace_printk_init_buffers(); + + return &bpf_trace_printk_proto; +} + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -172,15 +171,18 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; - + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; case BPF_FUNC_trace_printk: - /* - * this program might be calling bpf_trace_printk, - * so allocate per-cpu printk buffers - */ - trace_printk_init_buffers(); - - return &bpf_trace_printk_proto; + return bpf_get_trace_printk_proto(); + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; default: return NULL; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0315d4317..6260717c1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/trace_seq.h> @@ -115,63 +115,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s) * */ -/* - * A fast way to enable or disable all ring buffers is to - * call tracing_on or tracing_off. Turning off the ring buffers - * prevents all ring buffers from being recorded to. - * Turning this switch on, makes it OK to write to the - * ring buffer, if the ring buffer is enabled itself. - * - * There's three layers that must be on in order to write - * to the ring buffer. - * - * 1) This global flag must be set. - * 2) The ring buffer must be enabled for recording. - * 3) The per cpu buffer must be enabled for recording. - * - * In case of an anomaly, this global flag has a bit set that - * will permantly disable all ring buffers. - */ - -/* - * Global flag to disable all recording to ring buffers - * This has two bits: ON, DISABLED - * - * ON DISABLED - * ---- ---------- - * 0 0 : ring buffers are off - * 1 0 : ring buffers are on - * X 1 : ring buffers are permanently disabled - */ - -enum { - RB_BUFFERS_ON_BIT = 0, - RB_BUFFERS_DISABLED_BIT = 1, -}; - -enum { - RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT, - RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, -}; - -static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; - /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) -/** - * tracing_off_permanent - permanently disable ring buffers - * - * This function, once called, will disable all ring buffers - * permanently. - */ -void tracing_off_permanent(void) -{ - set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); -} - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) @@ -452,6 +400,23 @@ struct rb_irq_work { }; /* + * Used for which event context the event is in. + * NMI = 0 + * IRQ = 1 + * SOFTIRQ = 2 + * NORMAL = 3 + * + * See trace_recursive_lock() comment below for more details. + */ +enum { + RB_CTX_NMI, + RB_CTX_IRQ, + RB_CTX_SOFTIRQ, + RB_CTX_NORMAL, + RB_CTX_MAX +}; + +/* * head_page == tail_page && head == tail then buffer is empty. */ struct ring_buffer_per_cpu { @@ -462,6 +427,7 @@ struct ring_buffer_per_cpu { arch_spinlock_t lock; struct lock_class_key lock_key; unsigned int nr_pages; + unsigned int current_context; struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ @@ -2224,7 +2190,7 @@ static unsigned rb_calculate_event_length(unsigned length) /* zero length can cause confusions */ if (!length) - length = 1; + length++; if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); @@ -2636,8 +2602,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, return NULL; } -#ifdef CONFIG_TRACING - /* * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified @@ -2675,44 +2639,38 @@ rb_reserve_next_event(struct ring_buffer *buffer, * just so happens that it is the same bit corresponding to * the current context. */ -static DEFINE_PER_CPU(unsigned int, current_context); -static __always_inline int trace_recursive_lock(void) +static __always_inline int +trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned int val = __this_cpu_read(current_context); + unsigned int val = cpu_buffer->current_context; int bit; if (in_interrupt()) { if (in_nmi()) - bit = 0; + bit = RB_CTX_NMI; else if (in_irq()) - bit = 1; + bit = RB_CTX_IRQ; else - bit = 2; + bit = RB_CTX_SOFTIRQ; } else - bit = 3; + bit = RB_CTX_NORMAL; if (unlikely(val & (1 << bit))) return 1; val |= (1 << bit); - __this_cpu_write(current_context, val); + cpu_buffer->current_context = val; return 0; } -static __always_inline void trace_recursive_unlock(void) +static __always_inline void +trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } -#else - -#define trace_recursive_lock() (0) -#define trace_recursive_unlock() do { } while (0) - -#endif - /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from @@ -2735,41 +2693,37 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) struct ring_buffer_event *event; int cpu; - if (ring_buffer_flags != RB_BUFFERS_ON) - return NULL; - /* If we are tracing schedule, we don't want to recurse */ preempt_disable_notrace(); - if (atomic_read(&buffer->record_disabled)) - goto out_nocheck; - - if (trace_recursive_lock()) - goto out_nocheck; + if (unlikely(atomic_read(&buffer->record_disabled))) + goto out; cpu = raw_smp_processor_id(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask))) goto out; cpu_buffer = buffer->buffers[cpu]; - if (atomic_read(&cpu_buffer->record_disabled)) + if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; - if (length > BUF_MAX_DATA_SIZE) + if (unlikely(length > BUF_MAX_DATA_SIZE)) + goto out; + + if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) - goto out; + goto out_unlock; return event; + out_unlock: + trace_recursive_unlock(cpu_buffer); out: - trace_recursive_unlock(); - - out_nocheck: preempt_enable_notrace(); return NULL; } @@ -2859,7 +2813,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_wakeups(buffer, cpu_buffer); - trace_recursive_unlock(); + trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); @@ -2970,7 +2924,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, out: rb_end_commit(cpu_buffer); - trace_recursive_unlock(); + trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); @@ -3000,9 +2954,6 @@ int ring_buffer_write(struct ring_buffer *buffer, int ret = -EBUSY; int cpu; - if (ring_buffer_flags != RB_BUFFERS_ON) - return -EBUSY; - preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) @@ -3021,9 +2972,12 @@ int ring_buffer_write(struct ring_buffer *buffer, if (length > BUF_MAX_DATA_SIZE) goto out; + if (unlikely(trace_recursive_lock(cpu_buffer))) + goto out; + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) - goto out; + goto out_unlock; body = rb_event_data(event); @@ -3034,6 +2988,10 @@ int ring_buffer_write(struct ring_buffer *buffer, rb_wakeups(buffer, cpu_buffer); ret = 0; + + out_unlock: + trace_recursive_unlock(cpu_buffer); + out: preempt_enable_notrace(); @@ -3860,19 +3818,36 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); -static inline int rb_ok_to_lock(void) +static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer) { + if (likely(!in_nmi())) { + raw_spin_lock(&cpu_buffer->reader_lock); + return true; + } + /* * If an NMI die dumps out the content of the ring buffer - * do not grab locks. We also permanently disable the ring - * buffer too. A one time deal is all you get from reading - * the ring buffer from an NMI. + * trylock must be used to prevent a deadlock if the NMI + * preempted a task that holds the ring buffer locks. If + * we get the lock then all is fine, if not, then continue + * to do the read, but this can corrupt the ring buffer, + * so it must be permanently disabled from future writes. + * Reading from NMI is a oneshot deal. */ - if (likely(!in_nmi())) - return 1; + if (raw_spin_trylock(&cpu_buffer->reader_lock)) + return true; - tracing_off_permanent(); - return 0; + /* Continue without locking, but disable the ring buffer */ + atomic_inc(&cpu_buffer->record_disabled); + return false; +} + +static inline void +rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked) +{ + if (likely(locked)) + raw_spin_unlock(&cpu_buffer->reader_lock); + return; } /** @@ -3892,21 +3867,18 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; - int dolock; + bool dolock; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - dolock = rb_ok_to_lock(); again: local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); + dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); + rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) @@ -3959,9 +3931,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; - int dolock; - - dolock = rb_ok_to_lock(); + bool dolock; again: /* might be called in atomic */ @@ -3972,8 +3942,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); + dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { @@ -3981,8 +3950,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, rb_advance_reader(cpu_buffer); } - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); + rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); out: @@ -4263,21 +4231,17 @@ int ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int dolock; + bool dolock; int cpu; int ret; - dolock = rb_ok_to_lock(); - /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); + dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); + rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (!ret) @@ -4297,21 +4261,17 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int dolock; + bool dolock; int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; - dolock = rb_ok_to_lock(); - cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); + dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); + rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); return ret; @@ -4349,9 +4309,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, ret = -EAGAIN; - if (ring_buffer_flags != RB_BUFFERS_ON) - goto out; - if (atomic_read(&buffer_a->record_disabled)) goto out; diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 1b28df2d9..a1503a027 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -32,11 +32,11 @@ static struct task_struct *producer; static struct task_struct *consumer; static unsigned long read; -static int disable_reader; +static unsigned int disable_reader; module_param(disable_reader, uint, 0644); MODULE_PARM_DESC(disable_reader, "only run producer"); -static int write_iteration = 50; +static unsigned int write_iteration = 50; module_param(write_iteration, uint, 0644); MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); @@ -46,16 +46,16 @@ static int consumer_nice = MAX_NICE; static int producer_fifo = -1; static int consumer_fifo = -1; -module_param(producer_nice, uint, 0644); +module_param(producer_nice, int, 0644); MODULE_PARM_DESC(producer_nice, "nice prio for producer"); -module_param(consumer_nice, uint, 0644); +module_param(consumer_nice, int, 0644); MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); -module_param(producer_fifo, uint, 0644); +module_param(producer_fifo, int, 0644); MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); -module_param(consumer_fifo, uint, 0644); +module_param(consumer_fifo, int, 0644); MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); static int read_events; @@ -263,6 +263,8 @@ static void ring_buffer_producer(void) if (cnt % wakeup_interval) cond_resched(); #endif + if (kthread_should_stop()) + kill_test = 1; } while (ktime_before(end_time, timeout) && !kill_test); trace_printk("End ring buffer hammer\n"); @@ -285,7 +287,7 @@ static void ring_buffer_producer(void) entries = ring_buffer_entries(buffer); overruns = ring_buffer_overruns(buffer); - if (kill_test) + if (kill_test && !kthread_should_stop()) trace_printk("ERROR!\n"); if (!disable_reader) { @@ -379,7 +381,7 @@ static int ring_buffer_consumer_thread(void *arg) } __set_current_state(TASK_RUNNING); - if (kill_test) + if (!kthread_should_stop()) wait_to_die(); return 0; @@ -399,13 +401,16 @@ static int ring_buffer_producer_thread(void *arg) } ring_buffer_producer(); + if (kill_test) + goto out_kill; trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ * SLEEP_TIME); } - if (kill_test) +out_kill: + if (!kthread_should_stop()) wait_to_die(); return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 05330494a..abcbf7ff8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -297,11 +297,11 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_check_discard(struct ftrace_event_file *file, void *rec, +int filter_check_discard(struct trace_event_file *file, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && + if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && !filter_match_preds(file->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; @@ -311,7 +311,7 @@ int filter_check_discard(struct ftrace_event_file *file, void *rec, } EXPORT_SYMBOL_GPL(filter_check_discard); -int call_filter_check_discard(struct ftrace_event_call *call, void *rec, +int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { @@ -876,6 +876,7 @@ static struct { { trace_clock_jiffies, "uptime", 0 }, { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, + { ktime_get_raw_fast_ns, "mono_raw", 1 }, ARCH_TRACE_CLOCKS }; @@ -1693,13 +1694,13 @@ static struct ring_buffer *temp_buffer; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, - struct ftrace_event_file *ftrace_file, + struct trace_event_file *trace_file, int type, unsigned long len, unsigned long flags, int pc) { struct ring_buffer_event *entry; - *current_rb = ftrace_file->tr->trace_buffer.buffer; + *current_rb = trace_file->tr->trace_buffer.buffer; entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); /* @@ -1708,7 +1709,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, * to store the trace event for the tigger to use. It's recusive * safe and will not be recorded anywhere. */ - if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) { + if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); @@ -1760,7 +1761,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_function; + struct trace_event_call *call = &event_function; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -1795,7 +1796,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { - struct ftrace_event_call *call = &event_kernel_stack; + struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -1923,7 +1924,7 @@ static DEFINE_PER_CPU(int, user_stack_count); void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_user_stack; + struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; @@ -2129,7 +2130,7 @@ static void trace_printk_start_stop_comm(int enabled) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - struct ftrace_event_call *call = &event_bprint; + struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_array *tr = &global_trace; @@ -2187,7 +2188,7 @@ static int __trace_array_vprintk(struct ring_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { - struct ftrace_event_call *call = &event_print; + struct trace_event_call *call = &event_print; struct ring_buffer_event *event; int len = 0, size, pc; struct print_entry *entry; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 921691c5c..74bde8160 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -12,7 +12,7 @@ #include <linux/ftrace.h> #include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/compiler.h> #include <linux/trace_seq.h> @@ -211,8 +211,8 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; - struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; + struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; + struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; @@ -859,7 +859,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops); #define ftrace_destroy_filter_files(ops) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ -int ftrace_event_is_function(struct ftrace_event_call *call); +int ftrace_event_is_function(struct trace_event_call *call); /* * struct trace_parser - servers for reading the user input separated by spaces @@ -993,7 +993,7 @@ struct event_subsystem { int ref_count; }; -struct ftrace_subsystem_dir { +struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; @@ -1053,30 +1053,30 @@ struct filter_pred { extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); -extern void print_event_filter(struct ftrace_event_file *file, +extern void print_event_filter(struct trace_event_file *file, struct trace_seq *s); -extern int apply_event_filter(struct ftrace_event_file *file, +extern int apply_event_filter(struct trace_event_file *file, char *filter_string); -extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, +extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); -extern int create_event_filter(struct ftrace_event_call *call, +extern int create_event_filter(struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp); extern void free_event_filter(struct event_filter *filter); struct ftrace_event_field * -trace_find_event_field(struct ftrace_event_call *call, char *name); +trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); -extern struct ftrace_event_file *find_event_file(struct trace_array *tr, - const char *system, - const char *event); +extern struct trace_event_file *find_event_file(struct trace_array *tr, + const char *system, + const char *event); static inline void *event_file_data(struct file *filp) { @@ -1181,7 +1181,7 @@ struct event_trigger_ops { * commands need to do this if they themselves log to the trace * buffer (see the @post_trigger() member below). @trigger_type * values are defined by adding new values to the trigger_type - * enum in include/linux/ftrace_event.h. + * enum in include/linux/trace_events.h. * * @post_trigger: A flag that says whether or not this command needs * to have its action delayed until after the current event has @@ -1243,23 +1243,23 @@ struct event_command { enum event_trigger_type trigger_type; bool post_trigger; int (*func)(struct event_command *cmd_ops, - struct ftrace_event_file *file, + struct trace_event_file *file, char *glob, char *cmd, char *params); int (*reg)(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file); + struct trace_event_file *file); void (*unreg)(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file); + struct trace_event_file *file); int (*set_filter)(char *filter_str, struct event_trigger_data *data, - struct ftrace_event_file *file); + struct trace_event_file *file); struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; -extern int trace_event_enable_disable(struct ftrace_event_file *file, +extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); @@ -1287,7 +1287,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ - extern struct ftrace_event_call \ + extern struct trace_event_call \ __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ @@ -1296,7 +1296,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #include "trace_entries.h" #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) -int perf_ftrace_event_register(struct ftrace_event_call *call, +int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 1879980f0..e2e12ad31 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -29,7 +29,7 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { - struct ftrace_event_call *call = &event_branch; + struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct trace_array_cpu *data; struct ring_buffer_event *event; @@ -194,7 +194,7 @@ __init static int init_branch_tracer(void) { int ret; - ret = register_ftrace_event(&trace_branch_event); + ret = register_trace_event(&trace_branch_event); if (!ret) { printk(KERN_WARNING "Warning: could not register " "branch events\n"); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 57b67b1f2..0f06532a7 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -56,6 +56,7 @@ u64 notrace trace_clock(void) { return local_clock(); } +EXPORT_SYMBOL_GPL(trace_clock); /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. @@ -68,6 +69,7 @@ u64 notrace trace_clock_jiffies(void) { return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } +EXPORT_SYMBOL_GPL(trace_clock_jiffies); /* * trace_clock_global(): special globally coherent trace clock @@ -123,6 +125,7 @@ u64 notrace trace_clock_global(void) return now; } +EXPORT_SYMBOL_GPL(trace_clock_global); static atomic64_t trace_counter; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 6fa484de2..abfc903e7 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -21,7 +21,7 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) /* Count the events in use (per event id, not per instance) */ static int total_ref_count; -static int perf_trace_event_perm(struct ftrace_event_call *tp_event, +static int perf_trace_event_perm(struct trace_event_call *tp_event, struct perf_event *p_event) { if (tp_event->perf_perm) { @@ -83,7 +83,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return 0; } -static int perf_trace_event_reg(struct ftrace_event_call *tp_event, +static int perf_trace_event_reg(struct trace_event_call *tp_event, struct perf_event *p_event) { struct hlist_head __percpu *list; @@ -143,7 +143,7 @@ fail: static void perf_trace_event_unreg(struct perf_event *p_event) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; int i; if (--tp_event->perf_refcount > 0) @@ -172,17 +172,17 @@ out: static int perf_trace_event_open(struct perf_event *p_event) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); } static void perf_trace_event_close(struct perf_event *p_event) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, +static int perf_trace_event_init(struct trace_event_call *tp_event, struct perf_event *p_event) { int ret; @@ -206,7 +206,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { - struct ftrace_event_call *tp_event; + struct trace_event_call *tp_event; u64 event_id = p_event->attr.config; int ret = -EINVAL; @@ -236,7 +236,7 @@ void perf_trace_destroy(struct perf_event *p_event) int perf_trace_add(struct perf_event *p_event, int flags) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; struct hlist_head __percpu *pcpu_list; struct hlist_head *list; @@ -255,7 +255,7 @@ int perf_trace_add(struct perf_event *p_event, int flags) void perf_trace_del(struct perf_event *p_event, int flags) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; hlist_del_rcu(&p_event->hlist_entry); tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } @@ -357,7 +357,7 @@ static void perf_ftrace_function_disable(struct perf_event *event) ftrace_function_local_disable(&event->ftrace_ops); } -int perf_ftrace_event_register(struct ftrace_event_call *call, +int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data) { switch (type) { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c4de47fc5..404a372ad 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -61,14 +61,14 @@ static int system_refcount_dec(struct event_subsystem *system) #define do_for_each_event_file_safe(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ - struct ftrace_event_file *___n; \ + struct trace_event_file *___n; \ list_for_each_entry_safe(file, ___n, &tr->events, list) #define while_for_each_event_file() \ } static struct list_head * -trace_get_fields(struct ftrace_event_call *event_call) +trace_get_fields(struct trace_event_call *event_call) { if (!event_call->class->get_fields) return &event_call->class->fields; @@ -89,7 +89,7 @@ __find_event_field(struct list_head *head, char *name) } struct ftrace_event_field * -trace_find_event_field(struct ftrace_event_call *call, char *name) +trace_find_event_field(struct trace_event_call *call, char *name) { struct ftrace_event_field *field; struct list_head *head; @@ -129,7 +129,7 @@ static int __trace_define_field(struct list_head *head, const char *type, return 0; } -int trace_define_field(struct ftrace_event_call *call, const char *type, +int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { @@ -166,7 +166,7 @@ static int trace_define_common_fields(void) return ret; } -static void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct trace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; @@ -178,11 +178,11 @@ static void trace_destroy_fields(struct ftrace_event_call *call) } } -int trace_event_raw_init(struct ftrace_event_call *call) +int trace_event_raw_init(struct trace_event_call *call) { int id; - id = register_ftrace_event(&call->event); + id = register_trace_event(&call->event); if (!id) return -ENODEV; @@ -190,18 +190,18 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); -void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, - struct ftrace_event_file *ftrace_file, - unsigned long len) +void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, + struct trace_event_file *trace_file, + unsigned long len) { - struct ftrace_event_call *event_call = ftrace_file->event_call; + struct trace_event_call *event_call = trace_file->event_call; local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); - fbuffer->ftrace_file = ftrace_file; + fbuffer->trace_file = trace_file; fbuffer->event = - trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file, + trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, event_call->event.type, len, fbuffer->flags, fbuffer->pc); if (!fbuffer->event) @@ -210,13 +210,13 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, fbuffer->entry = ring_buffer_event_data(fbuffer->event); return fbuffer->entry; } -EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); +EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); static DEFINE_SPINLOCK(tracepoint_iter_lock); -static void output_printk(struct ftrace_event_buffer *fbuffer) +static void output_printk(struct trace_event_buffer *fbuffer) { - struct ftrace_event_call *event_call; + struct trace_event_call *event_call; struct trace_event *event; unsigned long flags; struct trace_iterator *iter = tracepoint_print_iter; @@ -224,12 +224,12 @@ static void output_printk(struct ftrace_event_buffer *fbuffer) if (!iter) return; - event_call = fbuffer->ftrace_file->event_call; + event_call = fbuffer->trace_file->event_call; if (!event_call || !event_call->event.funcs || !event_call->event.funcs->trace) return; - event = &fbuffer->ftrace_file->event_call->event; + event = &fbuffer->trace_file->event_call->event; spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); @@ -241,21 +241,21 @@ static void output_printk(struct ftrace_event_buffer *fbuffer) spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } -void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) { if (tracepoint_printk) output_printk(fbuffer); - event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, + event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc); } -EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit); +EXPORT_SYMBOL_GPL(trace_event_buffer_commit); -int ftrace_event_reg(struct ftrace_event_call *call, - enum trace_reg type, void *data) +int trace_event_reg(struct trace_event_call *call, + enum trace_reg type, void *data) { - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { @@ -288,34 +288,34 @@ int ftrace_event_reg(struct ftrace_event_call *call, } return 0; } -EXPORT_SYMBOL_GPL(ftrace_event_reg); +EXPORT_SYMBOL_GPL(trace_event_reg); void trace_event_enable_cmd_record(bool enable) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_array *tr; mutex_lock(&event_mutex); do_for_each_event_file(tr, file) { - if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) + if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); - set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); - clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); mutex_unlock(&event_mutex); } -static int __ftrace_event_enable_disable(struct ftrace_event_file *file, +static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; int ret = 0; int disable; @@ -337,24 +337,24 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) break; - disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; - clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; + clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); } else - disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); - if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { - clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); - if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) { + if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { + clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); + if (file->flags & EVENT_FILE_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); - clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ - if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) - set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + if (file->flags & EVENT_FILE_FL_SOFT_MODE) + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else - clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* @@ -366,31 +366,31 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, * it still seems to be disabled. */ if (!soft_disable) - clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else { if (atomic_inc_return(&file->sm_ref) > 1) break; - set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); } - if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + if (!(file->flags & EVENT_FILE_FL_ENABLED)) { /* Keep the event disabled, when going to SOFT_MODE. */ if (soft_disable) - set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); if (trace_flags & TRACE_ITER_RECORD_CMD) { tracing_start_cmdline_record(); - set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " - "%s\n", ftrace_event_name(call)); + "%s\n", trace_event_name(call)); break; } - set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ call->flags |= TRACE_EVENT_FL_WAS_ENABLED; @@ -401,13 +401,13 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, return ret; } -int trace_event_enable_disable(struct ftrace_event_file *file, +int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { return __ftrace_event_enable_disable(file, enable, soft_disable); } -static int ftrace_event_enable_disable(struct ftrace_event_file *file, +static int ftrace_event_enable_disable(struct trace_event_file *file, int enable) { return __ftrace_event_enable_disable(file, enable, 0); @@ -415,7 +415,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file, static void ftrace_clear_events(struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { @@ -449,14 +449,14 @@ static void __get_system(struct event_subsystem *system) system_refcount_inc(system); } -static void __get_system_dir(struct ftrace_subsystem_dir *dir) +static void __get_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); dir->ref_count++; __get_system(dir->subsystem); } -static void __put_system_dir(struct ftrace_subsystem_dir *dir) +static void __put_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */ @@ -467,14 +467,14 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir) kfree(dir); } -static void put_system(struct ftrace_subsystem_dir *dir) +static void put_system(struct trace_subsystem_dir *dir) { mutex_lock(&event_mutex); __put_system_dir(dir); mutex_unlock(&event_mutex); } -static void remove_subsystem(struct ftrace_subsystem_dir *dir) +static void remove_subsystem(struct trace_subsystem_dir *dir) { if (!dir) return; @@ -486,7 +486,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) } } -static void remove_event_file_dir(struct ftrace_event_file *file) +static void remove_event_file_dir(struct trace_event_file *file) { struct dentry *dir = file->dir; struct dentry *child; @@ -515,15 +515,15 @@ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, const char *sub, const char *event, int set) { - struct ftrace_event_file *file; - struct ftrace_event_call *call; + struct trace_event_file *file; + struct trace_event_call *call; const char *name; int ret = -EINVAL; list_for_each_entry(file, &tr->events, list) { call = file->event_call; - name = ftrace_event_name(call); + name = trace_event_name(call); if (!name || !call->class || !call->class->reg) continue; @@ -671,8 +671,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_file *file = v; - struct ftrace_event_call *call; + struct trace_event_file *file = v; + struct trace_event_call *call; struct trace_array *tr = m->private; (*pos)++; @@ -692,13 +692,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - file = list_entry(&tr->events, struct ftrace_event_file, list); + file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = t_next(m, file, &l); if (!file) @@ -710,13 +710,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_file *file = v; + struct trace_event_file *file = v; struct trace_array *tr = m->private; (*pos)++; list_for_each_entry_continue(file, &tr->events, list) { - if (file->flags & FTRACE_EVENT_FL_ENABLED) + if (file->flags & EVENT_FILE_FL_ENABLED) return file; } @@ -725,13 +725,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - file = list_entry(&tr->events, struct ftrace_event_file, list); + file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = s_next(m, file, &l); if (!file) @@ -742,12 +742,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) static int t_show(struct seq_file *m, void *v) { - struct ftrace_event_file *file = v; - struct ftrace_event_call *call = file->event_call; + struct trace_event_file *file = v; + struct trace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); - seq_printf(m, "%s\n", ftrace_event_name(call)); + seq_printf(m, "%s\n", trace_event_name(call)); return 0; } @@ -761,7 +761,7 @@ static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct trace_event_file *file; unsigned long flags; char buf[4] = "0"; @@ -774,12 +774,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (!file) return -ENODEV; - if (flags & FTRACE_EVENT_FL_ENABLED && - !(flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + if (flags & EVENT_FILE_FL_ENABLED && + !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); - if (flags & FTRACE_EVENT_FL_SOFT_DISABLED || - flags & FTRACE_EVENT_FL_SOFT_MODE) + if (flags & EVENT_FILE_FL_SOFT_DISABLED || + flags & EVENT_FILE_FL_SOFT_MODE) strcat(buf, "*"); strcat(buf, "\n"); @@ -791,7 +791,7 @@ static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct trace_event_file *file; unsigned long val; int ret; @@ -828,10 +828,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct ftrace_subsystem_dir *dir = filp->private_data; + struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; - struct ftrace_event_call *call; - struct ftrace_event_file *file; + struct trace_event_call *call; + struct trace_event_file *file; struct trace_array *tr = dir->tr; char buf[2]; int set = 0; @@ -840,7 +840,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; - if (!ftrace_event_name(call) || !call->class || !call->class->reg) + if (!trace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) @@ -851,7 +851,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED)); + set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED)); /* * If we have a mixture, no need to look further. @@ -873,7 +873,7 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_subsystem_dir *dir = filp->private_data; + struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; @@ -917,7 +917,7 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = event_file_data(m->private); + struct trace_event_call *call = event_file_data(m->private); struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; @@ -949,13 +949,13 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) static int f_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = event_file_data(m->private); + struct trace_event_call *call = event_file_data(m->private); struct ftrace_event_field *field; const char *array_descriptor; switch ((unsigned long)v) { case FORMAT_HEADER: - seq_printf(m, "name: %s\n", ftrace_event_name(call)); + seq_printf(m, "name: %s\n", trace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_puts(m, "format:\n"); return 0; @@ -1062,7 +1062,7 @@ static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_seq *s; int r = -ENODEV; @@ -1095,7 +1095,7 @@ static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct trace_event_file *file; char *buf; int err = -ENODEV; @@ -1132,7 +1132,7 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct event_subsystem *system = NULL; - struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ + struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */ struct trace_array *tr; int ret; @@ -1181,7 +1181,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) static int system_tr_open(struct inode *inode, struct file *filp) { - struct ftrace_subsystem_dir *dir; + struct trace_subsystem_dir *dir; struct trace_array *tr = inode->i_private; int ret; @@ -1214,7 +1214,7 @@ static int system_tr_open(struct inode *inode, struct file *filp) static int subsystem_release(struct inode *inode, struct file *file) { - struct ftrace_subsystem_dir *dir = file->private_data; + struct trace_subsystem_dir *dir = file->private_data; trace_array_put(dir->tr); @@ -1235,7 +1235,7 @@ static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_subsystem_dir *dir = filp->private_data; + struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; @@ -1262,7 +1262,7 @@ static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_subsystem_dir *dir = filp->private_data; + struct trace_subsystem_dir *dir = filp->private_data; char *buf; int err; @@ -1497,9 +1497,9 @@ create_new_subsystem(const char *name) static struct dentry * event_subsystem_dir(struct trace_array *tr, const char *name, - struct ftrace_event_file *file, struct dentry *parent) + struct trace_event_file *file, struct dentry *parent) { - struct ftrace_subsystem_dir *dir; + struct trace_subsystem_dir *dir; struct event_subsystem *system; struct dentry *entry; @@ -1571,9 +1571,9 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } static int -event_create_dir(struct dentry *parent, struct ftrace_event_file *file) +event_create_dir(struct dentry *parent, struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; struct list_head *head; struct dentry *d_events; @@ -1591,7 +1591,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) } else d_events = parent; - name = ftrace_event_name(call); + name = trace_event_name(call); file->dir = tracefs_create_dir(name, d_events); if (!file->dir) { pr_warn("Could not create tracefs '%s' directory\n", name); @@ -1634,9 +1634,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) return 0; } -static void remove_event_from_tracers(struct ftrace_event_call *call) +static void remove_event_from_tracers(struct trace_event_call *call) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_array *tr; do_for_each_event_file_safe(tr, file) { @@ -1654,10 +1654,10 @@ static void remove_event_from_tracers(struct ftrace_event_call *call) } while_for_each_event_file(); } -static void event_remove(struct ftrace_event_call *call) +static void event_remove(struct trace_event_call *call) { struct trace_array *tr; - struct ftrace_event_file *file; + struct trace_event_file *file; do_for_each_event_file(tr, file) { if (file->event_call != call) @@ -1673,17 +1673,17 @@ static void event_remove(struct ftrace_event_call *call) } while_for_each_event_file(); if (call->event.funcs) - __unregister_ftrace_event(&call->event); + __unregister_trace_event(&call->event); remove_event_from_tracers(call); list_del(&call->list); } -static int event_init(struct ftrace_event_call *call) +static int event_init(struct trace_event_call *call) { int ret = 0; const char *name; - name = ftrace_event_name(call); + name = trace_event_name(call); if (WARN_ON(!name)) return -EINVAL; @@ -1697,7 +1697,7 @@ static int event_init(struct ftrace_event_call *call) } static int -__register_event(struct ftrace_event_call *call, struct module *mod) +__register_event(struct trace_event_call *call, struct module *mod) { int ret; @@ -1733,7 +1733,7 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) return ptr + elen; } -static void update_event_printk(struct ftrace_event_call *call, +static void update_event_printk(struct trace_event_call *call, struct trace_enum_map *map) { char *ptr; @@ -1811,7 +1811,7 @@ static void update_event_printk(struct ftrace_event_call *call, void trace_event_enum_update(struct trace_enum_map **map, int len) { - struct ftrace_event_call *call, *p; + struct trace_event_call *call, *p; const char *last_system = NULL; int last_i; int i; @@ -1836,11 +1836,11 @@ void trace_event_enum_update(struct trace_enum_map **map, int len) up_write(&trace_event_sem); } -static struct ftrace_event_file * -trace_create_new_event(struct ftrace_event_call *call, +static struct trace_event_file * +trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) @@ -1858,9 +1858,9 @@ trace_create_new_event(struct ftrace_event_call *call, /* Add an event to a trace directory */ static int -__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) +__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; file = trace_create_new_event(call, tr); if (!file) @@ -1875,10 +1875,10 @@ __trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) * the filesystem is initialized. */ static __init int -__trace_early_add_new_event(struct ftrace_event_call *call, +__trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; file = trace_create_new_event(call, tr); if (!file) @@ -1888,10 +1888,10 @@ __trace_early_add_new_event(struct ftrace_event_call *call, } struct ftrace_module_file_ops; -static void __add_event_to_tracers(struct ftrace_event_call *call); +static void __add_event_to_tracers(struct trace_event_call *call); /* Add an additional event_call dynamically */ -int trace_add_event_call(struct ftrace_event_call *call) +int trace_add_event_call(struct trace_event_call *call) { int ret; mutex_lock(&trace_types_lock); @@ -1910,7 +1910,7 @@ int trace_add_event_call(struct ftrace_event_call *call) * Must be called under locking of trace_types_lock, event_mutex and * trace_event_sem. */ -static void __trace_remove_event_call(struct ftrace_event_call *call) +static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); @@ -1918,10 +1918,10 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) call->filter = NULL; } -static int probe_remove_event_call(struct ftrace_event_call *call) +static int probe_remove_event_call(struct trace_event_call *call) { struct trace_array *tr; - struct ftrace_event_file *file; + struct trace_event_file *file; #ifdef CONFIG_PERF_EVENTS if (call->perf_refcount) @@ -1932,10 +1932,10 @@ static int probe_remove_event_call(struct ftrace_event_call *call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) - * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress + * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress * TRACE_REG_UNREGISTER. */ - if (file->flags & FTRACE_EVENT_FL_ENABLED) + if (file->flags & EVENT_FILE_FL_ENABLED) return -EBUSY; /* * The do_for_each_event_file_safe() is @@ -1952,7 +1952,7 @@ static int probe_remove_event_call(struct ftrace_event_call *call) } /* Remove an event_call */ -int trace_remove_event_call(struct ftrace_event_call *call) +int trace_remove_event_call(struct trace_event_call *call) { int ret; @@ -1976,7 +1976,7 @@ int trace_remove_event_call(struct ftrace_event_call *call) static void trace_module_add_events(struct module *mod) { - struct ftrace_event_call **call, **start, **end; + struct trace_event_call **call, **start, **end; if (!mod->num_trace_events) return; @@ -1999,7 +1999,7 @@ static void trace_module_add_events(struct module *mod) static void trace_module_remove_events(struct module *mod) { - struct ftrace_event_call *call, *p; + struct trace_event_call *call, *p; bool clear_trace = false; down_write(&trace_event_sem); @@ -2055,28 +2055,28 @@ static struct notifier_block trace_module_nb = { static void __trace_add_event_dirs(struct trace_array *tr) { - struct ftrace_event_call *call; + struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create directory for event %s\n", - ftrace_event_name(call)); + trace_event_name(call)); } } -struct ftrace_event_file * +struct trace_event_file * find_event_file(struct trace_array *tr, const char *system, const char *event) { - struct ftrace_event_file *file; - struct ftrace_event_call *call; + struct trace_event_file *file; + struct trace_event_call *call; const char *name; list_for_each_entry(file, &tr->events, list) { call = file->event_call; - name = ftrace_event_name(call); + name = trace_event_name(call); if (!name || !call->class || !call->class->reg) continue; @@ -2098,7 +2098,7 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) #define DISABLE_EVENT_STR "disable_event" struct event_probe_data { - struct ftrace_event_file *file; + struct trace_event_file *file; unsigned long count; int ref; bool enable; @@ -2114,9 +2114,9 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) return; if (data->enable) - clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else - set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); } static void @@ -2132,7 +2132,7 @@ event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data return; /* Skip if the event is in a state we want to switch to */ - if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; if (data->count != -1) @@ -2152,7 +2152,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, seq_printf(m, "%s:%s:%s", data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, data->file->event_call->class->system, - ftrace_event_name(data->file->event_call)); + trace_event_name(data->file->event_call)); if (data->count == -1) seq_puts(m, ":unlimited\n"); @@ -2226,7 +2226,7 @@ event_enable_func(struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { struct trace_array *tr = top_trace_array(); - struct ftrace_event_file *file; + struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; const char *system; @@ -2358,7 +2358,7 @@ static inline int register_event_cmds(void) { return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* - * The top level array has already had its ftrace_event_file + * The top level array has already had its trace_event_file * descriptors created in order to allow for early events to * be recorded. This function is called after the tracefs has been * initialized, and we now have to create the files associated @@ -2367,7 +2367,7 @@ static inline int register_event_cmds(void) { return 0; } static __init void __trace_early_add_event_dirs(struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; int ret; @@ -2375,7 +2375,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warn("Could not create directory for event %s\n", - ftrace_event_name(file->event_call)); + trace_event_name(file->event_call)); } } @@ -2388,7 +2388,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) static __init void __trace_early_add_events(struct trace_array *tr) { - struct ftrace_event_call *call; + struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { @@ -2399,7 +2399,7 @@ __trace_early_add_events(struct trace_array *tr) ret = __trace_early_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create early event %s\n", - ftrace_event_name(call)); + trace_event_name(call)); } } @@ -2407,13 +2407,13 @@ __trace_early_add_events(struct trace_array *tr) static void __trace_remove_event_dirs(struct trace_array *tr) { - struct ftrace_event_file *file, *next; + struct trace_event_file *file, *next; list_for_each_entry_safe(file, next, &tr->events, list) remove_event_file_dir(file); } -static void __add_event_to_tracers(struct ftrace_event_call *call) +static void __add_event_to_tracers(struct trace_event_call *call) { struct trace_array *tr; @@ -2421,8 +2421,8 @@ static void __add_event_to_tracers(struct ftrace_event_call *call) __trace_add_new_event(call, tr); } -extern struct ftrace_event_call *__start_ftrace_events[]; -extern struct ftrace_event_call *__stop_ftrace_events[]; +extern struct trace_event_call *__start_ftrace_events[]; +extern struct trace_event_call *__stop_ftrace_events[]; static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; @@ -2557,7 +2557,7 @@ int event_trace_del_tracer(struct trace_array *tr) static __init int event_trace_memsetup(void) { field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); - file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); + file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC); return 0; } @@ -2593,7 +2593,7 @@ early_enable_events(struct trace_array *tr, bool disable_first) static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); - struct ftrace_event_call **iter, *call; + struct trace_event_call **iter, *call; int ret; if (!tr) @@ -2754,9 +2754,9 @@ static __init void event_test_stuff(void) */ static __init void event_trace_self_tests(void) { - struct ftrace_subsystem_dir *dir; - struct ftrace_event_file *file; - struct ftrace_event_call *call; + struct trace_subsystem_dir *dir; + struct trace_event_file *file; + struct trace_event_call *call; struct event_subsystem *system; struct trace_array *tr; int ret; @@ -2787,13 +2787,13 @@ static __init void event_trace_self_tests(void) continue; #endif - pr_info("Testing event %s: ", ftrace_event_name(call)); + pr_info("Testing event %s: ", trace_event_name(call)); /* * If an event is already enabled, someone is using * it and the self test should not be on. */ - if (file->flags & FTRACE_EVENT_FL_ENABLED) { + if (file->flags & EVENT_FILE_FL_ENABLED) { pr_warn("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 52adf02d7..d81d6f302 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -643,7 +643,7 @@ static void append_filter_err(struct filter_parse_state *ps, free_page((unsigned long) buf); } -static inline struct event_filter *event_filter(struct ftrace_event_file *file) +static inline struct event_filter *event_filter(struct trace_event_file *file) { if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) return file->event_call->filter; @@ -652,7 +652,7 @@ static inline struct event_filter *event_filter(struct ftrace_event_file *file) } /* caller must hold event_mutex */ -void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) +void print_event_filter(struct trace_event_file *file, struct trace_seq *s) { struct event_filter *filter = event_filter(file); @@ -780,14 +780,14 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void filter_disable(struct ftrace_event_file *file) +static void filter_disable(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) call->flags &= ~TRACE_EVENT_FL_FILTERED; else - file->flags &= ~FTRACE_EVENT_FL_FILTERED; + file->flags &= ~EVENT_FILE_FL_FILTERED; } static void __free_filter(struct event_filter *filter) @@ -837,9 +837,9 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static inline void __remove_filter(struct ftrace_event_file *file) +static inline void __remove_filter(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; filter_disable(file); if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) @@ -848,10 +848,10 @@ static inline void __remove_filter(struct ftrace_event_file *file) remove_filter_string(file->filter); } -static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir, +static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -860,9 +860,9 @@ static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir, } } -static inline void __free_subsystem_filter(struct ftrace_event_file *file) +static inline void __free_subsystem_filter(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { __free_filter(call->filter); @@ -873,10 +873,10 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file) } } -static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir, +static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -1342,7 +1342,7 @@ parse_operand: } static struct filter_pred *create_pred(struct filter_parse_state *ps, - struct ftrace_event_call *call, + struct trace_event_call *call, int op, char *operand1, char *operand2) { struct ftrace_event_field *field; @@ -1564,7 +1564,7 @@ static int fold_pred_tree(struct event_filter *filter, filter->preds); } -static int replace_preds(struct ftrace_event_call *call, +static int replace_preds(struct trace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, bool dry_run) @@ -1677,20 +1677,20 @@ fail: return err; } -static inline void event_set_filtered_flag(struct ftrace_event_file *file) +static inline void event_set_filtered_flag(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) call->flags |= TRACE_EVENT_FL_FILTERED; else - file->flags |= FTRACE_EVENT_FL_FILTERED; + file->flags |= EVENT_FILE_FL_FILTERED; } -static inline void event_set_filter(struct ftrace_event_file *file, +static inline void event_set_filter(struct trace_event_file *file, struct event_filter *filter) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) rcu_assign_pointer(call->filter, filter); @@ -1698,9 +1698,9 @@ static inline void event_set_filter(struct ftrace_event_file *file, rcu_assign_pointer(file->filter, filter); } -static inline void event_clear_filter(struct ftrace_event_file *file) +static inline void event_clear_filter(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) RCU_INIT_POINTER(call->filter, NULL); @@ -1709,33 +1709,33 @@ static inline void event_clear_filter(struct ftrace_event_file *file) } static inline void -event_set_no_set_filter_flag(struct ftrace_event_file *file) +event_set_no_set_filter_flag(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; else - file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; + file->flags |= EVENT_FILE_FL_NO_SET_FILTER; } static inline void -event_clear_no_set_filter_flag(struct ftrace_event_file *file) +event_clear_no_set_filter_flag(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; else - file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; + file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; } static inline bool -event_no_set_filter_flag(struct ftrace_event_file *file) +event_no_set_filter_flag(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; - if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) + if (file->flags & EVENT_FILE_FL_NO_SET_FILTER) return true; if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && @@ -1750,12 +1750,12 @@ struct filter_list { struct event_filter *filter; }; -static int replace_system_preds(struct ftrace_subsystem_dir *dir, +static int replace_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct filter_list *filter_item; struct filter_list *tmp; LIST_HEAD(filter_list); @@ -1899,8 +1899,8 @@ static void create_filter_finish(struct filter_parse_state *ps) } /** - * create_filter - create a filter for a ftrace_event_call - * @call: ftrace_event_call to create a filter for + * create_filter - create a filter for a trace_event_call + * @call: trace_event_call to create a filter for * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) @@ -1914,7 +1914,7 @@ static void create_filter_finish(struct filter_parse_state *ps) * information if @set_str is %true and the caller is responsible for * freeing it. */ -static int create_filter(struct ftrace_event_call *call, +static int create_filter(struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp) { @@ -1934,7 +1934,7 @@ static int create_filter(struct ftrace_event_call *call, return err; } -int create_event_filter(struct ftrace_event_call *call, +int create_event_filter(struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp) { @@ -1950,7 +1950,7 @@ int create_event_filter(struct ftrace_event_call *call, * Identical to create_filter() except that it creates a subsystem filter * and always remembers @filter_str. */ -static int create_system_filter(struct ftrace_subsystem_dir *dir, +static int create_system_filter(struct trace_subsystem_dir *dir, struct trace_array *tr, char *filter_str, struct event_filter **filterp) { @@ -1976,9 +1976,9 @@ static int create_system_filter(struct ftrace_subsystem_dir *dir, } /* caller must hold event_mutex */ -int apply_event_filter(struct ftrace_event_file *file, char *filter_string) +int apply_event_filter(struct trace_event_file *file, char *filter_string) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; struct event_filter *filter; int err; @@ -2027,7 +2027,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string) return err; } -int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, +int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, char *filter_string) { struct event_subsystem *system = dir->subsystem; @@ -2090,7 +2090,7 @@ struct function_filter_data { static char ** ftrace_function_filter_re(char *buf, int len, int *count) { - char *str, *sep, **re; + char *str, **re; str = kstrndup(buf, len, GFP_KERNEL); if (!str) @@ -2100,8 +2100,7 @@ ftrace_function_filter_re(char *buf, int len, int *count) * The argv_split function takes white space * as a separator, so convert ',' into spaces. */ - while ((sep = strchr(str, ','))) - *sep = ' '; + strreplace(str, ',', ' '); re = argv_split(GFP_KERNEL, str, count); kfree(str); @@ -2227,7 +2226,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, { int err; struct event_filter *filter; - struct ftrace_event_call *call; + struct trace_event_call *call; mutex_lock(&event_mutex); @@ -2283,7 +2282,7 @@ out_unlock: static struct test_filter_data_t { char *filter; - struct ftrace_raw_ftrace_test_filter rec; + struct trace_event_raw_ftrace_test_filter rec; int match; char *not_visited; } test_filter_data[] = { diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 8712df9de..42a4009fd 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -40,7 +40,7 @@ trigger_data_free(struct event_trigger_data *data) /** * event_triggers_call - Call triggers associated with a trace event - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * @rec: The trace entry for the event, NULL for unconditional invocation * * For each trigger associated with an event, invoke the trigger @@ -63,7 +63,7 @@ trigger_data_free(struct event_trigger_data *data) * any trigger that should be deferred, ETT_NONE if nothing to defer. */ enum event_trigger_type -event_triggers_call(struct ftrace_event_file *file, void *rec) +event_triggers_call(struct trace_event_file *file, void *rec) { struct event_trigger_data *data; enum event_trigger_type tt = ETT_NONE; @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); /** * event_triggers_post_call - Call 'post_triggers' for a trace event - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * @tt: enum event_trigger_type containing a set bit for each trigger to invoke * * For each trigger associated with an event, invoke the trigger @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); * Called from tracepoint handlers (with rcu_read_lock_sched() held). */ void -event_triggers_post_call(struct ftrace_event_file *file, +event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt) { struct event_trigger_data *data; @@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(event_triggers_post_call); static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) { - struct ftrace_event_file *event_file = event_file_data(m->private); + struct trace_event_file *event_file = event_file_data(m->private); if (t == SHOW_AVAILABLE_TRIGGERS) return NULL; @@ -129,7 +129,7 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) static void *trigger_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_file *event_file; + struct trace_event_file *event_file; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); @@ -201,7 +201,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) return ret; } -static int trigger_process_regex(struct ftrace_event_file *file, char *buff) +static int trigger_process_regex(struct trace_event_file *file, char *buff) { char *command, *next = buff; struct event_command *p; @@ -227,7 +227,7 @@ static ssize_t event_trigger_regex_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *event_file; + struct trace_event_file *event_file; ssize_t ret; char *buf; @@ -430,7 +430,7 @@ event_trigger_free(struct event_trigger_ops *ops, trigger_data_free(data); } -static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, +static int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable) { int ret = 0; @@ -438,12 +438,12 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, if (trigger_enable) { if (atomic_inc_return(&file->tm_ref) > 1) return ret; - set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + set_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags); ret = trace_event_enable_disable(file, 1, 1); } else { if (atomic_dec_return(&file->tm_ref) > 0) return ret; - clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags); ret = trace_event_enable_disable(file, 0, 1); } @@ -466,7 +466,7 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, void clear_event_triggers(struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { struct event_trigger_data *data; @@ -480,7 +480,7 @@ clear_event_triggers(struct trace_array *tr) /** * update_cond_flag - Set or reset the TRIGGER_COND bit - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * * If an event has triggers and any of those triggers has a filter or * a post_trigger, trigger invocation needs to be deferred until after @@ -488,7 +488,7 @@ clear_event_triggers(struct trace_array *tr) * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be * cleared. */ -static void update_cond_flag(struct ftrace_event_file *file) +static void update_cond_flag(struct trace_event_file *file) { struct event_trigger_data *data; bool set_cond = false; @@ -501,9 +501,9 @@ static void update_cond_flag(struct ftrace_event_file *file) } if (set_cond) - set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); + set_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags); else - clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags); } /** @@ -511,7 +511,7 @@ static void update_cond_flag(struct ftrace_event_file *file) * @glob: The raw string used to register the trigger * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data to associate with the trigger - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * * Common implementation for event trigger registration. * @@ -522,7 +522,7 @@ static void update_cond_flag(struct ftrace_event_file *file) */ static int register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct event_trigger_data *test; int ret = 0; @@ -557,7 +557,7 @@ out: * @glob: The raw string used to register the trigger * @ops: The trigger ops associated with the trigger * @test: Trigger-specific data used to find the trigger to remove - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * * Common implementation for event trigger unregistration. * @@ -566,7 +566,7 @@ out: */ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; @@ -588,7 +588,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, /** * event_trigger_callback - Generic event_command @func implementation * @cmd_ops: The command ops, used for trigger registration - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger * @cmd: The cmd portion of the string used to register the trigger * @param: The params portion of the string used to register the trigger @@ -603,7 +603,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, */ static int event_trigger_callback(struct event_command *cmd_ops, - struct ftrace_event_file *file, + struct trace_event_file *file, char *glob, char *cmd, char *param) { struct event_trigger_data *trigger_data; @@ -688,7 +688,7 @@ event_trigger_callback(struct event_command *cmd_ops, * set_trigger_filter - Generic event_command @set_filter implementation * @filter_str: The filter string for the trigger, NULL to remove filter * @trigger_data: Trigger-specific data - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * * Common implementation for event command filter parsing and filter * instantiation. @@ -702,7 +702,7 @@ event_trigger_callback(struct event_command *cmd_ops, */ static int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct event_trigger_data *data = trigger_data; struct event_filter *filter = NULL, *tmp; @@ -900,7 +900,7 @@ snapshot_count_trigger(struct event_trigger_data *data) static int register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file) + struct trace_event_file *file) { int ret = register_trigger(glob, ops, data, file); @@ -968,7 +968,7 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } * Skip 3: * stacktrace_trigger() * event_triggers_post_call() - * ftrace_raw_event_xxx() + * trace_event_raw_event_xxx() */ #define STACK_SKIP 3 @@ -1053,7 +1053,7 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) #define DISABLE_EVENT_STR "disable_event" struct enable_trigger_data { - struct ftrace_event_file *file; + struct trace_event_file *file; bool enable; }; @@ -1063,9 +1063,9 @@ event_enable_trigger(struct event_trigger_data *data) struct enable_trigger_data *enable_data = data->private_data; if (enable_data->enable) - clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); else - set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); } static void @@ -1077,7 +1077,7 @@ event_enable_count_trigger(struct event_trigger_data *data) return; /* Skip if the event is in a state we want to switch to */ - if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; if (data->count != -1) @@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, seq_printf(m, "%s:%s:%s", enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, enable_data->file->event_call->class->system, - ftrace_event_name(enable_data->file->event_call)); + trace_event_name(enable_data->file->event_call)); if (data->count == -1) seq_puts(m, ":unlimited"); @@ -1159,10 +1159,10 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { static int event_enable_trigger_func(struct event_command *cmd_ops, - struct ftrace_event_file *file, + struct trace_event_file *file, char *glob, char *cmd, char *param) { - struct ftrace_event_file *event_enable_file; + struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; struct event_trigger_data *trigger_data; struct event_trigger_ops *trigger_ops; @@ -1294,7 +1294,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, static int event_enable_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct enable_trigger_data *enable_data = data->private_data; struct enable_trigger_data *test_enable_data; @@ -1331,7 +1331,7 @@ out: static void event_enable_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; struct enable_trigger_data *enable_data; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 174a6a711..adabf7da9 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -125,7 +125,7 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ static int __init \ -ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ +ftrace_define_fields_##name(struct trace_event_call *event_call) \ { \ struct struct_name field; \ int ret; \ @@ -163,14 +163,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct ftrace_event_class __refdata event_class_ftrace_##call = { \ +struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ .reg = regfn, \ }; \ \ -struct ftrace_event_call __used event_##call = { \ +struct trace_event_call __used event_##call = { \ .class = &event_class_ftrace_##call, \ { \ .name = #call, \ @@ -179,7 +179,7 @@ struct ftrace_event_call __used event_##call = { \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ -struct ftrace_event_call __used \ +struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY @@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; FTRACE_ENTRY_REG(call, struct_name, etype, \ PARAMS(tstruct), PARAMS(print), filter, NULL) -int ftrace_event_is_function(struct ftrace_event_call *call) +int ftrace_event_is_function(struct trace_event_call *call) { return call == &event_function; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a51e79688..8968bf720 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -278,7 +278,7 @@ int __trace_graph_entry(struct trace_array *tr, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_funcgraph_entry; + struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ent_entry *entry; @@ -393,7 +393,7 @@ void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_funcgraph_exit; + struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ret_entry *entry; @@ -1454,12 +1454,12 @@ static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); - if (!register_ftrace_event(&graph_trace_entry_event)) { + if (!register_trace_event(&graph_trace_entry_event)) { pr_warning("Warning: could not register graph trace events\n"); return 1; } - if (!register_ftrace_event(&graph_trace_ret_event)) { + if (!register_trace_event(&graph_trace_ret_event)) { pr_warning("Warning: could not register graph trace events\n"); return 1; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d0ce590f0..b7d0cdd99 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -348,7 +348,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, struct trace_kprobe *tk; list_for_each_entry(tk, &probe_list, list) - if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 && + if (strcmp(trace_event_name(&tk->tp.call), event) == 0 && strcmp(tk->tp.call.class->system, group) == 0) return tk; return NULL; @@ -359,7 +359,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, * if the file is NULL, enable "perf" handler, or enable "trace" handler. */ static int -enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) +enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { int ret = 0; @@ -394,7 +394,7 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) * if the file is NULL, disable "perf" handler, or disable "trace" handler. */ static int -disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) +disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { struct event_file_link *link = NULL; int wait = 0; @@ -523,7 +523,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&probe_lock); /* Delete old (same name) event if exist */ - old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call), + old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call), tk->tp.call.class->system); if (old_tk) { ret = unregister_trace_kprobe(old_tk); @@ -572,7 +572,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warning("Failed to re-register probe %s on" "%s: %d\n", - ftrace_event_name(&tk->tp.call), + trace_event_name(&tk->tp.call), mod->name, ret); } } @@ -829,7 +829,7 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tk->tp.call.class->system, - ftrace_event_name(&tk->tp.call)); + trace_event_name(&tk->tp.call)); if (!tk->symbol) seq_printf(m, " 0x%p", tk->rp.kp.addr); @@ -888,7 +888,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) struct trace_kprobe *tk = v; seq_printf(m, " %-44s %15lu %15lu\n", - ftrace_event_name(&tk->tp.call), tk->nhit, + trace_event_name(&tk->tp.call), tk->nhit, tk->rp.kp.nmissed); return 0; @@ -917,18 +917,18 @@ static const struct file_operations kprobe_profile_ops = { /* Kprobe handler */ static nokprobe_inline void __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, - struct ftrace_event_file *ftrace_file) + struct trace_event_file *trace_file) { struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; int size, dsize, pc; unsigned long irq_flags; - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; - WARN_ON(call != ftrace_file->event_call); + WARN_ON(call != trace_file->event_call); - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; local_save_flags(irq_flags); @@ -937,7 +937,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, dsize = __get_data_size(&tk->tp, regs); size = sizeof(*entry) + tk->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, call->event.type, size, irq_flags, pc); if (!event) @@ -947,7 +947,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, entry->ip = (unsigned long)tk->rp.kp.addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); } @@ -965,18 +965,18 @@ NOKPROBE_SYMBOL(kprobe_trace_func); static nokprobe_inline void __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, - struct ftrace_event_file *ftrace_file) + struct trace_event_file *trace_file) { struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; int size, pc, dsize; unsigned long irq_flags; - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; - WARN_ON(call != ftrace_file->event_call); + WARN_ON(call != trace_file->event_call); - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; local_save_flags(irq_flags); @@ -985,7 +985,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, dsize = __get_data_size(&tk->tp, regs); size = sizeof(*entry) + tk->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, call->event.type, size, irq_flags, pc); if (!event) @@ -996,7 +996,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); } @@ -1025,7 +1025,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); + trace_seq_printf(s, "%s: (", trace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) goto out; @@ -1056,7 +1056,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); + trace_seq_printf(s, "%s: (", trace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) goto out; @@ -1081,7 +1081,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, } -static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +static int kprobe_event_define_fields(struct trace_event_call *event_call) { int ret, i; struct kprobe_trace_entry_head field; @@ -1104,7 +1104,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +static int kretprobe_event_define_fields(struct trace_event_call *event_call) { int ret, i; struct kretprobe_trace_entry_head field; @@ -1134,7 +1134,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; @@ -1169,7 +1169,7 @@ static void kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; @@ -1206,11 +1206,11 @@ NOKPROBE_SYMBOL(kretprobe_perf_func); * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe * lockless, but we can't race with this __init function. */ -static int kprobe_register(struct ftrace_event_call *event, +static int kprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { struct trace_kprobe *tk = (struct trace_kprobe *)event->data; - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: @@ -1276,10 +1276,10 @@ static struct trace_event_functions kprobe_funcs = { static int register_kprobe_event(struct trace_kprobe *tk) { - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; int ret; - /* Initialize ftrace_event_call */ + /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; @@ -1290,7 +1290,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) } if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; - ret = register_ftrace_event(&call->event); + ret = register_trace_event(&call->event); if (!ret) { kfree(call->print_fmt); return -ENODEV; @@ -1301,9 +1301,9 @@ static int register_kprobe_event(struct trace_kprobe *tk) ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", - ftrace_event_name(call)); + trace_event_name(call)); kfree(call->print_fmt); - unregister_ftrace_event(&call->event); + unregister_trace_event(&call->event); } return ret; } @@ -1364,10 +1364,10 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, return a1 + a2 + a3 + a4 + a5 + a6; } -static struct ftrace_event_file * +static struct trace_event_file * find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) if (file->event_call == &tk->tp.call) @@ -1385,7 +1385,7 @@ static __init int kprobe_trace_self_tests_init(void) int ret, warn = 0; int (*target)(int, int, int, int, int, int); struct trace_kprobe *tk; - struct ftrace_event_file *file; + struct trace_event_file *file; if (tracing_is_disabled()) return -ENODEV; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 7a9ba62e9..638e110c5 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -298,7 +298,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { - struct ftrace_event_call *call = &event_mmiotrace_rw; + struct trace_event_call *call = &event_mmiotrace_rw; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -328,7 +328,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { - struct ftrace_event_call *call = &event_mmiotrace_map; + struct trace_event_call *call = &event_mmiotrace_map; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 25a086bcb..dfab25372 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -60,9 +60,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) } const char * -ftrace_print_flags_seq(struct trace_seq *p, const char *delim, - unsigned long flags, - const struct trace_print_flags *flag_array) +trace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array) { unsigned long mask; const char *str; @@ -95,11 +95,11 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, return ret; } -EXPORT_SYMBOL(ftrace_print_flags_seq); +EXPORT_SYMBOL(trace_print_flags_seq); const char * -ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array) +trace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array) { int i; const char *ret = trace_seq_buffer_ptr(p); @@ -120,11 +120,11 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, return ret; } -EXPORT_SYMBOL(ftrace_print_symbols_seq); +EXPORT_SYMBOL(trace_print_symbols_seq); #if BITS_PER_LONG == 32 const char * -ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, +trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) { int i; @@ -146,12 +146,12 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, return ret; } -EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); +EXPORT_SYMBOL(trace_print_symbols_seq_u64); #endif const char * -ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, - unsigned int bitmask_size) +trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) { const char *ret = trace_seq_buffer_ptr(p); @@ -160,10 +160,10 @@ ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, return ret; } -EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); +EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); const char * -ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { int i; const char *ret = trace_seq_buffer_ptr(p); @@ -175,11 +175,11 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) return ret; } -EXPORT_SYMBOL(ftrace_print_hex_seq); +EXPORT_SYMBOL(trace_print_hex_seq); const char * -ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count, - size_t el_size) +trace_print_array_seq(struct trace_seq *p, const void *buf, int count, + size_t el_size) { const char *ret = trace_seq_buffer_ptr(p); const char *prefix = ""; @@ -220,17 +220,17 @@ ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count, return ret; } -EXPORT_SYMBOL(ftrace_print_array_seq); +EXPORT_SYMBOL(trace_print_array_seq); -int ftrace_raw_output_prep(struct trace_iterator *iter, - struct trace_event *trace_event) +int trace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *trace_event) { - struct ftrace_event_call *event; + struct trace_event_call *event; struct trace_seq *s = &iter->seq; struct trace_seq *p = &iter->tmp_seq; struct trace_entry *entry; - event = container_of(trace_event, struct ftrace_event_call, event); + event = container_of(trace_event, struct trace_event_call, event); entry = iter->ent; if (entry->type != event->event.type) { @@ -239,14 +239,14 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } trace_seq_init(p); - trace_seq_printf(s, "%s: ", ftrace_event_name(event)); + trace_seq_printf(s, "%s: ", trace_event_name(event)); return trace_handle_return(s); } -EXPORT_SYMBOL(ftrace_raw_output_prep); +EXPORT_SYMBOL(trace_raw_output_prep); -static int ftrace_output_raw(struct trace_iterator *iter, char *name, - char *fmt, va_list ap) +static int trace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; @@ -256,18 +256,18 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name, return trace_handle_return(s); } -int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) +int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); - ret = ftrace_output_raw(iter, name, fmt, ap); + ret = trace_output_raw(iter, name, fmt, ap); va_end(ap); return ret; } -EXPORT_SYMBOL_GPL(ftrace_output_call); +EXPORT_SYMBOL_GPL(trace_output_call); #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) @@ -675,7 +675,7 @@ static int trace_search_list(struct list_head **list) } /* Did we used up all 65 thousand events??? */ - if ((last + 1) > FTRACE_MAX_EVENT) + if ((last + 1) > TRACE_EVENT_TYPE_MAX) return 0; *list = &e->list; @@ -693,7 +693,7 @@ void trace_event_read_unlock(void) } /** - * register_ftrace_event - register output for an event type + * register_trace_event - register output for an event type * @event: the event type to register * * Event types are stored in a hash and this hash is used to @@ -707,7 +707,7 @@ void trace_event_read_unlock(void) * * Returns the event type number or zero on error. */ -int register_ftrace_event(struct trace_event *event) +int register_trace_event(struct trace_event *event) { unsigned key; int ret = 0; @@ -725,7 +725,7 @@ int register_ftrace_event(struct trace_event *event) if (!event->type) { struct list_head *list = NULL; - if (next_event_type > FTRACE_MAX_EVENT) { + if (next_event_type > TRACE_EVENT_TYPE_MAX) { event->type = trace_search_list(&list); if (!event->type) @@ -771,12 +771,12 @@ int register_ftrace_event(struct trace_event *event) return ret; } -EXPORT_SYMBOL_GPL(register_ftrace_event); +EXPORT_SYMBOL_GPL(register_trace_event); /* * Used by module code with the trace_event_sem held for write. */ -int __unregister_ftrace_event(struct trace_event *event) +int __unregister_trace_event(struct trace_event *event) { hlist_del(&event->node); list_del(&event->list); @@ -784,18 +784,18 @@ int __unregister_ftrace_event(struct trace_event *event) } /** - * unregister_ftrace_event - remove a no longer used event + * unregister_trace_event - remove a no longer used event * @event: the event to remove */ -int unregister_ftrace_event(struct trace_event *event) +int unregister_trace_event(struct trace_event *event) { down_write(&trace_event_sem); - __unregister_ftrace_event(event); + __unregister_trace_event(event); up_write(&trace_event_sem); return 0; } -EXPORT_SYMBOL_GPL(unregister_ftrace_event); +EXPORT_SYMBOL_GPL(unregister_trace_event); /* * Standard events @@ -1243,7 +1243,7 @@ __init static int init_events(void) for (i = 0; events[i]; i++) { event = events[i]; - ret = register_ftrace_event(event); + ret = register_trace_event(event); if (!ret) { printk(KERN_WARNING "event %d failed to register\n", event->type); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 8ef2c40ef..4cbfe85b9 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -32,7 +32,7 @@ extern int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ -extern int __unregister_ftrace_event(struct trace_event *event); +extern int __unregister_trace_event(struct trace_event *event); extern struct rw_semaphore trace_event_sem; #define SEQ_PUT_FIELD(s, x) \ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index ab283e146..b98dee914 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -272,8 +272,8 @@ struct probe_arg { struct trace_probe { unsigned int flags; /* For TP_FLAG_* */ - struct ftrace_event_class class; - struct ftrace_event_call call; + struct trace_event_class class; + struct trace_event_call call; struct list_head files; ssize_t size; /* trace entry size */ unsigned int nr_args; @@ -281,7 +281,7 @@ struct trace_probe { }; struct event_file_link { - struct ftrace_event_file *file; + struct trace_event_file *file; struct list_head list; }; @@ -314,7 +314,7 @@ static inline int is_good_name(const char *name) } static inline struct event_file_link * -find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) +find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) { struct event_file_link *link; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index d6e100372..9b33dd117 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -369,7 +369,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_context_switch; + struct trace_event_call *call = &event_context_switch; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -397,7 +397,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_wakeup; + struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; struct ring_buffer *buffer = tr->trace_buffer.buffer; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 287cf721c..b0f86ea77 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1039,15 +1039,10 @@ static int trace_wakeup_test_thread(void *data) { /* Make this a -deadline thread */ static const struct sched_attr attr = { -#ifdef CONFIG_SCHED_BFS - /* No deadline on BFS, use RR */ - .sched_policy = SCHED_RR, -#else .sched_policy = SCHED_DEADLINE, .sched_runtime = 100000ULL, .sched_deadline = 10000000ULL, .sched_period = 10000000ULL -#endif }; struct wakeup_test_data *x = data; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f97f6e3a6..7d567a4b9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -13,13 +13,13 @@ static DEFINE_MUTEX(syscall_trace_lock); -static int syscall_enter_register(struct ftrace_event_call *event, +static int syscall_enter_register(struct trace_event_call *event, enum trace_reg type, void *data); -static int syscall_exit_register(struct ftrace_event_call *event, +static int syscall_exit_register(struct trace_event_call *event, enum trace_reg type, void *data); static struct list_head * -syscall_get_enter_fields(struct ftrace_event_call *call) +syscall_get_enter_fields(struct trace_event_call *call) { struct syscall_metadata *entry = call->data; @@ -219,7 +219,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) return pos; } -static int __init set_syscall_print_fmt(struct ftrace_event_call *call) +static int __init set_syscall_print_fmt(struct trace_event_call *call) { char *print_fmt; int len; @@ -244,7 +244,7 @@ static int __init set_syscall_print_fmt(struct ftrace_event_call *call) return 0; } -static void __init free_syscall_print_fmt(struct ftrace_event_call *call) +static void __init free_syscall_print_fmt(struct trace_event_call *call) { struct syscall_metadata *entry = call->data; @@ -252,7 +252,7 @@ static void __init free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -static int __init syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; @@ -275,7 +275,7 @@ static int __init syscall_enter_define_fields(struct ftrace_event_call *call) return ret; } -static int __init syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct trace_event_call *call) { struct syscall_trace_exit trace; int ret; @@ -293,7 +293,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call) static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; - struct ftrace_event_file *ftrace_file; + struct trace_event_file *trace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -308,11 +308,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ - ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); - if (!ftrace_file) + trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + if (!trace_file) return; - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -334,14 +334,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + event_trigger_unlock_commit(trace_file, buffer, event, entry, irq_flags, pc); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { struct trace_array *tr = data; - struct ftrace_event_file *ftrace_file; + struct trace_event_file *trace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -355,11 +355,11 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ - ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); - if (!ftrace_file) + trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + if (!trace_file) return; - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -380,12 +380,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + event_trigger_unlock_commit(trace_file, buffer, event, entry, irq_flags, pc); } -static int reg_event_syscall_enter(struct ftrace_event_file *file, - struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct trace_event_file *file, + struct trace_event_call *call) { struct trace_array *tr = file->tr; int ret = 0; @@ -405,8 +405,8 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file, return ret; } -static void unreg_event_syscall_enter(struct ftrace_event_file *file, - struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct trace_event_file *file, + struct trace_event_call *call) { struct trace_array *tr = file->tr; int num; @@ -422,8 +422,8 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, mutex_unlock(&syscall_trace_lock); } -static int reg_event_syscall_exit(struct ftrace_event_file *file, - struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct trace_event_file *file, + struct trace_event_call *call) { struct trace_array *tr = file->tr; int ret = 0; @@ -443,8 +443,8 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file, return ret; } -static void unreg_event_syscall_exit(struct ftrace_event_file *file, - struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct trace_event_file *file, + struct trace_event_call *call) { struct trace_array *tr = file->tr; int num; @@ -460,7 +460,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, mutex_unlock(&syscall_trace_lock); } -static int __init init_syscall_trace(struct ftrace_event_call *call) +static int __init init_syscall_trace(struct trace_event_call *call) { int id; int num; @@ -493,7 +493,7 @@ struct trace_event_functions exit_syscall_print_funcs = { .trace = print_syscall_exit, }; -struct ftrace_event_class __refdata event_class_syscall_enter = { +struct trace_event_class __refdata event_class_syscall_enter = { .system = "syscalls", .reg = syscall_enter_register, .define_fields = syscall_enter_define_fields, @@ -501,7 +501,7 @@ struct ftrace_event_class __refdata event_class_syscall_enter = { .raw_init = init_syscall_trace, }; -struct ftrace_event_class __refdata event_class_syscall_exit = { +struct trace_event_class __refdata event_class_syscall_exit = { .system = "syscalls", .reg = syscall_exit_register, .define_fields = syscall_exit_define_fields, @@ -584,7 +584,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -static int perf_sysenter_enable(struct ftrace_event_call *call) +static int perf_sysenter_enable(struct trace_event_call *call) { int ret = 0; int num; @@ -605,7 +605,7 @@ static int perf_sysenter_enable(struct ftrace_event_call *call) return ret; } -static void perf_sysenter_disable(struct ftrace_event_call *call) +static void perf_sysenter_disable(struct trace_event_call *call) { int num; @@ -656,7 +656,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -static int perf_sysexit_enable(struct ftrace_event_call *call) +static int perf_sysexit_enable(struct trace_event_call *call) { int ret = 0; int num; @@ -677,7 +677,7 @@ static int perf_sysexit_enable(struct ftrace_event_call *call) return ret; } -static void perf_sysexit_disable(struct ftrace_event_call *call) +static void perf_sysexit_disable(struct trace_event_call *call) { int num; @@ -693,10 +693,10 @@ static void perf_sysexit_disable(struct ftrace_event_call *call) #endif /* CONFIG_PERF_EVENTS */ -static int syscall_enter_register(struct ftrace_event_call *event, +static int syscall_enter_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: @@ -721,10 +721,10 @@ static int syscall_enter_register(struct ftrace_event_call *event, return 0; } -static int syscall_exit_register(struct ftrace_event_call *event, +static int syscall_exit_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 6dd022c7b..aa1ea7b36 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -293,7 +293,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; list_for_each_entry(tu, &uprobe_list, list) - if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 && + if (strcmp(trace_event_name(&tu->tp.call), event) == 0 && strcmp(tu->tp.call.class->system, group) == 0) return tu; @@ -323,7 +323,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&uprobe_lock); /* register as an event */ - old_tu = find_probe_event(ftrace_event_name(&tu->tp.call), + old_tu = find_probe_event(trace_event_name(&tu->tp.call), tu->tp.call.class->system); if (old_tu) { /* delete old event */ @@ -600,7 +600,7 @@ static int probes_seq_show(struct seq_file *m, void *v) int i; seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, - ftrace_event_name(&tu->tp.call)); + trace_event_name(&tu->tp.call)); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); for (i = 0; i < tu->tp.nr_args; i++) @@ -651,7 +651,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) struct trace_uprobe *tu = v; seq_printf(m, " %s %-44s %15lu\n", tu->filename, - ftrace_event_name(&tu->tp.call), tu->nhit); + trace_event_name(&tu->tp.call), tu->nhit); return 0; } @@ -770,26 +770,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer *ucb, int dsize, - struct ftrace_event_file *ftrace_file) + struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; void *data; int size, esize; - struct ftrace_event_call *call = &tu->tp.call; + struct trace_event_call *call = &tu->tp.call; - WARN_ON(call != ftrace_file->event_call); + WARN_ON(call != trace_file->event_call); if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) return; - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, call->event.type, size, 0, 0); if (!event) return; @@ -806,7 +806,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0); + event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0); } /* uprobe handler */ @@ -853,12 +853,12 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e if (is_ret_probe(tu)) { trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", - ftrace_event_name(&tu->tp.call), + trace_event_name(&tu->tp.call), entry->vaddr[1], entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, true); } else { trace_seq_printf(s, "%s: (0x%lx)", - ftrace_event_name(&tu->tp.call), + trace_event_name(&tu->tp.call), entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, false); } @@ -881,7 +881,7 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); static int -probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, +probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, filter_func_t filter) { bool enabled = trace_probe_is_enabled(&tu->tp); @@ -938,7 +938,7 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, } static void -probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) +probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) { if (!trace_probe_is_enabled(&tu->tp)) return; @@ -967,7 +967,7 @@ probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) uprobe_buffer_disable(); } -static int uprobe_event_define_fields(struct ftrace_event_call *event_call) +static int uprobe_event_define_fields(struct trace_event_call *event_call) { int ret, i, size; struct uprobe_trace_entry_head field; @@ -1093,7 +1093,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer *ucb, int dsize) { - struct ftrace_event_call *call = &tu->tp.call; + struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; void *data; @@ -1159,11 +1159,11 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, #endif /* CONFIG_PERF_EVENTS */ static int -trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, +trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { struct trace_uprobe *tu = event->data; - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: @@ -1272,10 +1272,10 @@ static struct trace_event_functions uprobe_funcs = { static int register_uprobe_event(struct trace_uprobe *tu) { - struct ftrace_event_call *call = &tu->tp.call; + struct trace_event_call *call = &tu->tp.call; int ret; - /* Initialize ftrace_event_call */ + /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; @@ -1283,7 +1283,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; - ret = register_ftrace_event(&call->event); + ret = register_trace_event(&call->event); if (!ret) { kfree(call->print_fmt); return -ENODEV; @@ -1295,9 +1295,9 @@ static int register_uprobe_event(struct trace_uprobe *tu) if (ret) { pr_info("Failed to register uprobe event: %s\n", - ftrace_event_name(call)); + trace_event_name(call)); kfree(call->print_fmt); - unregister_ftrace_event(&call->event); + unregister_trace_event(&call->event); } return ret; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 581a68a04..a6ffa43f2 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -19,6 +19,7 @@ #include <linux/sysctl.h> #include <linux/smpboot.h> #include <linux/sched/rt.h> +#include <linux/tick.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -58,6 +59,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace; #else #define sysctl_softlockup_all_cpu_backtrace 0 #endif +static struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -207,7 +214,7 @@ void touch_all_softlockup_watchdogs(void) * do we care if a 0 races with a timestamp? * all it means is the softlock check starts one cycle later */ - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; } @@ -616,7 +623,7 @@ void watchdog_nmi_enable_all(void) goto unlock; get_online_cpus(); - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) watchdog_nmi_enable(cpu); put_online_cpus(); @@ -634,7 +641,7 @@ void watchdog_nmi_disable_all(void) goto unlock; get_online_cpus(); - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) watchdog_nmi_disable(cpu); put_online_cpus(); @@ -696,7 +703,7 @@ static void update_watchdog_all_cpus(void) int cpu; get_online_cpus(); - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) update_watchdog(cpu); put_online_cpus(); } @@ -709,8 +716,12 @@ static int watchdog_enable_all_cpus(void) err = smpboot_register_percpu_thread(&watchdog_threads); if (err) pr_err("Failed to create watchdog threads, disabled\n"); - else + else { + if (smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask)) + pr_err("Failed to set cpumask for watchdog threads\n"); watchdog_running = 1; + } } else { /* * Enable/disable the lockup detectors or @@ -879,12 +890,58 @@ out: mutex_unlock(&watchdog_proc_mutex); return err; } + +/* + * The cpumask is the mask of possible cpus that the watchdog can run + * on, not the mask of cpus it is actually running on. This allows the + * user to specify a mask that will include cpus that have not yet + * been brought online, if desired. + */ +int proc_watchdog_cpumask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + mutex_lock(&watchdog_proc_mutex); + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (!err && write) { + /* Remove impossible cpus to keep sysctl output cleaner. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, + cpu_possible_mask); + + if (watchdog_running) { + /* + * Failure would be due to being unable to allocate + * a temporary cpumask, so we are likely not in a + * position to do much else to make things better. + */ + if (smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask) != 0) + pr_err("cpumask update failed\n"); + } + } + mutex_unlock(&watchdog_proc_mutex); + return err; +} + #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) { set_sample_period(); +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) { + if (!cpumask_empty(tick_nohz_full_mask)) + pr_info("Disabling watchdog on nohz_full cores by default\n"); + cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, + tick_nohz_full_mask); + } else + cpumask_copy(&watchdog_cpumask, cpu_possible_mask); +#else + cpumask_copy(&watchdog_cpumask, cpu_possible_mask); +#endif + if (watchdog_enabled) watchdog_enable_all_cpus(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 586ad9130..a413acb59 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -127,6 +127,11 @@ enum { * * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. * + * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. + * + * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or + * sched-RCU for reads. + * * WQ: wq->mutex protected. * * WR: wq->mutex protected for writes. Sched-RCU protected for reads. @@ -247,8 +252,8 @@ struct workqueue_struct { int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* WQ: saved pwq max_active */ - struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ - struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */ + struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ + struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -268,7 +273,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ - struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */ + struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ }; static struct kmem_cache *pwq_cache; @@ -280,12 +285,7 @@ static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); /* see the comment above the definition of WQ_POWER_EFFICIENT */ -#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT -static bool wq_power_efficient = true; -#else -static bool wq_power_efficient; -#endif - +static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ @@ -299,6 +299,8 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ +static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ + /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); @@ -330,8 +332,6 @@ struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); -static void copy_workqueue_attrs(struct workqueue_attrs *to, - const struct workqueue_attrs *from); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #define CREATE_TRACE_POINTS @@ -347,6 +347,12 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); lockdep_is_held(&wq->mutex), \ "sched RCU or wq->mutex should be held") +#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&wq->mutex) || \ + lockdep_is_held(&wq_pool_mutex), \ + "sched RCU, wq->mutex or wq_pool_mutex should be held") + #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -551,7 +557,8 @@ static int worker_pool_assign_id(struct worker_pool *pool) * @wq: the target workqueue * @node: the node ID * - * This must be called either with pwq_lock held or sched RCU read locked. + * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU + * read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -560,7 +567,7 @@ static int worker_pool_assign_id(struct worker_pool *pool) static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { - assert_rcu_or_wq_mutex(wq); + assert_rcu_or_wq_mutex_or_pool_mutex(wq); return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } @@ -976,7 +983,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * move_linked_works - move linked works to a list * @work: start of series of works to be scheduled * @head: target list to append @work to - * @nextp: out paramter for nested worklist walking + * @nextp: out parameter for nested worklist walking * * Schedule linked works starting from @work to @head. Work series to * be scheduled starts at @work and includes any consecutive work with @@ -2607,7 +2614,7 @@ void flush_workqueue(struct workqueue_struct *wq) out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL_GPL(flush_workqueue); +EXPORT_SYMBOL(flush_workqueue); /** * drain_workqueue - drain a workqueue @@ -2616,7 +2623,7 @@ EXPORT_SYMBOL_GPL(flush_workqueue); * Wait until the workqueue becomes empty. While draining is in progress, * only chain queueing is allowed. IOW, only currently pending or running * work items on @wq can queue further work items on it. @wq is flushed - * repeatedly until it becomes empty. The number of flushing is detemined + * repeatedly until it becomes empty. The number of flushing is determined * by the depth of chaining and should be relatively short. Whine if it * takes too long. */ @@ -2947,36 +2954,6 @@ int schedule_on_each_cpu(work_func_t func) } /** - * flush_scheduled_work - ensure that any scheduled work has run to completion. - * - * Forces execution of the kernel-global workqueue and blocks until its - * completion. - * - * Think twice before calling this function! It's very easy to get into - * trouble if you don't take great care. Either of the following situations - * will lead to deadlock: - * - * One of the work items currently on the workqueue needs to acquire - * a lock held by your code or its caller. - * - * Your code is running in the context of a work routine. - * - * They will be detected by lockdep when they occur, but the first might not - * occur very often. It depends on what work items are on the workqueue and - * what locks they need, which you have no control over. - * - * In most situations flushing the entire workqueue is overkill; you merely - * need to know that a particular work item isn't queued and isn't running. - * In such cases you should use cancel_delayed_work_sync() or - * cancel_work_sync() instead. - */ -void flush_scheduled_work(void) -{ - flush_workqueue(system_wq); -} -EXPORT_SYMBOL(flush_scheduled_work); - -/** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute * @ew: guaranteed storage for the execute work structure (must @@ -3081,7 +3058,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize * - * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs. * * Return: 0 on success, -errno on failure. Even on failure, all fields * inside @pool proper are initialized and put_unbound_pool() can be called @@ -3425,20 +3402,9 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, return pwq; } -/* undo alloc_unbound_pwq(), used only in the error path */ -static void free_unbound_pwq(struct pool_workqueue *pwq) -{ - lockdep_assert_held(&wq_pool_mutex); - - if (pwq) { - put_unbound_pool(pwq->pool); - kmem_cache_free(pwq_cache, pwq); - } -} - /** - * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node - * @attrs: the wq_attrs of interest + * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node + * @attrs: the wq_attrs of the default pwq of the target workqueue * @node: the target NUMA node * @cpu_going_down: if >= 0, the CPU to consider as offline * @cpumask: outarg, the resulting cpumask @@ -3488,6 +3454,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, { struct pool_workqueue *old_pwq; + lockdep_assert_held(&wq_pool_mutex); lockdep_assert_held(&wq->mutex); /* link_pwq() can handle duplicate calls */ @@ -3498,46 +3465,59 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, return old_pwq; } -/** - * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue - * @wq: the target workqueue - * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() - * - * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA - * machines, this function maps a separate pwq to each NUMA node with - * possibles CPUs in @attrs->cpumask so that work items are affine to the - * NUMA node it was issued on. Older pwqs are released as in-flight work - * items finish. Note that a work item which repeatedly requeues itself - * back-to-back will stay on its current pwq. - * - * Performs GFP_KERNEL allocations. - * - * Return: 0 on success and -errno on failure. - */ -int apply_workqueue_attrs(struct workqueue_struct *wq, - const struct workqueue_attrs *attrs) +/* context to store the prepared attrs & pwqs before applying */ +struct apply_wqattrs_ctx { + struct workqueue_struct *wq; /* target workqueue */ + struct workqueue_attrs *attrs; /* attrs to apply */ + struct list_head list; /* queued for batching commit */ + struct pool_workqueue *dfl_pwq; + struct pool_workqueue *pwq_tbl[]; +}; + +/* free the resources after success or abort */ +static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) +{ + if (ctx) { + int node; + + for_each_node(node) + put_pwq_unlocked(ctx->pwq_tbl[node]); + put_pwq_unlocked(ctx->dfl_pwq); + + free_workqueue_attrs(ctx->attrs); + + kfree(ctx); + } +} + +/* allocate the attrs and pwqs for later installation */ +static struct apply_wqattrs_ctx * +apply_wqattrs_prepare(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) { + struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs, *tmp_attrs; - struct pool_workqueue **pwq_tbl, *dfl_pwq; - int node, ret; + int node; - /* only unbound workqueues can change attributes */ - if (WARN_ON(!(wq->flags & WQ_UNBOUND))) - return -EINVAL; + lockdep_assert_held(&wq_pool_mutex); - /* creating multiple pwqs breaks ordering guarantee */ - if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) - return -EINVAL; + ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]), + GFP_KERNEL); - pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!pwq_tbl || !new_attrs || !tmp_attrs) - goto enomem; + if (!ctx || !new_attrs || !tmp_attrs) + goto out_free; - /* make a copy of @attrs and sanitize it */ + /* + * Calculate the attrs of the default pwq. + * If the user configured cpumask doesn't overlap with the + * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. + */ copy_workqueue_attrs(new_attrs, attrs); - cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); + if (unlikely(cpumask_empty(new_attrs->cpumask))) + cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); /* * We may create multiple pwqs with differing cpumasks. Make a @@ -3547,75 +3527,129 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, copy_workqueue_attrs(tmp_attrs, new_attrs); /* - * CPUs should stay stable across pwq creations and installations. - * Pin CPUs, determine the target cpumask for each node and create - * pwqs accordingly. - */ - get_online_cpus(); - - mutex_lock(&wq_pool_mutex); - - /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ - dfl_pwq = alloc_unbound_pwq(wq, new_attrs); - if (!dfl_pwq) - goto enomem_pwq; + ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); + if (!ctx->dfl_pwq) + goto out_free; for_each_node(node) { - if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) { - pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); - if (!pwq_tbl[node]) - goto enomem_pwq; + if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { + ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); + if (!ctx->pwq_tbl[node]) + goto out_free; } else { - dfl_pwq->refcnt++; - pwq_tbl[node] = dfl_pwq; + ctx->dfl_pwq->refcnt++; + ctx->pwq_tbl[node] = ctx->dfl_pwq; } } - mutex_unlock(&wq_pool_mutex); + /* save the user configured attrs and sanitize it. */ + copy_workqueue_attrs(new_attrs, attrs); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + ctx->attrs = new_attrs; + + ctx->wq = wq; + free_workqueue_attrs(tmp_attrs); + return ctx; + +out_free: + free_workqueue_attrs(tmp_attrs); + free_workqueue_attrs(new_attrs); + apply_wqattrs_cleanup(ctx); + return NULL; +} + +/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ +static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) +{ + int node; /* all pwqs have been created successfully, let's install'em */ - mutex_lock(&wq->mutex); + mutex_lock(&ctx->wq->mutex); - copy_workqueue_attrs(wq->unbound_attrs, new_attrs); + copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwq and install the new one */ for_each_node(node) - pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]); + ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node, + ctx->pwq_tbl[node]); /* @dfl_pwq might not have been used, ensure it's linked */ - link_pwq(dfl_pwq); - swap(wq->dfl_pwq, dfl_pwq); + link_pwq(ctx->dfl_pwq); + swap(ctx->wq->dfl_pwq, ctx->dfl_pwq); - mutex_unlock(&wq->mutex); + mutex_unlock(&ctx->wq->mutex); +} - /* put the old pwqs */ - for_each_node(node) - put_pwq_unlocked(pwq_tbl[node]); - put_pwq_unlocked(dfl_pwq); +static void apply_wqattrs_lock(void) +{ + /* CPUs should stay stable across pwq creations and installations */ + get_online_cpus(); + mutex_lock(&wq_pool_mutex); +} +static void apply_wqattrs_unlock(void) +{ + mutex_unlock(&wq_pool_mutex); put_online_cpus(); - ret = 0; - /* fall through */ -out_free: - free_workqueue_attrs(tmp_attrs); - free_workqueue_attrs(new_attrs); - kfree(pwq_tbl); +} + +static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct apply_wqattrs_ctx *ctx; + int ret = -ENOMEM; + + /* only unbound workqueues can change attributes */ + if (WARN_ON(!(wq->flags & WQ_UNBOUND))) + return -EINVAL; + + /* creating multiple pwqs breaks ordering guarantee */ + if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) + return -EINVAL; + + ctx = apply_wqattrs_prepare(wq, attrs); + + /* the ctx has been prepared successfully, let's commit it */ + if (ctx) { + apply_wqattrs_commit(ctx); + ret = 0; + } + + apply_wqattrs_cleanup(ctx); + return ret; +} -enomem_pwq: - free_unbound_pwq(dfl_pwq); - for_each_node(node) - if (pwq_tbl && pwq_tbl[node] != dfl_pwq) - free_unbound_pwq(pwq_tbl[node]); - mutex_unlock(&wq_pool_mutex); - put_online_cpus(); -enomem: - ret = -ENOMEM; - goto out_free; +/** + * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue + * @wq: the target workqueue + * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() + * + * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA + * machines, this function maps a separate pwq to each NUMA node with + * possibles CPUs in @attrs->cpumask so that work items are affine to the + * NUMA node it was issued on. Older pwqs are released as in-flight work + * items finish. Note that a work item which repeatedly requeues itself + * back-to-back will stay on its current pwq. + * + * Performs GFP_KERNEL allocations. + * + * Return: 0 on success and -errno on failure. + */ +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + int ret; + + apply_wqattrs_lock(); + ret = apply_workqueue_attrs_locked(wq, attrs); + apply_wqattrs_unlock(); + + return ret; } /** @@ -3651,7 +3685,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, lockdep_assert_held(&wq_pool_mutex); - if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND)) + if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) || + wq->unbound_attrs->no_numa) return; /* @@ -3662,48 +3697,37 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, target_attrs = wq_update_unbound_numa_attrs_buf; cpumask = target_attrs->cpumask; - mutex_lock(&wq->mutex); - if (wq->unbound_attrs->no_numa) - goto out_unlock; - copy_workqueue_attrs(target_attrs, wq->unbound_attrs); pwq = unbound_pwq_by_node(wq, node); /* * Let's determine what needs to be done. If the target cpumask is - * different from wq's, we need to compare it to @pwq's and create - * a new one if they don't match. If the target cpumask equals - * wq's, the default pwq should be used. + * different from the default pwq's, we need to compare it to @pwq's + * and create a new one if they don't match. If the target cpumask + * equals the default pwq's, the default pwq should be used. */ - if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { + if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) - goto out_unlock; + return; } else { goto use_dfl_pwq; } - mutex_unlock(&wq->mutex); - /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", wq->name); - mutex_lock(&wq->mutex); goto use_dfl_pwq; } - /* - * Install the new pwq. As this function is called only from CPU - * hotplug callbacks and applying a new attrs is wrapped with - * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed - * inbetween. - */ + /* Install the new pwq. */ mutex_lock(&wq->mutex); old_pwq = numa_pwq_tbl_install(wq, node, pwq); goto out_unlock; use_dfl_pwq: + mutex_lock(&wq->mutex); spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); spin_unlock_irq(&wq->dfl_pwq->pool->lock); @@ -4385,7 +4409,7 @@ static void rebind_workers(struct worker_pool *pool) /* * Restore CPU affinity of all workers. As all idle workers should * be on the run-queue of the associated CPU before any local - * wake-ups for concurrency management happen, restore CPU affinty + * wake-ups for concurrency management happen, restore CPU affinity * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ @@ -4698,6 +4722,82 @@ out_unlock: } #endif /* CONFIG_FREEZER */ +static int workqueue_apply_unbound_cpumask(void) +{ + LIST_HEAD(ctxs); + int ret = 0; + struct workqueue_struct *wq; + struct apply_wqattrs_ctx *ctx, *n; + + lockdep_assert_held(&wq_pool_mutex); + + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_UNBOUND)) + continue; + /* creating multiple pwqs breaks ordering guarantee */ + if (wq->flags & __WQ_ORDERED) + continue; + + ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); + if (!ctx) { + ret = -ENOMEM; + break; + } + + list_add_tail(&ctx->list, &ctxs); + } + + list_for_each_entry_safe(ctx, n, &ctxs, list) { + if (!ret) + apply_wqattrs_commit(ctx); + apply_wqattrs_cleanup(ctx); + } + + return ret; +} + +/** + * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask + * @cpumask: the cpumask to set + * + * The low-level workqueues cpumask is a global cpumask that limits + * the affinity of all unbound workqueues. This function check the @cpumask + * and apply it to all unbound workqueues and updates all pwqs of them. + * + * Retun: 0 - Success + * -EINVAL - Invalid @cpumask + * -ENOMEM - Failed to allocate memory for attrs or pwqs. + */ +int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +{ + int ret = -EINVAL; + cpumask_var_t saved_cpumask; + + if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_and(cpumask, cpumask, cpu_possible_mask); + if (!cpumask_empty(cpumask)) { + apply_wqattrs_lock(); + + /* save the old wq_unbound_cpumask. */ + cpumask_copy(saved_cpumask, wq_unbound_cpumask); + + /* update wq_unbound_cpumask at first and apply it to wqs. */ + cpumask_copy(wq_unbound_cpumask, cpumask); + ret = workqueue_apply_unbound_cpumask(); + + /* restore the wq_unbound_cpumask when failed. */ + if (ret < 0) + cpumask_copy(wq_unbound_cpumask, saved_cpumask); + + apply_wqattrs_unlock(); + } + + free_cpumask_var(saved_cpumask); + return ret; +} + #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via @@ -4802,13 +4902,13 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) { struct workqueue_attrs *attrs; + lockdep_assert_held(&wq_pool_mutex); + attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!attrs) return NULL; - mutex_lock(&wq->mutex); copy_workqueue_attrs(attrs, wq->unbound_attrs); - mutex_unlock(&wq->mutex); return attrs; } @@ -4817,18 +4917,22 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; - int ret; + int ret = -ENOMEM; + + apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) - return -ENOMEM; + goto out_unlock; if (sscanf(buf, "%d", &attrs->nice) == 1 && attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) - ret = apply_workqueue_attrs(wq, attrs); + ret = apply_workqueue_attrs_locked(wq, attrs); else ret = -EINVAL; +out_unlock: + apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } @@ -4852,16 +4956,20 @@ static ssize_t wq_cpumask_store(struct device *dev, { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; - int ret; + int ret = -ENOMEM; + + apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) - return -ENOMEM; + goto out_unlock; ret = cpumask_parse(buf, attrs->cpumask); if (!ret) - ret = apply_workqueue_attrs(wq, attrs); + ret = apply_workqueue_attrs_locked(wq, attrs); +out_unlock: + apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } @@ -4885,18 +4993,22 @@ static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; - int v, ret; + int v, ret = -ENOMEM; + + apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) - return -ENOMEM; + goto out_unlock; ret = -EINVAL; if (sscanf(buf, "%d", &v) == 1) { attrs->no_numa = !v; - ret = apply_workqueue_attrs(wq, attrs); + ret = apply_workqueue_attrs_locked(wq, attrs); } +out_unlock: + apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } @@ -4914,9 +5026,49 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; +static ssize_t wq_unbound_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int written; + + mutex_lock(&wq_pool_mutex); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", + cpumask_pr_args(wq_unbound_cpumask)); + mutex_unlock(&wq_pool_mutex); + + return written; +} + +static ssize_t wq_unbound_cpumask_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + cpumask_var_t cpumask; + int ret; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpumask_parse(buf, cpumask); + if (!ret) + ret = workqueue_set_unbound_cpumask(cpumask); + + free_cpumask_var(cpumask); + return ret ? ret : count; +} + +static struct device_attribute wq_sysfs_cpumask_attr = + __ATTR(cpumask, 0644, wq_unbound_cpumask_show, + wq_unbound_cpumask_store); + static int __init wq_sysfs_init(void) { - return subsys_virtual_register(&wq_subsys, NULL); + int err; + + err = subsys_virtual_register(&wq_subsys, NULL); + if (err) + return err; + + return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr); } core_initcall(wq_sysfs_init); @@ -4948,7 +5100,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) int ret; /* - * Adjusting max_active or creating new pwqs by applyting + * Adjusting max_active or creating new pwqs by applying * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ @@ -5064,6 +5216,9 @@ static int __init init_workqueues(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); + cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); |